aboutsummaryrefslogtreecommitdiff
path: root/files/source
diff options
context:
space:
mode:
Diffstat (limited to 'files/source')
-rw-r--r--files/source/compare.cc14
-rw-r--r--files/source/compare_common.cc30
-rw-r--r--files/source/compare_gcc.cc347
-rw-r--r--files/source/compare_neon.cc70
-rw-r--r--files/source/compare_neon64.cc68
-rw-r--r--files/source/compare_win.cc14
-rw-r--r--files/source/convert.cc1863
-rw-r--r--files/source/convert_argb.cc5572
-rw-r--r--files/source/convert_from.cc1003
-rw-r--r--files/source/convert_from_argb.cc1125
-rw-r--r--files/source/convert_jpeg.cc134
-rw-r--r--files/source/convert_to_argb.cc89
-rw-r--r--files/source/convert_to_i420.cc29
-rw-r--r--files/source/cpu_id.cc82
-rw-r--r--files/source/mjpeg_decoder.cc3
-rw-r--r--files/source/planar_functions.cc2375
-rw-r--r--files/source/rotate.cc429
-rw-r--r--files/source/rotate_any.cc12
-rw-r--r--files/source/rotate_argb.cc97
-rw-r--r--files/source/rotate_dspr2.cc475
-rw-r--r--files/source/rotate_gcc.cc543
-rw-r--r--files/source/rotate_lsx.cc243
-rw-r--r--files/source/rotate_neon.cc218
-rw-r--r--files/source/rotate_neon64.cc281
-rw-r--r--files/source/rotate_win.cc5
-rw-r--r--files/source/row_any.cc1170
-rw-r--r--files/source/row_common.cc2245
-rw-r--r--files/source/row_dspr2.cc1721
-rw-r--r--files/source/row_gcc.cc9345
-rw-r--r--files/source/row_lasx.cc2230
-rw-r--r--files/source/row_lsx.cc1829
-rw-r--r--files/source/row_mmi.cc2450
-rw-r--r--files/source/row_msa.cc1507
-rw-r--r--files/source/row_neon.cc3755
-rw-r--r--files/source/row_neon64.cc4484
-rw-r--r--files/source/row_win.cc685
-rw-r--r--files/source/scale.cc852
-rw-r--r--files/source/scale_any.cc653
-rw-r--r--files/source/scale_argb.cc194
-rw-r--r--files/source/scale_common.cc555
-rw-r--r--files/source/scale_dspr2.cc668
-rw-r--r--files/source/scale_gcc.cc3104
-rw-r--r--files/source/scale_lsx.cc739
-rw-r--r--files/source/scale_mmi.cc55
-rw-r--r--files/source/scale_neon.cc1214
-rw-r--r--files/source/scale_neon64.cc1410
-rw-r--r--files/source/scale_rgb.cc66
-rw-r--r--files/source/scale_uv.cc1161
-rw-r--r--files/source/scale_win.cc5
-rwxr-xr-xfiles/source/test.sh35
50 files changed, 40530 insertions, 16723 deletions
diff --git a/files/source/compare.cc b/files/source/compare.cc
index 5aa3a4db..d4713b60 100644
--- a/files/source/compare.cc
+++ b/files/source/compare.cc
@@ -69,13 +69,13 @@ static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
return FOURCC_BGRA;
}
- if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
+ if (argb[3] != 255) { // Fourth byte is not Alpha of 255, so not BGRA.
return FOURCC_ARGB;
}
if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
return FOURCC_BGRA;
}
- if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255.
+ if (argb[7] != 255) { // Second pixel fourth byte is not Alpha of 255.
return FOURCC_ARGB;
}
argb += 8;
@@ -154,11 +154,6 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a,
HammingDistance = HammingDistance_MSA;
}
#endif
-#if defined(HAS_HAMMINGDISTANCE_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- HammingDistance = HammingDistance_MMI;
- }
-#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : diff)
@@ -216,11 +211,6 @@ uint64_t ComputeSumSquareError(const uint8_t* src_a,
SumSquareError = SumSquareError_MSA;
}
#endif
-#if defined(HAS_SUMSQUAREERROR_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SumSquareError = SumSquareError_MMI;
- }
-#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : sse)
#endif
diff --git a/files/source/compare_common.cc b/files/source/compare_common.cc
index d4b170ad..d1cab8d2 100644
--- a/files/source/compare_common.cc
+++ b/files/source/compare_common.cc
@@ -17,36 +17,6 @@ namespace libyuv {
extern "C" {
#endif
-#if ORIGINAL_OPT
-uint32_t HammingDistance_C1(const uint8_t* src_a,
- const uint8_t* src_b,
- int count) {
- uint32_t diff = 0u;
-
- int i;
- for (i = 0; i < count; ++i) {
- int x = src_a[i] ^ src_b[i];
- if (x & 1)
- ++diff;
- if (x & 2)
- ++diff;
- if (x & 4)
- ++diff;
- if (x & 8)
- ++diff;
- if (x & 16)
- ++diff;
- if (x & 32)
- ++diff;
- if (x & 64)
- ++diff;
- if (x & 128)
- ++diff;
- }
- return diff;
-}
-#endif
-
// Hakmem method for hamming distance.
uint32_t HammingDistance_C(const uint8_t* src_a,
const uint8_t* src_b,
diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc
index 676527c1..b834b42a 100644
--- a/files/source/compare_gcc.cc
+++ b/files/source/compare_gcc.cc
@@ -19,8 +19,7 @@ extern "C" {
#endif
// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#if defined(__x86_64__)
uint32_t HammingDistance_SSE42(const uint8_t* src_a,
@@ -29,38 +28,38 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
uint64_t diff = 0u;
asm volatile(
- "xor %3,%3 \n"
- "xor %%r8,%%r8 \n"
- "xor %%r9,%%r9 \n"
- "xor %%r10,%%r10 \n"
+ "xor %3,%3 \n"
+ "xor %%r8,%%r8 \n"
+ "xor %%r9,%%r9 \n"
+ "xor %%r10,%%r10 \n"
// Process 32 bytes per loop.
LABELALIGN
"1: \n"
- "mov (%0),%%rcx \n"
- "mov 0x8(%0),%%rdx \n"
- "xor (%1),%%rcx \n"
- "xor 0x8(%1),%%rdx \n"
- "popcnt %%rcx,%%rcx \n"
- "popcnt %%rdx,%%rdx \n"
- "mov 0x10(%0),%%rsi \n"
- "mov 0x18(%0),%%rdi \n"
- "xor 0x10(%1),%%rsi \n"
- "xor 0x18(%1),%%rdi \n"
- "popcnt %%rsi,%%rsi \n"
- "popcnt %%rdi,%%rdi \n"
- "add $0x20,%0 \n"
- "add $0x20,%1 \n"
- "add %%rcx,%3 \n"
- "add %%rdx,%%r8 \n"
- "add %%rsi,%%r9 \n"
- "add %%rdi,%%r10 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "mov (%0),%%rcx \n"
+ "mov 0x8(%0),%%rdx \n"
+ "xor (%1),%%rcx \n"
+ "xor 0x8(%1),%%rdx \n"
+ "popcnt %%rcx,%%rcx \n"
+ "popcnt %%rdx,%%rdx \n"
+ "mov 0x10(%0),%%rsi \n"
+ "mov 0x18(%0),%%rdi \n"
+ "xor 0x10(%1),%%rsi \n"
+ "xor 0x18(%1),%%rdi \n"
+ "popcnt %%rsi,%%rsi \n"
+ "popcnt %%rdi,%%rdi \n"
+ "add $0x20,%0 \n"
+ "add $0x20,%1 \n"
+ "add %%rcx,%3 \n"
+ "add %%rdx,%%r8 \n"
+ "add %%rsi,%%r9 \n"
+ "add %%rdi,%%r10 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
- "add %%r8, %3 \n"
- "add %%r9, %3 \n"
- "add %%r10, %3 \n"
+ "add %%r8, %3 \n"
+ "add %%r9, %3 \n"
+ "add %%r10, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@@ -80,26 +79,26 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
// Process 16 bytes per loop.
LABELALIGN
"1: \n"
- "mov (%0),%%ecx \n"
- "mov 0x4(%0),%%edx \n"
- "xor (%1),%%ecx \n"
- "xor 0x4(%1),%%edx \n"
- "popcnt %%ecx,%%ecx \n"
- "add %%ecx,%3 \n"
- "popcnt %%edx,%%edx \n"
- "add %%edx,%3 \n"
- "mov 0x8(%0),%%ecx \n"
- "mov 0xc(%0),%%edx \n"
- "xor 0x8(%1),%%ecx \n"
- "xor 0xc(%1),%%edx \n"
- "popcnt %%ecx,%%ecx \n"
- "add %%ecx,%3 \n"
- "popcnt %%edx,%%edx \n"
- "add %%edx,%3 \n"
- "add $0x10,%0 \n"
- "add $0x10,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "mov (%0),%%ecx \n"
+ "mov 0x4(%0),%%edx \n"
+ "xor (%1),%%ecx \n"
+ "xor 0x4(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "mov 0x8(%0),%%ecx \n"
+ "mov 0xc(%0),%%edx \n"
+ "xor 0x8(%1),%%ecx \n"
+ "xor 0xc(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "add $0x10,%0 \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@@ -121,46 +120,46 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
uint32_t diff = 0u;
asm volatile(
- "movdqa %4,%%xmm2 \n"
- "movdqa %5,%%xmm3 \n"
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub %0,%1 \n"
+ "movdqa %4,%%xmm2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqa (%0),%%xmm4 \n"
- "movdqa 0x10(%0), %%xmm5 \n"
- "pxor (%0,%1), %%xmm4 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pand %%xmm2,%%xmm6 \n"
- "psrlw $0x4,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "pshufb %%xmm6,%%xmm7 \n"
- "pand %%xmm2,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "paddb %%xmm7,%%xmm6 \n"
- "pxor 0x10(%0,%1),%%xmm5 \n"
- "add $0x20,%0 \n"
- "movdqa %%xmm5,%%xmm4 \n"
- "pand %%xmm2,%%xmm5 \n"
- "psrlw $0x4,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "pshufb %%xmm5,%%xmm7 \n"
- "pand %%xmm2,%%xmm4 \n"
- "movdqa %%xmm3,%%xmm5 \n"
- "pshufb %%xmm4,%%xmm5 \n"
- "paddb %%xmm7,%%xmm5 \n"
- "paddb %%xmm5,%%xmm6 \n"
- "psadbw %%xmm1,%%xmm6 \n"
- "paddd %%xmm6,%%xmm0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "movdqa (%0),%%xmm4 \n"
+ "movdqa 0x10(%0), %%xmm5 \n"
+ "pxor (%0,%1), %%xmm4 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pand %%xmm2,%%xmm6 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm6,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "paddb %%xmm7,%%xmm6 \n"
+ "pxor 0x10(%0,%1),%%xmm5 \n"
+ "add $0x20,%0 \n"
+ "movdqa %%xmm5,%%xmm4 \n"
+ "pand %%xmm2,%%xmm5 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm5,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "pshufb %%xmm4,%%xmm5 \n"
+ "paddb %%xmm7,%%xmm5 \n"
+ "paddb %%xmm5,%%xmm6 \n"
+ "psadbw %%xmm1,%%xmm6 \n"
+ "paddd %%xmm6,%%xmm0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
- "pshufd $0xaa,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0, %3 \n"
+ "pshufd $0xaa,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0, %3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
@@ -182,40 +181,40 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
asm volatile(
"vbroadcastf128 %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n"
- "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
- "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
- "sub %0,%1 \n"
+ "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "vmovdqa (%0),%%ymm4 \n"
- "vmovdqa 0x20(%0), %%ymm5 \n"
- "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
- "vpand %%ymm2,%%ymm4,%%ymm6 \n"
- "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
- "vpand %%ymm2,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
- "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
- "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
- "add $0x40,%0 \n"
- "vpand %%ymm2,%%ymm4,%%ymm5 \n"
- "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
- "vpand %%ymm2,%%ymm4,%%ymm4 \n"
- "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
- "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
- "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
- "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
- "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
- "sub $0x40,%2 \n"
- "jg 1b \n"
+ "vmovdqa (%0),%%ymm4 \n"
+ "vmovdqa 0x20(%0), %%ymm5 \n"
+ "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm6 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
+ "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
+ "add $0x40,%0 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm5 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
+ "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
+ "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
- "vpermq $0xb1,%%ymm0,%%ymm1 \n"
- "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xaa,%%ymm0,%%ymm1 \n"
- "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovd %%xmm0, %3 \n"
+ "vpermq $0xb1,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xaa,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovd %%xmm0, %3 \n"
"vzeroupper \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
@@ -234,34 +233,34 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
int count) {
uint32_t sse;
asm volatile(
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psubusb %%xmm2,%%xmm1 \n"
- "psubusb %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm2 \n"
- "pmaddwd %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm2,%%xmm2 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
- "pshufd $0xee,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "pshufd $0x1,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0,%3 \n"
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
@@ -301,44 +300,44 @@ static const uvec32 kHashMul3 = {
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash;
asm volatile(
- "movd %2,%%xmm0 \n"
- "pxor %%xmm7,%%xmm7 \n"
- "movdqa %4,%%xmm6 \n"
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pmulld %%xmm6,%%xmm0 \n"
- "movdqa %5,%%xmm5 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm7,%%xmm3 \n"
- "pmulld %%xmm5,%%xmm3 \n"
- "movdqa %6,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpckhwd %%xmm7,%%xmm4 \n"
- "pmulld %%xmm5,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "punpckhbw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm7,%%xmm2 \n"
- "pmulld %%xmm5,%%xmm2 \n"
- "movdqa %8,%%xmm5 \n"
- "punpckhwd %%xmm7,%%xmm1 \n"
- "pmulld %%xmm5,%%xmm1 \n"
- "paddd %%xmm4,%%xmm3 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm1 \n"
- "pshufd $0xe,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "pshufd $0x1,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "sub $0x10,%1 \n"
- "jg 1b \n"
- "movd %%xmm0,%3 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "sub $0x10,%1 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
: "+r"(src), // %0
"+r"(count), // %1
"+rm"(seed), // %2
diff --git a/files/source/compare_neon.cc b/files/source/compare_neon.cc
index 2a2181e0..afdd6012 100644
--- a/files/source/compare_neon.cc
+++ b/files/source/compare_neon.cc
@@ -29,24 +29,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
uint32_t diff;
asm volatile(
- "vmov.u16 q4, #0 \n" // accumulator
+ "vmov.u16 q4, #0 \n" // accumulator
"1: \n"
- "vld1.8 {q0, q1}, [%0]! \n"
- "vld1.8 {q2, q3}, [%1]! \n"
- "veor.32 q0, q0, q2 \n"
- "veor.32 q1, q1, q3 \n"
- "vcnt.i8 q0, q0 \n"
- "vcnt.i8 q1, q1 \n"
- "subs %2, %2, #32 \n"
- "vadd.u8 q0, q0, q1 \n" // 16 byte counts
- "vpadal.u8 q4, q0 \n" // 8 shorts
- "bgt 1b \n"
+ "vld1.8 {q0, q1}, [%0]! \n"
+ "vld1.8 {q2, q3}, [%1]! \n"
+ "veor.32 q0, q0, q2 \n"
+ "veor.32 q1, q1, q3 \n"
+ "vcnt.i8 q0, q0 \n"
+ "vcnt.i8 q1, q1 \n"
+ "subs %2, %2, #32 \n"
+ "vadd.u8 q0, q0, q1 \n" // 16 byte counts
+ "vpadal.u8 q4, q0 \n" // 8 shorts
+ "bgt 1b \n"
- "vpaddl.u16 q0, q4 \n" // 4 ints
- "vpadd.u32 d0, d0, d1 \n"
- "vpadd.u32 d0, d0, d0 \n"
- "vmov.32 %3, d0[0] \n"
+ "vpaddl.u16 q0, q4 \n" // 4 ints
+ "vpadd.u32 d0, d0, d1 \n"
+ "vpadd.u32 d0, d0, d0 \n"
+ "vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
@@ -59,29 +59,29 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
int count) {
uint32_t sse;
asm volatile(
- "vmov.u8 q8, #0 \n"
- "vmov.u8 q10, #0 \n"
- "vmov.u8 q9, #0 \n"
- "vmov.u8 q11, #0 \n"
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
"1: \n"
- "vld1.8 {q0}, [%0]! \n"
- "vld1.8 {q1}, [%1]! \n"
- "subs %2, %2, #16 \n"
- "vsubl.u8 q2, d0, d2 \n"
- "vsubl.u8 q3, d1, d3 \n"
- "vmlal.s16 q8, d4, d4 \n"
- "vmlal.s16 q9, d6, d6 \n"
- "vmlal.s16 q10, d5, d5 \n"
- "vmlal.s16 q11, d7, d7 \n"
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
- "vadd.u32 q8, q8, q9 \n"
- "vadd.u32 q10, q10, q11 \n"
- "vadd.u32 q11, q8, q10 \n"
- "vpaddl.u32 q1, q11 \n"
- "vadd.u64 d0, d2, d3 \n"
- "vmov.32 %3, d0[0] \n"
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
diff --git a/files/source/compare_neon64.cc b/files/source/compare_neon64.cc
index 6e8f672a..70fb9b91 100644
--- a/files/source/compare_neon64.cc
+++ b/files/source/compare_neon64.cc
@@ -27,22 +27,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
int count) {
uint32_t diff;
asm volatile(
- "movi v4.8h, #0 \n"
+ "movi v4.8h, #0 \n"
"1: \n"
- "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
- "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
- "eor v0.16b, v0.16b, v2.16b \n"
- "eor v1.16b, v1.16b, v3.16b \n"
- "cnt v0.16b, v0.16b \n"
- "cnt v1.16b, v1.16b \n"
- "subs %w2, %w2, #32 \n"
- "add v0.16b, v0.16b, v1.16b \n"
- "uadalp v4.8h, v0.16b \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
+ "eor v0.16b, v0.16b, v2.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "eor v1.16b, v1.16b, v3.16b \n"
+ "cnt v0.16b, v0.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "cnt v1.16b, v1.16b \n"
+ "subs %w2, %w2, #32 \n"
+ "add v0.16b, v0.16b, v1.16b \n"
+ "uadalp v4.8h, v0.16b \n"
+ "b.gt 1b \n"
- "uaddlv s4, v4.8h \n"
- "fmov %w3, s4 \n"
+ "uaddlv s4, v4.8h \n"
+ "fmov %w3, s4 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4");
@@ -54,28 +56,30 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
int count) {
uint32_t sse;
asm volatile(
- "eor v16.16b, v16.16b, v16.16b \n"
- "eor v18.16b, v18.16b, v18.16b \n"
- "eor v17.16b, v17.16b, v17.16b \n"
- "eor v19.16b, v19.16b, v19.16b \n"
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "subs %w2, %w2, #16 \n"
- "usubl v2.8h, v0.8b, v1.8b \n"
- "usubl2 v3.8h, v0.16b, v1.16b \n"
- "smlal v16.4s, v2.4h, v2.4h \n"
- "smlal v17.4s, v3.4h, v3.4h \n"
- "smlal2 v18.4s, v2.8h, v2.8h \n"
- "smlal2 v19.4s, v3.8h, v3.8h \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %w2, %w2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "b.gt 1b \n"
- "add v16.4s, v16.4s, v17.4s \n"
- "add v18.4s, v18.4s, v19.4s \n"
- "add v19.4s, v16.4s, v18.4s \n"
- "addv s0, v19.4s \n"
- "fmov %w3, s0 \n"
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
diff --git a/files/source/compare_win.cc b/files/source/compare_win.cc
index d57d3d9d..9bb27f1d 100644
--- a/files/source/compare_win.cc
+++ b/files/source/compare_win.cc
@@ -22,8 +22,9 @@ namespace libyuv {
extern "C" {
#endif
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ !defined(__clang__) && defined(_M_IX86)
uint32_t HammingDistance_SSE42(const uint8_t* src_a,
const uint8_t* src_b,
@@ -77,8 +78,7 @@ __declspec(naked) uint32_t
}
}
-// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
+#ifdef HAS_SUMSQUAREERROR_AVX2
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
#pragma warning(disable : 4752)
__declspec(naked) uint32_t
@@ -118,7 +118,7 @@ __declspec(naked) uint32_t
ret
}
}
-#endif // _MSC_VER >= 1700
+#endif // HAS_SUMSQUAREERROR_AVX2
uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
uvec32 kHashMul0 = {
@@ -196,7 +196,7 @@ __declspec(naked) uint32_t
}
// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
+#ifdef HAS_HASHDJB2_AVX2
__declspec(naked) uint32_t
HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
__asm {
@@ -231,7 +231,7 @@ __declspec(naked) uint32_t
ret
}
}
-#endif // _MSC_VER >= 1700
+#endif // HAS_HASHDJB2_AVX2
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
diff --git a/files/source/convert.cc b/files/source/convert.cc
index 614fa482..7178580f 100644
--- a/files/source/convert.cc
+++ b/files/source/convert.cc
@@ -15,7 +15,9 @@
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
#include "libyuv/row.h"
-#include "libyuv/scale.h" // For ScalePlane()
+#include "libyuv/scale.h" // For ScalePlane()
+#include "libyuv/scale_row.h" // For FixedDiv
+#include "libyuv/scale_uv.h" // For UVScale()
#ifdef __cplusplus
namespace libyuv {
@@ -48,7 +50,7 @@ static int I4xxToI420(const uint8_t* src_y,
const int dst_y_height = Abs(src_y_height);
const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
- if (src_uv_width == 0 || src_uv_height == 0) {
+ if (src_uv_width <= 0 || src_uv_height == 0) {
return -1;
}
if (dst_y) {
@@ -82,7 +84,8 @@ int I420Copy(const uint8_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -124,7 +127,8 @@ int I010Copy(const uint16_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -148,6 +152,53 @@ int I010Copy(const uint16_t* src_y,
return 0;
}
+static int Planar16bitTo8bit(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int subsample_x,
+ int subsample_y,
+ int depth) {
+ int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+ int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+ int scale = 1 << (24 - depth);
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ uv_height = -uv_height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (uv_height - 1) * src_stride_u;
+ src_v = src_v + (uv_height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Convert Y plane.
+ Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+ height);
+ // Convert UV planes.
+ Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale, uv_width,
+ uv_height);
+ Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale, uv_width,
+ uv_height);
+ return 0;
+}
+
// Convert 10 bit YUV to 8 bit.
LIBYUV_API
int I010ToI420(const uint16_t* src_y,
@@ -164,34 +215,344 @@ int I010ToI420(const uint16_t* src_y,
int dst_stride_v,
int width,
int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+ 1, 10);
+}
+
+LIBYUV_API
+int I210ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ const int depth = 10;
+ const int scale = 1 << (24 - depth);
+
+ if (width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
- halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
src_stride_y = -src_stride_y;
src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v;
}
- // Convert Y plane.
- Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width,
- height);
- // Convert UV planes.
- Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth,
- halfheight);
- Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth,
- halfheight);
+ {
+ const int uv_width = SUBSAMPLE(width, 1, 1);
+ const int uv_height = SUBSAMPLE(height, 1, 1);
+ const int dy = FixedDiv(height, uv_height);
+
+ Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
+ height);
+ ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u,
+ dst_stride_u, src_u, dst_u, 0, 32768, dy,
+ /*bpp=*/1, scale, kFilterBilinear);
+ ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v,
+ dst_stride_v, src_v, dst_v, 0, 32768, dy,
+ /*bpp=*/1, scale, kFilterBilinear);
+ }
return 0;
}
+LIBYUV_API
+int I210ToI422(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+ 0, 10);
+}
+
+LIBYUV_API
+int I410ToI444(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height, 0,
+ 0, 10);
+}
+
+LIBYUV_API
+int I012ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+ 1, 12);
+}
+
+LIBYUV_API
+int I212ToI422(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+ 0, 12);
+}
+
+LIBYUV_API
+int I412ToI444(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, width, height, 0,
+ 0, 12);
+}
+
+// Any Ix10 To I010 format with mirroring.
+static int Ix10ToI010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int subsample_x,
+ int subsample_y) {
+ const int dst_y_width = Abs(width);
+ const int dst_y_height = Abs(height);
+ const int src_uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+ const int src_uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+ const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+ const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+ if (dst_y) {
+ ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ dst_y_width, dst_y_height, kFilterBilinear);
+ }
+ ScalePlane_12(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+ dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+ ScalePlane_12(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+ dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+ return 0;
+}
+
+LIBYUV_API
+int I410ToI010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, 0, 0);
+}
+
+LIBYUV_API
+int I210ToI010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, width, height, 1, 0);
+}
+
+// Any I[420]1[02] to P[420]1[02] format with mirroring.
+static int IxxxToPxxx(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height,
+ int subsample_x,
+ int subsample_y,
+ int depth) {
+ const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+ const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+
+ ConvertToMSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
+ depth);
+ MergeUVPlane_16(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+ dst_stride_uv, uv_width, uv_height, depth);
+ return 0;
+}
+
+LIBYUV_API
+int I010ToP010(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+ width, height, 1, 1, 10);
+}
+
+LIBYUV_API
+int I210ToP210(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+ width, height, 1, 0, 10);
+}
+
+LIBYUV_API
+int I012ToP012(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+ width, height, 1, 1, 12);
+}
+
+LIBYUV_API
+int I212ToP212(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
+ width, height, 1, 0, 12);
+}
+
// 422 chroma is 1/2 width, 1x height
// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
@@ -215,6 +576,48 @@ int I422ToI420(const uint8_t* src_y,
dst_v, dst_stride_v, width, height, src_uv_width, height);
}
+LIBYUV_API
+int I422ToI210(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Convert Y plane.
+ Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
+ height);
+ // Convert UV planes.
+ Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
+ height);
+ Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
+ height);
+ return 0;
+}
+
// TODO(fbarchard): Implement row conversion.
LIBYUV_API
int I422ToNV21(const uint8_t* src_y,
@@ -256,6 +659,60 @@ int I422ToNV21(const uint8_t* src_y,
return 0;
}
+LIBYUV_API
+int MM21ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_uv || !dst_uv || width <= 0) {
+ return -1;
+ }
+
+ int sign = height < 0 ? -1 : 1;
+
+ if (dst_y) {
+ DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32);
+ }
+ DetilePlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, (width + 1) & ~1,
+ (height + sign) / 2, 16);
+
+ return 0;
+}
+
+LIBYUV_API
+int MM21ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int sign = height < 0 ? -1 : 1;
+
+ if (!src_uv || !dst_u || !dst_v || width <= 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32);
+ }
+ DetileSplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, (width + 1) & ~1, (height + sign) / 2, 16);
+
+ return 0;
+}
+
#ifdef I422TONV21_ROW_VERSION
// Unittest fails for this version.
// 422 chroma is 1/2 width, 1x height
@@ -328,11 +785,11 @@ int I422ToNV21(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MergeUVRow = MergeUVRow_Any_MMI;
- if (IS_ALIGNED(halfwidth, 8)) {
- MergeUVRow = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_LSX;
}
}
#endif
@@ -368,11 +825,11 @@ int I422ToNV21(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
}
}
#endif
@@ -426,9 +883,8 @@ int I444ToI420(const uint8_t* src_y,
dst_v, dst_stride_v, width, height, width, height);
}
-// TODO(fbarchard): Implement row conversion.
LIBYUV_API
-int I444ToNV21(const uint8_t* src_y,
+int I444ToNV12(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
@@ -436,16 +892,16 @@ int I444ToNV21(const uint8_t* src_y,
int src_stride_v,
uint8_t* dst_y,
int dst_stride_y,
- uint8_t* dst_vu,
- int dst_stride_vu,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
int width,
int height) {
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
- halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
src_u = src_u + (height - 1) * src_stride_u;
src_v = src_v + (height - 1) * src_stride_v;
@@ -453,19 +909,32 @@ int I444ToNV21(const uint8_t* src_y,
src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v;
}
- // Allocate u and v buffers
- align_buffer_64(plane_u, halfwidth * halfheight * 2);
- uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
- I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
- dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
- height);
- MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
- halfwidth, halfheight);
- free_aligned_buffer_64(plane_u);
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+ dst_stride_uv, width, height);
return 0;
}
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+ width, height);
+}
+
// I400 is greyscale typically used in MJPG
LIBYUV_API
int I400ToI420(const uint8_t* src_y,
@@ -527,70 +996,21 @@ int I400ToNV21(const uint8_t* src_y,
return 0;
}
-static void CopyPlane2(const uint8_t* src,
- int src_stride_0,
- int src_stride_1,
- uint8_t* dst,
- int dst_stride,
- int width,
- int height) {
- int y;
- void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_AVX)
- if (TestCpuFlag(kCpuHasAVX)) {
- CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
- }
-#endif
-#if defined(HAS_COPYROW_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
- }
-#endif
-
- // Copy plane
- for (y = 0; y < height - 1; y += 2) {
- CopyRow(src, dst, width);
- CopyRow(src + src_stride_0, dst + dst_stride, width);
- src += src_stride_0 + src_stride_1;
- dst += dst_stride * 2;
- }
- if (height & 1) {
- CopyRow(src, dst, width);
- }
-}
-
-// Support converting from FOURCC_M420
-// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
-// easy conversion to I420.
-// M420 format description:
-// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
-// Chroma is half width / half height. (420)
-// src_stride_m420 is row planar. Normally this will be the width in pixels.
-// The UV plane is half width, but 2 values, so src_stride_m420 applies to
-// this as well as the two Y planes.
-static int X420ToI420(const uint8_t* src_y,
- int src_stride_y0,
- int src_stride_y1,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_y,
- int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
- int width,
- int height) {
+// Convert NV12 to I420.
+// TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm.
+LIBYUV_API
+int NV12ToI420(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -600,21 +1020,16 @@ static int X420ToI420(const uint8_t* src_y,
if (height < 0) {
height = -height;
halfheight = (height + 1) >> 1;
- if (dst_y) {
- dst_y = dst_y + (height - 1) * dst_stride_y;
- }
- dst_u = dst_u + (halfheight - 1) * dst_stride_u;
- dst_v = dst_v + (halfheight - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
}
// Coalesce rows.
- if (src_stride_y0 == width && src_stride_y1 == width &&
- dst_stride_y == width) {
+ if (src_stride_y == width && dst_stride_y == width) {
width *= height;
height = 1;
- src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
+ src_stride_y = dst_stride_y = 0;
}
// Coalesce rows.
if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
@@ -625,12 +1040,7 @@ static int X420ToI420(const uint8_t* src_y,
}
if (dst_y) {
- if (src_stride_y0 == src_stride_y1) {
- CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
- } else {
- CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
- width, height);
- }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
// Split UV plane - NV12 / NV21
@@ -640,12 +1050,12 @@ static int X420ToI420(const uint8_t* src_y,
return 0;
}
-// Convert NV12 to I420.
+// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
LIBYUV_API
-int NV12ToI420(const uint8_t* src_y,
+int NV21ToI420(const uint8_t* src_y,
int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
+ const uint8_t* src_vu,
+ int src_stride_vu,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
@@ -654,46 +1064,107 @@ int NV12ToI420(const uint8_t* src_y,
int dst_stride_v,
int width,
int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,
- dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
- dst_stride_v, width, height);
+ return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+ dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u,
+ width, height);
}
-// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
LIBYUV_API
-int NV21ToI420(const uint8_t* src_y,
+int NV12ToNV24(const uint8_t* src_y,
int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
+ const uint8_t* src_uv,
+ int src_stride_uv,
uint8_t* dst_y,
int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
int width,
int height) {
- return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,
- dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,
- dst_stride_u, width, height);
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ }
+ UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
+ SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
+ Abs(height), kFilterBilinear);
+ return 0;
}
-// Convert M420 to I420.
LIBYUV_API
-int M420ToI420(const uint8_t* src_m420,
- int src_stride_m420,
+int NV16ToNV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
uint8_t* dst_y,
int dst_stride_y,
- uint8_t* dst_u,
- int dst_stride_u,
- uint8_t* dst_v,
- int dst_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
int width,
int height) {
- return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
- src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,
- dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
- width, height);
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ }
+ UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
+ dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+ return 0;
+}
+
+LIBYUV_API
+int P010ToP410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ }
+ UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
+ SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
+ Abs(height), kFilterBilinear);
+ return 0;
+}
+
+LIBYUV_API
+int P210ToP410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (width <= 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ }
+ UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
+ dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
+ return 0;
}
// Convert YUY2 to I420.
@@ -750,7 +1221,7 @@ int YUY2ToI420(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MSA)
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToYRow = YUY2ToYRow_Any_MSA;
YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
@@ -760,15 +1231,13 @@ int YUY2ToI420(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- YUY2ToYRow = YUY2ToYRow_Any_MMI;
- YUY2ToUVRow = YUY2ToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- YUY2ToYRow = YUY2ToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- YUY2ToUVRow = YUY2ToUVRow_MMI;
- }
+#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ YUY2ToYRow = YUY2ToYRow_Any_LASX;
+ YUY2ToUVRow = YUY2ToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_LASX;
+ YUY2ToUVRow = YUY2ToUVRow_LASX;
}
}
#endif
@@ -853,13 +1322,13 @@ int UYVYToI420(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_UYVYTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- UYVYToYRow = UYVYToYRow_Any_MMI;
- UYVYToUVRow = UYVYToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- UYVYToYRow = UYVYToYRow_MMI;
- UYVYToUVRow = UYVYToUVRow_MMI;
+#if defined(HAS_UYVYTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ UYVYToYRow = UYVYToYRow_Any_LASX;
+ UYVYToUVRow = UYVYToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_LASX;
+ UYVYToUVRow = UYVYToUVRow_LASX;
}
}
#endif
@@ -1045,30 +1514,10 @@ int ARGBToI420(const uint8_t* src_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
-#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -1081,35 +1530,57 @@ int ARGBToI420(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_MSA;
+ ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
}
}
#endif
@@ -1170,7 +1641,7 @@ int BGRAToI420(const uint8_t* src_bgra,
#if defined(HAS_BGRATOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
BGRAToYRow = BGRAToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
BGRAToYRow = BGRAToYRow_NEON;
}
}
@@ -1183,35 +1654,23 @@ int BGRAToI420(const uint8_t* src_bgra,
}
}
#endif
-#if defined(HAS_BGRATOYROW_MSA)
+#if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
BGRAToYRow = BGRAToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- BGRAToYRow = BGRAToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_BGRATOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
BGRAToUVRow = BGRAToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
+ BGRAToYRow = BGRAToYRow_MSA;
BGRAToUVRow = BGRAToUVRow_MSA;
}
}
#endif
-#if defined(HAS_BGRATOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- BGRAToYRow = BGRAToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- BGRAToYRow = BGRAToYRow_MMI;
- }
- }
-#endif
-#if defined(HAS_BGRATOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- BGRAToUVRow = BGRAToUVRow_Any_MMI;
+#if defined(HAS_BGRATOYROW_LSX) && defined(HAS_BGRATOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ BGRAToYRow = BGRAToYRow_Any_LSX;
+ BGRAToUVRow = BGRAToUVRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_MMI;
+ BGRAToYRow = BGRAToYRow_LSX;
+ BGRAToUVRow = BGRAToUVRow_LSX;
}
}
#endif
@@ -1259,20 +1718,42 @@ int ABGRToI420(const uint8_t* src_abgr,
src_abgr = src_abgr + (height - 1) * src_stride_abgr;
src_stride_abgr = -src_stride_abgr;
}
-#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+#if defined(HAS_ABGRTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
ABGRToYRow = ABGRToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_SSSE3;
ABGRToYRow = ABGRToYRow_SSSE3;
}
}
#endif
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToYRow = ABGRToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToYRow = ABGRToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ABGRTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ABGRToYRow = ABGRToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ABGRToYRow = ABGRToYRow_NEON;
}
}
@@ -1285,35 +1766,23 @@ int ABGRToI420(const uint8_t* src_abgr,
}
}
#endif
-#if defined(HAS_ABGRTOYROW_MSA)
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYRow = ABGRToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- ABGRToYRow = ABGRToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
ABGRToUVRow = ABGRToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
ABGRToUVRow = ABGRToUVRow_MSA;
}
}
#endif
-#if defined(HAS_ABGRTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ABGRToYRow = ABGRToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ABGRToYRow = ABGRToYRow_MMI;
- }
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ABGRToUVRow = ABGRToUVRow_Any_MMI;
+#if defined(HAS_ABGRTOYROW_LSX) && defined(HAS_ABGRTOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ABGRToYRow = ABGRToYRow_Any_LSX;
+ ABGRToUVRow = ABGRToUVRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_MMI;
+ ABGRToYRow = ABGRToYRow_LSX;
+ ABGRToUVRow = ABGRToUVRow_LSX;
}
}
#endif
@@ -1361,20 +1830,26 @@ int RGBAToI420(const uint8_t* src_rgba,
src_rgba = src_rgba + (height - 1) * src_stride_rgba;
src_stride_rgba = -src_stride_rgba;
}
-#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
+#if defined(HAS_RGBATOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
RGBAToYRow = RGBAToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_SSSE3;
RGBAToYRow = RGBAToYRow_SSSE3;
}
}
#endif
+#if defined(HAS_RGBATOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_SSSE3;
+ }
+ }
+#endif
#if defined(HAS_RGBATOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGBAToYRow = RGBAToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
RGBAToYRow = RGBAToYRow_NEON;
}
}
@@ -1387,35 +1862,23 @@ int RGBAToI420(const uint8_t* src_rgba,
}
}
#endif
-#if defined(HAS_RGBATOYROW_MSA)
+#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGBAToYRow = RGBAToYRow_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- RGBAToYRow = RGBAToYRow_MSA;
- }
- }
-#endif
-#if defined(HAS_RGBATOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
RGBAToUVRow = RGBAToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
+ RGBAToYRow = RGBAToYRow_MSA;
RGBAToUVRow = RGBAToUVRow_MSA;
}
}
#endif
-#if defined(HAS_RGBATOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGBAToYRow = RGBAToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- RGBAToYRow = RGBAToYRow_MMI;
- }
- }
-#endif
-#if defined(HAS_RGBATOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGBAToUVRow = RGBAToUVRow_Any_MMI;
+#if defined(HAS_RGBATOYROW_LSX) && defined(HAS_RGBATOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGBAToYRow = RGBAToYRow_Any_LSX;
+ RGBAToUVRow = RGBAToUVRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_MMI;
+ RGBAToYRow = RGBAToYRow_LSX;
+ RGBAToUVRow = RGBAToUVRow_LSX;
}
}
#endif
@@ -1436,6 +1899,12 @@ int RGBAToI420(const uint8_t* src_rgba,
return 0;
}
+// Enabled if 1 pass is available
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_LSX))
+#define HAS_RGB24TOYROW
+#endif
+
// Convert RGB24 to I420.
LIBYUV_API
int RGB24ToI420(const uint8_t* src_rgb24,
@@ -1449,8 +1918,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
int width,
int height) {
int y;
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
- defined(HAS_RGB24TOYROW_MMI))
+#if defined(HAS_RGB24TOYROW)
void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
uint8_t* dst_u, uint8_t* dst_v, int width) =
RGB24ToUVRow_C;
@@ -1475,19 +1943,20 @@ int RGB24ToI420(const uint8_t* src_rgb24,
src_stride_rgb24 = -src_stride_rgb24;
}
+#if defined(HAS_RGB24TOYROW)
+
// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYROW_NEON)
+#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
RGB24ToYRow = RGB24ToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
RGB24ToYRow = RGB24ToYRow_NEON;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToUVRow = RGB24ToUVRow_NEON;
- }
+ RGB24ToUVRow = RGB24ToUVRow_NEON;
}
}
-#elif defined(HAS_RGB24TOYROW_MSA)
+#endif
+#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
RGB24ToYRow = RGB24ToYRow_Any_MSA;
@@ -1496,19 +1965,31 @@ int RGB24ToI420(const uint8_t* src_rgb24,
RGB24ToUVRow = RGB24ToUVRow_MSA;
}
}
-#elif defined(HAS_RGB24TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGB24ToUVRow = RGB24ToUVRow_Any_MMI;
- RGB24ToYRow = RGB24ToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToYRow = RGB24ToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToUVRow = RGB24ToUVRow_MMI;
- }
+#endif
+#if defined(HAS_RGB24TOYROW_LSX) && defined(HAS_RGB24TOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_LSX;
+ RGB24ToYRow = RGB24ToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYRow = RGB24ToYRow_LSX;
+ RGB24ToUVRow = RGB24ToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYROW_LASX) && defined(HAS_RGB24TOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_LASX;
+ RGB24ToYRow = RGB24ToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RGB24ToYRow = RGB24ToYRow_LASX;
+ RGB24ToUVRow = RGB24ToUVRow_LASX;
}
}
+#endif
+
// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else
+#else // HAS_RGB24TOYROW
+
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@@ -1517,39 +1998,49 @@ int RGB24ToI420(const uint8_t* src_rgb24,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
#endif
+#endif // HAS_RGB24TOYROW
{
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
- defined(HAS_RGB24TOYROW_MMI))
+#if !defined(HAS_RGB24TOYROW)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
- defined(HAS_RGB24TOYROW_MMI))
+#if defined(HAS_RGB24TOYROW)
RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
RGB24ToYRow(src_rgb24, dst_y, width);
RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
@@ -1566,8 +2057,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
- defined(HAS_RGB24TOYROW_MMI))
+#if defined(HAS_RGB24TOYROW)
RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
RGB24ToYRow(src_rgb24, dst_y, width);
#else
@@ -1576,15 +2066,19 @@ int RGB24ToI420(const uint8_t* src_rgb24,
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
- defined(HAS_RGB24TOYROW_MMI))
+#if !defined(HAS_RGB24TOYROW)
free_aligned_buffer_64(row);
#endif
}
return 0;
}
+#undef HAS_RGB24TOYROW
+
+// Enabled if 1 pass is available
+#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA)
+#define HAS_RGB24TOYJROW
+#endif
-// TODO(fbarchard): Use Matrix version to implement I420 and J420.
// Convert RGB24 to J420.
LIBYUV_API
int RGB24ToJ420(const uint8_t* src_rgb24,
@@ -1598,8 +2092,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
int width,
int height) {
int y;
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
+#if defined(HAS_RGB24TOYJROW)
void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
uint8_t* dst_u, uint8_t* dst_v, int width) =
RGB24ToUVJRow_C;
@@ -1624,19 +2117,20 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
src_stride_rgb24 = -src_stride_rgb24;
}
+#if defined(HAS_RGB24TOYJROW)
+
// Neon version does direct RGB24 to YUV.
-#if defined(HAS_RGB24TOYJROW_NEON)
+#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
RGB24ToYJRow = RGB24ToYJRow_NEON;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToUVJRow = RGB24ToUVJRow_NEON;
- }
+ RGB24ToUVJRow = RGB24ToUVJRow_NEON;
}
}
-#elif defined(HAS_RGB24TOYJROW_MSA)
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
@@ -1645,19 +2139,11 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
RGB24ToUVJRow = RGB24ToUVJRow_MSA;
}
}
-#elif defined(HAS_RGB24TOYJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI;
- RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToYJRow = RGB24ToYJRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToUVJRow = RGB24ToUVJRow_MMI;
- }
- }
- }
+#endif
+
// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else
+#else // HAS_RGB24TOYJROW
+
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@@ -1666,39 +2152,49 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_SSSE3;
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
+#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVJRow = ARGBToUVJRow_AVX2;
ARGBToYJRow = ARGBToYJRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_AVX2;
+ }
+ }
#endif
+#endif // HAS_RGB24TOYJROW
{
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
+#if !defined(HAS_RGB24TOYJROW)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
+#if defined(HAS_RGB24TOYJROW)
RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
RGB24ToYJRow(src_rgb24, dst_y, width);
RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
@@ -1715,8 +2211,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
+#if defined(HAS_RGB24TOYJROW)
RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
RGB24ToYJRow(src_rgb24, dst_y, width);
#else
@@ -1725,13 +2220,19 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
ARGBToYJRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
+#if !defined(HAS_RGB24TOYJROW)
free_aligned_buffer_64(row);
#endif
}
return 0;
}
+#undef HAS_RGB24TOYJROW
+
+// Enabled if 1 pass is available
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_LSX))
+#define HAS_RAWTOYROW
+#endif
// Convert RAW to I420.
LIBYUV_API
@@ -1746,8 +2247,7 @@ int RAWToI420(const uint8_t* src_raw,
int width,
int height) {
int y;
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
- defined(HAS_RAWTOYROW_MMI))
+#if defined(HAS_RAWTOYROW)
void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
uint8_t* dst_v, int width) = RAWToUVRow_C;
void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
@@ -1771,19 +2271,20 @@ int RAWToI420(const uint8_t* src_raw,
src_stride_raw = -src_stride_raw;
}
+#if defined(HAS_RAWTOYROW)
+
// Neon version does direct RAW to YUV.
-#if defined(HAS_RAWTOYROW_NEON)
+#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToUVRow = RAWToUVRow_Any_NEON;
RAWToYRow = RAWToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
RAWToYRow = RAWToYRow_NEON;
- if (IS_ALIGNED(width, 16)) {
- RAWToUVRow = RAWToUVRow_NEON;
- }
+ RAWToUVRow = RAWToUVRow_NEON;
}
}
-#elif defined(HAS_RAWTOYROW_MSA)
+#endif
+#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RAWToUVRow = RAWToUVRow_Any_MSA;
RAWToYRow = RAWToYRow_Any_MSA;
@@ -1792,19 +2293,31 @@ int RAWToI420(const uint8_t* src_raw,
RAWToUVRow = RAWToUVRow_MSA;
}
}
-#elif defined(HAS_RAWTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RAWToUVRow = RAWToUVRow_Any_MMI;
- RAWToYRow = RAWToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- RAWToYRow = RAWToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- RAWToUVRow = RAWToUVRow_MMI;
- }
+#endif
+#if defined(HAS_RAWTOYROW_LSX) && defined(HAS_RAWTOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RAWToUVRow = RAWToUVRow_Any_LSX;
+ RAWToYRow = RAWToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYRow = RAWToYRow_LSX;
+ RAWToUVRow = RAWToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYROW_LASX) && defined(HAS_RAWTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RAWToUVRow = RAWToUVRow_Any_LASX;
+ RAWToYRow = RAWToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RAWToYRow = RAWToYRow_LASX;
+ RAWToUVRow = RAWToUVRow_LASX;
}
}
+#endif
+
// Other platforms do intermediate conversion from RAW to ARGB.
-#else
+#else // HAS_RAWTOYROW
+
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@@ -1813,39 +2326,49 @@ int RAWToI420(const uint8_t* src_raw,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
+#endif // HAS_RAWTOYROW
{
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
- defined(HAS_RAWTOYROW_MMI))
+#if !defined(HAS_RAWTOYROW)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
- defined(HAS_RAWTOYROW_MMI))
+#if defined(HAS_RAWTOYROW)
RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
RAWToYRow(src_raw, dst_y, width);
RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
@@ -1862,8 +2385,7 @@ int RAWToI420(const uint8_t* src_raw,
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
- defined(HAS_RAWTOYROW_MMI))
+#if defined(HAS_RAWTOYROW)
RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
RAWToYRow(src_raw, dst_y, width);
#else
@@ -1872,13 +2394,167 @@ int RAWToI420(const uint8_t* src_raw,
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
- defined(HAS_RAWTOYROW_MMI))
+#if !defined(HAS_RAWTOYROW)
free_aligned_buffer_64(row);
#endif
}
return 0;
}
+#undef HAS_RAWTOYROW
+
+// Enabled if 1 pass is available
+#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA)
+#define HAS_RAWTOYJROW
+#endif
+
+// Convert RAW to J420.
+LIBYUV_API
+int RAWToJ420(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if defined(HAS_RAWTOYJROW)
+ void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RAWToUVJRow_C;
+ void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+ RAWToYJRow_C;
+#else
+ void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RAWToARGBRow_C;
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYJRow_C;
+#endif
+ if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+
+#if defined(HAS_RAWTOYJROW)
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToUVJRow = RAWToUVJRow_Any_NEON;
+ RAWToYJRow = RAWToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_NEON;
+ RAWToUVJRow = RAWToUVJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToUVJRow = RAWToUVJRow_Any_MSA;
+ RAWToYJRow = RAWToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_MSA;
+ RAWToUVJRow = RAWToUVJRow_MSA;
+ }
+ }
+#endif
+
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else // HAS_RAWTOYJROW
+
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_AVX2;
+ }
+ }
+#endif
+#endif // HAS_RAWTOYJROW
+
+ {
+#if !defined(HAS_RAWTOYJROW)
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYJROW)
+ RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+ RAWToYJRow(src_raw, dst_y, width);
+ RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+ ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+ ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RAWTOYJROW)
+ RAWToUVJRow(src_raw, 0, dst_u, dst_v, width);
+ RAWToYJRow(src_raw, dst_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RAWTOYJROW)
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+#undef HAS_RAWTOYJROW
// Convert RGB565 to I420.
LIBYUV_API
@@ -1894,7 +2570,7 @@ int RGB565ToI420(const uint8_t* src_rgb565,
int height) {
int y;
#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
- defined(HAS_RGB565TOYROW_MMI))
+ defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
uint8_t* dst_u, uint8_t* dst_v, int width) =
RGB565ToUVRow_C;
@@ -1931,7 +2607,10 @@ int RGB565ToI420(const uint8_t* src_rgb565,
}
}
}
-#elif defined(HAS_RGB565TOYROW_MSA)
+// MSA version does direct RGB565 to YUV.
+#elif (defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_LSX) || \
+ defined(HAS_RGB565TOYROW_LASX))
+#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
RGB565ToYRow = RGB565ToYRow_Any_MSA;
@@ -1940,17 +2619,27 @@ int RGB565ToI420(const uint8_t* src_rgb565,
RGB565ToUVRow = RGB565ToUVRow_MSA;
}
}
-#elif defined(HAS_RGB565TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGB565ToUVRow = RGB565ToUVRow_Any_MMI;
- RGB565ToYRow = RGB565ToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToYRow = RGB565ToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- RGB565ToUVRow = RGB565ToUVRow_MMI;
- }
+#endif
+#if defined(HAS_RGB565TOYROW_LSX) && defined(HAS_RGB565TOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_LSX;
+ RGB565ToYRow = RGB565ToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToYRow = RGB565ToYRow_LSX;
+ RGB565ToUVRow = RGB565ToUVRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOYROW_LASX) && defined(HAS_RGB565TOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_LASX;
+ RGB565ToYRow = RGB565ToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RGB565ToYRow = RGB565ToYRow_LASX;
+ RGB565ToUVRow = RGB565ToUVRow_LASX;
}
}
+#endif
// Other platforms do intermediate conversion from RGB565 to ARGB.
#else
#if defined(HAS_RGB565TOARGBROW_SSE2)
@@ -1969,37 +2658,49 @@ int RGB565ToI420(const uint8_t* src_rgb565,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
#endif
{
#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
- defined(HAS_RGB565TOYROW_MMI))
+ defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
- defined(HAS_RGB565TOYROW_MMI))
+ defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
RGB565ToYRow(src_rgb565, dst_y, width);
RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
@@ -2017,7 +2718,7 @@ int RGB565ToI420(const uint8_t* src_rgb565,
}
if (height & 1) {
#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
- defined(HAS_RGB565TOYROW_MMI))
+ defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
RGB565ToYRow(src_rgb565, dst_y, width);
#else
@@ -2027,7 +2728,7 @@ int RGB565ToI420(const uint8_t* src_rgb565,
#endif
}
#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
- defined(HAS_RGB565TOYROW_MMI))
+ defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX))
free_aligned_buffer_64(row);
#endif
}
@@ -2048,7 +2749,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
int height) {
int y;
#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
- defined(HAS_ARGB1555TOYROW_MMI))
+ defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
uint8_t* dst_u, uint8_t* dst_v, int width) =
ARGB1555ToUVRow_C;
@@ -2086,7 +2787,10 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
}
}
}
-#elif defined(HAS_ARGB1555TOYROW_MSA)
+// MSA version does direct ARGB1555 to YUV.
+#elif (defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_LSX) || \
+ defined(HAS_ARGB1555TOYROW_LASX))
+#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
@@ -2095,17 +2799,27 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
}
}
-#elif defined(HAS_ARGB1555TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI;
- ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGB1555ToYRow = ARGB1555ToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGB1555ToUVRow = ARGB1555ToUVRow_MMI;
- }
+#endif
+#if defined(HAS_ARGB1555TOYROW_LSX) && defined(HAS_ARGB1555TOUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_LSX;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_LSX;
+ ARGB1555ToUVRow = ARGB1555ToUVRow_LSX;
}
}
+#endif
+#if defined(HAS_ARGB1555TOYROW_LASX) && defined(HAS_ARGB1555TOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_LASX;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_LASX;
+ ARGB1555ToUVRow = ARGB1555ToUVRow_LASX;
+ }
+ }
+#endif
// Other platforms do intermediate conversion from ARGB1555 to ARGB.
#else
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
@@ -2124,30 +2838,42 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
#endif
{
#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
- defined(HAS_ARGB1555TOYROW_MMI))
+ defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
@@ -2155,7 +2881,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
for (y = 0; y < height - 1; y += 2) {
#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
- defined(HAS_ARGB1555TOYROW_MMI))
+ defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
ARGB1555ToYRow(src_argb1555, dst_y, width);
ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
@@ -2175,7 +2901,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
}
if (height & 1) {
#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
- defined(HAS_ARGB1555TOYROW_MMI))
+ defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
ARGB1555ToYRow(src_argb1555, dst_y, width);
#else
@@ -2185,7 +2911,7 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
#endif
}
#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
- defined(HAS_ARGB1555TOYROW_MMI))
+ defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX))
free_aligned_buffer_64(row);
#endif
}
@@ -2205,7 +2931,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
int width,
int height) {
int y;
-#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
+#if defined(HAS_ARGB4444TOYROW_NEON)
void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
uint8_t* dst_u, uint8_t* dst_v, int width) =
ARGB4444ToUVRow_C;
@@ -2243,17 +2969,6 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
}
}
}
-#elif defined(HAS_ARGB4444TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI;
- ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGB4444ToYRow = ARGB4444ToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGB4444ToUVRow = ARGB4444ToUVRow_MMI;
- }
- }
- }
// Other platforms do intermediate conversion from ARGB4444 to ARGB.
#else
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
@@ -2280,27 +2995,55 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGB4444TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToUVRow = ARGBToUVRow_Any_MSA;
ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -2312,29 +3055,27 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
- }
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
}
}
#endif
#endif
{
-#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
+#if !(defined(HAS_ARGB4444TOYROW_NEON))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
+#if defined(HAS_ARGB4444TOYROW_NEON)
ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
ARGB4444ToYRow(src_argb4444, dst_y, width);
ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
@@ -2353,7 +3094,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
+#if defined(HAS_ARGB4444TOYROW_NEON)
ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
ARGB4444ToYRow(src_argb4444, dst_y, width);
#else
@@ -2362,7 +3103,7 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
+#if !(defined(HAS_ARGB4444TOYROW_NEON))
free_aligned_buffer_64(row);
#endif
}
@@ -2378,125 +3119,129 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
int width,
int height) {
int y;
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
RGB24ToYJRow_C;
-#else
- void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
- RGB24ToARGBRow_C;
- void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
- ARGBToYJRow_C;
-#endif
if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
return -1;
}
- // Negative height means invert the image.
if (height < 0) {
height = -height;
src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
src_stride_rgb24 = -src_stride_rgb24;
}
-
-// Neon version does direct RGB24 to YUV.
+ // Coalesce rows.
+ if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_rgb24 = dst_stride_yj = 0;
+ }
+#if defined(HAS_RGB24TOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ RGB24ToYJRow = RGB24ToYJRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_RGB24TOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
RGB24ToYJRow = RGB24ToYJRow_NEON;
}
}
-#elif defined(HAS_RGB24TOYJROW_MSA)
+#endif
+#if defined(HAS_RGB24TOYJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
RGB24ToYJRow = RGB24ToYJRow_MSA;
}
}
-#elif defined(HAS_RGB24TOYJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToYJRow = RGB24ToYJRow_MMI;
- }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGB24ToYJRow(src_rgb24, dst_yj, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_yj += dst_stride_yj;
}
-// Other platforms do intermediate conversion from RGB24 to ARGB.
-#else
-#if defined(HAS_RGB24TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
- }
+ return 0;
+}
+
+// Convert RAW to J400.
+LIBYUV_API
+int RAWToJ400(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) =
+ RAWToYJRow_C;
+ if (!src_raw || !dst_yj || width <= 0 || height == 0) {
+ return -1;
}
-#endif
-#if defined(HAS_ARGBTOYJROW_SSSE3)
+
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_yj = 0;
+ }
+
+#if defined(HAS_RAWTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ RAWToYJRow = RAWToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYJRow = ARGBToYJRow_SSSE3;
+ RAWToYJRow = RAWToYJRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
+#if defined(HAS_RAWTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ RAWToYJRow = RAWToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToYJRow = ARGBToYJRow_AVX2;
+ RAWToYJRow = RAWToYJRow_AVX2;
}
}
#endif
-#endif
-
- {
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 31) & ~31;
- align_buffer_64(row, kRowSize * 2);
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- RGB24ToYJRow(src_rgb24, dst_yj, width);
- RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_yj + dst_stride_yj, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
- ARGBToYJRow(row, dst_yj, width);
- ARGBToYJRow(row + kRowSize, dst_yj + dst_stride_yj, width);
-#endif
- src_rgb24 += src_stride_rgb24 * 2;
- dst_yj += dst_stride_yj * 2;
+#if defined(HAS_RAWTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToYJRow = RAWToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_NEON;
}
- if (height & 1) {
-#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- RGB24ToYJRow(src_rgb24, dst_yj, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- ARGBToYJRow(row, dst_yj, width);
+ }
#endif
+#if defined(HAS_RAWTOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToYJRow = RAWToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_MSA;
}
-#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
- defined(HAS_RGB24TOYJROW_MMI))
- free_aligned_buffer_64(row);
-#endif
}
- return 0;
-}
+#endif
-static void SplitPixels(const uint8_t* src_u,
- int src_pixel_stride_uv,
- uint8_t* dst_u,
- int width) {
- int i;
- for (i = 0; i < width; ++i) {
- *dst_u = *src_u;
- ++dst_u;
- src_u += src_pixel_stride_uv;
+ for (y = 0; y < height; ++y) {
+ RAWToYJRow(src_raw, dst_yj, width);
+ src_raw += src_stride_raw;
+ dst_yj += dst_stride_yj;
}
+ return 0;
}
// Convert Android420 to I420.
@@ -2516,58 +3261,10 @@ int Android420ToI420(const uint8_t* src_y,
int dst_stride_v,
int width,
int height) {
- int y;
- const ptrdiff_t vu_off = src_v - src_u;
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- // Copy UV planes as is - I420
- if (src_pixel_stride_uv == 1) {
- CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
- CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
- return 0;
- // Split UV planes - NV21
- }
- if (src_pixel_stride_uv == 2 && vu_off == -1 &&
- src_stride_u == src_stride_v) {
- SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
- halfwidth, halfheight);
- return 0;
- // Split UV planes - NV12
- }
- if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
- SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
- halfwidth, halfheight);
- return 0;
- }
-
- for (y = 0; y < halfheight; ++y) {
- SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
- SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
- src_u += src_stride_u;
- src_v += src_stride_v;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- return 0;
+ return Android420ToI420Rotate(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_pixel_stride_uv, dst_y,
+ dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, width, height, kRotate0);
}
#ifdef __cplusplus
diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc
index 54050333..71ef8c10 100644
--- a/files/source/convert_argb.cc
+++ b/files/source/convert_argb.cc
@@ -7,7 +7,6 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-
#include "libyuv/convert_argb.h"
#include "libyuv/cpu_id.h"
@@ -17,6 +16,7 @@
#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle.
#include "libyuv/rotate_argb.h"
#include "libyuv/row.h"
+#include "libyuv/scale_row.h" // For ScaleRowUp2_Linear and ScaleRowUp2_Bilinear
#include "libyuv/video_common.h"
#ifdef __cplusplus
@@ -47,18 +47,19 @@ int ARGBCopy(const uint8_t* src_argb,
return 0;
}
-// Convert I420 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert I420 to ARGB with matrix.
+LIBYUV_API
+int I420ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
const uint8_t* v_buf, uint8_t* rgb_buf,
@@ -89,6 +90,15 @@ static int I420ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+ (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_AVX512BW;
+ }
+ }
+#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
@@ -105,6 +115,14 @@ static int I420ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGBRow = I422ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -226,18 +244,55 @@ int H420ToABGR(const uint8_t* src_y,
width, height);
}
-// Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert U420 to ARGB.
+LIBYUV_API
+int U420ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U420 to ABGR.
+LIBYUV_API
+int U420ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I422 to ARGB with matrix.
+LIBYUV_API
+int I422ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
const uint8_t* v_buf, uint8_t* rgb_buf,
@@ -275,6 +330,15 @@ static int I422ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+ (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_AVX512BW;
+ }
+ }
+#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
@@ -291,6 +355,14 @@ static int I422ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGBRow = I422ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@@ -410,20 +482,286 @@ int H422ToABGR(const uint8_t* src_y,
width, height);
}
-// Convert 10 bit YUV to ARGB with matrix
+// Convert U422 to ARGB.
+LIBYUV_API
+int U422ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U422 to ABGR.
+LIBYUV_API
+int U422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I444 to ARGB with matrix.
+LIBYUV_API
+int I444ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToARGBRow = I444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I444ToARGBRow = I444ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToARGBRow = I444ToARGBRow_LSX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J444 to ABGR.
+LIBYUV_API
+int J444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H444 to ARGB.
+LIBYUV_API
+int H444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H444 to ABGR.
+LIBYUV_API
+int H444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U444 to ARGB.
+LIBYUV_API
+int U444ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U444 to ABGR.
+LIBYUV_API
+int U444ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
// multiply 10 bit yuv into high bits to allow any number of bits.
-static int I010ToAR30Matrix(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+LIBYUV_API
+int I010ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
const uint16_t* v_buf, uint8_t* rgb_buf,
@@ -500,6 +838,23 @@ int H010ToAR30(const uint16_t* src_y,
&kYuvH709Constants, width, height);
}
+// Convert U010 to AR30.
+LIBYUV_API
+int U010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuv2020Constants, width, height);
+}
+
// Convert I010 to AB30.
LIBYUV_API
int I010ToAB30(const uint16_t* src_y,
@@ -534,18 +889,302 @@ int H010ToAB30(const uint16_t* src_y,
&kYvuH709Constants, width, height);
}
-// Convert 10 bit YUV to ARGB with matrix
-static int I010ToARGBMatrix(const uint16_t* src_y,
- int src_stride_y,
- const uint16_t* src_u,
- int src_stride_u,
- const uint16_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert U010 to AB30.
+LIBYUV_API
+int U010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert 12 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to
+// multiply 12 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I012ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I212ToAR30Row_C;
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I212TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I212ToAR30Row = I212ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I212ToAR30Row = I212ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I212TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I212ToAR30Row = I212ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I212ToAR30Row = I212ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+LIBYUV_API
+int I210ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToAR30Row_C;
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210ToAR30Row = I210ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToAR30Row = I210ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I210 to AR30.
+LIBYUV_API
+int I210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H210 to AR30.
+LIBYUV_API
+int H210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert U210 to AR30.
+LIBYUV_API
+int U210ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert I210 to AB30.
+LIBYUV_API
+int I210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuI601Constants, width, height);
+}
+
+// Convert H210 to AB30.
+LIBYUV_API
+int H210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuH709Constants, width, height);
+}
+
+// Convert U210 to AB30.
+LIBYUV_API
+int U210ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYuv2020Constants, width, height);
+}
+
+LIBYUV_API
+int I410ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I410ToAR30Row_C;
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I410TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410ToAR30Row = I410ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410ToAR30Row = I410ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410ToAR30Row = I410ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I410ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert 10 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
const uint16_t* v_buf, uint8_t* rgb_buf,
@@ -660,23 +1299,60 @@ int H010ToABGR(const uint16_t* src_y,
width, height);
}
-// Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert U010 to ARGB.
+LIBYUV_API
+int U010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U010 to ABGR.
+LIBYUV_API
+int U010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert 12 bit YUV to ARGB with matrix.
+LIBYUV_API
+int I012ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
- void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
+ void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
- I444ToARGBRow_C;
+ I212ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -686,48 +1362,79 @@ static int I444ToARGBMatrix(const uint8_t* src_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- // Coalesce rows.
- if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
- dst_stride_argb == width * 4) {
- width *= height;
- height = 1;
- src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
- }
-#if defined(HAS_I444TOARGBROW_SSSE3)
+#if defined(HAS_I212TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ I212ToARGBRow = I212ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_SSSE3;
+ I212ToARGBRow = I212ToARGBRow_SSSE3;
}
}
#endif
-#if defined(HAS_I444TOARGBROW_AVX2)
+#if defined(HAS_I212TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+ I212ToARGBRow = I212ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
- I444ToARGBRow = I444ToARGBRow_AVX2;
+ I212ToARGBRow = I212ToARGBRow_AVX2;
}
}
#endif
-#if defined(HAS_I444TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I444ToARGBRow = I444ToARGBRow_Any_NEON;
+ for (y = 0; y < height; ++y) {
+ I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert 10 bit 422 YUV to ARGB with matrix.
+LIBYUV_API
+int I210ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_NEON;
+ I210ToARGBRow = I210ToARGBRow_SSSE3;
}
}
#endif
-#if defined(HAS_I444TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I444ToARGBRow = I444ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_MSA;
+#if defined(HAS_I210TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToARGBRow = I210ToARGBRow_AVX2;
}
}
#endif
-
for (y = 0; y < height; ++y) {
- I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
@@ -736,74 +1443,378 @@ static int I444ToARGBMatrix(const uint8_t* src_y,
return 0;
}
-// Convert I444 to ARGB.
+// Convert I210 to ARGB.
LIBYUV_API
-int I444ToARGB(const uint8_t* src_y,
+int I210ToARGB(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
- return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_argb, dst_stride_argb,
&kYuvI601Constants, width, height);
}
-// Convert I444 to ABGR.
+// Convert I210 to ABGR.
LIBYUV_API
-int I444ToABGR(const uint8_t* src_y,
+int I210ToABGR(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height) {
- return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+ return I210ToARGBMatrix(src_y, src_stride_y, src_v,
src_stride_v, // Swap U and V
src_u, src_stride_u, dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
-// Convert J444 to ARGB.
+// Convert H210 to ARGB.
LIBYUV_API
-int J444ToARGB(const uint8_t* src_y,
+int H210ToARGB(const uint16_t* src_y,
int src_stride_y,
- const uint8_t* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8_t* src_v,
+ const uint16_t* src_v,
int src_stride_v,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
- return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_argb, dst_stride_argb,
- &kYuvJPEGConstants, width, height);
+ &kYuvH709Constants, width, height);
}
-// Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- const uint8_t* src_a,
- int src_stride_a,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height,
- int attenuate) {
+// Convert H210 to ABGR.
+LIBYUV_API
+int H210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert U210 to ARGB.
+LIBYUV_API
+int U210ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuv2020Constants, width, height);
+}
+
+// Convert U210 to ABGR.
+LIBYUV_API
+int U210ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I210ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvu2020Constants, // Use Yvu matrix
+ width, height);
+}
+
+LIBYUV_API
+int I410ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I410ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I410TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410ToARGBRow = I410ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410ToARGBRow = I410ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410ToARGBRow = I410ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410ToARGBRow = I410ToARGBRow_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I410ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int P010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P210ToARGBRow)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_P210TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P210ToARGBRow = P210ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P210ToARGBRow = P210ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P210TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P210ToARGBRow = P210ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P210ToARGBRow = P210ToARGBRow_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+LIBYUV_API
+int P210ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P210ToARGBRow)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_P210TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P210ToARGBRow = P210ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P210ToARGBRow = P210ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P210TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P210ToARGBRow = P210ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P210ToARGBRow = P210ToARGBRow_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_uv += src_stride_uv;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int P010ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P210ToAR30Row)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
+ if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_P210TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P210ToAR30Row = P210ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P210ToAR30Row = P210ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P210TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P210ToAR30Row = P210ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P210ToAR30Row = P210ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+LIBYUV_API
+int P210ToAR30Matrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P210ToAR30Row)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
+ if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_P210TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P210ToAR30Row = P210ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P210ToAR30Row = P210ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P210TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P210ToAR30Row = P210ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P210ToAR30Row = P210ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_uv += src_stride_uv;
+ }
+ return 0;
+}
+
+// Convert I420 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I420AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
int y;
void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
const uint8_t* v_buf, const uint8_t* a_buf,
@@ -812,7 +1823,8 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
int width) = I422AlphaToARGBRow_C;
void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
int width) = ARGBAttenuateRow_C;
- if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -853,6 +1865,14 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_I422ALPHATOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_LASX;
+ }
+ }
+#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@@ -885,14 +1905,6 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBAttenuateRow = ARGBAttenuateRow_MMI;
- }
- }
-#endif
for (y = 0; y < height; ++y) {
I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -911,6 +1923,242 @@ static int I420AlphaToARGBMatrix(const uint8_t* src_y,
return 0;
}
+// Convert I422 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I422AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422ALPHATOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ I422AlphaToARGBRow = I422AlphaToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I444 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I444AlphaToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I444AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
// Convert I420 with Alpha to ARGB.
LIBYUV_API
int I420AlphaToARGB(const uint8_t* src_y,
@@ -954,16 +2202,400 @@ int I420AlphaToABGR(const uint8_t* src_y,
width, height, attenuate);
}
-// Convert I400 to ARGB.
+// Convert I422 with Alpha to ARGB.
LIBYUV_API
-int I400ToARGB(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+int I422AlphaToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate) {
+ return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_a, src_stride_a, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width,
+ height, attenuate);
+}
+
+// Convert I422 with Alpha to ABGR.
+LIBYUV_API
+int I422AlphaToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate) {
+ return I422AlphaToARGBMatrix(
+ src_y, src_stride_y, src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height, attenuate);
+}
+
+// Convert I444 with Alpha to ARGB.
+LIBYUV_API
+int I444AlphaToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int attenuate) {
+ return I444AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, src_a, src_stride_a, dst_argb,
+ dst_stride_argb, &kYuvI601Constants, width,
+ height, attenuate);
+}
+
+// Convert I444 with Alpha to ABGR.
+LIBYUV_API
+int I444AlphaToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height,
+ int attenuate) {
+ return I444AlphaToARGBMatrix(
+ src_y, src_stride_y, src_v, src_stride_v, // Swap U and V
+ src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height, attenuate);
+}
+
+// Convert I010 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I010AlphaToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
int y;
- void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) =
+ void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I210AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I210 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I210AlphaToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I210AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I410 with Alpha to preattenuated ARGB with matrix.
+LIBYUV_API
+int I410AlphaToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I410AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I400 to ARGB with matrix.
+LIBYUV_API
+int I400ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
I400ToARGBRow_C;
if (!src_y || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -1012,23 +2644,35 @@ int I400ToARGB(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_I400TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I400ToARGBRow = I400ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- I400ToARGBRow = I400ToARGBRow_MMI;
+#if defined(HAS_I400TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ I400ToARGBRow = I400ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ I400ToARGBRow = I400ToARGBRow_LSX;
}
}
#endif
for (y = 0; y < height; ++y) {
- I400ToARGBRow(src_y, dst_argb, width);
+ I400ToARGBRow(src_y, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
}
return 0;
}
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
// Convert J400 to ARGB.
LIBYUV_API
int J400ToARGB(const uint8_t* src_y,
@@ -1087,11 +2731,11 @@ int J400ToARGB(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_J400TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- J400ToARGBRow = J400ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- J400ToARGBRow = J400ToARGBRow_MMI;
+#if defined(HAS_J400TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ J400ToARGBRow = J400ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ J400ToARGBRow = J400ToARGBRow_LSX;
}
}
#endif
@@ -1115,6 +2759,10 @@ static const uvec8 kShuffleMaskABGRToARGB = {
static const uvec8 kShuffleMaskRGBAToARGB = {
1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
+// Shuffle table for converting AR64 to AB64.
+static const uvec8 kShuffleMaskAR64ToAB64 = {
+ 4u, 5u, 2u, 3u, 0u, 1u, 6u, 7u, 12u, 13u, 10u, 11u, 8u, 9u, 14u, 15u};
+
// Convert BGRA to ARGB.
LIBYUV_API
int BGRAToARGB(const uint8_t* src_bgra,
@@ -1124,7 +2772,7 @@ int BGRAToARGB(const uint8_t* src_bgra,
int width,
int height) {
return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
- (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
+ (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height);
}
// Convert ARGB to BGRA (same as BGRAToARGB).
@@ -1136,7 +2784,7 @@ int ARGBToBGRA(const uint8_t* src_bgra,
int width,
int height) {
return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
- (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
+ (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height);
}
// Convert ABGR to ARGB.
@@ -1148,7 +2796,7 @@ int ABGRToARGB(const uint8_t* src_abgr,
int width,
int height) {
return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
- (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
+ (const uint8_t*)&kShuffleMaskABGRToARGB, width, height);
}
// Convert ARGB to ABGR to (same as ABGRToARGB).
@@ -1160,7 +2808,7 @@ int ARGBToABGR(const uint8_t* src_abgr,
int width,
int height) {
return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
- (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
+ (const uint8_t*)&kShuffleMaskABGRToARGB, width, height);
}
// Convert RGBA to ARGB.
@@ -1172,7 +2820,19 @@ int RGBAToARGB(const uint8_t* src_rgba,
int width,
int height) {
return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
- (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height);
+ (const uint8_t*)&kShuffleMaskRGBAToARGB, width, height);
+}
+
+// Convert AR64 To AB64.
+LIBYUV_API
+int AR64ToAB64(const uint16_t* src_ar64,
+ int src_stride_ar64,
+ uint16_t* dst_ab64,
+ int dst_stride_ab64,
+ int width,
+ int height) {
+ return AR64Shuffle(src_ar64, src_stride_ar64, dst_ab64, dst_stride_ab64,
+ (const uint8_t*)&kShuffleMaskAR64ToAB64, width, height);
}
// Convert RGB24 to ARGB.
@@ -1225,11 +2885,19 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
}
}
#endif
-#if defined(HAS_RGB24TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- RGB24ToARGBRow = RGB24ToARGBRow_MMI;
+#if defined(HAS_RGB24TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RGB24TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_LASX;
}
}
#endif
@@ -1292,11 +2960,19 @@ int RAWToARGB(const uint8_t* src_raw,
}
}
#endif
-#if defined(HAS_RAWTOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RAWToARGBRow = RAWToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- RAWToARGBRow = RAWToARGBRow_MMI;
+#if defined(HAS_RAWTOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RAWToARGBRow = RAWToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RAWToARGBRow = RAWToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RAWToARGBRow = RAWToARGBRow_LASX;
}
}
#endif
@@ -1309,6 +2985,57 @@ int RAWToARGB(const uint8_t* src_raw,
return 0;
}
+// Convert RAW to RGBA.
+LIBYUV_API
+int RAWToRGBA(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ int y;
+ void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) =
+ RAWToRGBARow_C;
+ if (!src_raw || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_rgba = 0;
+ }
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToRGBARow = RAWToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToRGBARow = RAWToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RAWTORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToRGBARow = RAWToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToRGBARow = RAWToRGBARow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToRGBARow(src_raw, dst_rgba, width);
+ src_raw += src_stride_raw;
+ dst_rgba += dst_stride_rgba;
+ }
+ return 0;
+}
+
// Convert RGB565 to ARGB.
LIBYUV_API
int RGB565ToARGB(const uint8_t* src_rgb565,
@@ -1367,11 +3094,19 @@ int RGB565ToARGB(const uint8_t* src_rgb565,
}
}
#endif
-#if defined(HAS_RGB565TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- RGB565ToARGBRow = RGB565ToARGBRow_MMI;
+#if defined(HAS_RGB565TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_RGB565TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_LASX;
}
}
#endif
@@ -1442,11 +3177,19 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555,
}
}
#endif
-#if defined(HAS_ARGB1555TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI;
+#if defined(HAS_ARGB1555TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_LASX;
}
}
#endif
@@ -1517,11 +3260,19 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444,
}
}
#endif
-#if defined(HAS_ARGB4444TOARGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI;
+#if defined(HAS_ARGB4444TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_LASX;
}
}
#endif
@@ -1630,16 +3381,135 @@ int AR30ToAB30(const uint8_t* src_ar30,
return 0;
}
-// Convert NV12 to ARGB with matrix
-static int NV12ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert AR64 to ARGB.
+LIBYUV_API
+int AR64ToARGB(const uint16_t* src_ar64,
+ int src_stride_ar64,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
+ int width) = AR64ToARGBRow_C;
+ if (!src_ar64 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
+ src_stride_ar64 = -src_stride_ar64;
+ }
+ // Coalesce rows.
+ if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar64 = dst_stride_argb = 0;
+ }
+#if defined(HAS_AR64TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ AR64ToARGBRow = AR64ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_AR64TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AR64ToARGBRow = AR64ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ AR64ToARGBRow = AR64ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_AR64TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AR64ToARGBRow = AR64ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ AR64ToARGBRow = AR64ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ AR64ToARGBRow(src_ar64, dst_argb, width);
+ src_ar64 += src_stride_ar64;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert AB64 to ARGB.
+LIBYUV_API
+int AB64ToARGB(const uint16_t* src_ab64,
+ int src_stride_ab64,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*AB64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb,
+ int width) = AB64ToARGBRow_C;
+ if (!src_ab64 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ab64 = src_ab64 + (height - 1) * src_stride_ab64;
+ src_stride_ab64 = -src_stride_ab64;
+ }
+ // Coalesce rows.
+ if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ab64 = dst_stride_argb = 0;
+ }
+#if defined(HAS_AB64TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ AB64ToARGBRow = AB64ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_AB64TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AB64ToARGBRow = AB64ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ AB64ToARGBRow = AB64ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_AB64TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AB64ToARGBRow = AB64ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ AB64ToARGBRow = AB64ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ AB64ToARGBRow(src_ab64, dst_argb, width);
+ src_ab64 += src_stride_ab64;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert NV12 to ARGB with matrix.
+LIBYUV_API
+int NV12ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*NV12ToARGBRow)(
const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -1685,6 +3555,22 @@ static int NV12ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_NV12TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_NV12TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
@@ -1697,16 +3583,17 @@ static int NV12ToARGBMatrix(const uint8_t* src_y,
return 0;
}
-// Convert NV21 to ARGB with matrix
-static int NV21ToARGBMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_argb,
- int dst_stride_argb,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert NV21 to ARGB with matrix.
+LIBYUV_API
+int NV21ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*NV21ToARGBRow)(
const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -1752,6 +3639,22 @@ static int NV21ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_NV21TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToARGBRow = NV21ToARGBRow_LASX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
@@ -1823,16 +3726,17 @@ int NV21ToABGR(const uint8_t* src_y,
}
// TODO(fbarchard): Consider SSSE3 2 step conversion.
-// Convert NV12 to RGB24 with matrix
-static int NV12ToRGB24Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert NV12 to RGB24 with matrix.
+LIBYUV_API
+int NV12ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*NV12ToRGB24Row)(
const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -1882,16 +3786,17 @@ static int NV12ToRGB24Matrix(const uint8_t* src_y,
return 0;
}
-// Convert NV21 to RGB24 with matrix
-static int NV21ToRGB24Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_vu,
- int src_stride_vu,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+// Convert NV21 to RGB24 with matrix.
+LIBYUV_API
+int NV21ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
void (*NV21ToRGB24Row)(
const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
@@ -2028,6 +3933,14 @@ int NV21ToYUV24(const uint8_t* src_y,
}
}
#endif
+#if defined(HAS_NV21TOYUV24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_SSSE3;
+ }
+ }
+#endif
#if defined(HAS_NV21TOYUV24ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
@@ -2047,75 +3960,6 @@ int NV21ToYUV24(const uint8_t* src_y,
return 0;
}
-// Convert M420 to ARGB.
-LIBYUV_API
-int M420ToARGB(const uint8_t* src_m420,
- int src_stride_m420,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
- int y;
- void (*NV12ToARGBRow)(
- const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
- if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb = dst_argb + (height - 1) * dst_stride_argb;
- dst_stride_argb = -dst_stride_argb;
- }
-#if defined(HAS_NV12TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- NV12ToARGBRow = NV12ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_NV12TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height - 1; y += 2) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
- &kYuvI601Constants, width);
- NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
- dst_argb + dst_stride_argb, &kYuvI601Constants, width);
- dst_argb += dst_stride_argb * 2;
- src_m420 += src_stride_m420 * 3;
- }
- if (height & 1) {
- NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
- &kYuvI601Constants, width);
- }
- return 0;
-}
-
// Convert YUY2 to ARGB.
LIBYUV_API
int YUY2ToARGB(const uint8_t* src_yuy2,
@@ -2175,6 +4019,14 @@ int YUY2ToARGB(const uint8_t* src_yuy2,
}
}
#endif
+#if defined(HAS_YUY2TOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_LSX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
src_yuy2 += src_stride_yuy2;
@@ -2242,6 +4094,14 @@ int UYVYToARGB(const uint8_t* src_uyvy,
}
}
#endif
+#if defined(HAS_UYVYTOARGBROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_LSX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
src_uyvy += src_stride_uyvy;
@@ -2264,7 +4124,7 @@ static void WeavePixels(const uint8_t* src_u,
}
}
-// Convert Android420 to ARGB.
+// Convert Android420 to ARGB with matrix.
LIBYUV_API
int Android420ToARGBMatrix(const uint8_t* src_y,
int src_stride_y,
@@ -2365,6 +4225,3144 @@ int Android420ToABGR(const uint8_t* src_y,
height);
}
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I422ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGBARow = I422ToRGBARow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGBARow = I422ToRGBARow_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert NV12 to RGB565 with matrix.
+LIBYUV_API
+int NV12ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToRGB565Row)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+ if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_LSX;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB565ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_rgb565, dst_stride_rgb565, &kYuvI601Constants,
+ width, height);
+}
+
+// Convert I422 to RGBA with matrix.
+LIBYUV_API
+int I420ToRGBAMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGBARow = I422ToRGBARow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGBAROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGBARow = I422ToRGBARow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGBARow = I422ToRGBARow_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgba, dst_stride_rgba,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height) {
+ return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_bgra, dst_stride_bgra,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to RGB24 with matrix.
+LIBYUV_API
+int I420ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB24Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB24Row = I422ToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB24Row = I422ToRGB24Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB24ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB24Row = I422ToRGB24Row_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert J420 to RGB24.
+LIBYUV_API
+int J420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert J420 to RAW.
+LIBYUV_API
+int J420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuJPEGConstants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H420 to RGB24.
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to RAW.
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb1555,
+ int dst_stride_argb1555,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB1555Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+ dst_stride_argb1555 = -dst_stride_argb1555;
+ }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
+ width);
+ dst_argb1555 += dst_stride_argb1555;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb4444,
+ int dst_stride_argb4444,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) = I422ToARGB4444Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+ dst_stride_argb4444 = -dst_stride_argb4444;
+ }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
+ width);
+ dst_argb4444 += dst_stride_argb4444;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565 with specified color matrix.
+LIBYUV_API
+int I420ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB565Row = I422ToRGB565Row_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert J420 to RGB565.
+LIBYUV_API
+int J420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvJPEGConstants, width, height);
+}
+
+// Convert H420 to RGB565.
+LIBYUV_API
+int H420ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGB565Row = I422ToRGB565Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TORGB565ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToRGB565Row = I422ToRGB565Row_LASX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
+static const uint8_t kDither565_4x4[16] = {
+ 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8_t* dither4x4,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToARGBRow_C;
+ void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ const uint32_t dither4, int width) =
+ ARGBToRGB565DitherRow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+ if (!dither4x4) {
+ dither4x4 = kDither565_4x4;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+ (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_AVX512BW;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I422ToARGBRow = I422ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGBRow = I422ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX;
+ }
+ }
+#endif
+ {
+ // Allocate a row of argb.
+ align_buffer_64(row_argb, width * 4);
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
+ ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+ *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+ width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ free_aligned_buffer_64(row_argb);
+ }
+ return 0;
+}
+
+// Convert I420 to AR30 with matrix.
+LIBYUV_API
+int I420ToAR30Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToAR30Row_C;
+
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+
+#if defined(HAS_I422TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToAR30Row = I422ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToAR30Row = I422ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToAR30Row = I422ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYvuH709Constants, width, height);
+}
+
+// Convert I420 to AB30.
+LIBYUV_API
+int I420ToAB30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuI601Constants, width, height);
+}
+
+// Convert H420 to AB30.
+LIBYUV_API
+int H420ToAB30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuH709Constants, width, height);
+}
+
+static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToARGBRow_C;
+ void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_Any_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToARGBRow = I444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I444ToARGBRow = I444ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I444ToARGBRow = I444ToARGBRow_LASX;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
+ }
+#endif
+
+ // alloc 4 lines temp
+ const int kRowSize = (width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 4);
+ uint8_t* temp_u_1 = row;
+ uint8_t* temp_u_2 = row + kRowSize;
+ uint8_t* temp_v_1 = row + kRowSize * 2;
+ uint8_t* temp_v_2 = row + kRowSize * 3;
+
+ Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+ I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+ I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ I444ToARGBRow(src_y, temp_u_2, temp_v_2, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ if (!(height & 1)) {
+ Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+ I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I422ToARGBMatrixLinear(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I444ToARGBRow_C;
+ void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
+ ScaleRowUp2_Linear_Any_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444ToARGBRow = I444ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444ToARGBRow = I444ToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I444ToARGBRow = I444ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I444ToARGBRow = I444ToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int kRowSize = (width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+ uint8_t* temp_u = row;
+ uint8_t* temp_v = row + kRowSize;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp(src_u, temp_u, width);
+ ScaleRowUp(src_v, temp_v, width);
+ I444ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I410ToAR30Row_C;
+ void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_16_Any_C;
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I410TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410ToAR30Row = I410ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410ToAR30Row = I410ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410ToAR30Row = I410ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+ }
+#endif
+
+ // alloc 4 lines temp
+ const int kRowSize = (width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t));
+ uint16_t* temp_u_1 = (uint16_t*)(row);
+ uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize;
+ uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2;
+ uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3;
+
+ Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+ I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+ I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ I410ToAR30Row(src_y, temp_u_2, temp_v_2, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ if (!(height & 1)) {
+ Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+ I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width);
+ }
+
+ free_aligned_buffer_64(row);
+
+ return 0;
+}
+
+static int I210ToAR30MatrixLinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I410ToAR30Row_C;
+ void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I410TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410ToAR30Row = I410ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410ToAR30Row = I410ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410ToAR30Row = I410ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int kRowSize = (width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+ uint16_t* temp_u = (uint16_t*)(row);
+ uint16_t* temp_v = (uint16_t*)(row) + kRowSize;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp(src_u, temp_u, width);
+ ScaleRowUp(src_v, temp_v, width);
+ I410ToAR30Row(src_y, temp_u, temp_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I410ToARGBRow_C;
+ void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_16_Any_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I410TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410ToARGBRow = I410ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410ToARGBRow = I410ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410ToARGBRow = I410ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410ToARGBRow = I410ToARGBRow_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+ }
+#endif
+
+ // alloc 4 lines temp
+ const int kRowSize = (width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t));
+ uint16_t* temp_u_1 = (uint16_t*)(row);
+ uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize;
+ uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2;
+ uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3;
+
+ Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+ I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+ I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ I410ToARGBRow(src_y, temp_u_2, temp_v_2, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ if (!(height & 1)) {
+ Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+ I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width);
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I210ToARGBMatrixLinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I410ToARGBRow_C;
+ void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I410TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410ToARGBRow = I410ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410ToARGBRow = I410ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410ToARGBRow = I410ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410ToARGBRow = I410ToARGBRow_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int kRowSize = (width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+ uint16_t* temp_u = (uint16_t*)(row);
+ uint16_t* temp_v = (uint16_t*)(row) + kRowSize;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp(src_u, temp_u, width);
+ ScaleRowUp(src_v, temp_v, width);
+ I410ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I420AlphaToARGBMatrixBilinear(
+ const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I444AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_Any_C;
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
+ }
+#endif
+
+ // alloc 4 lines temp
+ const int kRowSize = (width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 4);
+ uint8_t* temp_u_1 = row;
+ uint8_t* temp_u_2 = row + kRowSize;
+ uint8_t* temp_v_1 = row + kRowSize * 2;
+ uint8_t* temp_v_2 = row + kRowSize * 3;
+
+ Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+ I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_a += src_stride_a;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+ I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_a += src_stride_a;
+ I444AlphaToARGBRow(src_y, temp_u_2, temp_v_2, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ src_a += src_stride_a;
+ }
+
+ if (!(height & 1)) {
+ Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+ I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I444AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
+ ScaleRowUp2_Linear_Any_C;
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_I444ALPHATOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ I444AlphaToARGBRow = I444AlphaToARGBRow_LASX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int kRowSize = (width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+ uint8_t* temp_u = row;
+ uint8_t* temp_v = row + kRowSize;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp(src_u, temp_u, width);
+ ScaleRowUp(src_v, temp_v, width);
+ I444AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I010AlphaToARGBMatrixBilinear(
+ const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I410AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_16_Any_C;
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+ }
+#endif
+
+ // alloc 4 lines temp
+ const int kRowSize = (width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t));
+ uint16_t* temp_u_1 = (uint16_t*)(row);
+ uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize;
+ uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2;
+ uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3;
+
+ Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+ I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_a += src_stride_a;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width);
+ I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_a += src_stride_a;
+ I410AlphaToARGBRow(src_y, temp_u_2, temp_v_2, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_a += src_stride_a;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+
+ if (!(height & 1)) {
+ Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width);
+ Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width);
+ I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate) {
+ int y;
+ void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) = I410AlphaToARGBRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+ }
+ }
+#endif
+
+#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
+ }
+#endif
+#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int kRowSize = (width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+ uint16_t* temp_u = (uint16_t*)(row);
+ uint16_t* temp_v = (uint16_t*)(row) + kRowSize;
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp(src_u, temp_u, width);
+ ScaleRowUp(src_v, temp_v, width);
+ I410AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants,
+ width);
+ if (attenuate) {
+ ARGBAttenuateRow(dst_argb, dst_argb, width);
+ }
+ dst_argb += dst_stride_argb;
+ src_a += src_stride_a;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P410ToARGBRow)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C;
+ void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleUVRowUp2_Bilinear_16_Any_C;
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_P410TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P410ToARGBRow = P410ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P410ToARGBRow = P410ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P410TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P410ToARGBRow = P410ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P410ToARGBRow = P410ToARGBRow_AVX2;
+ }
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int kRowSize = (2 * width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+ uint16_t* temp_uv_1 = (uint16_t*)(row);
+ uint16_t* temp_uv_2 = (uint16_t*)(row) + kRowSize;
+
+ Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width);
+ P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp(src_uv, src_stride_uv, temp_uv_1, kRowSize, width);
+ P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ P410ToARGBRow(src_y, temp_uv_2, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_uv += src_stride_uv;
+ }
+
+ if (!(height & 1)) {
+ Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width);
+ P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width);
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int P210ToARGBMatrixLinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P410ToARGBRow)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C;
+ void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
+ ScaleUVRowUp2_Linear_16_Any_C;
+ if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_P410TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P410ToARGBRow = P410ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P410ToARGBRow = P410ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P410TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P410ToARGBRow = P410ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P410ToARGBRow = P410ToARGBRow_AVX2;
+ }
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
+ }
+#endif
+
+ const int kRowSize = (2 * width + 31) & ~31;
+ align_buffer_64(row, kRowSize * sizeof(uint16_t));
+ uint16_t* temp_uv = (uint16_t*)(row);
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp(src_uv, temp_uv, width);
+ P410ToARGBRow(src_y, temp_uv, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_uv += src_stride_uv;
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P410ToAR30Row)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C;
+ void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleUVRowUp2_Bilinear_16_Any_C;
+ if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_P410TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P410ToAR30Row = P410ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P410ToAR30Row = P410ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P410TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P410ToAR30Row = P410ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P410ToAR30Row = P410ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+ }
+#endif
+
+ // alloc 2 lines temp
+ const int kRowSize = (2 * width + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t));
+ uint16_t* temp_uv_1 = (uint16_t*)(row);
+ uint16_t* temp_uv_2 = (uint16_t*)(row) + kRowSize;
+
+ Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width);
+ P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+
+ for (y = 0; y < height - 2; y += 2) {
+ Scale2RowUp(src_uv, src_stride_uv, temp_uv_1, kRowSize, width);
+ P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ P410ToAR30Row(src_y, temp_uv_2, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_uv += src_stride_uv;
+ }
+
+ if (!(height & 1)) {
+ Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width);
+ P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width);
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+static int P210ToAR30MatrixLinear(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*P410ToAR30Row)(
+ const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C;
+ void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
+ ScaleUVRowUp2_Linear_16_Any_C;
+ if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_P410TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ P410ToAR30Row = P410ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ P410ToAR30Row = P410ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_P410TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ P410ToAR30Row = P410ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ P410ToAR30Row = P410ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
+ }
+#endif
+
+ const int kRowSize = (2 * width + 31) & ~31;
+ align_buffer_64(row, kRowSize * sizeof(uint16_t));
+ uint16_t* temp_uv = (uint16_t*)(row);
+
+ for (y = 0; y < height; ++y) {
+ ScaleRowUp(src_uv, temp_uv, width);
+ P410ToAR30Row(src_y, temp_uv, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ src_uv += src_stride_uv;
+ }
+
+ free_aligned_buffer_64(row);
+ return 0;
+}
+
+LIBYUV_API
+int I420ToARGBMatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ yuvconstants, width, height);
+ case kFilterBilinear:
+ case kFilterBox:
+ return I420ToARGBMatrixBilinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_argb, dst_stride_argb, yuvconstants, width, height);
+ case kFilterLinear:
+ // Actually we can do this, but probably there's no usage.
+ return -1;
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I422ToARGBMatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ yuvconstants, width, height);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return I422ToARGBMatrixLinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_argb, dst_stride_argb, yuvconstants, width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I010ToAR30MatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ yuvconstants, width, height);
+ case kFilterBilinear:
+ case kFilterBox:
+ return I010ToAR30MatrixBilinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_ar30, dst_stride_ar30, yuvconstants, width, height);
+ case kFilterLinear:
+ return -1;
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I210ToAR30MatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ yuvconstants, width, height);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return I210ToAR30MatrixLinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_ar30, dst_stride_ar30, yuvconstants, width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I010ToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ yuvconstants, width, height);
+ case kFilterBilinear:
+ case kFilterBox:
+ return I010ToARGBMatrixBilinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_argb, dst_stride_argb, yuvconstants, width, height);
+ case kFilterLinear:
+ return -1;
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I210ToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ yuvconstants, width, height);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return I210ToARGBMatrixLinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_argb, dst_stride_argb, yuvconstants, width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I420AlphaToARGBMatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u,
+ src_v, src_stride_v, src_a, src_stride_a,
+ dst_argb, dst_stride_argb, yuvconstants,
+ width, height, attenuate);
+ case kFilterBilinear:
+ case kFilterBox:
+ return I420AlphaToARGBMatrixBilinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
+ src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
+ attenuate);
+ case kFilterLinear:
+ return -1;
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I422AlphaToARGBMatrixFilter(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u,
+ src_v, src_stride_v, src_a, src_stride_a,
+ dst_argb, dst_stride_argb, yuvconstants,
+ width, height, attenuate);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return I422AlphaToARGBMatrixLinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
+ src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
+ attenuate);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I010AlphaToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I010AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u,
+ src_v, src_stride_v, src_a, src_stride_a,
+ dst_argb, dst_stride_argb, yuvconstants,
+ width, height, attenuate);
+ case kFilterBilinear:
+ case kFilterBox:
+ return I010AlphaToARGBMatrixBilinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
+ src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
+ attenuate);
+ case kFilterLinear:
+ return -1;
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int I210AlphaToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ int attenuate,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return I210AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u,
+ src_v, src_stride_v, src_a, src_stride_a,
+ dst_argb, dst_stride_argb, yuvconstants,
+ width, height, attenuate);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return I210AlphaToARGBMatrixLinear(
+ src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a,
+ src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height,
+ attenuate);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int P010ToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return P010ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_argb, dst_stride_argb, yuvconstants, width,
+ height);
+ case kFilterBilinear:
+ case kFilterBox:
+ return P010ToARGBMatrixBilinear(src_y, src_stride_y, src_uv,
+ src_stride_uv, dst_argb, dst_stride_argb,
+ yuvconstants, width, height);
+ case kFilterLinear:
+ return -1;
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int P210ToARGBMatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return P210ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_argb, dst_stride_argb, yuvconstants, width,
+ height);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return P210ToARGBMatrixLinear(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_argb, dst_stride_argb, yuvconstants,
+ width, height);
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int P010ToAR30MatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return P010ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_ar30, dst_stride_ar30, yuvconstants, width,
+ height);
+ case kFilterBilinear:
+ case kFilterBox:
+ return P010ToAR30MatrixBilinear(src_y, src_stride_y, src_uv,
+ src_stride_uv, dst_ar30, dst_stride_ar30,
+ yuvconstants, width, height);
+ case kFilterLinear:
+ return -1;
+ }
+
+ return -1;
+}
+
+LIBYUV_API
+int P210ToAR30MatrixFilter(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height,
+ enum FilterMode filter) {
+ switch (filter) {
+ case kFilterNone:
+ return P210ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_ar30, dst_stride_ar30, yuvconstants, width,
+ height);
+ case kFilterBilinear:
+ case kFilterBox:
+ case kFilterLinear:
+ return P210ToAR30MatrixLinear(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_ar30, dst_stride_ar30, yuvconstants,
+ width, height);
+ }
+
+ return -1;
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc
index 60140cb4..8bd07e4c 100644
--- a/files/source/convert_from.cc
+++ b/files/source/convert_from.cc
@@ -30,6 +30,8 @@ static __inline int Abs(int v) {
}
// I420 To any I4xx YUV format with mirroring.
+// TODO(fbarchard): Consider kFilterNone for Y, or CopyPlane
+
static int I420ToI4xx(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
@@ -83,7 +85,8 @@ int I420ToI010(const uint8_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -109,6 +112,51 @@ int I420ToI010(const uint8_t* src_y,
return 0;
}
+// Convert 8 bit YUV to 12 bit.
+LIBYUV_API
+int I420ToI012(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Convert Y plane.
+ Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 4096, width,
+ height);
+ // Convert UV planes.
+ Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 4096, halfwidth,
+ halfheight);
+ Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 4096, halfwidth,
+ halfheight);
+ return 0;
+}
+
// 420 chroma is 1/2 width, 1/2 height
// 422 chroma is 1/2 width, 1x height
LIBYUV_API
@@ -159,6 +207,102 @@ int I420ToI444(const uint8_t* src_y,
dst_uv_height);
}
+// 420 chroma to 444 chroma, 10/12 bit version
+LIBYUV_API
+int I010ToI410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ if (width == 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ }
+ ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1),
+ SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width),
+ Abs(height), kFilterBilinear);
+ ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1),
+ SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width),
+ Abs(height), kFilterBilinear);
+ return 0;
+}
+
+// 422 chroma to 444 chroma, 10/12 bit version
+LIBYUV_API
+int I210ToI410(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ if (width == 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ }
+ ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
+ dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
+ ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
+ dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
+ return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I422ToI444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ if (width == 0 || height == 0) {
+ return -1;
+ }
+
+ if (dst_y) {
+ ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
+ Abs(width), Abs(height), kFilterBilinear);
+ }
+ ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
+ dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
+ dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
+ return 0;
+}
+
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
LIBYUV_API
int I400Copy(const uint8_t* src_y,
@@ -302,11 +446,11 @@ int I420ToYUY2(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_I422TOYUY2ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- I422ToYUY2Row = I422ToYUY2Row_MMI;
+#if defined(HAS_I422TOYUY2ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_LASX;
}
}
#endif
@@ -389,11 +533,11 @@ int I422ToUYVY(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_I422TOUYVYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- I422ToUYVYRow = I422ToUYVYRow_MMI;
+#if defined(HAS_I422TOUYVYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_LASX;
}
}
#endif
@@ -464,11 +608,11 @@ int I420ToUYVY(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_I422TOUYVYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- I422ToUYVYRow = I422ToUYVYRow_MMI;
+#if defined(HAS_I422TOUYVYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_LASX;
}
}
#endif
@@ -488,7 +632,6 @@ int I420ToUYVY(const uint8_t* src_y,
return 0;
}
-// TODO(fbarchard): test negative height for invert.
LIBYUV_API
int I420ToNV12(const uint8_t* src_y,
int src_stride_y,
@@ -502,12 +645,22 @@ int I420ToNV12(const uint8_t* src_y,
int dst_stride_uv,
int width,
int height) {
- if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
- height == 0) {
+ int halfwidth = (width + 1) / 2;
+ int halfheight = (height + 1) / 2;
+ if (!src_y || !src_u || !src_v || !dst_uv || width <= 0 || height == 0) {
return -1;
}
- int halfwidth = (width + 1) / 2;
- int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
if (dst_y) {
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
@@ -534,807 +687,6 @@ int I420ToNV21(const uint8_t* src_y,
width, height);
}
-// Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
- }
-#if defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGBARow = I422ToRGBARow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGBARow = I422ToRGBARow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
- dst_rgba += dst_stride_rgba;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGBA.
-LIBYUV_API
-int I420ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height) {
- return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgba, dst_stride_rgba,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to BGRA.
-LIBYUV_API
-int I420ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height) {
- return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_bgra, dst_stride_bgra,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGB24Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
- dst_stride_rgb24 = -dst_stride_rgb24;
- }
-#if defined(HAS_I422TORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB24Row = I422ToRGB24Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- I422ToRGB24Row = I422ToRGB24Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB24Row = I422ToRGB24Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGB24ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB24Row = I422ToRGB24Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
- dst_rgb24 += dst_stride_rgb24;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB24.
-LIBYUV_API
-int I420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb24, dst_stride_rgb24,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I420 to RAW.
-LIBYUV_API
-int I420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_raw, dst_stride_raw,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert H420 to RGB24.
-LIBYUV_API
-int H420ToRGB24(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb24, dst_stride_rgb24,
- &kYuvH709Constants, width, height);
-}
-
-// Convert H420 to RAW.
-LIBYUV_API
-int H420ToRAW(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_raw,
- int dst_stride_raw,
- int width,
- int height) {
- return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_raw, dst_stride_raw,
- &kYvuH709Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert I420 to ARGB1555.
-LIBYUV_API
-int I420ToARGB1555(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb1555,
- int dst_stride_argb1555,
- int width,
- int height) {
- int y;
- void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToARGB1555Row_C;
- if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
- height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
- dst_stride_argb1555 = -dst_stride_argb1555;
- }
-#if defined(HAS_I422TOARGB1555ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB1555ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB1555Row = I422ToARGB1555Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
- width);
- dst_argb1555 += dst_stride_argb1555;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to ARGB4444.
-LIBYUV_API
-int I420ToARGB4444(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_argb4444,
- int dst_stride_argb4444,
- int width,
- int height) {
- int y;
- void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) = I422ToARGB4444Row_C;
- if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
- height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
- dst_stride_argb4444 = -dst_stride_argb4444;
- }
-#if defined(HAS_I422TOARGB4444ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGB4444ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGB4444Row = I422ToARGB4444Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
- width);
- dst_argb4444 += dst_stride_argb4444;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB565 with specified color matrix.
-LIBYUV_API
-int I420ToRGB565Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGB565Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB565Row = I422ToRGB565Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to RGB565.
-LIBYUV_API
-int I420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb565, dst_stride_rgb565,
- &kYuvI601Constants, width, height);
-}
-
-// Convert J420 to RGB565.
-LIBYUV_API
-int J420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb565, dst_stride_rgb565,
- &kYuvJPEGConstants, width, height);
-}
-
-// Convert H420 to RGB565.
-LIBYUV_API
-int H420ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgb565, dst_stride_rgb565,
- &kYuvH709Constants, width, height);
-}
-
-// Convert I422 to RGB565.
-LIBYUV_API
-int I422ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- int y;
- void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGB565Row_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_I422TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGB565Row = I422ToRGB565Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGB565Row = I422ToRGB565Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
-static const uint8_t kDither565_4x4[16] = {
- 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
-};
-
-// Convert I420 to RGB565 with dithering.
-LIBYUV_API
-int I420ToRGB565Dither(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- const uint8_t* dither4x4,
- int width,
- int height) {
- int y;
- void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToARGBRow_C;
- void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
- const uint32_t dither4, int width) =
- ARGBToRGB565DitherRow_C;
- if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
- if (!dither4x4) {
- dither4x4 = kDither565_4x4;
- }
-#if defined(HAS_I422TOARGBROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToARGBRow = I422ToARGBRow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToARGBRow = I422ToARGBRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToARGBRow = I422ToARGBRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TOARGBROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToARGBRow = I422ToARGBRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToARGBRow = I422ToARGBRow_MSA;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
- }
- }
-#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
- }
- }
-#endif
- {
- // Allocate a row of argb.
- align_buffer_64(row_argb, width * 4);
- for (y = 0; y < height; ++y) {
- I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
- ARGBToRGB565DitherRow(row_argb, dst_rgb565,
- *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
- width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- free_aligned_buffer_64(row_argb);
- }
- return 0;
-}
-
-// Convert I420 to AR30 with matrix
-static int I420ToAR30Matrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToAR30Row_C;
-
- if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
- dst_stride_ar30 = -dst_stride_ar30;
- }
-
-#if defined(HAS_I422TOAR30ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToAR30Row = I422ToAR30Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TOAR30ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToAR30Row = I422ToAR30Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToAR30Row = I422ToAR30Row_AVX2;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
- dst_ar30 += dst_stride_ar30;
- src_y += src_stride_y;
- if (y & 1) {
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- }
- return 0;
-}
-
-// Convert I420 to AR30.
-LIBYUV_API
-int I420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height) {
- return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_ar30, dst_stride_ar30,
- &kYuvI601Constants, width, height);
-}
-
-// Convert H420 to AR30.
-LIBYUV_API
-int H420ToAR30(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_ar30,
- int dst_stride_ar30,
- int width,
- int height) {
- return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_ar30, dst_stride_ar30,
- &kYvuH709Constants, width, height);
-}
-
// Convert I420 to specified format
LIBYUV_API
int ConvertFromI420(const uint8_t* y,
@@ -1421,7 +773,8 @@ int ConvertFromI420(const uint8_t* y,
height);
break;
case FOURCC_NV12: {
- uint8_t* dst_uv = dst_sample + width * height;
+ int dst_y_stride = dst_sample_stride ? dst_sample_stride : width;
+ uint8_t* dst_uv = dst_sample + dst_y_stride * height;
r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width, dst_uv,
dst_sample_stride ? dst_sample_stride : width, width,
@@ -1429,14 +782,14 @@ int ConvertFromI420(const uint8_t* y,
break;
}
case FOURCC_NV21: {
- uint8_t* dst_vu = dst_sample + width * height;
+ int dst_y_stride = dst_sample_stride ? dst_sample_stride : width;
+ uint8_t* dst_vu = dst_sample + dst_y_stride * height;
r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width, dst_vu,
dst_sample_stride ? dst_sample_stride : width, width,
height);
break;
}
- // TODO(fbarchard): Add M420.
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
diff --git a/files/source/convert_from_argb.cc b/files/source/convert_from_argb.cc
index fbcd039d..e50c2af3 100644
--- a/files/source/convert_from_argb.cc
+++ b/files/source/convert_from_argb.cc
@@ -76,11 +76,11 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOUV444ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUV444Row = ARGBToUV444Row_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToUV444Row = ARGBToUV444Row_MMI;
+#if defined(HAS_ARGBTOUV444ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUV444Row = ARGBToUV444Row_LASX;
}
}
#endif
@@ -103,7 +103,7 @@ int ARGBToI444(const uint8_t* src_argb,
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -116,11 +116,11 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
}
}
#endif
@@ -170,30 +170,42 @@ int ARGBToI422(const uint8_t* src_argb,
height = 1;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -206,37 +218,26 @@ int ARGBToI422(const uint8_t* src_argb,
}
}
#endif
-
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_MSA;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
}
}
#endif
@@ -279,30 +280,10 @@ int ARGBToNV12(const uint8_t* src_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
- ARGBToYRow = ARGBToYRow_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
- ARGBToYRow = ARGBToYRow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
- ARGBToYRow = ARGBToYRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
- ARGBToYRow = ARGBToYRow_AVX2;
- }
- }
-#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -315,35 +296,57 @@ int ARGBToNV12(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToYRow = ARGBToYRow_Any_MSA;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToYRow = ARGBToYRow_MSA;
+ ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_MSA;
+ ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
+ ARGBToYRow = ARGBToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
}
}
#endif
@@ -379,11 +382,11 @@ int ARGBToNV12(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MergeUVRow_ = MergeUVRow_Any_MMI;
- if (IS_ALIGNED(halfwidth, 8)) {
- MergeUVRow_ = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow_ = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_LSX;
}
}
#endif
@@ -439,30 +442,42 @@ int ARGBToNV21(const uint8_t* src_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -475,39 +490,28 @@ int ARGBToNV21(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_MSA;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
}
}
#endif
-
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -540,11 +544,11 @@ int ARGBToNV21(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MergeUVRow_ = MergeUVRow_Any_MMI;
- if (IS_ALIGNED(halfwidth, 8)) {
- MergeUVRow_ = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow_ = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_LSX;
}
}
#endif
@@ -599,30 +603,42 @@ int ABGRToNV12(const uint8_t* src_abgr,
src_abgr = src_abgr + (height - 1) * src_stride_abgr;
src_stride_abgr = -src_stride_abgr;
}
-#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+#if defined(HAS_ABGRTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
ABGRToYRow = ABGRToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_SSSE3;
ABGRToYRow = ABGRToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ABGRToUVRow = ABGRToUVRow_Any_AVX2;
ABGRToYRow = ABGRToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ABGRToUVRow = ABGRToUVRow_AVX2;
ABGRToYRow = ABGRToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ABGRTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ABGRToYRow = ABGRToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ABGRToYRow = ABGRToYRow_NEON;
}
}
@@ -635,35 +651,167 @@ int ABGRToNV12(const uint8_t* src_abgr,
}
}
#endif
-#if defined(HAS_ABGRTOYROW_MSA)
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYRow = ABGRToYRow_Any_MSA;
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ABGRToYRow = ABGRToYRow_MSA;
}
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
}
#endif
-#if defined(HAS_ABGRTOUVROW_MSA)
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
- ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow_ = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_LSX;
+ }
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+ for (y = 0; y < height - 1; y += 2) {
+ ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+ ABGRToYRow_C;
+ void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_vu, int width) = MergeUVRow_C;
+ if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+#if defined(HAS_ABGRTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToYRow = ABGRToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ABGRToUVRow = ABGRToUVRow_MSA;
+ ABGRToYRow = ABGRToYRow_AVX2;
}
}
#endif
-#if defined(HAS_ABGRTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ABGRToYRow = ABGRToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ABGRToYRow = ABGRToYRow_MMI;
+#if defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
}
}
#endif
-#if defined(HAS_ABGRTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ABGRToUVRow = ABGRToUVRow_Any_MMI;
+#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_MMI;
+ ABGRToYRow = ABGRToYRow_MSA;
+ }
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_MSA;
}
}
#endif
@@ -699,11 +847,11 @@ int ABGRToNV12(const uint8_t* src_abgr,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MergeUVRow_ = MergeUVRow_Any_MMI;
- if (IS_ALIGNED(halfwidth, 8)) {
- MergeUVRow_ = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow_ = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_LSX;
}
}
#endif
@@ -714,16 +862,16 @@ int ABGRToNV12(const uint8_t* src_abgr,
for (y = 0; y < height - 1; y += 2) {
ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
- MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
ABGRToYRow(src_abgr, dst_y, width);
ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
src_abgr += src_stride_abgr * 2;
dst_y += dst_stride_y * 2;
- dst_uv += dst_stride_uv;
+ dst_vu += dst_stride_vu;
}
if (height & 1) {
ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
- MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
ABGRToYRow(src_abgr, dst_y, width);
}
free_aligned_buffer_64(row_u);
@@ -764,30 +912,42 @@ int ARGBToYUY2(const uint8_t* src_argb,
height = 1;
src_stride_argb = dst_stride_yuy2 = 0;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -800,35 +960,25 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_MSA;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
}
}
#endif
@@ -864,11 +1014,11 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_I422TOYUY2ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- I422ToYUY2Row = I422ToYUY2Row_MMI;
+#if defined(HAS_I422TOYUY2ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_LASX;
}
}
#endif
@@ -925,30 +1075,42 @@ int ARGBToUYVY(const uint8_t* src_argb,
height = 1;
src_stride_argb = dst_stride_uyvy = 0;
}
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
- ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
+#if defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -961,35 +1123,25 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MSA)
+#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
+ ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_MSA;
}
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVRow = ARGBToUVRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVRow = ARGBToUVRow_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_MMI;
+#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ ARGBToUVRow = ARGBToUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
+ ARGBToUVRow = ARGBToUVRow_LASX;
}
}
#endif
@@ -1025,11 +1177,11 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_I422TOUYVYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- I422ToUYVYRow = I422ToUYVYRow_MMI;
+#if defined(HAS_I422TOUYVYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_LASX;
}
}
#endif
@@ -1097,7 +1249,7 @@ int ARGBToI400(const uint8_t* src_argb,
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
@@ -1110,11 +1262,11 @@ int ARGBToI400(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYRow = ARGBToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_MMI;
+#if defined(HAS_ARGBTOYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYRow = ARGBToYRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_LASX;
}
}
#endif
@@ -1195,7 +1347,7 @@ int ARGBToRGB24(const uint8_t* src_argb,
#if defined(HAS_ARGBTORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToRGB24Row = ARGBToRGB24Row_NEON;
}
}
@@ -1208,11 +1360,11 @@ int ARGBToRGB24(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTORGB24ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB24Row = ARGBToRGB24Row_MMI;
+#if defined(HAS_ARGBTORGB24ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_LASX;
}
}
#endif
@@ -1282,11 +1434,11 @@ int ARGBToRAW(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTORAWROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToRAWRow = ARGBToRAWRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRAWRow = ARGBToRAWRow_MMI;
+#if defined(HAS_ARGBTORAWROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToRAWRow = ARGBToRAWRow_LASX;
}
}
#endif
@@ -1360,11 +1512,11 @@ int ARGBToRGB565Dither(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+#if defined(HAS_ARGBTORGB565DITHERROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LASX;
}
}
#endif
@@ -1437,11 +1589,11 @@ int ARGBToRGB565(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTORGB565ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToRGB565Row = ARGBToRGB565Row_MMI;
+#if defined(HAS_ARGBTORGB565ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_LASX;
}
}
#endif
@@ -1511,11 +1663,11 @@ int ARGBToARGB1555(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOARGB1555ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToARGB1555Row = ARGBToARGB1555Row_MMI;
+#if defined(HAS_ARGBTOARGB1555ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_LASX;
}
}
#endif
@@ -1585,11 +1737,11 @@ int ARGBToARGB4444(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOARGB4444ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- ARGBToARGB4444Row = ARGBToARGB4444Row_MMI;
+#if defined(HAS_ARGBTOARGB4444ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_LASX;
}
}
#endif
@@ -1727,16 +1879,22 @@ int ARGBToJ420(const uint8_t* src_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_SSSE3;
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
@@ -1748,7 +1906,7 @@ int ARGBToJ420(const uint8_t* src_argb,
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_NEON;
}
}
@@ -1761,35 +1919,35 @@ int ARGBToJ420(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MSA)
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_MSA;
}
- }
-#endif
-#if defined(HAS_ARGBTOYJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYJRow = ARGBToYJRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYJRow = ARGBToYJRow_MMI;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVJRow = ARGBToUVJRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOUVJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+#if defined(HAS_ARGBTOYJROW_LSX) && defined(HAS_ARGBTOUVJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYJRow = ARGBToYJRow_Any_LSX;
+ ARGBToUVJRow = ARGBToUVJRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_MMI;
+ ARGBToYJRow = ARGBToYJRow_LSX;
+ ARGBToUVJRow = ARGBToUVJRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYJRow = ARGBToYJRow_Any_LASX;
+ ARGBToUVJRow = ARGBToUVJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_LASX;
+ ARGBToUVJRow = ARGBToUVJRow_LASX;
}
}
#endif
@@ -1844,16 +2002,22 @@ int ARGBToJ422(const uint8_t* src_argb,
height = 1;
src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
}
-#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_SSSE3;
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
@@ -1865,7 +2029,7 @@ int ARGBToJ422(const uint8_t* src_argb,
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_NEON;
}
}
@@ -1878,35 +2042,35 @@ int ARGBToJ422(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MSA)
+#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA;
+ ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_MSA;
}
- }
-#endif
-#if defined(HAS_ARGBTOYJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYJRow = ARGBToYJRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYJRow = ARGBToYJRow_MMI;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVJROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
ARGBToUVJRow = ARGBToUVJRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOUVJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+#if defined(HAS_ARGBTOYJROW_LSX) && defined(HAS_ARGBTOUVJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYJRow = ARGBToYJRow_Any_LSX;
+ ARGBToUVJRow = ARGBToUVJRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_MMI;
+ ARGBToYJRow = ARGBToYJRow_LSX;
+ ARGBToUVJRow = ARGBToUVJRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYJRow = ARGBToYJRow_Any_LASX;
+ ARGBToUVJRow = ARGBToUVJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_LASX;
+ ARGBToUVJRow = ARGBToUVJRow_LASX;
}
}
#endif
@@ -1922,6 +2086,124 @@ int ARGBToJ422(const uint8_t* src_argb,
return 0;
}
+// Convert ARGB to AR64.
+LIBYUV_API
+int ARGBToAR64(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
+ int width) = ARGBToAR64Row_C;
+ if (!src_argb || !dst_ar64 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_ar64 = 0;
+ }
+#if defined(HAS_ARGBTOAR64ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToAR64Row = ARGBToAR64Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToAR64Row = ARGBToAR64Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToAR64Row = ARGBToAR64Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOAR64ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToAR64Row = ARGBToAR64Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToAR64Row = ARGBToAR64Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToAR64Row(src_argb, dst_ar64, width);
+ src_argb += src_stride_argb;
+ dst_ar64 += dst_stride_ar64;
+ }
+ return 0;
+}
+
+// Convert ARGB to AB64.
+LIBYUV_API
+int ARGBToAB64(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint16_t* dst_ab64,
+ int dst_stride_ab64,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64,
+ int width) = ARGBToAB64Row_C;
+ if (!src_argb || !dst_ab64 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_ab64 = 0;
+ }
+#if defined(HAS_ARGBTOAB64ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToAB64Row = ARGBToAB64Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToAB64Row = ARGBToAB64Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToAB64Row = ARGBToAB64Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOAB64ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToAB64Row = ARGBToAB64Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToAB64Row = ARGBToAB64Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToAB64Row(src_argb, dst_ab64, width);
+ src_argb += src_stride_argb;
+ dst_ab64 += dst_stride_ab64;
+ }
+ return 0;
+}
+
// Convert ARGB to J400.
LIBYUV_API
int ARGBToJ400(const uint8_t* src_argb,
@@ -1966,7 +2248,7 @@ int ARGBToJ400(const uint8_t* src_argb,
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_NEON;
}
}
@@ -1979,23 +2261,282 @@ int ARGBToJ400(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYJRow = ARGBToYJRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYJRow = ARGBToYJRow_MMI;
+
+ for (y = 0; y < height; ++y) {
+ ARGBToYJRow(src_argb, dst_yj, width);
+ src_argb += src_stride_argb;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+// Convert RGBA to J400.
+LIBYUV_API
+int RGBAToJ400(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+ void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
+ RGBAToYJRow_C;
+ if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+ src_stride_rgba = -src_stride_rgba;
+ }
+ // Coalesce rows.
+ if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_rgba = dst_stride_yj = 0;
+ }
+#if defined(HAS_RGBATOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYJRow = RGBAToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ RGBAToYJRow = RGBAToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ RGBAToYJRow = RGBAToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGBAToYJRow = RGBAToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYJRow = RGBAToYJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGBAToYJRow = RGBAToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToYJRow = RGBAToYJRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
- ARGBToYJRow(src_argb, dst_yj, width);
- src_argb += src_stride_argb;
+ RGBAToYJRow(src_rgba, dst_yj, width);
+ src_rgba += src_stride_rgba;
dst_yj += dst_stride_yj;
}
return 0;
}
+// Enabled if 1 pass is available
+#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA)
+#define HAS_RAWTOYJROW
+#endif
+
+// RAW to JNV21 full range NV21
+LIBYUV_API
+int RAWToJNV21(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+#if defined(HAS_RAWTOYJROW)
+ void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RAWToUVJRow_C;
+ void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
+ RAWToYJRow_C;
+#else
+ void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RAWToARGBRow_C;
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYJRow_C;
+#endif
+ void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_vu, int width) = MergeUVRow_C;
+ if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+
+#if defined(HAS_RAWTOYJROW)
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RAWToUVJRow = RAWToUVJRow_Any_NEON;
+ RAWToYJRow = RAWToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_NEON;
+ RAWToUVJRow = RAWToUVJRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RAWToUVJRow = RAWToUVJRow_Any_MSA;
+ RAWToYJRow = RAWToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToYJRow = RAWToYJRow_MSA;
+ RAWToUVJRow = RAWToUVJRow_MSA;
+ }
+ }
+#endif
+
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else // HAS_RAWTOYJROW
+
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_AVX2;
+ }
+ }
+#endif
+#endif // HAS_RAWTOYJROW
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow_ = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_LSX;
+ }
+ }
+#endif
+ {
+ // Allocate a row of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+#if !defined(HAS_RAWTOYJROW)
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYJROW)
+ RAWToUVJRow(src_raw, src_stride_raw, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+ RAWToYJRow(src_raw, dst_y, width);
+ RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+ ARGBToUVJRow(row, kRowSize, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+ ARGBToYJRow(row, dst_y, width);
+ ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+#if defined(HAS_RAWTOYJROW)
+ RAWToUVJRow(src_raw, 0, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+ RAWToYJRow(src_raw, dst_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVJRow(row, 0, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+ ARGBToYJRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RAWTOYJROW)
+ free_aligned_buffer_64(row);
+#endif
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+#undef HAS_RAWTOYJROW
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/convert_jpeg.cc b/files/source/convert_jpeg.cc
index f440c7c2..d7556ee9 100644
--- a/files/source/convert_jpeg.cc
+++ b/files/source/convert_jpeg.cc
@@ -328,6 +328,140 @@ int MJPGToNV21(const uint8_t* src_mjpg,
return ret ? 0 : 1;
}
+static void JpegI420ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 with VU swapped.
+ I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToNV12(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ // Use NV21 since there is no UV plane.
+ I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+ dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPEG) to NV12.
+LIBYUV_API
+int MJPGToNV12(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+ mjpeg_decoder.GetHeight() != src_height)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ // Use NV21Buffers but with UV instead of VU.
+ NV21Buffers bufs = {dst_y, dst_stride_y, dst_uv,
+ dst_stride_uv, dst_width, dst_height};
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width,
+ dst_height);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width,
+ dst_height);
+ } else {
+ // Unknown colorspace.
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
struct ARGBBuffers {
uint8_t* argb;
int argb_stride;
diff --git a/files/source/convert_to_argb.cc b/files/source/convert_to_argb.cc
index bde1aa88..84df16c8 100644
--- a/files/source/convert_to_argb.cc
+++ b/files/source/convert_to_argb.cc
@@ -32,9 +32,6 @@ extern "C" {
// TODO(fbarchard): Add the following:
// H010ToARGB
// I010ToARGB
-// J400ToARGB
-// J422ToARGB
-// J444ToARGB
LIBYUV_API
int ConvertToARGB(const uint8_t* sample,
@@ -161,6 +158,11 @@ int ConvertToARGB(const uint8_t* sample,
r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
inv_crop_height);
break;
+ case FOURCC_J400:
+ src = sample + src_width * crop_y + crop_x;
+ r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
// Biplanar formats
case FOURCC_NV12:
@@ -178,12 +180,6 @@ int ConvertToARGB(const uint8_t* sample,
r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
dst_stride_argb, crop_width, inv_crop_height);
break;
- case FOURCC_M420:
- src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
- inv_crop_height);
- break;
-
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
@@ -208,6 +204,19 @@ int ConvertToARGB(const uint8_t* sample,
break;
}
+ case FOURCC_J420: {
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8_t* src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
case FOURCC_H420: {
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
@@ -221,7 +230,7 @@ int ConvertToARGB(const uint8_t* sample,
break;
}
- case FOURCC_J420: {
+ case FOURCC_U420: {
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
@@ -229,7 +238,7 @@ int ConvertToARGB(const uint8_t* sample,
(halfwidth * crop_y + crop_x) / 2;
const uint8_t* src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
- r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
break;
}
@@ -256,6 +265,18 @@ int ConvertToARGB(const uint8_t* sample,
break;
}
+ case FOURCC_J422: {
+ int halfwidth = (src_width + 1) / 2;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u =
+ sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
case FOURCC_H422: {
int halfwidth = (src_width + 1) / 2;
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
@@ -268,6 +289,18 @@ int ConvertToARGB(const uint8_t* sample,
break;
}
+ case FOURCC_U422: {
+ int halfwidth = (src_width + 1) / 2;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u =
+ sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
case FOURCC_I444:
case FOURCC_YV24: {
const uint8_t* src_y = sample + src_width * crop_y + crop_x;
@@ -284,6 +317,40 @@ int ConvertToARGB(const uint8_t* sample,
dst_argb, dst_stride_argb, crop_width, inv_crop_height);
break;
}
+
+ case FOURCC_J444: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_H444: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_U444: {
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
#ifdef HAVE_JPEG
case FOURCC_MJPG:
r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
diff --git a/files/source/convert_to_i420.cc b/files/source/convert_to_i420.cc
index 584be0ac..5869ecd7 100644
--- a/files/source/convert_to_i420.cc
+++ b/files/source/convert_to_i420.cc
@@ -89,18 +89,26 @@ int ConvertToI420(const uint8_t* sample,
switch (format) {
// Single plane formats
- case FOURCC_YUY2:
+ case FOURCC_YUY2: { // TODO(fbarchard): Find better odd crop fix.
+ uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
+ uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
+ int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
+ int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
- dst_stride_u, dst_v, dst_stride_v, crop_width,
- inv_crop_height);
+ r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
+ stride_u, v, stride_v, crop_width, inv_crop_height);
break;
- case FOURCC_UYVY:
+ }
+ case FOURCC_UYVY: {
+ uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
+ uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
+ int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
+ int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
- dst_stride_u, dst_v, dst_stride_v, crop_width,
- inv_crop_height);
+ r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
+ stride_u, v, stride_v, crop_width, inv_crop_height);
break;
+ }
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
@@ -179,11 +187,6 @@ int ConvertToI420(const uint8_t* sample,
dst_stride_y, dst_v, dst_stride_v, dst_u,
dst_stride_u, crop_width, inv_crop_height, rotation);
break;
- case FOURCC_M420:
- src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
- dst_v, dst_stride_v, crop_width, inv_crop_height);
- break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc
index 48e2b615..56fe60e4 100644
--- a/files/source/cpu_id.cc
+++ b/files/source/cpu_id.cc
@@ -20,7 +20,7 @@
#endif
// For ArmCpuCaps() but unittested on all platforms
-#include <stdio.h>
+#include <stdio.h> // For fopen()
#include <string.h>
#ifdef __cplusplus
@@ -75,9 +75,9 @@ void CpuId(int info_eax, int info_ecx, int* cpu_info) {
asm volatile(
#if defined(__i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
- "mov %%ebx, %%edi \n"
+ "mov %%ebx, %%edi \n"
"cpuid \n"
- "xchg %%edi, %%ebx \n"
+ "xchg %%edi, %%ebx \n"
: "=D"(info_ebx),
#else
"cpuid \n"
@@ -133,7 +133,7 @@ int GetXCR0() {
#pragma optimize("g", on)
#endif
-// based on libvpx arm_cpudetect.c
+// Based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
char cpuinfo_line[512];
@@ -163,45 +163,54 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
}
// TODO(fbarchard): Consider read_msa_ir().
-// TODO(fbarchard): Add unittest.
-LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
- const char ase[]) {
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
char cpuinfo_line[512];
+ int flag = 0x0;
FILE* f = fopen(cpuinfo_name, "r");
if (!f) {
- // ase enabled if /proc/cpuinfo is unavailable.
- if (strcmp(ase, " msa") == 0) {
- return kCpuHasMSA;
- }
- if (strcmp(ase, " mmi") == 0) {
- return kCpuHasMMI;
- }
+ // Assume nothing if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
return 0;
}
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
- if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
- char* p = strstr(cpuinfo_line, ase);
- if (p) {
- fclose(f);
- if (strcmp(ase, " msa") == 0) {
- return kCpuHasMSA;
- }
- return 0;
+ if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
+ // Workaround early kernel without MSA in ASEs line.
+ if (strstr(cpuinfo_line, "Loongson-2K")) {
+ flag |= kCpuHasMSA;
}
- } else if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
- char* p = strstr(cpuinfo_line, "Loongson-3");
- if (p) {
- fclose(f);
- if (strcmp(ase, " mmi") == 0) {
- return kCpuHasMMI;
- }
- return 0;
+ }
+ if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+ if (strstr(cpuinfo_line, "msa")) {
+ flag |= kCpuHasMSA;
}
+ // ASEs is the last line, so we can break here.
+ break;
}
}
fclose(f);
- return 0;
+ return flag;
+}
+
+// TODO(fbarchard): Consider read_loongarch_ir().
+#define LOONGARCH_CFG2 0x2
+#define LOONGARCH_CFG2_LSX (1 << 6)
+#define LOONGARCH_CFG2_LASX (1 << 7)
+
+#if defined(__loongarch__)
+LIBYUV_API SAFEBUFFERS int LoongarchCpuCaps(void) {
+ int flag = 0x0;
+ uint32_t cfg2 = 0;
+
+ __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(cfg2) : "r"(LOONGARCH_CFG2));
+
+ if (cfg2 & LOONGARCH_CFG2_LSX)
+ flag |= kCpuHasLSX;
+
+ if (cfg2 & LOONGARCH_CFG2_LASX)
+ flag |= kCpuHasLASX;
+ return flag;
}
+#endif
static SAFEBUFFERS int GetCpuFlags(void) {
int cpu_info = 0;
@@ -235,6 +244,7 @@ static SAFEBUFFERS int GetCpuFlags(void) {
cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
+ cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0;
cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
@@ -242,13 +252,13 @@ static SAFEBUFFERS int GetCpuFlags(void) {
}
#endif
#if defined(__mips__) && defined(__linux__)
-#if defined(__mips_msa)
- cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
-#elif defined(_MIPS_ARCH_LOONGSON3A)
- cpu_info = MipsCpuCaps("/proc/cpuinfo", " mmi");
-#endif
+ cpu_info = MipsCpuCaps("/proc/cpuinfo");
cpu_info |= kCpuHasMIPS;
#endif
+#if defined(__loongarch__) && defined(__linux__)
+ cpu_info = LoongarchCpuCaps();
+ cpu_info |= kCpuHasLOONGARCH;
+#endif
#if defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc
index 5c5e5ead..4ccf00a3 100644
--- a/files/source/mjpeg_decoder.cc
+++ b/files/source/mjpeg_decoder.cc
@@ -417,7 +417,6 @@ void init_source(j_decompress_ptr cinfo) {
boolean fill_input_buffer(j_decompress_ptr cinfo) {
BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
if (buf_vec->pos >= buf_vec->len) {
- assert(0 && "No more data");
// ERROR: No more data
return FALSE;
}
@@ -430,7 +429,7 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
jpeg_source_mgr* src = cinfo->src;
size_t bytes = static_cast<size_t>(num_bytes);
- if(bytes > src->bytes_in_buffer) {
+ if (bytes > src->bytes_in_buffer) {
src->next_input_byte = nullptr;
src->bytes_in_buffer = 0;
} else {
diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc
index 9cab230f..169d4a8f 100644
--- a/files/source/planar_functions.cc
+++ b/files/source/planar_functions.cc
@@ -10,6 +10,7 @@
#include "libyuv/planar_functions.h"
+#include <assert.h>
#include <string.h> // for memset()
#include "libyuv/cpu_id.h"
@@ -34,6 +35,9 @@ void CopyPlane(const uint8_t* src_y,
int height) {
int y;
void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -80,8 +84,6 @@ void CopyPlane(const uint8_t* src_y,
}
}
-// TODO(fbarchard): Consider support for negative height.
-// TODO(fbarchard): Consider stride measured in bytes.
LIBYUV_API
void CopyPlane_16(const uint16_t* src_y,
int src_stride_y,
@@ -89,36 +91,8 @@ void CopyPlane_16(const uint16_t* src_y,
int dst_stride_y,
int width,
int height) {
- int y;
- void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C;
- // Coalesce rows.
- if (src_stride_y == width && dst_stride_y == width) {
- width *= height;
- height = 1;
- src_stride_y = dst_stride_y = 0;
- }
-#if defined(HAS_COPYROW_16_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_16_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_16_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_16_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_16_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
- CopyRow = CopyRow_16_NEON;
- }
-#endif
-
- // Copy plane
- for (y = 0; y < height; ++y) {
- CopyRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
- }
+ CopyPlane((const uint8_t*)src_y, src_stride_y * 2, (uint8_t*)dst_y,
+ dst_stride_y * 2, width * 2, height);
}
// Convert a plane of 16 bit data to 8 bit
@@ -134,6 +108,9 @@ void Convert16To8Plane(const uint16_t* src_y,
void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
int width) = Convert16To8Row_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -146,6 +123,14 @@ void Convert16To8Plane(const uint16_t* src_y,
height = 1;
src_stride_y = dst_stride_y = 0;
}
+#if defined(HAS_CONVERT16TO8ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Convert16To8Row = Convert16To8Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ Convert16To8Row = Convert16To8Row_NEON;
+ }
+ }
+#endif
#if defined(HAS_CONVERT16TO8ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
Convert16To8Row = Convert16To8Row_Any_SSSE3;
@@ -184,6 +169,9 @@ void Convert8To16Plane(const uint8_t* src_y,
void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale,
int width) = Convert8To16Row_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -238,9 +226,12 @@ int I422Copy(const uint8_t* src_y,
int width,
int height) {
int halfwidth = (width + 1) >> 1;
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
return -1;
}
+
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -276,7 +267,8 @@ int I444Copy(const uint8_t* src_y,
int dst_stride_v,
int width,
int height) {
- if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -298,6 +290,49 @@ int I444Copy(const uint8_t* src_y,
return 0;
}
+// Copy I210.
+LIBYUV_API
+int I210Copy(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ // Copy UV planes.
+ CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+ CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+ return 0;
+}
+
// Copy I400.
LIBYUV_API
int I400ToI400(const uint8_t* src_y,
@@ -349,6 +384,54 @@ int I420ToI400(const uint8_t* src_y,
return 0;
}
+// Copy NV12. Supports inverting.
+int NV12Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+
+ if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2,
+ halfheight);
+ return 0;
+}
+
+// Copy NV21. Supports inverting.
+int NV21Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
+ dst_stride_y, dst_vu, dst_stride_vu, width, height);
+}
+
// Support function for NV12 etc UV channels.
// Width and height are plane sizes (typically half pixel width).
LIBYUV_API
@@ -363,6 +446,9 @@ void SplitUVPlane(const uint8_t* src_uv,
int y;
void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
int width) = SplitUVRow_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -410,11 +496,11 @@ void SplitUVPlane(const uint8_t* src_uv,
}
}
#endif
-#if defined(HAS_SPLITUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SplitUVRow = SplitUVRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- SplitUVRow = SplitUVRow_MMI;
+#if defined(HAS_SPLITUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SplitUVRow = SplitUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_LSX;
}
}
#endif
@@ -440,6 +526,9 @@ void MergeUVPlane(const uint8_t* src_u,
int y;
void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
uint8_t* dst_uv, int width) = MergeUVRow_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -485,11 +574,11 @@ void MergeUVPlane(const uint8_t* src_u,
}
}
#endif
-#if defined(HAS_MERGEUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MergeUVRow = MergeUVRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- MergeUVRow = MergeUVRow_MMI;
+#if defined(HAS_MERGEUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ MergeUVRow = MergeUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow = MergeUVRow_LSX;
}
}
#endif
@@ -503,6 +592,289 @@ void MergeUVPlane(const uint8_t* src_u,
}
}
+// Support function for P010 etc UV channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitUVPlane_16(const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u,
+ uint16_t* dst_v, int depth, int width) =
+ SplitUVRow_16_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == width * 2 && dst_stride_u == width &&
+ dst_stride_v == width) {
+ width *= height;
+ height = 1;
+ src_stride_uv = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_SPLITUVROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitUVRow_16 = SplitUVRow_16_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitUVRow_16 = SplitUVRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitUVRow_16 = SplitUVRow_16_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ SplitUVRow_16 = SplitUVRow_16_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Copy a row of UV.
+ SplitUVRow_16(src_uv, dst_u, dst_v, depth, width);
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ src_uv += src_stride_uv;
+ }
+}
+
+LIBYUV_API
+void MergeUVPlane_16(const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v,
+ uint16_t* dst_uv, int depth, int width) =
+ MergeUVRow_16_C;
+ assert(depth >= 8);
+ assert(depth <= 16);
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uv = dst_uv + (height - 1) * dst_stride_uv;
+ dst_stride_uv = -dst_stride_uv;
+ }
+ // Coalesce rows.
+ if (src_stride_u == width && src_stride_v == width &&
+ dst_stride_uv == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_u = src_stride_v = dst_stride_uv = 0;
+ }
+#if defined(HAS_MERGEUVROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeUVRow_16 = MergeUVRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_16 = MergeUVRow_16_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ MergeUVRow_16 = MergeUVRow_16_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Merge a row of U and V into a row of UV.
+ MergeUVRow_16(src_u, src_v, dst_uv, depth, width);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uv += dst_stride_uv;
+ }
+}
+
+// Convert plane from lsb to msb
+LIBYUV_API
+void ConvertToMSBPlane_16(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ int scale = 1 << (16 - depth);
+ void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale,
+ int width) = MultiplyRow_16_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+
+#if defined(HAS_MULTIPLYROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MultiplyRow_16 = MultiplyRow_16_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MultiplyRow_16 = MultiplyRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MULTIPLYROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MultiplyRow_16 = MultiplyRow_16_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MultiplyRow_16 = MultiplyRow_16_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MultiplyRow_16(src_y, dst_y, scale, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Convert plane from msb to lsb
+LIBYUV_API
+void ConvertToLSBPlane_16(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ int scale = 1 << depth;
+ void (*DivideRow)(const uint16_t* src_y, uint16_t* dst_y, int scale,
+ int width) = DivideRow_16_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+
+#if defined(HAS_DIVIDEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ DivideRow = DivideRow_16_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ DivideRow = DivideRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_DIVIDEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ DivideRow = DivideRow_16_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ DivideRow = DivideRow_16_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ DivideRow(src_y, dst_y, scale, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+ SwapUVRow_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_uv = dst_stride_vu = 0;
+ }
+
+#if defined(HAS_SWAPUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SwapUVRow = SwapUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ SwapUVRow = SwapUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SWAPUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SwapUVRow = SwapUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ SwapUVRow = SwapUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SWAPUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SwapUVRow = SwapUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SwapUVRow = SwapUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ SwapUVRow(src_uv, dst_vu, width);
+ src_uv += src_stride_uv;
+ dst_vu += dst_stride_vu;
+ }
+}
+
// Convert NV21 to NV12.
LIBYUV_API
int NV21ToNV12(const uint8_t* src_y,
@@ -515,49 +887,150 @@ int NV21ToNV12(const uint8_t* src_y,
int dst_stride_uv,
int width,
int height) {
- int y;
- void (*UVToVURow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
- UVToVURow_C;
-
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
+
if (!src_vu || !dst_uv || width <= 0 || height == 0) {
return -1;
}
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
// Negative height means invert the image.
if (height < 0) {
height = -height;
halfheight = (height + 1) >> 1;
- src_y = src_y + (height - 1) * src_stride_y;
src_vu = src_vu + (halfheight - 1) * src_stride_vu;
- src_stride_y = -src_stride_y;
src_stride_vu = -src_stride_vu;
}
- // Coalesce rows.
- if (src_stride_vu == halfwidth * 2 && dst_stride_uv == halfwidth * 2) {
- halfwidth *= halfheight;
- halfheight = 1;
- src_stride_vu = dst_stride_uv = 0;
+
+ SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth,
+ halfheight);
+ return 0;
+}
+
+// Detile a plane of data
+// tile width is 16 and assumed.
+// tile_height is 16 or 32 for MM21.
+// src_stride_y is bytes per row of source ignoring tiling. e.g. 640
+// TODO: More detile row functions.
+
+LIBYUV_API
+void DetilePlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ int tile_height) {
+ const ptrdiff_t src_tile_stride = 16 * tile_height;
+ int y;
+ void (*DetileRow)(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst,
+ int width) = DetileRow_C;
+ assert(src_stride_y >= 0);
+ assert(tile_height > 0);
+ assert(src_stride_y > 0);
+
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
}
-#if defined(HAS_UVToVUROW_NEON)
+#if defined(HAS_DETILEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ DetileRow = DetileRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ DetileRow = DetileRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_DETILEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- UVToVURow = UVToVURow_Any_NEON;
- if (IS_ALIGNED(halfwidth, 16)) {
- UVToVURow = UVToVURow_NEON;
+ DetileRow = DetileRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ DetileRow = DetileRow_NEON;
}
}
#endif
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+
+ // Detile plane
+ for (y = 0; y < height; ++y) {
+ DetileRow(src_y, src_tile_stride, dst_y, width);
+ dst_y += dst_stride_y;
+ src_y += 16;
+ // Advance to next row of tiles.
+ if ((y & (tile_height - 1)) == (tile_height - 1)) {
+ src_y = src_y - src_tile_stride + src_stride_y * tile_height;
+ }
}
+}
- for (y = 0; y < halfheight; ++y) {
- UVToVURow(src_vu, dst_uv, halfwidth);
- src_vu += src_stride_vu;
- dst_uv += dst_stride_uv;
+LIBYUV_API
+void DetileSplitUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int tile_height) {
+ const ptrdiff_t src_tile_stride = 16 * tile_height;
+ int y;
+ void (*DetileSplitUVRow)(const uint8_t* src, ptrdiff_t src_tile_stride,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ DetileSplitUVRow_C;
+ assert(src_stride_uv >= 0);
+ assert(tile_height > 0);
+ assert(src_stride_uv > 0);
+
+ if (width <= 0 || height == 0) {
+ return;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_stride_u = -dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_v = -dst_stride_v;
+ }
+
+#if defined(HAS_DETILESPLITUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ DetileSplitUVRow = DetileSplitUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ DetileSplitUVRow = DetileSplitUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_DETILESPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ DetileSplitUVRow = DetileSplitUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ DetileSplitUVRow = DetileSplitUVRow_NEON;
+ }
+ }
+#endif
+
+ // Detile plane
+ for (y = 0; y < height; ++y) {
+ DetileSplitUVRow(src_uv, src_tile_stride, dst_u, dst_v, width);
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ src_uv += 16;
+ // Advance to next row of tiles.
+ if ((y & (tile_height - 1)) == (tile_height - 1)) {
+ src_uv = src_uv - src_tile_stride + src_stride_uv * tile_height;
+ }
}
- return 0;
}
// Support function for NV12 etc RGB channels.
@@ -576,6 +1049,9 @@ void SplitRGBPlane(const uint8_t* src_rgb,
int y;
void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
uint8_t* dst_b, int width) = SplitRGBRow_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -609,14 +1085,6 @@ void SplitRGBPlane(const uint8_t* src_rgb,
}
}
#endif
-#if defined(HAS_SPLITRGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SplitRGBRow = SplitRGBRow_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- SplitRGBRow = SplitRGBRow_MMI;
- }
- }
-#endif
for (y = 0; y < height; ++y) {
// Copy a row of RGB.
@@ -643,6 +1111,9 @@ void MergeRGBPlane(const uint8_t* src_r,
void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
const uint8_t* src_b, uint8_t* dst_rgb, int width) =
MergeRGBRow_C;
+ if (width <= 0 || height == 0) {
+ return;
+ }
// Coalesce rows.
// Negative height means invert the image.
if (height < 0) {
@@ -673,86 +1144,673 @@ void MergeRGBPlane(const uint8_t* src_r,
}
}
#endif
-#if defined(HAS_MERGERGBROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MergeRGBRow = MergeRGBRow_Any_MMI;
+
+ for (y = 0; y < height; ++y) {
+ // Merge a row of U and V into a row of RGB.
+ MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ dst_rgb += dst_stride_rgb;
+ }
+}
+
+LIBYUV_NOINLINE
+void SplitARGBPlaneAlpha(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_r,
+ int dst_stride_r,
+ uint8_t* dst_g,
+ int dst_stride_g,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ int width,
+ int height) {
+ int y;
+ void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+ uint8_t* dst_b, uint8_t* dst_a, int width) =
+ SplitARGBRow_C;
+
+ assert(height > 0);
+
+ if (src_stride_argb == width * 4 && dst_stride_r == width &&
+ dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b =
+ dst_stride_a = 0;
+ }
+
+#if defined(HAS_SPLITARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitARGBRow = SplitARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
- MergeRGBRow = MergeRGBRow_MMI;
+ SplitARGBRow = SplitARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SplitARGBRow = SplitARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ SplitARGBRow = SplitARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SPLITARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitARGBRow = SplitARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitARGBRow = SplitARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitARGBRow = SplitARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitARGBRow = SplitARGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
- // Merge a row of U and V into a row of RGB.
- MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
+ SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width);
+ dst_r += dst_stride_r;
+ dst_g += dst_stride_g;
+ dst_b += dst_stride_b;
+ dst_a += dst_stride_a;
+ src_argb += src_stride_argb;
+ }
+}
+
+LIBYUV_NOINLINE
+void SplitARGBPlaneOpaque(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_r,
+ int dst_stride_r,
+ uint8_t* dst_g,
+ int dst_stride_g,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int y;
+ void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+ uint8_t* dst_b, int width) = SplitXRGBRow_C;
+ assert(height > 0);
+
+ if (src_stride_argb == width * 4 && dst_stride_r == width &&
+ dst_stride_g == width && dst_stride_b == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
+ }
+
+#if defined(HAS_SPLITXRGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SplitXRGBRow = SplitXRGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ SplitXRGBRow = SplitXRGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITXRGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SplitXRGBRow = SplitXRGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ SplitXRGBRow = SplitXRGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SPLITXRGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ SplitXRGBRow = SplitXRGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ SplitXRGBRow = SplitXRGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITXRGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitXRGBRow = SplitXRGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitXRGBRow = SplitXRGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width);
+ dst_r += dst_stride_r;
+ dst_g += dst_stride_g;
+ dst_b += dst_stride_b;
+ src_argb += src_stride_argb;
+ }
+}
+
+LIBYUV_API
+void SplitARGBPlane(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_r,
+ int dst_stride_r,
+ uint8_t* dst_g,
+ int dst_stride_g,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ int width,
+ int height) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_r = dst_r + (height - 1) * dst_stride_r;
+ dst_g = dst_g + (height - 1) * dst_stride_g;
+ dst_b = dst_b + (height - 1) * dst_stride_b;
+ dst_a = dst_a + (height - 1) * dst_stride_a;
+ dst_stride_r = -dst_stride_r;
+ dst_stride_g = -dst_stride_g;
+ dst_stride_b = -dst_stride_b;
+ dst_stride_a = -dst_stride_a;
+ }
+
+ if (dst_a == NULL) {
+ SplitARGBPlaneOpaque(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g,
+ dst_stride_g, dst_b, dst_stride_b, width, height);
+ } else {
+ SplitARGBPlaneAlpha(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g,
+ dst_stride_g, dst_b, dst_stride_b, dst_a, dst_stride_a,
+ width, height);
+ }
+}
+
+LIBYUV_NOINLINE
+void MergeARGBPlaneAlpha(const uint8_t* src_r,
+ int src_stride_r,
+ const uint8_t* src_g,
+ int src_stride_g,
+ const uint8_t* src_b,
+ int src_stride_b,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+ const uint8_t* src_b, const uint8_t* src_a,
+ uint8_t* dst_argb, int width) = MergeARGBRow_C;
+
+ assert(height > 0);
+
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ src_stride_a == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = src_stride_a =
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_MERGEARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeARGBRow = MergeARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ MergeARGBRow = MergeARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeARGBRow = MergeARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeARGBRow = MergeARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeARGBRow = MergeARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MergeARGBRow = MergeARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width);
src_r += src_stride_r;
src_g += src_stride_g;
src_b += src_stride_b;
- dst_rgb += dst_stride_rgb;
+ src_a += src_stride_a;
+ dst_argb += dst_stride_argb;
}
}
-// Mirror a plane of data.
-void MirrorPlane(const uint8_t* src_y,
- int src_stride_y,
- uint8_t* dst_y,
- int dst_stride_y,
- int width,
- int height) {
+LIBYUV_NOINLINE
+void MergeARGBPlaneOpaque(const uint8_t* src_r,
+ int src_stride_r,
+ const uint8_t* src_g,
+ int src_stride_g,
+ const uint8_t* src_b,
+ int src_stride_b,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int y;
- void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+ void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+ const uint8_t* src_b, uint8_t* dst_argb, int width) =
+ MergeXRGBRow_C;
+
+ assert(height > 0);
+
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
+ }
+#if defined(HAS_MERGEXRGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeXRGBRow = MergeXRGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ MergeXRGBRow = MergeXRGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEXRGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeXRGBRow = MergeXRGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeXRGBRow = MergeXRGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEXRGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeXRGBRow = MergeXRGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MergeXRGBRow = MergeXRGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MergeXRGBRow(src_r, src_g, src_b, dst_argb, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ dst_argb += dst_stride_argb;
+ }
+}
+
+LIBYUV_API
+void MergeARGBPlane(const uint8_t* src_r,
+ int src_stride_r,
+ const uint8_t* src_g,
+ int src_stride_g,
+ const uint8_t* src_b,
+ int src_stride_b,
+ const uint8_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_stride_y = -src_stride_y;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
}
-#if defined(HAS_MIRRORROW_NEON)
+
+ if (src_a == NULL) {
+ MergeARGBPlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
+ src_stride_b, dst_argb, dst_stride_argb, width,
+ height);
+ } else {
+ MergeARGBPlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
+ src_stride_b, src_a, src_stride_a, dst_argb,
+ dst_stride_argb, width, height);
+ }
+}
+
+// TODO(yuan): Support 2 bit alpha channel.
+LIBYUV_API
+void MergeXR30Plane(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g,
+ const uint16_t* src_b, uint8_t* dst_ar30, int depth,
+ int width) = MergeXR30Row_C;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+ // Coalesce rows.
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ dst_stride_ar30 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0;
+ }
+#if defined(HAS_MERGEXR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeXR30Row = MergeXR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeXR30Row = MergeXR30Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEXR30ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- MirrorRow = MirrorRow_Any_NEON;
+ if (depth == 10) {
+ MergeXR30Row = MergeXR30Row_10_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ MergeXR30Row = MergeXR30Row_10_NEON;
+ }
+ } else {
+ MergeXR30Row = MergeXR30Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ MergeXR30Row = MergeXR30Row_NEON;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ dst_ar30 += dst_stride_ar30;
+ }
+}
+
+LIBYUV_NOINLINE
+static void MergeAR64PlaneAlpha(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g,
+ const uint16_t* src_b, const uint16_t* src_a,
+ uint16_t* dst_argb, int depth, int width) =
+ MergeAR64Row_C;
+
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ src_stride_a == width && dst_stride_ar64 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = src_stride_a =
+ dst_stride_ar64 = 0;
+ }
+#if defined(HAS_MERGEAR64ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeAR64Row = MergeAR64Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_NEON;
+ MergeAR64Row = MergeAR64Row_AVX2;
}
}
#endif
-#if defined(HAS_MIRRORROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- MirrorRow = MirrorRow_Any_SSSE3;
+#if defined(HAS_MERGEAR64ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeAR64Row = MergeAR64Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ MergeAR64Row = MergeAR64Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ src_a += src_stride_a;
+ dst_ar64 += dst_stride_ar64;
+ }
+}
+
+LIBYUV_NOINLINE
+static void MergeAR64PlaneOpaque(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g,
+ const uint16_t* src_b, uint16_t* dst_argb, int depth,
+ int width) = MergeXR64Row_C;
+
+ // Coalesce rows.
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ dst_stride_ar64 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0;
+ }
+#if defined(HAS_MERGEXR64ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeXR64Row = MergeXR64Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
- MirrorRow = MirrorRow_SSSE3;
+ MergeXR64Row = MergeXR64Row_AVX2;
}
}
#endif
-#if defined(HAS_MIRRORROW_AVX2)
+#if defined(HAS_MERGEXR64ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeXR64Row = MergeXR64Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ MergeXR64Row = MergeXR64Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ dst_ar64 += dst_stride_ar64;
+ }
+}
+
+LIBYUV_API
+void MergeAR64Plane(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ int width,
+ int height,
+ int depth) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64;
+ dst_stride_ar64 = -dst_stride_ar64;
+ }
+
+ if (src_a == NULL) {
+ MergeAR64PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
+ src_stride_b, dst_ar64, dst_stride_ar64, width, height,
+ depth);
+ } else {
+ MergeAR64PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
+ src_stride_b, src_a, src_stride_a, dst_ar64,
+ dst_stride_ar64, width, height, depth);
+ }
+}
+
+LIBYUV_NOINLINE
+static void MergeARGB16To8PlaneAlpha(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
+ const uint16_t* src_b, const uint16_t* src_a,
+ uint8_t* dst_argb, int depth, int width) =
+ MergeARGB16To8Row_C;
+
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ src_stride_a == width && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = src_stride_a =
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_MERGEARGB16TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- MirrorRow = MirrorRow_Any_AVX2;
- if (IS_ALIGNED(width, 32)) {
- MirrorRow = MirrorRow_AVX2;
+ MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeARGB16To8Row = MergeARGB16To8Row_AVX2;
}
}
#endif
-#if defined(HAS_MIRRORROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- MirrorRow = MirrorRow_Any_MSA;
- if (IS_ALIGNED(width, 64)) {
- MirrorRow = MirrorRow_MSA;
+#if defined(HAS_MERGEARGB16TO8ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ MergeARGB16To8Row = MergeARGB16To8Row_NEON;
}
}
#endif
-#if defined(HAS_MIRRORROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MirrorRow = MirrorRow_Any_MMI;
+
+ for (y = 0; y < height; ++y) {
+ MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ src_a += src_stride_a;
+ dst_argb += dst_stride_argb;
+ }
+}
+
+LIBYUV_NOINLINE
+static void MergeARGB16To8PlaneOpaque(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int depth) {
+ int y;
+ void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
+ const uint16_t* src_b, uint8_t* dst_argb, int depth,
+ int width) = MergeXRGB16To8Row_C;
+
+ // Coalesce rows.
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
+ }
+#if defined(HAS_MERGEXRGB16TO8ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEXRGB16TO8ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- MirrorRow = MirrorRow_MMI;
+ MergeXRGB16To8Row = MergeXRGB16To8Row_NEON;
}
}
#endif
- // Mirror plane
for (y = 0; y < height; ++y) {
- MirrorRow(src_y, dst_y, width);
- src_y += src_stride_y;
- dst_y += dst_stride_y;
+ MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ dst_argb += dst_stride_argb;
+ }
+}
+
+LIBYUV_API
+void MergeARGB16To8Plane(const uint16_t* src_r,
+ int src_stride_r,
+ const uint16_t* src_g,
+ int src_stride_g,
+ const uint16_t* src_b,
+ int src_stride_b,
+ const uint16_t* src_a,
+ int src_stride_a,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int depth) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+
+ if (src_a == NULL) {
+ MergeARGB16To8PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b,
+ src_stride_b, dst_argb, dst_stride_argb, width,
+ height, depth);
+ } else {
+ MergeARGB16To8PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b,
+ src_stride_b, src_a, src_stride_a, dst_argb,
+ dst_stride_argb, width, height, depth);
}
}
@@ -820,7 +1878,7 @@ int YUY2ToI422(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MSA)
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToYRow = YUY2ToYRow_Any_MSA;
YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
@@ -830,13 +1888,13 @@ int YUY2ToI422(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- YUY2ToYRow = YUY2ToYRow_Any_MMI;
- YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- YUY2ToYRow = YUY2ToYRow_MMI;
- YUY2ToUV422Row = YUY2ToUV422Row_MMI;
+#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ YUY2ToYRow = YUY2ToYRow_Any_LASX;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToYRow = YUY2ToYRow_LASX;
+ YUY2ToUV422Row = YUY2ToUV422Row_LASX;
}
}
#endif
@@ -916,7 +1974,7 @@ int UYVYToI422(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_UYVYTOYROW_MSA)
+#if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToYRow = UYVYToYRow_Any_MSA;
UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
@@ -926,13 +1984,13 @@ int UYVYToI422(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_UYVYTOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- UYVYToYRow = UYVYToYRow_Any_MMI;
- UYVYToUV422Row = UYVYToUV422Row_Any_MMI;
- if (IS_ALIGNED(width, 16)) {
- UYVYToYRow = UYVYToYRow_MMI;
- UYVYToUV422Row = UYVYToUV422Row_MMI;
+#if defined(HAS_UYVYTOYROW_LASX) && defined(HAS_UYVYTOUV422ROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ UYVYToYRow = UYVYToYRow_Any_LASX;
+ UYVYToUV422Row = UYVYToUV422Row_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_LASX;
+ UYVYToUV422Row = UYVYToUV422Row_LASX;
}
}
#endif
@@ -1006,23 +2064,214 @@ int YUY2ToY(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_YUY2TOYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- YUY2ToYRow = YUY2ToYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- YUY2ToYRow = YUY2ToYRow_MMI;
+
+ for (y = 0; y < height; ++y) {
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+// Convert UYVY to Y.
+LIBYUV_API
+int UYVYToY(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+ UYVYToYRow_C;
+ if (!src_uyvy || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_uyvy == width * 2 && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_y = 0;
+ }
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ UYVYToYRow = UYVYToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToYRow = UYVYToYRow_MSA;
}
}
#endif
for (y = 0; y < height; ++y) {
- YUY2ToYRow(src_yuy2, dst_y, width);
- src_yuy2 += src_stride_yuy2;
+ UYVYToYRow(src_uyvy, dst_y, width);
+ src_uyvy += src_stride_uyvy;
dst_y += dst_stride_y;
}
return 0;
}
+// Mirror a plane of data.
+// See Also I400Mirror
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height) {
+ int y;
+ void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorRow = MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorRow = MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorRow = MirrorRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorRow = MirrorRow_Any_MSA;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ MirrorRow = MirrorRow_Any_LASX;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_LASX;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ MirrorRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
+ MirrorUVRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+#if defined(HAS_MIRRORUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MirrorUVRow = MirrorUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ MirrorUVRow = MirrorUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MirrorUVRow = MirrorUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MirrorUVRow = MirrorUVRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorUVRow = MirrorUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MirrorUVRow = MirrorUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ MirrorUVRow = MirrorUVRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ MirrorUVRow = MirrorUVRow_LASX;
+ }
+ }
+#endif
+
+ // MirrorUV plane
+ for (y = 0; y < height; ++y) {
+ MirrorUVRow(src_uv, dst_uv, width);
+ src_uv += src_stride_uv;
+ dst_uv += dst_stride_uv;
+ }
+}
+
// Mirror I400 with optional flipping
LIBYUV_API
int I400Mirror(const uint8_t* src_y,
@@ -1063,10 +2312,12 @@ int I420Mirror(const uint8_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+
+ if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
+
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -1087,6 +2338,43 @@ int I420Mirror(const uint8_t* src_y,
return 0;
}
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+
+ if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ if (dst_y) {
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
+ halfheight);
+ return 0;
+}
+
// ARGB mirror.
LIBYUV_API
int ARGBMirror(const uint8_t* src_argb,
@@ -1110,7 +2398,7 @@ int ARGBMirror(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 4)) {
+ if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
@@ -1139,11 +2427,11 @@ int ARGBMirror(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBMIRRORROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBMirrorRow = ARGBMirrorRow_MMI;
+#if defined(HAS_ARGBMIRRORROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_LASX;
}
}
#endif
@@ -1157,6 +2445,52 @@ int ARGBMirror(const uint8_t* src_argb,
return 0;
}
+// RGB24 mirror.
+LIBYUV_API
+int RGB24Mirror(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ int y;
+ void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
+ RGB24MirrorRow_C;
+ if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+#if defined(HAS_RGB24MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24MirrorRow = RGB24MirrorRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24MirrorRow = RGB24MirrorRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_RGB24MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24MirrorRow = RGB24MirrorRow_SSSE3;
+ }
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ RGB24MirrorRow(src_rgb24, dst_rgb24, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_rgb24 += dst_stride_rgb24;
+ }
+ return 0;
+}
+
// Get a blender that optimized for the CPU and pixel count.
// As there are 6 blenders to choose from, the caller should try to use
// the same blend function for all pixels if possible.
@@ -1180,9 +2514,9 @@ ARGBBlendRow GetARGBBlend() {
ARGBBlendRow = ARGBBlendRow_MSA;
}
#endif
-#if defined(HAS_ARGBBLENDROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBBlendRow = ARGBBlendRow_MMI;
+#if defined(HAS_ARGBBLENDROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBBlendRow = ARGBBlendRow_LSX;
}
#endif
return ARGBBlendRow;
@@ -1277,14 +2611,6 @@ int BlendPlane(const uint8_t* src_y0,
}
}
#endif
-#if defined(HAS_BLENDPLANEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- BlendPlaneRow = BlendPlaneRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- BlendPlaneRow = BlendPlaneRow_MMI;
- }
- }
-#endif
for (y = 0; y < height; ++y) {
BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
@@ -1329,6 +2655,7 @@ int I420Blend(const uint8_t* src_y0,
BlendPlaneRow_C;
void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+
if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
!alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -1361,14 +2688,6 @@ int I420Blend(const uint8_t* src_y0,
}
}
#endif
-#if defined(HAS_BLENDPLANEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- BlendPlaneRow = BlendPlaneRow_Any_MMI;
- if (IS_ALIGNED(halfwidth, 8)) {
- BlendPlaneRow = BlendPlaneRow_MMI;
- }
- }
-#endif
if (!IS_ALIGNED(width, 2)) {
ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
}
@@ -1405,17 +2724,6 @@ int I420Blend(const uint8_t* src_y0,
}
}
#endif
-#if defined(HAS_SCALEROWDOWN2_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI;
- if (IS_ALIGNED(width, 2)) {
- ScaleRowDown2 = ScaleRowDown2Box_Any_MMI;
- if (IS_ALIGNED(halfwidth, 8)) {
- ScaleRowDown2 = ScaleRowDown2Box_MMI;
- }
- }
- }
-#endif
// Row buffer for intermediate alpha pixels.
align_buffer_64(halfalpha, halfwidth);
@@ -1501,11 +2809,11 @@ int ARGBMultiply(const uint8_t* src_argb0,
}
}
#endif
-#if defined(HAS_ARGBMULTIPLYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBMultiplyRow = ARGBMultiplyRow_MMI;
+#if defined(HAS_ARGBMULTIPLYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_LASX;
}
}
#endif
@@ -1549,12 +2857,12 @@ int ARGBAdd(const uint8_t* src_argb0,
height = 1;
src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
-#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+#if defined(HAS_ARGBADDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAddRow = ARGBAddRow_SSE2;
}
#endif
-#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+#if defined(HAS_ARGBADDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAddRow = ARGBAddRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
@@ -1586,11 +2894,11 @@ int ARGBAdd(const uint8_t* src_argb0,
}
}
#endif
-#if defined(HAS_ARGBADDROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBAddRow = ARGBAddRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBAddRow = ARGBAddRow_MMI;
+#if defined(HAS_ARGBADDROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAddRow = ARGBAddRow_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_LASX;
}
}
#endif
@@ -1666,11 +2974,11 @@ int ARGBSubtract(const uint8_t* src_argb0,
}
}
#endif
-#if defined(HAS_ARGBSUBTRACTROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBSubtractRow = ARGBSubtractRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBSubtractRow = ARGBSubtractRow_MMI;
+#if defined(HAS_ARGBSUBTRACTROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_LASX;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_LASX;
}
}
#endif
@@ -1684,177 +2992,6 @@ int ARGBSubtract(const uint8_t* src_argb0,
}
return 0;
}
-// Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
- int y;
- void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
- const uint8_t* v_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- I422ToRGBARow_C;
- if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
- dst_stride_rgba = -dst_stride_rgba;
- }
-#if defined(HAS_I422TORGBAROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_SSSE3;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- I422ToRGBARow = I422ToRGBARow_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- I422ToRGBARow = I422ToRGBARow_AVX2;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- I422ToRGBARow = I422ToRGBARow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_NEON;
- }
- }
-#endif
-#if defined(HAS_I422TORGBAROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- I422ToRGBARow = I422ToRGBARow_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- I422ToRGBARow = I422ToRGBARow_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
- dst_rgba += dst_stride_rgba;
- src_y += src_stride_y;
- src_u += src_stride_u;
- src_v += src_stride_v;
- }
- return 0;
-}
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height) {
- return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
- src_stride_v, dst_rgba, dst_stride_rgba,
- &kYuvI601Constants, width, height);
-}
-
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_u,
- int src_stride_u,
- const uint8_t* src_v,
- int src_stride_v,
- uint8_t* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height) {
- return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
- src_stride_v, // Swap U and V
- src_u, src_stride_u, dst_bgra, dst_stride_bgra,
- &kYvuI601Constants, // Use Yvu matrix
- width, height);
-}
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8_t* src_y,
- int src_stride_y,
- const uint8_t* src_uv,
- int src_stride_uv,
- uint8_t* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height) {
- int y;
- void (*NV12ToRGB565Row)(
- const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
- if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
- dst_stride_rgb565 = -dst_stride_rgb565;
- }
-#if defined(HAS_NV12TORGB565ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
- NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_NEON;
- }
- }
-#endif
-#if defined(HAS_NV12TORGB565ROW_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- NV12ToRGB565Row = NV12ToRGB565Row_MSA;
- }
- }
-#endif
-
- for (y = 0; y < height; ++y) {
- NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
- dst_rgb565 += dst_stride_rgb565;
- src_y += src_stride_y;
- if (y & 1) {
- src_uv += src_stride_uv;
- }
- }
- return 0;
-}
// Convert RAW to RGB24.
LIBYUV_API
@@ -1906,11 +3043,11 @@ int RAWToRGB24(const uint8_t* src_raw,
}
}
#endif
-#if defined(HAS_RAWTORGB24ROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- RAWToRGB24Row = RAWToRGB24Row_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- RAWToRGB24Row = RAWToRGB24Row_MMI;
+#if defined(HAS_RAWTORGB24ROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToRGB24Row = RAWToRGB24Row_LSX;
}
}
#endif
@@ -1931,6 +3068,10 @@ void SetPlane(uint8_t* dst_y,
uint32_t value) {
int y;
void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;
+
+ if (width <= 0 || height == 0) {
+ return;
+ }
if (height < 0) {
height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y;
@@ -1968,6 +3109,14 @@ void SetPlane(uint8_t* dst_y,
SetRow = SetRow_MSA;
}
#endif
+#if defined(HAS_SETROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SetRow = SetRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ SetRow = SetRow_LSX;
+ }
+ }
+#endif
// Set plane
for (y = 0; y < height; ++y) {
@@ -1996,6 +3145,7 @@ int I420Rect(uint8_t* dst_y,
uint8_t* start_y = dst_y + y * dst_stride_y + x;
uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+
if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
value_v < 0 || value_v > 255) {
@@ -2057,6 +3207,14 @@ int ARGBRect(uint8_t* dst_argb,
}
}
#endif
+#if defined(HAS_ARGBSETROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBSetRow = ARGBSetRow_Any_LSX;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSetRow = ARGBSetRow_LSX;
+ }
+ }
+#endif
// Set plane
for (y = 0; y < height; ++y) {
@@ -2135,11 +3293,11 @@ int ARGBAttenuate(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBATTENUATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+#if defined(HAS_ARGBATTENUATEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_LASX;
}
}
#endif
@@ -2243,9 +3401,9 @@ int ARGBGrayTo(const uint8_t* src_argb,
ARGBGrayRow = ARGBGrayRow_MSA;
}
#endif
-#if defined(HAS_ARGBGRAYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
- ARGBGrayRow = ARGBGrayRow_MMI;
+#if defined(HAS_ARGBGRAYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
+ ARGBGrayRow = ARGBGrayRow_LASX;
}
#endif
@@ -2293,9 +3451,9 @@ int ARGBGray(uint8_t* dst_argb,
ARGBGrayRow = ARGBGrayRow_MSA;
}
#endif
-#if defined(HAS_ARGBGRAYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
- ARGBGrayRow = ARGBGrayRow_MMI;
+#if defined(HAS_ARGBGRAYROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
+ ARGBGrayRow = ARGBGrayRow_LASX;
}
#endif
@@ -2341,9 +3499,9 @@ int ARGBSepia(uint8_t* dst_argb,
ARGBSepiaRow = ARGBSepiaRow_MSA;
}
#endif
-#if defined(HAS_ARGBSEPIAROW_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
- ARGBSepiaRow = ARGBSepiaRow_MMI;
+#if defined(HAS_ARGBSEPIAROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) {
+ ARGBSepiaRow = ARGBSepiaRow_LASX;
}
#endif
@@ -2397,9 +3555,9 @@ int ARGBColorMatrix(const uint8_t* src_argb,
ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
}
#endif
-#if defined(HAS_ARGBCOLORMATRIXROW_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
- ARGBColorMatrixRow = ARGBColorMatrixRow_MMI;
+#if defined(HAS_ARGBCOLORMATRIXROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_LSX;
}
#endif
for (y = 0; y < height; ++y) {
@@ -2567,6 +3725,11 @@ int ARGBQuantize(uint8_t* dst_argb,
ARGBQuantizeRow = ARGBQuantizeRow_MSA;
}
#endif
+#if defined(HAS_ARGBQUANTIZEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_LSX;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
dst += dst_stride_argb;
@@ -2596,11 +3759,6 @@ int ARGBComputeCumulativeSum(const uint8_t* src_argb,
ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
}
#endif
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
- }
-#endif
memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel.
for (y = 0; y < height; ++y) {
@@ -2651,7 +3809,7 @@ int ARGBBlur(const uint8_t* src_argb,
if (radius > (width / 2 - 1)) {
radius = width / 2 - 1;
}
- if (radius <= 0) {
+ if (radius <= 0 || height <= 1) {
return -1;
}
#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
@@ -2660,11 +3818,6 @@ int ARGBBlur(const uint8_t* src_argb,
CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
}
#endif
-#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
- }
-#endif
// Compute enough CumulativeSum for first row to be blurred. After this
// one row of CumulativeSum is updated at a time.
ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
@@ -2771,9 +3924,9 @@ int ARGBShade(const uint8_t* src_argb,
ARGBShadeRow = ARGBShadeRow_MSA;
}
#endif
-#if defined(HAS_ARGBSHADEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
- ARGBShadeRow = ARGBShadeRow_MMI;
+#if defined(HAS_ARGBSHADEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 8)) {
+ ARGBShadeRow = ARGBShadeRow_LASX;
}
#endif
@@ -2847,11 +4000,11 @@ int InterpolatePlane(const uint8_t* src0,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
}
}
#endif
@@ -2865,6 +4018,86 @@ int InterpolatePlane(const uint8_t* src0,
return 0;
}
+// Interpolate 2 planes by specified amount (0 to 255).
+LIBYUV_API
+int InterpolatePlane_16(const uint16_t* src0,
+ int src_stride0,
+ const uint16_t* src1,
+ int src_stride1,
+ uint16_t* dst,
+ int dst_stride,
+ int width,
+ int height,
+ int interpolation) {
+ int y;
+ void (*InterpolateRow_16)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_16_C;
+ if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst = dst + (height - 1) * dst_stride;
+ dst_stride = -dst_stride;
+ }
+ // Coalesce rows.
+ if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
+ width *= height;
+ height = 1;
+ src_stride0 = src_stride1 = dst_stride = 0;
+ }
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow_16 = InterpolateRow_16_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow_16 = InterpolateRow_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow_16 = InterpolateRow_16_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow_16 = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow_16 = InterpolateRow_16_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow_16 = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow_16 = InterpolateRow_16_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow_16 = InterpolateRow_16_MSA;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow_16 = InterpolateRow_16_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow_16 = InterpolateRow_16_LSX;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ InterpolateRow_16(dst, src0, src1 - src0, width, interpolation);
+ src0 += src_stride0;
+ src1 += src_stride1;
+ dst += dst_stride;
+ }
+ return 0;
+}
+
// Interpolate 2 ARGB images by specified amount (0 to 255).
LIBYUV_API
int ARGBInterpolate(const uint8_t* src_argb0,
@@ -2906,10 +4139,12 @@ int I420Interpolate(const uint8_t* src0_y,
int interpolation) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
+
if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
!dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
+
InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,
dst_stride_y, width, height, interpolation);
InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,
@@ -2978,11 +4213,11 @@ int ARGBShuffle(const uint8_t* src_bgra,
}
}
#endif
-#if defined(HAS_ARGBSHUFFLEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBShuffleRow = ARGBShuffleRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBShuffleRow = ARGBShuffleRow_MMI;
+#if defined(HAS_ARGBSHUFFLEROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBShuffleRow = ARGBShuffleRow_LASX;
}
}
#endif
@@ -2995,6 +4230,142 @@ int ARGBShuffle(const uint8_t* src_bgra,
return 0;
}
+// Shuffle AR64 channel order. e.g. AR64 to AB64.
+LIBYUV_API
+int AR64Shuffle(const uint16_t* src_ar64,
+ int src_stride_ar64,
+ uint16_t* dst_ar64,
+ int dst_stride_ar64,
+ const uint8_t* shuffler,
+ int width,
+ int height) {
+ int y;
+ void (*AR64ShuffleRow)(const uint8_t* src_ar64, uint8_t* dst_ar64,
+ const uint8_t* shuffler, int width) = AR64ShuffleRow_C;
+ if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar64 = src_ar64 + (height - 1) * src_stride_ar64;
+ src_stride_ar64 = -src_stride_ar64;
+ }
+ // Coalesce rows.
+ if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar64 = dst_stride_ar64 = 0;
+ }
+ // Assembly versions can be reused if it's implemented with shuffle.
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ AR64ShuffleRow = ARGBShuffleRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AR64ShuffleRow = ARGBShuffleRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ AR64ShuffleRow = ARGBShuffleRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AR64ShuffleRow = ARGBShuffleRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ AR64ShuffleRow = ARGBShuffleRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ AR64ShuffleRow((uint8_t*)(src_ar64), (uint8_t*)(dst_ar64), shuffler,
+ width * 2);
+ src_ar64 += src_stride_ar64;
+ dst_ar64 += dst_stride_ar64;
+ }
+ return 0;
+}
+
+// Gauss blur a float plane using Gaussian 5x5 filter with
+// coefficients of 1, 4, 6, 4, 1.
+// Each destination pixel is a blur of the 5x5
+// pixels from the source.
+// Source edges are clamped.
+// Edge is 2 pixels on each side, and interior is multiple of 4.
+LIBYUV_API
+int GaussPlane_F32(const float* src,
+ int src_stride,
+ float* dst,
+ int dst_stride,
+ int width,
+ int height) {
+ int y;
+ void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2,
+ const float* src3, const float* src4, float* dst,
+ int width) = GaussCol_F32_C;
+ void (*GaussRow_F32)(const float* src, float* dst, int width) =
+ GaussRow_F32_C;
+ if (!src || !dst || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src = src + (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+#if defined(HAS_GAUSSCOL_F32_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ GaussCol_F32 = GaussCol_F32_NEON;
+ }
+#endif
+#if defined(HAS_GAUSSROW_F32_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ GaussRow_F32 = GaussRow_F32_NEON;
+ }
+#endif
+ {
+ // 2 pixels on each side, but aligned out to 16 bytes.
+ align_buffer_64(rowbuf, (4 + width + 4) * 4);
+ memset(rowbuf, 0, 16);
+ memset(rowbuf + (4 + width) * 4, 0, 16);
+ float* row = (float*)(rowbuf + 16);
+ const float* src0 = src;
+ const float* src1 = src;
+ const float* src2 = src;
+ const float* src3 = src2 + ((height > 1) ? src_stride : 0);
+ const float* src4 = src3 + ((height > 2) ? src_stride : 0);
+
+ for (y = 0; y < height; ++y) {
+ GaussCol_F32(src0, src1, src2, src3, src4, row, width);
+
+ // Extrude edge by 2 floats
+ row[-2] = row[-1] = row[0];
+ row[width + 1] = row[width] = row[width - 1];
+
+ GaussRow_F32(row - 2, dst, width);
+
+ src0 = src1;
+ src1 = src2;
+ src2 = src3;
+ src3 = src4;
+ if ((y + 2) < (height - 1)) {
+ src4 += src_stride;
+ }
+ dst += dst_stride;
+ }
+ free_aligned_buffer_64(rowbuf);
+ }
+ return 0;
+}
+
// Sobel ARGB effect.
static int ARGBSobelize(const uint8_t* src_argb,
int src_stride_argb,
@@ -3044,7 +4415,7 @@ static int ARGBSobelize(const uint8_t* src_argb,
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_NEON;
}
}
@@ -3057,11 +4428,19 @@ static int ARGBSobelize(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBToYJRow = ARGBToYJRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYJRow = ARGBToYJRow_MMI;
+#if defined(HAS_ARGBTOYJROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBToYJRow = ARGBToYJRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBToYJRow = ARGBToYJRow_Any_LASX;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_LASX;
}
}
#endif
@@ -3081,11 +4460,6 @@ static int ARGBSobelize(const uint8_t* src_argb,
SobelYRow = SobelYRow_MSA;
}
#endif
-#if defined(HAS_SOBELYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SobelYRow = SobelYRow_MMI;
- }
-#endif
#if defined(HAS_SOBELXROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SobelXRow = SobelXRow_SSE2;
@@ -3101,11 +4475,6 @@ static int ARGBSobelize(const uint8_t* src_argb,
SobelXRow = SobelXRow_MSA;
}
#endif
-#if defined(HAS_SOBELXROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SobelXRow = SobelXRow_MMI;
- }
-#endif
{
// 3 rows with edges before/after.
const int kRowSize = (width + kEdge + 31) & ~31;
@@ -3188,11 +4557,11 @@ int ARGBSobel(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_SOBELROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SobelRow = SobelRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- SobelRow = SobelRow_MMI;
+#if defined(HAS_SOBELROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SobelRow = SobelRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ SobelRow = SobelRow_LSX;
}
}
#endif
@@ -3234,11 +4603,11 @@ int ARGBSobelToPlane(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_SOBELTOPLANEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SobelToPlaneRow = SobelToPlaneRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- SobelToPlaneRow = SobelToPlaneRow_MMI;
+#if defined(HAS_SOBELTOPLANEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ SobelToPlaneRow = SobelToPlaneRow_LSX;
}
}
#endif
@@ -3281,11 +4650,11 @@ int ARGBSobelXY(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_SOBELXYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SobelXYRow = SobelXYRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- SobelXYRow = SobelXYRow_MMI;
+#if defined(HAS_SOBELXYROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SobelXYRow = SobelXYRow_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ SobelXYRow = SobelXYRow_LSX;
}
}
#endif
@@ -3412,6 +4781,14 @@ int HalfFloatPlane(const uint16_t* src_y,
}
}
#endif
+#if defined(HAS_HALFFLOATROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ HalfFloatRow = HalfFloatRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ HalfFloatRow = HalfFloatRow_LSX;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
HalfFloatRow(src_y, dst_y, scale, width);
@@ -3526,14 +4903,6 @@ int ARGBCopyAlpha(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBCOPYALPHAROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI;
- }
- }
-#endif
for (y = 0; y < height; ++y) {
ARGBCopyAlphaRow(src_argb, dst_argb, width);
@@ -3592,10 +4961,10 @@ int ARGBExtractAlpha(const uint8_t* src_argb,
: ARGBExtractAlphaRow_Any_MSA;
}
#endif
-#if defined(HAS_ARGBEXTRACTALPHAROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI
- : ARGBExtractAlphaRow_Any_MMI;
+#if defined(HAS_ARGBEXTRACTALPHAROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_LSX
+ : ARGBExtractAlphaRow_Any_LSX;
}
#endif
@@ -3649,14 +5018,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
}
}
#endif
-#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI;
- }
- }
-#endif
for (y = 0; y < height; ++y) {
ARGBCopyYToAlphaRow(src_y, dst_argb, width);
@@ -3685,9 +5046,11 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
+
if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
return -1;
}
+
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -3726,11 +5089,11 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_SPLITUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SplitUVRow = SplitUVRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- SplitUVRow = SplitUVRow_MMI;
+#if defined(HAS_SPLITUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SplitUVRow = SplitUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_LSX;
}
}
#endif
@@ -3766,11 +5129,11 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
}
}
#endif
@@ -3817,9 +5180,11 @@ int UYVYToNV12(const uint8_t* src_uyvy,
void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
+
if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
return -1;
}
+
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -3858,11 +5223,11 @@ int UYVYToNV12(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_SPLITUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- SplitUVRow = SplitUVRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- SplitUVRow = SplitUVRow_MMI;
+#if defined(HAS_SPLITUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ SplitUVRow = SplitUVRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_LSX;
}
}
#endif
@@ -3898,11 +5263,11 @@ int UYVYToNV12(const uint8_t* src_uyvy,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
}
}
#endif
@@ -3933,6 +5298,56 @@ int UYVYToNV12(const uint8_t* src_uyvy,
return 0;
}
+// width and height are src size allowing odd size handling.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
+ const uint8_t* src_v, int src_stride_v,
+ uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+#if defined(HAS_HALFMERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ HalfMergeUVRow = HalfMergeUVRow_NEON;
+ }
+#endif
+#if defined(HAS_HALFMERGEUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+ HalfMergeUVRow = HalfMergeUVRow_SSSE3;
+ }
+#endif
+#if defined(HAS_HALFMERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+ HalfMergeUVRow = HalfMergeUVRow_AVX2;
+ }
+#endif
+ for (y = 0; y < height - 1; y += 2) {
+ // Merge a row of U and V into a row of UV.
+ HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
+ src_u += src_stride_u * 2;
+ src_v += src_stride_v * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/rotate.cc b/files/source/rotate.cc
index d414186a..f1e83cbd 100644
--- a/files/source/rotate.cc
+++ b/files/source/rotate.cc
@@ -29,16 +29,20 @@ void TransposePlane(const uint8_t* src,
int width,
int height) {
int i = height;
-#if defined(HAS_TRANSPOSEWX16_MSA)
+#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX)
void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
int dst_stride, int width) = TransposeWx16_C;
#else
void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
int dst_stride, int width) = TransposeWx8_C;
#endif
+
#if defined(HAS_TRANSPOSEWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- TransposeWx8 = TransposeWx8_NEON;
+ TransposeWx8 = TransposeWx8_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeWx8 = TransposeWx8_NEON;
+ }
}
#endif
#if defined(HAS_TRANSPOSEWX8_SSSE3)
@@ -49,11 +53,6 @@ void TransposePlane(const uint8_t* src,
}
}
#endif
-#if defined(HAS_TRANSPOSEWX8_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- TransposeWx8 = TransposeWx8_MMI;
- }
-#endif
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
@@ -70,8 +69,16 @@ void TransposePlane(const uint8_t* src,
}
}
#endif
+#if defined(HAS_TRANSPOSEWX16_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ TransposeWx16 = TransposeWx16_Any_LSX;
+ if (IS_ALIGNED(width, 16)) {
+ TransposeWx16 = TransposeWx16_LSX;
+ }
+ }
+#endif
-#if defined(HAS_TRANSPOSEWX16_MSA)
+#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX)
// Work across the source in 16x16 tiles
while (i >= 16) {
TransposeWx16(src, src_stride, dst, dst_stride, width);
@@ -142,7 +149,7 @@ void RotatePlane180(const uint8_t* src,
#if defined(HAS_MIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MirrorRow = MirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
+ if (IS_ALIGNED(width, 32)) {
MirrorRow = MirrorRow_NEON;
}
}
@@ -171,11 +178,11 @@ void RotatePlane180(const uint8_t* src,
}
}
#endif
-#if defined(HAS_MIRRORROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- MirrorRow = MirrorRow_Any_MMI;
- if (IS_ALIGNED(width, 8)) {
- MirrorRow = MirrorRow_MMI;
+#if defined(HAS_MIRRORROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ MirrorRow = MirrorRow_Any_LASX;
+ if (IS_ALIGNED(width, 64)) {
+ MirrorRow = MirrorRow_LASX;
}
}
#endif
@@ -199,19 +206,14 @@ void RotatePlane180(const uint8_t* src,
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
-#if defined(HAS_COPYROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI;
- }
-#endif
// Odd height will harmlessly mirror the middle row twice.
for (y = 0; y < half_height; ++y) {
- MirrorRow(src, row, width); // Mirror first row into a buffer
- src += src_stride;
+ CopyRow(src, row, width); // Copy first row into buffer
MirrorRow(src_bot, dst, width); // Mirror last row into first row
+ MirrorRow(row, dst_bot, width); // Mirror buffer into last row
+ src += src_stride;
dst += dst_stride;
- CopyRow(row, dst_bot, width); // Copy first mirrored row into last
src_bot -= src_stride;
dst_bot -= dst_stride;
}
@@ -219,24 +221,44 @@ void RotatePlane180(const uint8_t* src,
}
LIBYUV_API
-void TransposeUV(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height) {
+void SplitTransposeUV(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
int i = height;
#if defined(HAS_TRANSPOSEUVWX16_MSA)
void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
int width) = TransposeUVWx16_C;
+#elif defined(HAS_TRANSPOSEUVWX16_LSX)
+ void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+ int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+ int width) = TransposeUVWx16_C;
#else
void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,
int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
int width) = TransposeUVWx8_C;
#endif
+
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx16 = TransposeUVWx16_MSA;
+ }
+ }
+#elif defined(HAS_TRANSPOSEUVWX16_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ TransposeUVWx16 = TransposeUVWx16_Any_LSX;
+ if (IS_ALIGNED(width, 8)) {
+ TransposeUVWx16 = TransposeUVWx16_LSX;
+ }
+ }
+#else
#if defined(HAS_TRANSPOSEUVWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeUVWx8 = TransposeUVWx8_NEON;
@@ -250,22 +272,7 @@ void TransposeUV(const uint8_t* src,
}
}
#endif
-#if defined(HAS_TRANSPOSEUVWX8_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- TransposeUVWx8 = TransposeUVWx8_Any_MMI;
- if (IS_ALIGNED(width, 4)) {
- TransposeUVWx8 = TransposeUVWx8_MMI;
- }
- }
-#endif
-#if defined(HAS_TRANSPOSEUVWX16_MSA)
- if (TestCpuFlag(kCpuHasMSA)) {
- TransposeUVWx16 = TransposeUVWx16_Any_MSA;
- if (IS_ALIGNED(width, 8)) {
- TransposeUVWx16 = TransposeUVWx16_MSA;
- }
- }
-#endif
+#endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */
#if defined(HAS_TRANSPOSEUVWX16_MSA)
// Work through the source in 8x8 tiles.
@@ -277,6 +284,16 @@ void TransposeUV(const uint8_t* src,
dst_b += 16; // Move over 8 columns.
i -= 16;
}
+#elif defined(HAS_TRANSPOSEUVWX16_LSX)
+ // Work through the source in 8x8 tiles.
+ while (i >= 16) {
+ TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ src += 16 * src_stride; // Go down 16 rows.
+ dst_a += 16; // Move over 8 columns.
+ dst_b += 16; // Move over 8 columns.
+ i -= 16;
+ }
#else
// Work through the source in 8x8 tiles.
while (i >= 8) {
@@ -296,70 +313,70 @@ void TransposeUV(const uint8_t* src,
}
LIBYUV_API
-void RotateUV90(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height) {
+void SplitRotateUV90(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
src += src_stride * (height - 1);
src_stride = -src_stride;
- TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
- height);
+ SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width, height);
}
LIBYUV_API
-void RotateUV270(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height) {
+void SplitRotateUV270(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
dst_a += dst_stride_a * (width - 1);
dst_b += dst_stride_b * (width - 1);
dst_stride_a = -dst_stride_a;
dst_stride_b = -dst_stride_b;
- TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
- height);
+ SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width, height);
}
// Rotate 180 is a horizontal and vertical flip.
LIBYUV_API
-void RotateUV180(const uint8_t* src,
- int src_stride,
- uint8_t* dst_a,
- int dst_stride_a,
- uint8_t* dst_b,
- int dst_stride_b,
- int width,
- int height) {
+void SplitRotateUV180(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
int i;
- void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
- int width) = MirrorUVRow_C;
-#if defined(HAS_MIRRORUVROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- MirrorUVRow = MirrorUVRow_NEON;
+ void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+ int width) = MirrorSplitUVRow_C;
+#if defined(HAS_MIRRORSPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_NEON;
}
#endif
-#if defined(HAS_MIRRORUVROW_SSSE3)
+#if defined(HAS_MIRRORSPLITUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
- MirrorUVRow = MirrorUVRow_SSSE3;
+ MirrorSplitUVRow = MirrorSplitUVRow_SSSE3;
}
#endif
-#if defined(HAS_MIRRORUVROW_MSA)
+#if defined(HAS_MIRRORSPLITUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
- MirrorUVRow = MirrorUVRow_MSA;
+ MirrorSplitUVRow = MirrorSplitUVRow_MSA;
}
#endif
-#if defined(HAS_MIRRORUVROW_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) {
- MirrorUVRow = MirrorUVRow_MMI;
+#if defined(HAS_MIRRORSPLITUVROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 32)) {
+ MirrorSplitUVRow = MirrorSplitUVRow_LSX;
}
#endif
@@ -367,13 +384,59 @@ void RotateUV180(const uint8_t* src,
dst_b += dst_stride_b * (height - 1);
for (i = 0; i < height; ++i) {
- MirrorUVRow(src, dst_a, dst_b, width);
+ MirrorSplitUVRow(src, dst_a, dst_b, width);
src += src_stride;
dst_a -= dst_stride_a;
dst_b -= dst_stride_b;
}
}
+// Rotate UV and split into planar.
+// width and height expected to be half size for NV12
+LIBYUV_API
+int SplitRotateUV(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ if (!src_uv || width <= 0 || height == 0 || !dst_u || !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, width, height);
+ return 0;
+ case kRotate90:
+ SplitRotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, width, height);
+ return 0;
+ case kRotate270:
+ SplitRotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, width, height);
+ return 0;
+ case kRotate180:
+ SplitRotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
LIBYUV_API
int RotatePlane(const uint8_t* src,
int src_stride,
@@ -431,8 +494,8 @@ int I420Rotate(const uint8_t* src_y,
enum RotationMode mode) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
- !dst_u || !dst_v) {
+ if ((!src_y && dst_y) || !src_u || !src_v || width <= 0 || height == 0 ||
+ !dst_y || !dst_u || !dst_v) {
return -1;
}
@@ -482,6 +545,80 @@ int I420Rotate(const uint8_t* src_y,
}
LIBYUV_API
+int I422Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+ !dst_u || !dst_v) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+ return 0;
+ case kRotate90:
+ // We need to rotate and rescale, we use plane Y as temporal storage.
+ RotatePlane90(src_u, src_stride_u, dst_y, height, halfwidth, height);
+ ScalePlane(dst_y, height, height, halfwidth, dst_u, halfheight,
+ halfheight, width, kFilterBilinear);
+ RotatePlane90(src_v, src_stride_v, dst_y, height, halfwidth, height);
+ ScalePlane(dst_y, height, height, halfwidth, dst_v, halfheight,
+ halfheight, width, kFilterLinear);
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+ case kRotate270:
+ // We need to rotate and rescale, we use plane Y as temporal storage.
+ RotatePlane270(src_u, src_stride_u, dst_y, height, halfwidth, height);
+ ScalePlane(dst_y, height, height, halfwidth, dst_u, halfheight,
+ halfheight, width, kFilterBilinear);
+ RotatePlane270(src_v, src_stride_v, dst_y, height, halfwidth, height);
+ ScalePlane(dst_y, height, height, halfwidth, dst_v, halfheight,
+ halfheight, width, kFilterLinear);
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+ height);
+ RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+ height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
int I444Rotate(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
@@ -496,7 +633,7 @@ int I444Rotate(const uint8_t* src_y,
int dst_stride_v,
int width,
int height,
- enum libyuv::RotationMode mode) {
+ enum RotationMode mode) {
if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
!dst_u || !dst_v) {
return -1;
@@ -514,23 +651,23 @@ int I444Rotate(const uint8_t* src_y,
}
switch (mode) {
- case libyuv::kRotate0:
+ case kRotate0:
// copy frame
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
return 0;
- case libyuv::kRotate90:
+ case kRotate90:
RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
return 0;
- case libyuv::kRotate270:
+ case kRotate270:
RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
return 0;
- case libyuv::kRotate180:
+ case kRotate180:
RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
@@ -580,18 +717,18 @@ int NV12ToI420Rotate(const uint8_t* src_y,
width, height);
case kRotate90:
RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
- dst_stride_v, halfwidth, halfheight);
+ SplitRotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
return 0;
case kRotate270:
RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
- dst_stride_v, halfwidth, halfheight);
+ SplitRotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
return 0;
case kRotate180:
RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
- dst_stride_v, halfwidth, halfheight);
+ SplitRotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, halfwidth, halfheight);
return 0;
default:
break;
@@ -599,6 +736,98 @@ int NV12ToI420Rotate(const uint8_t* src_y,
return -1;
}
+static void SplitPixels(const uint8_t* src_u,
+ int src_pixel_stride_uv,
+ uint8_t* dst_u,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst_u = *src_u;
+ ++dst_u;
+ src_u += src_pixel_stride_uv;
+ }
+}
+
+// Convert Android420 to I420 with Rotate
+LIBYUV_API
+int Android420ToI420Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_pixel_stride_uv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum RotationMode rotation) {
+ int y;
+ const ptrdiff_t vu_off = src_v - src_u;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+ height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ RotatePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
+ rotation);
+ }
+
+ // Copy UV planes - I420
+ if (src_pixel_stride_uv == 1) {
+ RotatePlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight,
+ rotation);
+ RotatePlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight,
+ rotation);
+ return 0;
+ }
+ // Split UV planes - NV21
+ if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+ src_stride_u == src_stride_v) {
+ SplitRotateUV(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
+ halfwidth, halfheight, rotation);
+ return 0;
+ }
+ // Split UV planes - NV12
+ if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+ SplitRotateUV(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
+ halfwidth, halfheight, rotation);
+ return 0;
+ }
+
+ if (rotation == 0) {
+ for (y = 0; y < halfheight; ++y) {
+ SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth);
+ SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+ }
+ // unsupported type and/or rotation.
+ return -1;
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/rotate_any.cc b/files/source/rotate_any.cc
index b3baf084..88ca7876 100644
--- a/files/source/rotate_any.cc
+++ b/files/source/rotate_any.cc
@@ -35,15 +35,15 @@ TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
#ifdef HAS_TRANSPOSEWX8_SSSE3
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
#endif
-#ifdef HAS_TRANSPOSEWX8_MMI
-TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7)
-#endif
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
#endif
#ifdef HAS_TRANSPOSEWX16_MSA
TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
#endif
+#ifdef HAS_TRANSPOSEWX16_LSX
+TANY(TransposeWx16_Any_LSX, TransposeWx16_LSX, 15)
+#endif
#undef TANY
#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
@@ -65,12 +65,12 @@ TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
#ifdef HAS_TRANSPOSEUVWX8_SSE2
TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
#endif
-#ifdef HAS_TRANSPOSEUVWX8_MMI
-TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7)
-#endif
#ifdef HAS_TRANSPOSEUVWX16_MSA
TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
#endif
+#ifdef HAS_TRANSPOSEUVWX16_LSX
+TUVANY(TransposeUVWx16_Any_LSX, TransposeUVWx16_LSX, 7)
+#endif
#undef TUVANY
#ifdef __cplusplus
diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc
index a93fd55f..539cf98d 100644
--- a/files/source/rotate_argb.cc
+++ b/files/source/rotate_argb.cc
@@ -21,17 +21,21 @@ namespace libyuv {
extern "C" {
#endif
-static void ARGBTranspose(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBTranspose(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
int i;
int src_pixel_step = src_stride_argb >> 2;
void (*ScaleARGBRowDownEven)(
const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
+ // Check stride is a multiple of 4.
+ if (src_stride_argb & 3) {
+ return -1;
+ }
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
@@ -56,11 +60,11 @@ static void ARGBTranspose(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_LSX;
if (IS_ALIGNED(height, 4)) { // Width of dest.
- ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI;
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_LSX;
}
}
#endif
@@ -70,44 +74,45 @@ static void ARGBTranspose(const uint8_t* src_argb,
dst_argb += dst_stride_argb;
src_argb += 4;
}
+ return 0;
}
-void ARGBRotate90(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate90(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
src_argb += src_stride_argb * (height - 1);
src_stride_argb = -src_stride_argb;
- ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
+ return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
}
-void ARGBRotate270(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate270(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
dst_argb += dst_stride_argb * (width - 1);
dst_stride_argb = -dst_stride_argb;
- ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
+ return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
}
-void ARGBRotate180(const uint8_t* src_argb,
- int src_stride_argb,
- uint8_t* dst_argb,
- int dst_stride_argb,
- int width,
- int height) {
+static int ARGBRotate180(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4);
const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
@@ -121,7 +126,7 @@ void ARGBRotate180(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
- if (IS_ALIGNED(width, 4)) {
+ if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
@@ -150,11 +155,11 @@ void ARGBRotate180(const uint8_t* src_argb,
}
}
#endif
-#if defined(HAS_ARGBMIRRORROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
- if (IS_ALIGNED(width, 2)) {
- ARGBMirrorRow = ARGBMirrorRow_MMI;
+#if defined(HAS_ARGBMIRRORROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_LASX;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_LASX;
}
}
#endif
@@ -190,6 +195,7 @@ void ARGBRotate180(const uint8_t* src_argb,
dst_bot -= dst_stride_argb;
}
free_aligned_buffer_64(row);
+ return 0;
}
LIBYUV_API
@@ -217,17 +223,14 @@ int ARGBRotate(const uint8_t* src_argb,
return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
case kRotate90:
- ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
case kRotate270:
- ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
case kRotate180:
- ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
- height);
- return 0;
+ return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height);
default:
break;
}
diff --git a/files/source/rotate_dspr2.cc b/files/source/rotate_dspr2.cc
deleted file mode 100644
index 5d2338de..00000000
--- a/files/source/rotate_dspr2.cc
+++ /dev/null
@@ -1,475 +0,0 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/rotate_row.h"
-#include "libyuv/row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
- (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void TransposeWx8_DSPR2(const uint8* src,
- int src_stride,
- uint8* dst,
- int dst_stride,
- int width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
- "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
- "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
- "addu $t3, $t2, %[src_stride] \n"
- "addu $t5, $t4, %[src_stride] \n"
- "addu $t6, $t2, $t4 \n"
- "andi $t0, %[dst], 0x3 \n"
- "andi $t1, %[dst_stride], 0x3 \n"
- "or $t0, $t0, $t1 \n"
- "bnez $t0, 11f \n"
- " subu $t7, $t9, %[src_stride] \n"
- // dst + dst_stride word aligned
- "1: \n"
- "lbu $t0, 0(%[src]) \n"
- "lbux $t1, %[src_stride](%[src]) \n"
- "lbux $t8, $t2(%[src]) \n"
- "lbux $t9, $t3(%[src]) \n"
- "sll $t1, $t1, 16 \n"
- "sll $t9, $t9, 16 \n"
- "or $t0, $t0, $t1 \n"
- "or $t8, $t8, $t9 \n"
- "precr.qb.ph $s0, $t8, $t0 \n"
- "lbux $t0, $t4(%[src]) \n"
- "lbux $t1, $t5(%[src]) \n"
- "lbux $t8, $t6(%[src]) \n"
- "lbux $t9, $t7(%[src]) \n"
- "sll $t1, $t1, 16 \n"
- "sll $t9, $t9, 16 \n"
- "or $t0, $t0, $t1 \n"
- "or $t8, $t8, $t9 \n"
- "precr.qb.ph $s1, $t8, $t0 \n"
- "sw $s0, 0(%[dst]) \n"
- "addiu %[width], -1 \n"
- "addiu %[src], 1 \n"
- "sw $s1, 4(%[dst]) \n"
- "bnez %[width], 1b \n"
- " addu %[dst], %[dst], %[dst_stride] \n"
- "b 2f \n"
- // dst + dst_stride unaligned
- "11: \n"
- "lbu $t0, 0(%[src]) \n"
- "lbux $t1, %[src_stride](%[src]) \n"
- "lbux $t8, $t2(%[src]) \n"
- "lbux $t9, $t3(%[src]) \n"
- "sll $t1, $t1, 16 \n"
- "sll $t9, $t9, 16 \n"
- "or $t0, $t0, $t1 \n"
- "or $t8, $t8, $t9 \n"
- "precr.qb.ph $s0, $t8, $t0 \n"
- "lbux $t0, $t4(%[src]) \n"
- "lbux $t1, $t5(%[src]) \n"
- "lbux $t8, $t6(%[src]) \n"
- "lbux $t9, $t7(%[src]) \n"
- "sll $t1, $t1, 16 \n"
- "sll $t9, $t9, 16 \n"
- "or $t0, $t0, $t1 \n"
- "or $t8, $t8, $t9 \n"
- "precr.qb.ph $s1, $t8, $t0 \n"
- "swr $s0, 0(%[dst]) \n"
- "swl $s0, 3(%[dst]) \n"
- "addiu %[width], -1 \n"
- "addiu %[src], 1 \n"
- "swr $s1, 4(%[dst]) \n"
- "swl $s1, 7(%[dst]) \n"
- "bnez %[width], 11b \n"
- "addu %[dst], %[dst], %[dst_stride] \n"
- "2: \n"
- ".set pop \n"
- : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
- : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1");
-}
-
-void TransposeWx8_Fast_DSPR2(const uint8* src,
- int src_stride,
- uint8* dst,
- int dst_stride,
- int width) {
- __asm__ __volatile__(
- ".set noat \n"
- ".set push \n"
- ".set noreorder \n"
- "beqz %[width], 2f \n"
- " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
- "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
- "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
- "addu $t3, $t2, %[src_stride] \n"
- "addu $t5, $t4, %[src_stride] \n"
- "addu $t6, $t2, $t4 \n"
-
- "srl $AT, %[width], 0x2 \n"
- "andi $t0, %[dst], 0x3 \n"
- "andi $t1, %[dst_stride], 0x3 \n"
- "or $t0, $t0, $t1 \n"
- "bnez $t0, 11f \n"
- " subu $t7, $t9, %[src_stride] \n"
- // dst + dst_stride word aligned
- "1: \n"
- "lw $t0, 0(%[src]) \n"
- "lwx $t1, %[src_stride](%[src]) \n"
- "lwx $t8, $t2(%[src]) \n"
- "lwx $t9, $t3(%[src]) \n"
-
- // t0 = | 30 | 20 | 10 | 00 |
- // t1 = | 31 | 21 | 11 | 01 |
- // t8 = | 32 | 22 | 12 | 02 |
- // t9 = | 33 | 23 | 13 | 03 |
-
- "precr.qb.ph $s0, $t1, $t0 \n"
- "precr.qb.ph $s1, $t9, $t8 \n"
- "precrq.qb.ph $s2, $t1, $t0 \n"
- "precrq.qb.ph $s3, $t9, $t8 \n"
-
- // s0 = | 21 | 01 | 20 | 00 |
- // s1 = | 23 | 03 | 22 | 02 |
- // s2 = | 31 | 11 | 30 | 10 |
- // s3 = | 33 | 13 | 32 | 12 |
-
- "precr.qb.ph $s4, $s1, $s0 \n"
- "precrq.qb.ph $s5, $s1, $s0 \n"
- "precr.qb.ph $s6, $s3, $s2 \n"
- "precrq.qb.ph $s7, $s3, $s2 \n"
-
- // s4 = | 03 | 02 | 01 | 00 |
- // s5 = | 23 | 22 | 21 | 20 |
- // s6 = | 13 | 12 | 11 | 10 |
- // s7 = | 33 | 32 | 31 | 30 |
-
- "lwx $t0, $t4(%[src]) \n"
- "lwx $t1, $t5(%[src]) \n"
- "lwx $t8, $t6(%[src]) \n"
- "lwx $t9, $t7(%[src]) \n"
-
- // t0 = | 34 | 24 | 14 | 04 |
- // t1 = | 35 | 25 | 15 | 05 |
- // t8 = | 36 | 26 | 16 | 06 |
- // t9 = | 37 | 27 | 17 | 07 |
-
- "precr.qb.ph $s0, $t1, $t0 \n"
- "precr.qb.ph $s1, $t9, $t8 \n"
- "precrq.qb.ph $s2, $t1, $t0 \n"
- "precrq.qb.ph $s3, $t9, $t8 \n"
-
- // s0 = | 25 | 05 | 24 | 04 |
- // s1 = | 27 | 07 | 26 | 06 |
- // s2 = | 35 | 15 | 34 | 14 |
- // s3 = | 37 | 17 | 36 | 16 |
-
- "precr.qb.ph $t0, $s1, $s0 \n"
- "precrq.qb.ph $t1, $s1, $s0 \n"
- "precr.qb.ph $t8, $s3, $s2 \n"
- "precrq.qb.ph $t9, $s3, $s2 \n"
-
- // t0 = | 07 | 06 | 05 | 04 |
- // t1 = | 27 | 26 | 25 | 24 |
- // t8 = | 17 | 16 | 15 | 14 |
- // t9 = | 37 | 36 | 35 | 34 |
-
- "addu $s0, %[dst], %[dst_stride] \n"
- "addu $s1, $s0, %[dst_stride] \n"
- "addu $s2, $s1, %[dst_stride] \n"
-
- "sw $s4, 0(%[dst]) \n"
- "sw $t0, 4(%[dst]) \n"
- "sw $s6, 0($s0) \n"
- "sw $t8, 4($s0) \n"
- "sw $s5, 0($s1) \n"
- "sw $t1, 4($s1) \n"
- "sw $s7, 0($s2) \n"
- "sw $t9, 4($s2) \n"
-
- "addiu $AT, -1 \n"
- "addiu %[src], 4 \n"
-
- "bnez $AT, 1b \n"
- " addu %[dst], $s2, %[dst_stride] \n"
- "b 2f \n"
- // dst + dst_stride unaligned
- "11: \n"
- "lw $t0, 0(%[src]) \n"
- "lwx $t1, %[src_stride](%[src]) \n"
- "lwx $t8, $t2(%[src]) \n"
- "lwx $t9, $t3(%[src]) \n"
-
- // t0 = | 30 | 20 | 10 | 00 |
- // t1 = | 31 | 21 | 11 | 01 |
- // t8 = | 32 | 22 | 12 | 02 |
- // t9 = | 33 | 23 | 13 | 03 |
-
- "precr.qb.ph $s0, $t1, $t0 \n"
- "precr.qb.ph $s1, $t9, $t8 \n"
- "precrq.qb.ph $s2, $t1, $t0 \n"
- "precrq.qb.ph $s3, $t9, $t8 \n"
-
- // s0 = | 21 | 01 | 20 | 00 |
- // s1 = | 23 | 03 | 22 | 02 |
- // s2 = | 31 | 11 | 30 | 10 |
- // s3 = | 33 | 13 | 32 | 12 |
-
- "precr.qb.ph $s4, $s1, $s0 \n"
- "precrq.qb.ph $s5, $s1, $s0 \n"
- "precr.qb.ph $s6, $s3, $s2 \n"
- "precrq.qb.ph $s7, $s3, $s2 \n"
-
- // s4 = | 03 | 02 | 01 | 00 |
- // s5 = | 23 | 22 | 21 | 20 |
- // s6 = | 13 | 12 | 11 | 10 |
- // s7 = | 33 | 32 | 31 | 30 |
-
- "lwx $t0, $t4(%[src]) \n"
- "lwx $t1, $t5(%[src]) \n"
- "lwx $t8, $t6(%[src]) \n"
- "lwx $t9, $t7(%[src]) \n"
-
- // t0 = | 34 | 24 | 14 | 04 |
- // t1 = | 35 | 25 | 15 | 05 |
- // t8 = | 36 | 26 | 16 | 06 |
- // t9 = | 37 | 27 | 17 | 07 |
-
- "precr.qb.ph $s0, $t1, $t0 \n"
- "precr.qb.ph $s1, $t9, $t8 \n"
- "precrq.qb.ph $s2, $t1, $t0 \n"
- "precrq.qb.ph $s3, $t9, $t8 \n"
-
- // s0 = | 25 | 05 | 24 | 04 |
- // s1 = | 27 | 07 | 26 | 06 |
- // s2 = | 35 | 15 | 34 | 14 |
- // s3 = | 37 | 17 | 36 | 16 |
-
- "precr.qb.ph $t0, $s1, $s0 \n"
- "precrq.qb.ph $t1, $s1, $s0 \n"
- "precr.qb.ph $t8, $s3, $s2 \n"
- "precrq.qb.ph $t9, $s3, $s2 \n"
-
- // t0 = | 07 | 06 | 05 | 04 |
- // t1 = | 27 | 26 | 25 | 24 |
- // t8 = | 17 | 16 | 15 | 14 |
- // t9 = | 37 | 36 | 35 | 34 |
-
- "addu $s0, %[dst], %[dst_stride] \n"
- "addu $s1, $s0, %[dst_stride] \n"
- "addu $s2, $s1, %[dst_stride] \n"
-
- "swr $s4, 0(%[dst]) \n"
- "swl $s4, 3(%[dst]) \n"
- "swr $t0, 4(%[dst]) \n"
- "swl $t0, 7(%[dst]) \n"
- "swr $s6, 0($s0) \n"
- "swl $s6, 3($s0) \n"
- "swr $t8, 4($s0) \n"
- "swl $t8, 7($s0) \n"
- "swr $s5, 0($s1) \n"
- "swl $s5, 3($s1) \n"
- "swr $t1, 4($s1) \n"
- "swl $t1, 7($s1) \n"
- "swr $s7, 0($s2) \n"
- "swl $s7, 3($s2) \n"
- "swr $t9, 4($s2) \n"
- "swl $t9, 7($s2) \n"
-
- "addiu $AT, -1 \n"
- "addiu %[src], 4 \n"
-
- "bnez $AT, 11b \n"
- " addu %[dst], $s2, %[dst_stride] \n"
- "2: \n"
- ".set pop \n"
- ".set at \n"
- : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
- : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
- "s2", "s3", "s4", "s5", "s6", "s7");
-}
-
-void TransposeUVWx8_DSPR2(const uint8* src,
- int src_stride,
- uint8* dst_a,
- int dst_stride_a,
- uint8* dst_b,
- int dst_stride_b,
- int width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "beqz %[width], 2f \n"
- " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
- "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
- "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
- "addu $t3, $t2, %[src_stride] \n"
- "addu $t5, $t4, %[src_stride] \n"
- "addu $t6, $t2, $t4 \n"
- "subu $t7, $t9, %[src_stride] \n"
- "srl $t1, %[width], 1 \n"
-
- // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
- "andi $t0, %[dst_a], 0x3 \n"
- "andi $t8, %[dst_b], 0x3 \n"
- "or $t0, $t0, $t8 \n"
- "andi $t8, %[dst_stride_a], 0x3 \n"
- "andi $s5, %[dst_stride_b], 0x3 \n"
- "or $t8, $t8, $s5 \n"
- "or $t0, $t0, $t8 \n"
- "bnez $t0, 11f \n"
- " nop \n"
- // dst + dst_stride word aligned (both, a & b dst addresses)
- "1: \n"
- "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
- "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
- "addu $s5, %[dst_a], %[dst_stride_a] \n"
- "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
- "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
- "addu $s6, %[dst_b], %[dst_stride_b] \n"
-
- "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
-
- "sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
- "sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
-
- "sw $s3, 0($s5) \n"
- "sw $s4, 0($s6) \n"
-
- "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
-
- "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
- "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
- "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
- "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
- "sw $s3, 0(%[dst_a]) \n"
- "sw $s4, 0(%[dst_b]) \n"
-
- "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
-
- "sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
- "sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
- "sw $s3, 4($s5) \n"
- "sw $s4, 4($s6) \n"
-
- "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
-
- "addiu %[src], 4 \n"
- "addiu $t1, -1 \n"
- "sll $t0, %[dst_stride_a], 1 \n"
- "sll $t8, %[dst_stride_b], 1 \n"
- "sw $s3, 4(%[dst_a]) \n"
- "sw $s4, 4(%[dst_b]) \n"
- "addu %[dst_a], %[dst_a], $t0 \n"
- "bnez $t1, 1b \n"
- " addu %[dst_b], %[dst_b], $t8 \n"
- "b 2f \n"
- " nop \n"
-
- // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
- "11: \n"
- "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
- "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
- "addu $s5, %[dst_a], %[dst_stride_a] \n"
- "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
- "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
- "addu $s6, %[dst_b], %[dst_stride_b] \n"
-
- "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
-
- "sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
- "sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
-
- "swr $s3, 0($s5) \n"
- "swl $s3, 3($s5) \n"
- "swr $s4, 0($s6) \n"
- "swl $s4, 3($s6) \n"
-
- "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
-
- "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
- "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
- "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
- "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
- "swr $s3, 0(%[dst_a]) \n"
- "swl $s3, 3(%[dst_a]) \n"
- "swr $s4, 0(%[dst_b]) \n"
- "swl $s4, 3(%[dst_b]) \n"
-
- "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
- "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
- "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
-
- "sll $t0, $t0, 16 \n"
- "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
- "sll $t9, $t9, 16 \n"
- "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
-
- "swr $s3, 4($s5) \n"
- "swl $s3, 7($s5) \n"
- "swr $s4, 4($s6) \n"
- "swl $s4, 7($s6) \n"
-
- "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
- "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
-
- "addiu %[src], 4 \n"
- "addiu $t1, -1 \n"
- "sll $t0, %[dst_stride_a], 1 \n"
- "sll $t8, %[dst_stride_b], 1 \n"
- "swr $s3, 4(%[dst_a]) \n"
- "swl $s3, 7(%[dst_a]) \n"
- "swr $s4, 4(%[dst_b]) \n"
- "swl $s4, 7(%[dst_b]) \n"
- "addu %[dst_a], %[dst_a], $t0 \n"
- "bnez $t1, 11b \n"
- " addu %[dst_b], %[dst_b], $t8 \n"
-
- "2: \n"
- ".set pop \n"
- : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b),
- [width] "+r"(width), [src_stride] "+r"(src_stride)
- : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
- "s2", "s3", "s4", "s5", "s6");
-}
-
-#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc
index 04e19e29..1a3f8cbb 100644
--- a/files/source/rotate_gcc.cc
+++ b/files/source/rotate_gcc.cc
@@ -17,8 +17,7 @@ extern "C" {
#endif
// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
#if defined(HAS_TRANSPOSEWX8_SSSE3)
@@ -31,75 +30,75 @@ void TransposeWx8_SSSE3(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movq (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "movq (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "movq (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movq (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "movq (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movq (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "lea 0x8(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "neg %3 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movq (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "movq (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movq (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "movq (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movq (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "lea 0x8(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "neg %3 \n"
// Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "sub $0x8,%2 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -121,127 +120,127 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%3),%%xmm1 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm9 \n"
- "palignr $0x8,%%xmm1,%%xmm1 \n"
- "palignr $0x8,%%xmm9,%%xmm9 \n"
- "movdqu (%0,%3),%%xmm3 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm2,%%xmm10 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm10 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "movdqa %%xmm10,%%xmm11 \n"
- "movdqu (%0),%%xmm4 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "movdqu (%0,%3),%%xmm5 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm4,%%xmm12 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm12 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "movdqa %%xmm12,%%xmm13 \n"
- "movdqu (%0),%%xmm6 \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movdqu (%0,%3),%%xmm7 \n"
- "lea (%0,%3,2),%0 \n"
- "movdqa %%xmm6,%%xmm14 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "punpckhbw %%xmm7,%%xmm14 \n"
- "neg %3 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "movdqa %%xmm14,%%xmm15 \n"
- "lea 0x10(%0,%3,8),%0 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "neg %3 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqu (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqu (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqu (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
// Second round of bit swap.
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "palignr $0x8,%%xmm2,%%xmm2 \n"
- "palignr $0x8,%%xmm3,%%xmm3 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "movdqa %%xmm5,%%xmm7 \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "punpcklwd %%xmm10,%%xmm8 \n"
- "punpcklwd %%xmm11,%%xmm9 \n"
- "movdqa %%xmm8,%%xmm10 \n"
- "movdqa %%xmm9,%%xmm11 \n"
- "palignr $0x8,%%xmm10,%%xmm10 \n"
- "palignr $0x8,%%xmm11,%%xmm11 \n"
- "punpcklwd %%xmm14,%%xmm12 \n"
- "punpcklwd %%xmm15,%%xmm13 \n"
- "movdqa %%xmm12,%%xmm14 \n"
- "movdqa %%xmm13,%%xmm15 \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "punpckldq %%xmm4,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "palignr $0x8,%%xmm4,%%xmm4 \n"
- "movq %%xmm4,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "movq %%xmm2,(%1) \n"
- "palignr $0x8,%%xmm6,%%xmm6 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movq %%xmm6,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm1,%%xmm5 \n"
- "movq %%xmm1,(%1) \n"
- "palignr $0x8,%%xmm5,%%xmm5 \n"
- "movq %%xmm5,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movq %%xmm3,(%1) \n"
- "movdqa %%xmm3,%%xmm7 \n"
- "palignr $0x8,%%xmm7,%%xmm7 \n"
- "movq %%xmm7,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm12,%%xmm8 \n"
- "movq %%xmm8,(%1) \n"
- "movdqa %%xmm8,%%xmm12 \n"
- "palignr $0x8,%%xmm12,%%xmm12 \n"
- "movq %%xmm12,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm14,%%xmm10 \n"
- "movdqa %%xmm10,%%xmm14 \n"
- "movq %%xmm10,(%1) \n"
- "palignr $0x8,%%xmm14,%%xmm14 \n"
- "punpckldq %%xmm13,%%xmm9 \n"
- "movq %%xmm14,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "movdqa %%xmm9,%%xmm13 \n"
- "movq %%xmm9,(%1) \n"
- "palignr $0x8,%%xmm13,%%xmm13 \n"
- "movq %%xmm13,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "punpckldq %%xmm15,%%xmm11 \n"
- "movq %%xmm11,(%1) \n"
- "movdqa %%xmm11,%%xmm15 \n"
- "palignr $0x8,%%xmm15,%%xmm15 \n"
- "sub $0x10,%2 \n"
- "movq %%xmm15,(%1,%4) \n"
- "lea (%1,%4,2),%1 \n"
- "jg 1b \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -266,95 +265,95 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%4),%%xmm1 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm0,%%xmm8 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu (%0,%4),%%xmm3 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "punpckhbw %%xmm3,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm3 \n"
- "movdqu (%0),%%xmm4 \n"
- "movdqu (%0,%4),%%xmm5 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "punpcklbw %%xmm5,%%xmm4 \n"
- "punpckhbw %%xmm5,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm5 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu (%0,%4),%%xmm7 \n"
- "lea (%0,%4,2),%0 \n"
- "movdqa %%xmm6,%%xmm8 \n"
- "punpcklbw %%xmm7,%%xmm6 \n"
- "neg %4 \n"
- "lea 0x10(%0,%4,8),%0 \n"
- "punpckhbw %%xmm7,%%xmm8 \n"
- "movdqa %%xmm8,%%xmm7 \n"
- "neg %4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqu (%0),%%xmm4 \n"
+ "movdqu (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
// Second round of bit swap.
- "movdqa %%xmm0,%%xmm8 \n"
- "movdqa %%xmm1,%%xmm9 \n"
- "punpckhwd %%xmm2,%%xmm8 \n"
- "punpckhwd %%xmm3,%%xmm9 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpcklwd %%xmm3,%%xmm1 \n"
- "movdqa %%xmm8,%%xmm2 \n"
- "movdqa %%xmm9,%%xmm3 \n"
- "movdqa %%xmm4,%%xmm8 \n"
- "movdqa %%xmm5,%%xmm9 \n"
- "punpckhwd %%xmm6,%%xmm8 \n"
- "punpckhwd %%xmm7,%%xmm9 \n"
- "punpcklwd %%xmm6,%%xmm4 \n"
- "punpcklwd %%xmm7,%%xmm5 \n"
- "movdqa %%xmm8,%%xmm6 \n"
- "movdqa %%xmm9,%%xmm7 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
- "movdqa %%xmm0,%%xmm8 \n"
- "punpckldq %%xmm4,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n" // Write back U channel
- "movhpd %%xmm0,(%2) \n" // Write back V channel
- "punpckhdq %%xmm4,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm2,%%xmm8 \n"
- "punpckldq %%xmm6,%%xmm2 \n"
- "movlpd %%xmm2,(%1) \n"
- "movhpd %%xmm2,(%2) \n"
- "punpckhdq %%xmm6,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm1,%%xmm8 \n"
- "punpckldq %%xmm5,%%xmm1 \n"
- "movlpd %%xmm1,(%1) \n"
- "movhpd %%xmm1,(%2) \n"
- "punpckhdq %%xmm5,%%xmm8 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "movdqa %%xmm3,%%xmm8 \n"
- "punpckldq %%xmm7,%%xmm3 \n"
- "movlpd %%xmm3,(%1) \n"
- "movhpd %%xmm3,(%2) \n"
- "punpckhdq %%xmm7,%%xmm8 \n"
- "sub $0x8,%3 \n"
- "movlpd %%xmm8,(%1,%5) \n"
- "lea (%1,%5,2),%1 \n"
- "movhpd %%xmm8,(%2,%6) \n"
- "lea (%2,%6,2),%2 \n"
- "jg 1b \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
diff --git a/files/source/rotate_lsx.cc b/files/source/rotate_lsx.cc
new file mode 100644
index 00000000..94a2b91c
--- /dev/null
+++ b/files/source/rotate_lsx.cc
@@ -0,0 +1,243 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#include "libyuv/loongson_intrinsics.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ILVLH_B(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ DUP2_ARG2(__lsx_vilvl_b, in1, in0, in3, in2, out0, out2); \
+ DUP2_ARG2(__lsx_vilvh_b, in1, in0, in3, in2, out1, out3); \
+ }
+
+#define ILVLH_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, out0, out2); \
+ DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, out1, out3); \
+ }
+
+#define ILVLH_W(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ DUP2_ARG2(__lsx_vilvl_w, in1, in0, in3, in2, out0, out2); \
+ DUP2_ARG2(__lsx_vilvh_w, in1, in0, in3, in2, out1, out3); \
+ }
+
+#define ILVLH_D(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ DUP2_ARG2(__lsx_vilvl_d, in1, in0, in3, in2, out0, out2); \
+ DUP2_ARG2(__lsx_vilvh_d, in1, in0, in3, in2, out1, out3); \
+ }
+
+#define LSX_ST_4(_dst0, _dst1, _dst2, _dst3, _dst, _stride, _stride2, \
+ _stride3, _stride4) \
+ { \
+ __lsx_vst(_dst0, _dst, 0); \
+ __lsx_vstx(_dst1, _dst, _stride); \
+ __lsx_vstx(_dst2, _dst, _stride2); \
+ __lsx_vstx(_dst3, _dst, _stride3); \
+ _dst += _stride4; \
+ }
+
+#define LSX_ST_2(_dst0, _dst1, _dst, _stride, _stride2) \
+ { \
+ __lsx_vst(_dst0, _dst, 0); \
+ __lsx_vstx(_dst1, _dst, _stride); \
+ _dst += _stride2; \
+ }
+
+void TransposeWx16_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ TransposeWx8_C(src, src_stride, dst, dst_stride, width);
+ TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
+ width);
+}
+
+void TransposeUVWx16_C(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width) {
+ TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+ width);
+ TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
+ dst_stride_a, (dst_b + 8), dst_stride_b, width);
+}
+
+void TransposeWx16_LSX(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ int x;
+ int len = width / 16;
+ uint8_t* s;
+ int src_stride2 = src_stride << 1;
+ int src_stride3 = src_stride + src_stride2;
+ int src_stride4 = src_stride2 << 1;
+ int dst_stride2 = dst_stride << 1;
+ int dst_stride3 = dst_stride + dst_stride2;
+ int dst_stride4 = dst_stride2 << 1;
+ __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+ for (x = 0; x < len; x++) {
+ s = (uint8_t*)src;
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
+ ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+ ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
+ res8 = __lsx_vilvl_w(reg4, reg0);
+ res9 = __lsx_vilvh_w(reg4, reg0);
+ ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
+ dst_stride4);
+ res8 = __lsx_vilvl_w(reg5, reg1);
+ res9 = __lsx_vilvh_w(reg5, reg1);
+ ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
+ dst_stride4);
+ res8 = __lsx_vilvl_w(reg6, reg2);
+ res9 = __lsx_vilvh_w(reg6, reg2);
+ ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
+ dst_stride4);
+ res8 = __lsx_vilvl_w(reg7, reg3);
+ res9 = __lsx_vilvh_w(reg7, reg3);
+ ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_4(dst0, dst1, dst2, dst3, dst, dst_stride, dst_stride2, dst_stride3,
+ dst_stride4);
+ src += 16;
+ }
+}
+
+void TransposeUVWx16_LSX(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width) {
+ int x;
+ int len = width / 8;
+ uint8_t* s;
+ int src_stride2 = src_stride << 1;
+ int src_stride3 = src_stride + src_stride2;
+ int src_stride4 = src_stride2 << 1;
+ int dst_stride_a2 = dst_stride_a << 1;
+ int dst_stride_b2 = dst_stride_b << 1;
+ __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+ for (x = 0; x < len; x++) {
+ s = (uint8_t*)src;
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
+ ILVLH_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+ ILVLH_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg0, reg1, reg2, reg3);
+ src0 = __lsx_vld(s, 0);
+ src1 = __lsx_vldx(s, src_stride);
+ src2 = __lsx_vldx(s, src_stride2);
+ src3 = __lsx_vldx(s, src_stride3);
+ s += src_stride4;
+ ILVLH_B(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
+ ILVLH_H(tmp0, tmp2, tmp1, tmp3, reg4, reg5, reg6, reg7);
+ res8 = __lsx_vilvl_w(reg4, reg0);
+ res9 = __lsx_vilvh_w(reg4, reg0);
+ ILVLH_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
+ LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
+ res8 = __lsx_vilvl_w(reg5, reg1);
+ res9 = __lsx_vilvh_w(reg5, reg1);
+ ILVLH_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
+ LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
+ res8 = __lsx_vilvl_w(reg6, reg2);
+ res9 = __lsx_vilvh_w(reg6, reg2);
+ ILVLH_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
+ LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
+ res8 = __lsx_vilvl_w(reg7, reg3);
+ res9 = __lsx_vilvh_w(reg7, reg3);
+ ILVLH_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+ LSX_ST_2(dst0, dst2, dst_a, dst_stride_a, dst_stride_a2);
+ LSX_ST_2(dst1, dst3, dst_b, dst_stride_b, dst_stride_b2);
+ src += 16;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc
index fdc0dd47..844df2bf 100644
--- a/files/source/rotate_neon.cc
+++ b/files/source/rotate_neon.cc
@@ -38,52 +38,52 @@ void TransposeWx8_NEON(const uint8_t* src,
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
-
- "vld1.8 {d0}, [%0], %2 \n"
- "vld1.8 {d1}, [%0], %2 \n"
- "vld1.8 {d2}, [%0], %2 \n"
- "vld1.8 {d3}, [%0], %2 \n"
- "vld1.8 {d4}, [%0], %2 \n"
- "vld1.8 {d5}, [%0], %2 \n"
- "vld1.8 {d6}, [%0], %2 \n"
- "vld1.8 {d7}, [%0] \n"
-
- "vtrn.8 d1, d0 \n"
- "vtrn.8 d3, d2 \n"
- "vtrn.8 d5, d4 \n"
- "vtrn.8 d7, d6 \n"
-
- "vtrn.16 d1, d3 \n"
- "vtrn.16 d0, d2 \n"
- "vtrn.16 d5, d7 \n"
- "vtrn.16 d4, d6 \n"
-
- "vtrn.32 d1, d5 \n"
- "vtrn.32 d0, d4 \n"
- "vtrn.32 d3, d7 \n"
- "vtrn.32 d2, d6 \n"
-
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
-
- "mov %0, %3 \n"
-
- "vst1.8 {d1}, [%0], %4 \n"
- "vst1.8 {d0}, [%0], %4 \n"
- "vst1.8 {d3}, [%0], %4 \n"
- "vst1.8 {d2}, [%0], %4 \n"
- "vst1.8 {d5}, [%0], %4 \n"
- "vst1.8 {d4}, [%0], %4 \n"
- "vst1.8 {d7}, [%0], %4 \n"
- "vst1.8 {d6}, [%0] \n"
-
- "add %1, #8 \n" // src += 8
- "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
- "subs %5, #8 \n" // w -= 8
- "bge 1b \n"
+ "mov %0, %1 \n"
+
+ "vld1.8 {d0}, [%0], %2 \n"
+ "vld1.8 {d1}, [%0], %2 \n"
+ "vld1.8 {d2}, [%0], %2 \n"
+ "vld1.8 {d3}, [%0], %2 \n"
+ "vld1.8 {d4}, [%0], %2 \n"
+ "vld1.8 {d5}, [%0], %2 \n"
+ "vld1.8 {d6}, [%0], %2 \n"
+ "vld1.8 {d7}, [%0] \n"
+
+ "vtrn.8 d1, d0 \n"
+ "vtrn.8 d3, d2 \n"
+ "vtrn.8 d5, d4 \n"
+ "vtrn.8 d7, d6 \n"
+
+ "vtrn.16 d1, d3 \n"
+ "vtrn.16 d0, d2 \n"
+ "vtrn.16 d5, d7 \n"
+ "vtrn.16 d4, d6 \n"
+
+ "vtrn.32 d1, d5 \n"
+ "vtrn.32 d0, d4 \n"
+ "vtrn.32 d3, d7 \n"
+ "vtrn.32 d2, d6 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.8 {d1}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d3}, [%0], %4 \n"
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d5}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d7}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0] \n"
+
+ "add %1, #8 \n" // src += 8
+ "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %5, #8 \n" // w -= 8
+ "bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
@@ -208,68 +208,70 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
-
- "vld2.8 {d0, d1}, [%0], %2 \n"
- "vld2.8 {d2, d3}, [%0], %2 \n"
- "vld2.8 {d4, d5}, [%0], %2 \n"
- "vld2.8 {d6, d7}, [%0], %2 \n"
- "vld2.8 {d16, d17}, [%0], %2 \n"
- "vld2.8 {d18, d19}, [%0], %2 \n"
- "vld2.8 {d20, d21}, [%0], %2 \n"
- "vld2.8 {d22, d23}, [%0] \n"
-
- "vtrn.8 q1, q0 \n"
- "vtrn.8 q3, q2 \n"
- "vtrn.8 q9, q8 \n"
- "vtrn.8 q11, q10 \n"
-
- "vtrn.16 q1, q3 \n"
- "vtrn.16 q0, q2 \n"
- "vtrn.16 q9, q11 \n"
- "vtrn.16 q8, q10 \n"
-
- "vtrn.32 q1, q9 \n"
- "vtrn.32 q0, q8 \n"
- "vtrn.32 q3, q11 \n"
- "vtrn.32 q2, q10 \n"
-
- "vrev16.8 q0, q0 \n"
- "vrev16.8 q1, q1 \n"
- "vrev16.8 q2, q2 \n"
- "vrev16.8 q3, q3 \n"
- "vrev16.8 q8, q8 \n"
- "vrev16.8 q9, q9 \n"
- "vrev16.8 q10, q10 \n"
- "vrev16.8 q11, q11 \n"
-
- "mov %0, %3 \n"
-
- "vst1.8 {d2}, [%0], %4 \n"
- "vst1.8 {d0}, [%0], %4 \n"
- "vst1.8 {d6}, [%0], %4 \n"
- "vst1.8 {d4}, [%0], %4 \n"
- "vst1.8 {d18}, [%0], %4 \n"
- "vst1.8 {d16}, [%0], %4 \n"
- "vst1.8 {d22}, [%0], %4 \n"
- "vst1.8 {d20}, [%0] \n"
-
- "mov %0, %5 \n"
-
- "vst1.8 {d3}, [%0], %6 \n"
- "vst1.8 {d1}, [%0], %6 \n"
- "vst1.8 {d7}, [%0], %6 \n"
- "vst1.8 {d5}, [%0], %6 \n"
- "vst1.8 {d19}, [%0], %6 \n"
- "vst1.8 {d17}, [%0], %6 \n"
- "vst1.8 {d23}, [%0], %6 \n"
- "vst1.8 {d21}, [%0] \n"
-
- "add %1, #8*2 \n" // src += 8*2
- "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
- "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
- "subs %7, #8 \n" // w -= 8
- "bge 1b \n"
+ "mov %0, %1 \n"
+
+ "vld2.8 {d0, d1}, [%0], %2 \n"
+ "vld2.8 {d2, d3}, [%0], %2 \n"
+ "vld2.8 {d4, d5}, [%0], %2 \n"
+ "vld2.8 {d6, d7}, [%0], %2 \n"
+ "vld2.8 {d16, d17}, [%0], %2 \n"
+ "vld2.8 {d18, d19}, [%0], %2 \n"
+ "vld2.8 {d20, d21}, [%0], %2 \n"
+ "vld2.8 {d22, d23}, [%0] \n"
+
+ "vtrn.8 q1, q0 \n"
+ "vtrn.8 q3, q2 \n"
+ "vtrn.8 q9, q8 \n"
+ "vtrn.8 q11, q10 \n"
+
+ "vtrn.16 q1, q3 \n"
+ "vtrn.16 q0, q2 \n"
+ "vtrn.16 q9, q11 \n"
+ "vtrn.16 q8, q10 \n"
+
+ "vtrn.32 q1, q9 \n"
+ "vtrn.32 q0, q8 \n"
+ "vtrn.32 q3, q11 \n"
+ "vtrn.32 q2, q10 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+ "vrev16.8 q8, q8 \n"
+ "vrev16.8 q9, q9 \n"
+ "vrev16.8 q10, q10 \n"
+ "vrev16.8 q11, q11 \n"
+
+ "mov %0, %3 \n"
+
+ "vst1.8 {d2}, [%0], %4 \n"
+ "vst1.8 {d0}, [%0], %4 \n"
+ "vst1.8 {d6}, [%0], %4 \n"
+ "vst1.8 {d4}, [%0], %4 \n"
+ "vst1.8 {d18}, [%0], %4 \n"
+ "vst1.8 {d16}, [%0], %4 \n"
+ "vst1.8 {d22}, [%0], %4 \n"
+ "vst1.8 {d20}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ "vst1.8 {d3}, [%0], %6 \n"
+ "vst1.8 {d1}, [%0], %6 \n"
+ "vst1.8 {d7}, [%0], %6 \n"
+ "vst1.8 {d5}, [%0], %6 \n"
+ "vst1.8 {d19}, [%0], %6 \n"
+ "vst1.8 {d17}, [%0], %6 \n"
+ "vst1.8 {d23}, [%0], %6 \n"
+ "vst1.8 {d21}, [%0] \n"
+
+ "add %1, #8*2 \n" // src += 8*2
+ "add %3, %3, %4, lsl #3 \n" // dst_a += 8 *
+ // dst_stride_a
+ "add %5, %5, %6, lsl #3 \n" // dst_b += 8 *
+ // dst_stride_b
+ "subs %7, #8 \n" // w -= 8
+ "bge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
diff --git a/files/source/rotate_neon64.cc b/files/source/rotate_neon64.cc
index f469baac..43c15817 100644
--- a/files/source/rotate_neon64.cc
+++ b/files/source/rotate_neon64.cc
@@ -34,58 +34,74 @@ void TransposeWx8_NEON(const uint8_t* src,
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
- "sub %w3, %w3, #8 \n"
+ "sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
- "1: \n"
+ "1: \n"
+ "mov %0, %1 \n"
+
+ "ld1 {v0.8b}, [%0], %5 \n"
+ "ld1 {v1.8b}, [%0], %5 \n"
+ "ld1 {v2.8b}, [%0], %5 \n"
+ "ld1 {v3.8b}, [%0], %5 \n"
+ "ld1 {v4.8b}, [%0], %5 \n"
+ "ld1 {v5.8b}, [%0], %5 \n"
+ "ld1 {v6.8b}, [%0], %5 \n"
+ "ld1 {v7.8b}, [%0] \n"
"mov %0, %1 \n"
- "ld1 {v0.8b}, [%0], %5 \n"
- "ld1 {v1.8b}, [%0], %5 \n"
- "ld1 {v2.8b}, [%0], %5 \n"
- "ld1 {v3.8b}, [%0], %5 \n"
- "ld1 {v4.8b}, [%0], %5 \n"
- "ld1 {v5.8b}, [%0], %5 \n"
- "ld1 {v6.8b}, [%0], %5 \n"
- "ld1 {v7.8b}, [%0] \n"
-
- "trn2 v16.8b, v0.8b, v1.8b \n"
- "trn1 v17.8b, v0.8b, v1.8b \n"
- "trn2 v18.8b, v2.8b, v3.8b \n"
- "trn1 v19.8b, v2.8b, v3.8b \n"
- "trn2 v20.8b, v4.8b, v5.8b \n"
- "trn1 v21.8b, v4.8b, v5.8b \n"
- "trn2 v22.8b, v6.8b, v7.8b \n"
- "trn1 v23.8b, v6.8b, v7.8b \n"
-
- "trn2 v3.4h, v17.4h, v19.4h \n"
- "trn1 v1.4h, v17.4h, v19.4h \n"
- "trn2 v2.4h, v16.4h, v18.4h \n"
- "trn1 v0.4h, v16.4h, v18.4h \n"
- "trn2 v7.4h, v21.4h, v23.4h \n"
- "trn1 v5.4h, v21.4h, v23.4h \n"
- "trn2 v6.4h, v20.4h, v22.4h \n"
- "trn1 v4.4h, v20.4h, v22.4h \n"
-
- "trn2 v21.2s, v1.2s, v5.2s \n"
- "trn1 v17.2s, v1.2s, v5.2s \n"
- "trn2 v20.2s, v0.2s, v4.2s \n"
- "trn1 v16.2s, v0.2s, v4.2s \n"
- "trn2 v23.2s, v3.2s, v7.2s \n"
- "trn1 v19.2s, v3.2s, v7.2s \n"
- "trn2 v22.2s, v2.2s, v6.2s \n"
- "trn1 v18.2s, v2.2s, v6.2s \n"
+ "trn2 v16.8b, v0.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "trn1 v17.8b, v0.8b, v1.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v18.8b, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 1
+ "trn1 v19.8b, v2.8b, v3.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v20.8b, v4.8b, v5.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 2
+ "trn1 v21.8b, v4.8b, v5.8b \n"
+ "add %0, %0, %5 \n"
+ "trn2 v22.8b, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 3
+ "trn1 v23.8b, v6.8b, v7.8b \n"
+ "add %0, %0, %5 \n"
+
+ "trn2 v3.4h, v17.4h, v19.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 4
+ "trn1 v1.4h, v17.4h, v19.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v2.4h, v16.4h, v18.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 5
+ "trn1 v0.4h, v16.4h, v18.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v7.4h, v21.4h, v23.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 6
+ "trn1 v5.4h, v21.4h, v23.4h \n"
+ "add %0, %0, %5 \n"
+ "trn2 v6.4h, v20.4h, v22.4h \n"
+ "prfm pldl1keep, [%0, 448] \n" // row 7
+ "trn1 v4.4h, v20.4h, v22.4h \n"
+
+ "trn2 v21.2s, v1.2s, v5.2s \n"
+ "trn1 v17.2s, v1.2s, v5.2s \n"
+ "trn2 v20.2s, v0.2s, v4.2s \n"
+ "trn1 v16.2s, v0.2s, v4.2s \n"
+ "trn2 v23.2s, v3.2s, v7.2s \n"
+ "trn1 v19.2s, v3.2s, v7.2s \n"
+ "trn2 v22.2s, v2.2s, v6.2s \n"
+ "trn1 v18.2s, v2.2s, v6.2s \n"
"mov %0, %2 \n"
- "st1 {v17.8b}, [%0], %6 \n"
- "st1 {v16.8b}, [%0], %6 \n"
- "st1 {v19.8b}, [%0], %6 \n"
- "st1 {v18.8b}, [%0], %6 \n"
- "st1 {v21.8b}, [%0], %6 \n"
- "st1 {v20.8b}, [%0], %6 \n"
- "st1 {v23.8b}, [%0], %6 \n"
- "st1 {v22.8b}, [%0] \n"
+ "st1 {v17.8b}, [%0], %6 \n"
+ "st1 {v16.8b}, [%0], %6 \n"
+ "st1 {v19.8b}, [%0], %6 \n"
+ "st1 {v18.8b}, [%0], %6 \n"
+ "st1 {v21.8b}, [%0], %6 \n"
+ "st1 {v20.8b}, [%0], %6 \n"
+ "st1 {v23.8b}, [%0], %6 \n"
+ "st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
@@ -94,33 +110,33 @@ void TransposeWx8_NEON(const uint8_t* src,
// add 8 back to counter. if the result is 0 there are
// no residuals.
- "adds %w3, %w3, #8 \n"
- "b.eq 4f \n"
+ "adds %w3, %w3, #8 \n"
+ "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
- "cmp %w3, #2 \n"
- "b.lt 3f \n"
+ "cmp %w3, #2 \n"
+ "b.lt 3f \n"
- "cmp %w3, #4 \n"
- "b.lt 2f \n"
+ "cmp %w3, #4 \n"
+ "b.lt 2f \n"
// 4x8 block
- "mov %0, %1 \n"
- "ld1 {v0.s}[0], [%0], %5 \n"
- "ld1 {v0.s}[1], [%0], %5 \n"
- "ld1 {v0.s}[2], [%0], %5 \n"
- "ld1 {v0.s}[3], [%0], %5 \n"
- "ld1 {v1.s}[0], [%0], %5 \n"
- "ld1 {v1.s}[1], [%0], %5 \n"
- "ld1 {v1.s}[2], [%0], %5 \n"
- "ld1 {v1.s}[3], [%0] \n"
+ "mov %0, %1 \n"
+ "ld1 {v0.s}[0], [%0], %5 \n"
+ "ld1 {v0.s}[1], [%0], %5 \n"
+ "ld1 {v0.s}[2], [%0], %5 \n"
+ "ld1 {v0.s}[3], [%0], %5 \n"
+ "ld1 {v1.s}[0], [%0], %5 \n"
+ "ld1 {v1.s}[1], [%0], %5 \n"
+ "ld1 {v1.s}[2], [%0], %5 \n"
+ "ld1 {v1.s}[3], [%0] \n"
- "mov %0, %2 \n"
+ "mov %0, %2 \n"
- "ld1 {v2.16b}, [%4] \n"
+ "ld1 {v2.16b}, [%4] \n"
- "tbl v3.16b, {v0.16b}, v2.16b \n"
- "tbl v0.16b, {v1.16b}, v2.16b \n"
+ "tbl v3.16b, {v0.16b}, v2.16b \n"
+ "tbl v0.16b, {v1.16b}, v2.16b \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
@@ -212,89 +228,90 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
- "sub %w4, %w4, #8 \n"
+ "sub %w4, %w4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
- "mov %0, %1 \n"
-
- "ld1 {v0.16b}, [%0], %5 \n"
- "ld1 {v1.16b}, [%0], %5 \n"
- "ld1 {v2.16b}, [%0], %5 \n"
- "ld1 {v3.16b}, [%0], %5 \n"
- "ld1 {v4.16b}, [%0], %5 \n"
- "ld1 {v5.16b}, [%0], %5 \n"
- "ld1 {v6.16b}, [%0], %5 \n"
- "ld1 {v7.16b}, [%0] \n"
-
- "trn1 v16.16b, v0.16b, v1.16b \n"
- "trn2 v17.16b, v0.16b, v1.16b \n"
- "trn1 v18.16b, v2.16b, v3.16b \n"
- "trn2 v19.16b, v2.16b, v3.16b \n"
- "trn1 v20.16b, v4.16b, v5.16b \n"
- "trn2 v21.16b, v4.16b, v5.16b \n"
- "trn1 v22.16b, v6.16b, v7.16b \n"
- "trn2 v23.16b, v6.16b, v7.16b \n"
-
- "trn1 v0.8h, v16.8h, v18.8h \n"
- "trn2 v1.8h, v16.8h, v18.8h \n"
- "trn1 v2.8h, v20.8h, v22.8h \n"
- "trn2 v3.8h, v20.8h, v22.8h \n"
- "trn1 v4.8h, v17.8h, v19.8h \n"
- "trn2 v5.8h, v17.8h, v19.8h \n"
- "trn1 v6.8h, v21.8h, v23.8h \n"
- "trn2 v7.8h, v21.8h, v23.8h \n"
-
- "trn1 v16.4s, v0.4s, v2.4s \n"
- "trn2 v17.4s, v0.4s, v2.4s \n"
- "trn1 v18.4s, v1.4s, v3.4s \n"
- "trn2 v19.4s, v1.4s, v3.4s \n"
- "trn1 v20.4s, v4.4s, v6.4s \n"
- "trn2 v21.4s, v4.4s, v6.4s \n"
- "trn1 v22.4s, v5.4s, v7.4s \n"
- "trn2 v23.4s, v5.4s, v7.4s \n"
+ "mov %0, %1 \n"
- "mov %0, %2 \n"
+ "ld1 {v0.16b}, [%0], %5 \n"
+ "ld1 {v1.16b}, [%0], %5 \n"
+ "ld1 {v2.16b}, [%0], %5 \n"
+ "ld1 {v3.16b}, [%0], %5 \n"
+ "ld1 {v4.16b}, [%0], %5 \n"
+ "ld1 {v5.16b}, [%0], %5 \n"
+ "ld1 {v6.16b}, [%0], %5 \n"
+ "ld1 {v7.16b}, [%0] \n"
+ "mov %0, %1 \n"
- "st1 {v16.d}[0], [%0], %6 \n"
- "st1 {v18.d}[0], [%0], %6 \n"
- "st1 {v17.d}[0], [%0], %6 \n"
- "st1 {v19.d}[0], [%0], %6 \n"
- "st1 {v16.d}[1], [%0], %6 \n"
- "st1 {v18.d}[1], [%0], %6 \n"
- "st1 {v17.d}[1], [%0], %6 \n"
- "st1 {v19.d}[1], [%0] \n"
+ "trn1 v16.16b, v0.16b, v1.16b \n"
+ "trn2 v17.16b, v0.16b, v1.16b \n"
+ "trn1 v18.16b, v2.16b, v3.16b \n"
+ "trn2 v19.16b, v2.16b, v3.16b \n"
+ "trn1 v20.16b, v4.16b, v5.16b \n"
+ "trn2 v21.16b, v4.16b, v5.16b \n"
+ "trn1 v22.16b, v6.16b, v7.16b \n"
+ "trn2 v23.16b, v6.16b, v7.16b \n"
+
+ "trn1 v0.8h, v16.8h, v18.8h \n"
+ "trn2 v1.8h, v16.8h, v18.8h \n"
+ "trn1 v2.8h, v20.8h, v22.8h \n"
+ "trn2 v3.8h, v20.8h, v22.8h \n"
+ "trn1 v4.8h, v17.8h, v19.8h \n"
+ "trn2 v5.8h, v17.8h, v19.8h \n"
+ "trn1 v6.8h, v21.8h, v23.8h \n"
+ "trn2 v7.8h, v21.8h, v23.8h \n"
+
+ "trn1 v16.4s, v0.4s, v2.4s \n"
+ "trn2 v17.4s, v0.4s, v2.4s \n"
+ "trn1 v18.4s, v1.4s, v3.4s \n"
+ "trn2 v19.4s, v1.4s, v3.4s \n"
+ "trn1 v20.4s, v4.4s, v6.4s \n"
+ "trn2 v21.4s, v4.4s, v6.4s \n"
+ "trn1 v22.4s, v5.4s, v7.4s \n"
+ "trn2 v23.4s, v5.4s, v7.4s \n"
- "mov %0, %3 \n"
+ "mov %0, %2 \n"
- "st1 {v20.d}[0], [%0], %7 \n"
- "st1 {v22.d}[0], [%0], %7 \n"
- "st1 {v21.d}[0], [%0], %7 \n"
- "st1 {v23.d}[0], [%0], %7 \n"
- "st1 {v20.d}[1], [%0], %7 \n"
- "st1 {v22.d}[1], [%0], %7 \n"
- "st1 {v21.d}[1], [%0], %7 \n"
- "st1 {v23.d}[1], [%0] \n"
-
- "add %1, %1, #16 \n" // src += 8*2
- "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
+ "st1 {v16.d}[0], [%0], %6 \n"
+ "st1 {v18.d}[0], [%0], %6 \n"
+ "st1 {v17.d}[0], [%0], %6 \n"
+ "st1 {v19.d}[0], [%0], %6 \n"
+ "st1 {v16.d}[1], [%0], %6 \n"
+ "st1 {v18.d}[1], [%0], %6 \n"
+ "st1 {v17.d}[1], [%0], %6 \n"
+ "st1 {v19.d}[1], [%0] \n"
+
+ "mov %0, %3 \n"
+
+ "st1 {v20.d}[0], [%0], %7 \n"
+ "st1 {v22.d}[0], [%0], %7 \n"
+ "st1 {v21.d}[0], [%0], %7 \n"
+ "st1 {v23.d}[0], [%0], %7 \n"
+ "st1 {v20.d}[1], [%0], %7 \n"
+ "st1 {v22.d}[1], [%0], %7 \n"
+ "st1 {v21.d}[1], [%0], %7 \n"
+ "st1 {v23.d}[1], [%0] \n"
+
+ "add %1, %1, #16 \n" // src += 8*2
+ "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
// dst_stride_a
- "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
+ "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
// dst_stride_b
- "subs %w4, %w4, #8 \n" // w -= 8
- "b.ge 1b \n"
+ "subs %w4, %w4, #8 \n" // w -= 8
+ "b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
- "adds %w4, %w4, #8 \n"
- "b.eq 4f \n"
+ "adds %w4, %w4, #8 \n"
+ "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
- "cmp %w4, #2 \n"
- "b.lt 3f \n"
+ "cmp %w4, #2 \n"
+ "b.lt 3f \n"
- "cmp %w4, #4 \n"
- "b.lt 2f \n"
+ "cmp %w4, #4 \n"
+ "b.lt 2f \n"
// TODO(frkoenig): Clean this up
// 4x8 block
diff --git a/files/source/rotate_win.cc b/files/source/rotate_win.cc
index e887dd52..a78873f8 100644
--- a/files/source/rotate_win.cc
+++ b/files/source/rotate_win.cc
@@ -16,8 +16,9 @@ namespace libyuv {
extern "C" {
#endif
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ !defined(__clang__) && defined(_M_IX86)
__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
int src_stride,
diff --git a/files/source/row_any.cc b/files/source/row_any.cc
index 06ca723a..3781a9f2 100644
--- a/files/source/row_any.cc
+++ b/files/source/row_any.cc
@@ -30,6 +30,39 @@ extern "C" {
// Subsampled source needs to be increase by 1 of not even.
#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+// Any 4 planes to 1
+#define ANY41(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
+ const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 5]); \
+ memset(temp, 0, 64 * 4); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 192, a_buf + n, r); \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
+ SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_MERGEARGBROW_SSE2
+ANY41(MergeARGBRow_Any_SSE2, MergeARGBRow_SSE2, 0, 0, 4, 7)
+#endif
+#ifdef HAS_MERGEARGBROW_AVX2
+ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_MERGEARGBROW_NEON
+ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15)
+#endif
+
+// Note that odd width replication includes 444 due to implementation
+// on arm that subsamples 444 to 422 internally.
// Any 4 planes to 1 with yuvconstants
#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
@@ -46,26 +79,163 @@ extern "C" {
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 192, a_buf + n, r); \
+ if (width & 1) { \
+ temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
+ temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
+ } \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
SS(r, DUVSHIFT) * BPP); \
}
+#ifdef HAS_I444ALPHATOARGBROW_SSSE3
+ANY41C(I444AlphaToARGBRow_Any_SSSE3, I444AlphaToARGBRow_SSSE3, 0, 0, 4, 7)
+#endif
+#ifdef HAS_I444ALPHATOARGBROW_AVX2
+ANY41C(I444AlphaToARGBRow_Any_AVX2, I444AlphaToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
#ifdef HAS_I422ALPHATOARGBROW_SSSE3
ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_I422ALPHATOARGBROW_AVX2
ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
#endif
+#ifdef HAS_I444ALPHATOARGBROW_NEON
+ANY41C(I444AlphaToARGBRow_Any_NEON, I444AlphaToARGBRow_NEON, 0, 0, 4, 7)
+#endif
#ifdef HAS_I422ALPHATOARGBROW_NEON
ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
#endif
+#ifdef HAS_I444ALPHATOARGBROW_MSA
+ANY41C(I444AlphaToARGBRow_Any_MSA, I444AlphaToARGBRow_MSA, 0, 0, 4, 7)
+#endif
#ifdef HAS_I422ALPHATOARGBROW_MSA
ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
#endif
+#ifdef HAS_I422ALPHATOARGBROW_LASX
+ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15)
+#endif
#undef ANY41C
+// Any 4 planes to 1 plane of 8 bit with yuvconstants
+#define ANY41CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+ void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \
+ uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
+ int width) { \
+ SIMD_ALIGNED(T temp[16 * 4]); \
+ SIMD_ALIGNED(uint8_t out[64]); \
+ memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r * SBPP); \
+ memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
+ memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
+ memcpy(temp + 48, a_buf + n, r * SBPP); \
+ ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, yuvconstants, \
+ MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I210ALPHATOARGBROW_SSSE3
+ANY41CT(I210AlphaToARGBRow_Any_SSSE3,
+ I210AlphaToARGBRow_SSSE3,
+ 1,
+ 0,
+ uint16_t,
+ 2,
+ 4,
+ 7)
+#endif
+
+#ifdef HAS_I210ALPHATOARGBROW_AVX2
+ANY41CT(I210AlphaToARGBRow_Any_AVX2,
+ I210AlphaToARGBRow_AVX2,
+ 1,
+ 0,
+ uint16_t,
+ 2,
+ 4,
+ 15)
+#endif
+
+#ifdef HAS_I410ALPHATOARGBROW_SSSE3
+ANY41CT(I410AlphaToARGBRow_Any_SSSE3,
+ I410AlphaToARGBRow_SSSE3,
+ 0,
+ 0,
+ uint16_t,
+ 2,
+ 4,
+ 7)
+#endif
+
+#ifdef HAS_I410ALPHATOARGBROW_AVX2
+ANY41CT(I410AlphaToARGBRow_Any_AVX2,
+ I410AlphaToARGBRow_AVX2,
+ 0,
+ 0,
+ uint16_t,
+ 2,
+ 4,
+ 15)
+#endif
+
+#undef ANY41CT
+
+// Any 4 planes to 1 plane with parameter
+#define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \
+ void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
+ const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \
+ SIMD_ALIGNED(STYPE temp[16 * 4]); \
+ SIMD_ALIGNED(DTYPE out[64]); \
+ memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n); \
+ } \
+ memcpy(temp, r_buf + n, r * SBPP); \
+ memcpy(temp + 16, g_buf + n, r * SBPP); \
+ memcpy(temp + 32, b_buf + n, r * SBPP); \
+ memcpy(temp + 48, a_buf + n, r * SBPP); \
+ ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \
+ memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP); \
+ }
+
+#ifdef HAS_MERGEAR64ROW_AVX2
+ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15)
+#endif
+
+#ifdef HAS_MERGEAR64ROW_NEON
+ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 8, 7)
+#endif
+
+#ifdef HAS_MERGEARGB16TO8ROW_AVX2
+ANY41PT(MergeARGB16To8Row_Any_AVX2,
+ MergeARGB16To8Row_AVX2,
+ uint16_t,
+ 2,
+ uint8_t,
+ 4,
+ 15)
+#endif
+
+#ifdef HAS_MERGEARGB16TO8ROW_NEON
+ANY41PT(MergeARGB16To8Row_Any_NEON,
+ MergeARGB16To8Row_NEON,
+ uint16_t,
+ 2,
+ uint8_t,
+ 4,
+ 7)
+#endif
+
+#undef ANY41PT
+
// Any 3 planes to 1.
#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
@@ -92,8 +262,14 @@ ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
#ifdef HAS_MERGERGBROW_NEON
ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
#endif
-#ifdef HAS_MERGERGBROW_MMI
-ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
+#ifdef HAS_MERGEXRGBROW_SSE2
+ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7)
+#endif
+#ifdef HAS_MERGEXRGBROW_AVX2
+ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_MERGEXRGBROW_NEON
+ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15)
#endif
#ifdef HAS_I422TOYUY2ROW_SSE2
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
@@ -109,8 +285,8 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
#ifdef HAS_I422TOYUY2ROW_MSA
ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
#endif
-#ifdef HAS_I422TOYUY2ROW_MMI
-ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7)
+#ifdef HAS_I422TOYUY2ROW_LASX
+ANY31(I422ToYUY2Row_Any_LASX, I422ToYUY2Row_LASX, 1, 1, 4, 31)
#endif
#ifdef HAS_I422TOUYVYROW_NEON
ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
@@ -118,8 +294,8 @@ ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
#ifdef HAS_I422TOUYVYROW_MSA
ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
#endif
-#ifdef HAS_I422TOUYVYROW_MMI
-ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7)
+#ifdef HAS_I422TOUYVYROW_LASX
+ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31)
#endif
#ifdef HAS_BLENDPLANEROW_AVX2
ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
@@ -127,9 +303,6 @@ ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
#ifdef HAS_BLENDPLANEROW_SSSE3
ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
#endif
-#ifdef HAS_BLENDPLANEROW_MMI
-ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7)
-#endif
#undef ANY31
// Note that odd width replication includes 444 due to implementation
@@ -162,6 +335,21 @@ ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7)
#ifdef HAS_I422TOARGBROW_SSSE3
ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
+#ifdef HAS_I422TORGBAROW_SSSE3
+ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_SSSE3
+ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_SSSE3
+ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB565ROW_SSSE3
+ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB24ROW_SSSE3
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
+#endif
#ifdef HAS_I422TOAR30ROW_SSSE3
ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
#endif
@@ -170,18 +358,16 @@ ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I444TOARGBROW_SSSE3
ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
-ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
-ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
-#endif // HAS_I444TOARGBROW_SSSE3
+#endif
#ifdef HAS_I422TORGB24ROW_AVX2
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
#endif
#ifdef HAS_I422TOARGBROW_AVX2
ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif
+#ifdef HAS_I422TOARGBROW_AVX512BW
+ANY31C(I422ToARGBRow_Any_AVX512BW, I422ToARGBRow_AVX512BW, 1, 0, 4, 31)
+#endif
#ifdef HAS_I422TORGBAROW_AVX2
ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
#endif
@@ -215,6 +401,17 @@ ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
#endif
+#ifdef HAS_I422TOARGBROW_LASX
+ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31)
+ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31)
+ANY31C(I422ToRGB24Row_Any_LASX, I422ToRGB24Row_LASX, 1, 0, 3, 31)
+ANY31C(I422ToRGB565Row_Any_LASX, I422ToRGB565Row_LASX, 1, 0, 2, 31)
+ANY31C(I422ToARGB4444Row_Any_LASX, I422ToARGB4444Row_LASX, 1, 0, 2, 31)
+ANY31C(I422ToARGB1555Row_Any_LASX, I422ToARGB1555Row_LASX, 1, 0, 2, 31)
+#endif
+#ifdef HAS_I444TOARGBROW_LSX
+ANY31C(I444ToARGBRow_Any_LSX, I444ToARGBRow_LSX, 0, 0, 4, 15)
+#endif
#undef ANY31C
// Any 3 planes of 16 bit to 1 with yuvconstants
@@ -250,24 +447,112 @@ ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
#ifdef HAS_I210TOAR30ROW_AVX2
ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
#endif
+#ifdef HAS_I410TOAR30ROW_SSSE3
+ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I410TOARGBROW_SSSE3
+ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I410TOARGBROW_AVX2
+ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I410TOAR30ROW_AVX2
+ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I212TOAR30ROW_SSSE3
+ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I212TOARGBROW_SSSE3
+ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I212TOARGBROW_AVX2
+ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I212TOAR30ROW_AVX2
+ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
#undef ANY31CT
+// Any 3 planes to 1 plane with parameter
+#define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \
+ void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
+ DTYPE* dst_ptr, int depth, int width) { \
+ SIMD_ALIGNED(STYPE temp[16 * 3]); \
+ SIMD_ALIGNED(DTYPE out[64]); \
+ memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n); \
+ } \
+ memcpy(temp, r_buf + n, r * SBPP); \
+ memcpy(temp + 16, g_buf + n, r * SBPP); \
+ memcpy(temp + 32, b_buf + n, r * SBPP); \
+ ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1); \
+ memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP); \
+ }
+
+#ifdef HAS_MERGEXR30ROW_AVX2
+ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
+#endif
+
+#ifdef HAS_MERGEXR30ROW_NEON
+ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3)
+ANY31PT(MergeXR30Row_10_Any_NEON,
+ MergeXR30Row_10_NEON,
+ uint16_t,
+ 2,
+ uint8_t,
+ 4,
+ 3)
+#endif
+
+#ifdef HAS_MERGEXR64ROW_AVX2
+ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15)
+#endif
+
+#ifdef HAS_MERGEXR64ROW_NEON
+ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 8, 7)
+#endif
+
+#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
+ANY31PT(MergeXRGB16To8Row_Any_AVX2,
+ MergeXRGB16To8Row_AVX2,
+ uint16_t,
+ 2,
+ uint8_t,
+ 4,
+ 15)
+#endif
+
+#ifdef HAS_MERGEXRGB16TO8ROW_NEON
+ANY31PT(MergeXRGB16To8Row_Any_NEON,
+ MergeXRGB16To8Row_NEON,
+ uint16_t,
+ 2,
+ uint8_t,
+ 4,
+ 7)
+#endif
+
+#undef ANY31PT
+
// Any 2 planes to 1.
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
int width) { \
- SIMD_ALIGNED(uint8_t temp[64 * 3]); \
- memset(temp, 0, 64 * 2); /* for msan */ \
+ SIMD_ALIGNED(uint8_t temp[128 * 3]); \
+ memset(temp, 0, 128 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
} \
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
- memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \
SS(r, UVSHIFT) * SBPP2); \
- ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+ ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \
}
// Merge functions.
@@ -283,12 +568,15 @@ ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
#ifdef HAS_MERGEUVROW_MSA
ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
#endif
-#ifdef HAS_MERGEUVROW_MMI
-ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7)
+#ifdef HAS_MERGEUVROW_LSX
+ANY21(MergeUVRow_Any_LSX, MergeUVRow_LSX, 0, 1, 1, 2, 15)
#endif
#ifdef HAS_NV21TOYUV24ROW_NEON
ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
#endif
+#ifdef HAS_NV21TOYUV24ROW_SSSE3
+ANY21(NV21ToYUV24Row_Any_SSSE3, NV21ToYUV24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
#ifdef HAS_NV21TOYUV24ROW_AVX2
ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
#endif
@@ -323,20 +611,20 @@ ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
#ifdef HAS_ARGBMULTIPLYROW_MSA
ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
#endif
-#ifdef HAS_ARGBMULTIPLYROW_MMI
-ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1)
+#ifdef HAS_ARGBMULTIPLYROW_LASX
+ANY21(ARGBMultiplyRow_Any_LASX, ARGBMultiplyRow_LASX, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_ARGBADDROW_MSA
ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
#endif
-#ifdef HAS_ARGBADDROW_MMI
-ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1)
+#ifdef HAS_ARGBADDROW_LASX
+ANY21(ARGBAddRow_Any_LASX, ARGBAddRow_LASX, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_ARGBSUBTRACTROW_MSA
ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
#endif
-#ifdef HAS_ARGBSUBTRACTROW_MMI
-ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1)
+#ifdef HAS_ARGBSUBTRACTROW_LASX
+ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7)
#endif
#ifdef HAS_SOBELROW_SSE2
ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
@@ -347,8 +635,8 @@ ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
#ifdef HAS_SOBELROW_MSA
ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
#endif
-#ifdef HAS_SOBELROW_MMI
-ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7)
+#ifdef HAS_SOBELROW_LSX
+ANY21(SobelRow_Any_LSX, SobelRow_LSX, 0, 1, 1, 4, 15)
#endif
#ifdef HAS_SOBELTOPLANEROW_SSE2
ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
@@ -359,8 +647,8 @@ ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
#ifdef HAS_SOBELTOPLANEROW_MSA
ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
#endif
-#ifdef HAS_SOBELTOPLANEROW_MMI
-ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7)
+#ifdef HAS_SOBELTOPLANEROW_LSX
+ANY21(SobelToPlaneRow_Any_LSX, SobelToPlaneRow_LSX, 0, 1, 1, 1, 31)
#endif
#ifdef HAS_SOBELXYROW_SSE2
ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
@@ -371,8 +659,8 @@ ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
#ifdef HAS_SOBELXYROW_MSA
ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
#endif
-#ifdef HAS_SOBELXYROW_MMI
-ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7)
+#ifdef HAS_SOBELXYROW_LSX
+ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15)
#endif
#undef ANY21
@@ -407,6 +695,12 @@ ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
#ifdef HAS_NV12TOARGBROW_MSA
ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
#endif
+#ifdef HAS_NV12TOARGBROW_LSX
+ANY21C(NV12ToARGBRow_Any_LSX, NV12ToARGBRow_LSX, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_LASX
+ANY21C(NV12ToARGBRow_Any_LASX, NV12ToARGBRow_LASX, 1, 1, 2, 4, 15)
+#endif
#ifdef HAS_NV21TOARGBROW_SSSE3
ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
#endif
@@ -419,6 +713,12 @@ ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
#ifdef HAS_NV21TOARGBROW_MSA
ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
#endif
+#ifdef HAS_NV21TOARGBROW_LSX
+ANY21C(NV21ToARGBRow_Any_LSX, NV21ToARGBRow_LSX, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV21TOARGBROW_LASX
+ANY21C(NV21ToARGBRow_Any_LASX, NV21ToARGBRow_LASX, 1, 1, 2, 4, 15)
+#endif
#ifdef HAS_NV12TORGB24ROW_NEON
ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
#endif
@@ -449,8 +749,85 @@ ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
#ifdef HAS_NV12TORGB565ROW_MSA
ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
#endif
+#ifdef HAS_NV12TORGB565ROW_LSX
+ANY21C(NV12ToRGB565Row_Any_LSX, NV12ToRGB565Row_LSX, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_LASX
+ANY21C(NV12ToRGB565Row_Any_LASX, NV12ToRGB565Row_LASX, 1, 1, 2, 2, 15)
+#endif
#undef ANY21C
+// Any 2 planes of 16 bit to 1 with yuvconstants
+#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+ void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(T temp[16 * 3]); \
+ SIMD_ALIGNED(uint8_t out[64]); \
+ memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r * SBPP); \
+ memcpy(temp + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \
+ ANY_SIMD(temp, temp + 16, out, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_P210TOAR30ROW_SSSE3
+ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P210TOARGBROW_SSSE3
+ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P210TOARGBROW_AVX2
+ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_P210TOAR30ROW_AVX2
+ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_P410TOAR30ROW_SSSE3
+ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P410TOARGBROW_SSSE3
+ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P410TOARGBROW_AVX2
+ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_P410TOAR30ROW_AVX2
+ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
+#endif
+
+#undef ANY21CT
+
+// Any 2 16 bit planes with parameter to 1
+#define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \
+ void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \
+ int width) { \
+ SIMD_ALIGNED(T temp[16 * 4]); \
+ memset(temp, 0, 16 * 4 * BPP); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_u, src_v, dst_uv, depth, n); \
+ } \
+ memcpy(temp, src_u + n, r * BPP); \
+ memcpy(temp + 16, src_v + n, r * BPP); \
+ ANY_SIMD(temp, temp + 16, temp + 32, depth, MASK + 1); \
+ memcpy(dst_uv + n * 2, temp + 32, r * BPP * 2); \
+ }
+
+#ifdef HAS_MERGEUVROW_16_AVX2
+ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_16_NEON
+ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)
+#endif
+
+#undef ANY21CT
+
// Any 1 to 1.
#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
@@ -516,12 +893,6 @@ ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
#if defined(HAS_J400TOARGBROW_AVX2)
ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
#endif
-#if defined(HAS_I400TOARGBROW_SSE2)
-ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
-#endif
-#if defined(HAS_I400TOARGBROW_AVX2)
-ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
-#endif
#if defined(HAS_RGB24TOARGBROW_SSSE3)
ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
@@ -529,6 +900,9 @@ ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
#endif
+#if defined(HAS_RAWTORGBAROW_SSSE3)
+ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)
+#endif
#if defined(HAS_RAWTORGB24ROW_SSSE3)
ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
#endif
@@ -542,13 +916,12 @@ ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
-ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 15)
ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
-ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
#endif
#if defined(HAS_ARGBTORGB24ROW_MSA)
ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
@@ -557,16 +930,16 @@ ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
-ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
#endif
-#if defined(HAS_ARGBTORGB24ROW_MMI)
-ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3)
-ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3)
-ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3)
-ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3)
-ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3)
-ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3)
-ANY11(I400ToARGBRow_Any_MMI, I400ToARGBRow_MMI, 0, 1, 4, 7)
+#if defined(HAS_ARGBTORGB24ROW_LASX)
+ANY11(ARGBToRGB24Row_Any_LASX, ARGBToRGB24Row_LASX, 0, 4, 3, 31)
+ANY11(ARGBToRAWRow_Any_LASX, ARGBToRAWRow_LASX, 0, 4, 3, 31)
+ANY11(ARGBToRGB565Row_Any_LASX, ARGBToRGB565Row_LASX, 0, 4, 2, 15)
+ANY11(ARGBToARGB1555Row_Any_LASX, ARGBToARGB1555Row_LASX, 0, 4, 2, 15)
+ANY11(ARGBToARGB4444Row_Any_LASX, ARGBToARGB4444Row_LASX, 0, 4, 2, 15)
+#endif
+#if defined(HAS_J400TOARGBROW_LSX)
+ANY11(J400ToARGBRow_Any_LSX, J400ToARGBRow_LSX, 0, 1, 4, 15)
#endif
#if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
@@ -574,15 +947,21 @@ ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
#if defined(HAS_RAWTORGB24ROW_MSA)
ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
#endif
-#if defined(HAS_RAWTORGB24ROW_MMI)
-ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
+#if defined(HAS_RAWTORGB24ROW_LSX)
+ANY11(RAWToRGB24Row_Any_LSX, RAWToRGB24Row_LSX, 0, 3, 3, 15)
#endif
#ifdef HAS_ARGBTOYROW_AVX2
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
#endif
+#ifdef HAS_ABGRTOYROW_AVX2
+ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
+#endif
#ifdef HAS_ARGBTOYJROW_AVX2
ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
#endif
+#ifdef HAS_RGBATOYJROW_AVX2
+ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31)
+#endif
#ifdef HAS_UYVYTOYROW_AVX2
ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
#endif
@@ -596,74 +975,109 @@ ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_SSE2
ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYJROW_SSSE3
ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
#endif
+#ifdef HAS_RGBATOYJROW_SSSE3
+ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
#ifdef HAS_ARGBTOYROW_NEON
-ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYROW_MSA
ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
#endif
-#ifdef HAS_ARGBTOYROW_MMI
-ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7)
+#ifdef HAS_ARGBTOYROW_LASX
+ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31)
#endif
#ifdef HAS_ARGBTOYJROW_NEON
-ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_RGBATOYJROW_NEON
+ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYJROW_MSA
ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
#endif
-#ifdef HAS_ARGBTOYJROW_MMI
-ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7)
+#ifdef HAS_ARGBTOYJROW_LSX
+ANY11(ARGBToYJRow_Any_LSX, ARGBToYJRow_LSX, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_LASX
+ANY11(ARGBToYJRow_Any_LASX, ARGBToYJRow_LASX, 0, 4, 1, 31)
#endif
#ifdef HAS_BGRATOYROW_NEON
-ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15)
#endif
#ifdef HAS_BGRATOYROW_MSA
ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
#endif
-#ifdef HAS_BGRATOYROW_MMI
-ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7)
+#ifdef HAS_BGRATOYROW_LSX
+ANY11(BGRAToYRow_Any_LSX, BGRAToYRow_LSX, 0, 4, 1, 15)
#endif
#ifdef HAS_ABGRTOYROW_NEON
-ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15)
#endif
#ifdef HAS_ABGRTOYROW_MSA
ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
#endif
-#ifdef HAS_ABGRTOYROW_MMI
-ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7)
+#ifdef HAS_ABGRTOYROW_LSX
+ANY11(ABGRToYRow_Any_LSX, ABGRToYRow_LSX, 0, 4, 1, 15)
#endif
#ifdef HAS_RGBATOYROW_NEON
-ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15)
#endif
#ifdef HAS_RGBATOYROW_MSA
ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
#endif
-#ifdef HAS_RGBATOYROW_MMI
-ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7)
+#ifdef HAS_RGBATOYROW_LSX
+ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15)
#endif
#ifdef HAS_RGB24TOYROW_NEON
-ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_AVX2
+ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RGB24TOYJROW_SSSE3
+ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYJROW_NEON
+ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 15)
#endif
#ifdef HAS_RGB24TOYROW_MSA
ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
#endif
-#ifdef HAS_RGB24TOYROW_MMI
-ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7)
+#ifdef HAS_RGB24TOYROW_LSX
+ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RGB24TOYROW_LASX
+ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31)
#endif
#ifdef HAS_RAWTOYROW_NEON
-ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_AVX2
+ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
+#endif
+#ifdef HAS_RAWTOYJROW_SSSE3
+ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYJROW_NEON
+ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 15)
#endif
#ifdef HAS_RAWTOYROW_MSA
ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
#endif
-#ifdef HAS_RAWTOYROW_MMI
-ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7)
+#ifdef HAS_RAWTOYROW_LSX
+ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15)
+#endif
+#ifdef HAS_RAWTOYROW_LASX
+ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31)
#endif
#ifdef HAS_RGB565TOYROW_NEON
ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
@@ -671,8 +1085,11 @@ ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
#ifdef HAS_RGB565TOYROW_MSA
ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
#endif
-#ifdef HAS_RGB565TOYROW_MMI
-ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7)
+#ifdef HAS_RGB565TOYROW_LSX
+ANY11(RGB565ToYRow_Any_LSX, RGB565ToYRow_LSX, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB565TOYROW_LASX
+ANY11(RGB565ToYRow_Any_LASX, RGB565ToYRow_LASX, 0, 2, 1, 31)
#endif
#ifdef HAS_ARGB1555TOYROW_NEON
ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
@@ -680,15 +1097,15 @@ ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
#ifdef HAS_ARGB1555TOYROW_MSA
ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
#endif
-#ifdef HAS_ARGB1555TOYROW_MMI
-ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7)
+#ifdef HAS_ARGB1555TOYROW_LSX
+ANY11(ARGB1555ToYRow_Any_LSX, ARGB1555ToYRow_LSX, 0, 2, 1, 15)
+#endif
+#ifdef HAS_ARGB1555TOYROW_LASX
+ANY11(ARGB1555ToYRow_Any_LASX, ARGB1555ToYRow_LASX, 0, 2, 1, 31)
#endif
#ifdef HAS_ARGB4444TOYROW_NEON
ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
#endif
-#ifdef HAS_ARGB4444TOYROW_MMI
-ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7)
-#endif
#ifdef HAS_YUY2TOYROW_NEON
ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
#endif
@@ -698,20 +1115,26 @@ ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
#ifdef HAS_YUY2TOYROW_MSA
ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
#endif
-#ifdef HAS_YUY2TOYROW_MMI
-ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7)
+#ifdef HAS_YUY2TOYROW_LASX
+ANY11(YUY2ToYRow_Any_LASX, YUY2ToYRow_LASX, 1, 4, 1, 31)
#endif
#ifdef HAS_UYVYTOYROW_MSA
ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
#endif
-#ifdef HAS_UYVYTOYROW_MMI
-ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
+#ifdef HAS_UYVYTOYROW_LASX
+ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31)
#endif
#ifdef HAS_AYUVTOYROW_NEON
ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
#endif
-#ifdef HAS_AYUVTOYROW_NEON
-ANY11(UVToVURow_Any_NEON, UVToVURow_NEON, 0, 2, 2, 15)
+#ifdef HAS_SWAPUVROW_SSSE3
+ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15)
+#endif
+#ifdef HAS_SWAPUVROW_AVX2
+ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31)
+#endif
+#ifdef HAS_SWAPUVROW_NEON
+ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
#endif
#ifdef HAS_RGB24TOARGBROW_NEON
ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
@@ -719,17 +1142,26 @@ ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
#ifdef HAS_RGB24TOARGBROW_MSA
ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
#endif
-#ifdef HAS_RGB24TOARGBROW_MMI
-ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3)
+#ifdef HAS_RGB24TOARGBROW_LSX
+ANY11(RGB24ToARGBRow_Any_LSX, RGB24ToARGBRow_LSX, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_LASX
+ANY11(RGB24ToARGBRow_Any_LASX, RGB24ToARGBRow_LASX, 0, 3, 4, 31)
#endif
#ifdef HAS_RAWTOARGBROW_NEON
ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
#endif
+#ifdef HAS_RAWTORGBAROW_NEON
+ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7)
+#endif
#ifdef HAS_RAWTOARGBROW_MSA
ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
#endif
-#ifdef HAS_RAWTOARGBROW_MMI
-ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3)
+#ifdef HAS_RAWTOARGBROW_LSX
+ANY11(RAWToARGBRow_Any_LSX, RAWToARGBRow_LSX, 0, 3, 4, 15)
+#endif
+#ifdef HAS_RAWTOARGBROW_LASX
+ANY11(RAWToARGBRow_Any_LASX, RAWToARGBRow_LASX, 0, 3, 4, 31)
#endif
#ifdef HAS_RGB565TOARGBROW_NEON
ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
@@ -737,8 +1169,11 @@ ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
#ifdef HAS_RGB565TOARGBROW_MSA
ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
#endif
-#ifdef HAS_RGB565TOARGBROW_MMI
-ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3)
+#ifdef HAS_RGB565TOARGBROW_LSX
+ANY11(RGB565ToARGBRow_Any_LSX, RGB565ToARGBRow_LSX, 0, 2, 4, 15)
+#endif
+#ifdef HAS_RGB565TOARGBROW_LASX
+ANY11(RGB565ToARGBRow_Any_LASX, RGB565ToARGBRow_LASX, 0, 2, 4, 31)
#endif
#ifdef HAS_ARGB1555TOARGBROW_NEON
ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
@@ -746,8 +1181,11 @@ ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
#ifdef HAS_ARGB1555TOARGBROW_MSA
ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
#endif
-#ifdef HAS_ARGB1555TOARGBROW_MMI
-ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3)
+#ifdef HAS_ARGB1555TOARGBROW_LSX
+ANY11(ARGB1555ToARGBRow_Any_LSX, ARGB1555ToARGBRow_LSX, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_LASX
+ANY11(ARGB1555ToARGBRow_Any_LASX, ARGB1555ToARGBRow_LASX, 0, 2, 4, 31)
#endif
#ifdef HAS_ARGB4444TOARGBROW_NEON
ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
@@ -755,8 +1193,11 @@ ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
#ifdef HAS_ARGB4444TOARGBROW_MSA
ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
#endif
-#ifdef HAS_ARGB4444TOARGBROW_MMI
-ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3)
+#ifdef HAS_ARGB4444TOARGBROW_LSX
+ANY11(ARGB4444ToARGBRow_Any_LSX, ARGB4444ToARGBRow_LSX, 0, 2, 4, 15)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_LASX
+ANY11(ARGB4444ToARGBRow_Any_LASX, ARGB4444ToARGBRow_LASX, 0, 2, 4, 31)
#endif
#ifdef HAS_ARGBATTENUATEROW_SSSE3
ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
@@ -776,8 +1217,8 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
#ifdef HAS_ARGBATTENUATEROW_MSA
ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
#endif
-#ifdef HAS_ARGBATTENUATEROW_MMI
-ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1)
+#ifdef HAS_ARGBATTENUATEROW_LASX
+ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15)
#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
@@ -791,8 +1232,8 @@ ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
#endif
-#ifdef HAS_ARGBEXTRACTALPHAROW_MMI
-ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7)
+#ifdef HAS_ARGBEXTRACTALPHAROW_LSX
+ANY11(ARGBExtractAlphaRow_Any_LSX, ARGBExtractAlphaRow_LSX, 0, 4, 1, 15)
#endif
#undef ANY11
@@ -818,18 +1259,12 @@ ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
#endif
-#ifdef HAS_ARGBCOPYALPHAROW_MMI
-ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1)
-#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
#endif
-#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI
-ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7)
-#endif
#undef ANY11B
// Any 1 to 1 with parameter.
@@ -847,6 +1282,47 @@ ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7)
memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
}
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11P(I400ToARGBRow_Any_SSE2,
+ I400ToARGBRow_SSE2,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11P(I400ToARGBRow_Any_AVX2,
+ I400ToARGBRow_AVX2,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 15)
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+ANY11P(I400ToARGBRow_Any_NEON,
+ I400ToARGBRow_NEON,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 7)
+#endif
+#if defined(HAS_I400TOARGBROW_MSA)
+ANY11P(I400ToARGBRow_Any_MSA,
+ I400ToARGBRow_MSA,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 15)
+#endif
+#if defined(HAS_I400TOARGBROW_LSX)
+ANY11P(I400ToARGBRow_Any_LSX,
+ I400ToARGBRow_LSX,
+ const struct YuvConstants*,
+ 1,
+ 4,
+ 15)
+#endif
+
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
ARGBToRGB565DitherRow_SSE2,
@@ -879,13 +1355,13 @@ ANY11P(ARGBToRGB565DitherRow_Any_MSA,
2,
7)
#endif
-#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
-ANY11P(ARGBToRGB565DitherRow_Any_MMI,
- ARGBToRGB565DitherRow_MMI,
+#if defined(HAS_ARGBTORGB565DITHERROW_LASX)
+ANY11P(ARGBToRGB565DitherRow_Any_LASX,
+ ARGBToRGB565DitherRow_LASX,
const uint32_t,
4,
2,
- 3)
+ 15)
#endif
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
@@ -899,12 +1375,78 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
#ifdef HAS_ARGBSHUFFLEROW_MSA
ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
#endif
-#ifdef HAS_ARGBSHUFFLEROW_MMI
-ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1)
+#ifdef HAS_ARGBSHUFFLEROW_LASX
+ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15)
#endif
#undef ANY11P
#undef ANY11P
+// Any 1 to 1 with type
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
+ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t temp[(MASK + 1) * SBPP]); \
+ SIMD_ALIGNED(uint8_t out[(MASK + 1) * BPP]); \
+ memset(temp, 0, (MASK + 1) * SBPP); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(temp, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP); \
+ ANY_SIMD((STYPE*)temp, (DTYPE*)out, MASK + 1); \
+ memcpy((uint8_t*)(dst_ptr) + n * BPP, out, r * BPP); \
+ }
+
+#ifdef HAS_ARGBTOAR64ROW_SSSE3
+ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3)
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_SSSE3
+ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3)
+#endif
+
+#ifdef HAS_AR64TOARGBROW_SSSE3
+ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_SSSE3
+ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_AVX2
+ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_AVX2
+ANY11T(ARGBToAB64Row_Any_AVX2, ARGBToAB64Row_AVX2, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_AR64TOARGBROW_AVX2
+ANY11T(AR64ToARGBRow_Any_AVX2, AR64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_AVX2
+ANY11T(AB64ToARGBRow_Any_AVX2, AB64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_NEON
+ANY11T(ARGBToAR64Row_Any_NEON, ARGBToAR64Row_NEON, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_NEON
+ANY11T(ARGBToAB64Row_Any_NEON, ARGBToAB64Row_NEON, 4, 8, uint8_t, uint16_t, 7)
+#endif
+
+#ifdef HAS_AR64TOARGBROW_NEON
+ANY11T(AR64ToARGBRow_Any_NEON, AR64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#ifdef HAS_ARGBTOAR64ROW_NEON
+ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
+#endif
+
+#undef ANY11T
+
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
@@ -939,6 +1481,15 @@ ANY11C(Convert16To8Row_Any_AVX2,
uint8_t,
31)
#endif
+#ifdef HAS_CONVERT16TO8ROW_NEON
+ANY11C(Convert16To8Row_Any_NEON,
+ Convert16To8Row_NEON,
+ 2,
+ 1,
+ uint16_t,
+ uint8_t,
+ 15)
+#endif
#ifdef HAS_CONVERT8TO16ROW_SSE2
ANY11C(Convert8To16Row_Any_SSE2,
Convert8To16Row_SSE2,
@@ -957,6 +1508,30 @@ ANY11C(Convert8To16Row_Any_AVX2,
uint16_t,
31)
#endif
+#ifdef HAS_MULTIPLYROW_16_AVX2
+ANY11C(MultiplyRow_16_Any_AVX2,
+ MultiplyRow_16_AVX2,
+ 2,
+ 2,
+ uint16_t,
+ uint16_t,
+ 31)
+#endif
+#ifdef HAS_MULTIPLYROW_16_NEON
+ANY11C(MultiplyRow_16_Any_NEON,
+ MultiplyRow_16_NEON,
+ 2,
+ 2,
+ uint16_t,
+ uint16_t,
+ 15)
+#endif
+#ifdef HAS_DIVIDEROW_16_AVX2
+ANY11C(DivideRow_16_Any_AVX2, DivideRow_16_AVX2, 2, 2, uint16_t, uint16_t, 31)
+#endif
+#ifdef HAS_DIVIDEROW_16_NEON
+ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15)
+#endif
#undef ANY11C
// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts.
@@ -1007,6 +1582,9 @@ ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
#ifdef HAS_BYTETOFLOATROW_NEON
ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
#endif
+#ifdef HAS_HALFFLOATROW_LSX
+ANY11P16(HalfFloatRow_Any_LSX, HalfFloatRow_LSX, uint16_t, uint16_t, 2, 2, 31)
+#endif
#undef ANY11P16
// Any 1 to 1 with yuvconstants
@@ -1040,41 +1618,107 @@ ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
#endif
+#if defined(HAS_YUY2TOARGBROW_LSX)
+ANY11C(YUY2ToARGBRow_Any_LSX, YUY2ToARGBRow_LSX, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7)
+#endif
#undef ANY11C
// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
-#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
- void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \
- ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \
- SIMD_ALIGNED(uint8_t temp[64 * 3]); \
- memset(temp, 0, 64 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
- } \
- memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
- memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
- ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \
+ void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
+ int width, int source_y_fraction) { \
+ SIMD_ALIGNED(TS temps[64 * 2]); \
+ SIMD_ALIGNED(TD tempd[64]); \
+ memset(temps, 0, sizeof(temps)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \
+ } \
+ memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \
+ if (source_y_fraction) { \
+ memcpy(temps + 64, src_ptr + src_stride + n * SBPP, \
+ r * SBPP * sizeof(TS)); \
+ } \
+ ANY_SIMD(tempd, temps, 64, MASK + 1, source_y_fraction); \
+ memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD)); \
}
#ifdef HAS_INTERPOLATEROW_AVX2
-ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
+ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31)
#endif
#ifdef HAS_INTERPOLATEROW_SSSE3
-ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
+ANY11I(InterpolateRow_Any_SSSE3,
+ InterpolateRow_SSSE3,
+ uint8_t,
+ uint8_t,
+ 1,
+ 1,
+ 15)
#endif
#ifdef HAS_INTERPOLATEROW_NEON
-ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
+ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, uint8_t, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_MSA
-ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
+ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, uint8_t, 1, 1, 31)
#endif
-#ifdef HAS_INTERPOLATEROW_MMI
-ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7)
+#ifdef HAS_INTERPOLATEROW_LSX
+ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, uint8_t, 1, 1, 31)
#endif
-#undef ANY11T
+
+#ifdef HAS_INTERPOLATEROW_16_NEON
+ANY11I(InterpolateRow_16_Any_NEON,
+ InterpolateRow_16_NEON,
+ uint16_t,
+ uint16_t,
+ 1,
+ 1,
+ 7)
+#endif
+#undef ANY11I
+
+// Any 1 to 1 interpolate with scale param
+#define ANY11IS(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \
+ void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
+ int scale, int width, int source_y_fraction) { \
+ SIMD_ALIGNED(TS temps[64 * 2]); \
+ SIMD_ALIGNED(TD tempd[64]); \
+ memset(temps, 0, sizeof(temps)); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, src_ptr, src_stride, scale, n, source_y_fraction); \
+ } \
+ memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \
+ if (source_y_fraction) { \
+ memcpy(temps + 64, src_ptr + src_stride + n * SBPP, \
+ r * SBPP * sizeof(TS)); \
+ } \
+ ANY_SIMD(tempd, temps, 64, scale, MASK + 1, source_y_fraction); \
+ memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD)); \
+ }
+
+#ifdef HAS_INTERPOLATEROW_16TO8_NEON
+ANY11IS(InterpolateRow_16To8_Any_NEON,
+ InterpolateRow_16To8_NEON,
+ uint8_t,
+ uint16_t,
+ 1,
+ 1,
+ 7)
+#endif
+#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
+ANY11IS(InterpolateRow_16To8_Any_AVX2,
+ InterpolateRow_16To8_AVX2,
+ uint8_t,
+ uint16_t,
+ 1,
+ 1,
+ 31)
+#endif
+
+#undef ANY11IS
// Any 1 to 1 mirror.
#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
@@ -1098,13 +1742,28 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
#endif
#ifdef HAS_MIRRORROW_NEON
-ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31)
#endif
#ifdef HAS_MIRRORROW_MSA
ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
#endif
-#ifdef HAS_MIRRORROW_MMI
-ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
+#ifdef HAS_MIRRORROW_LASX
+ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63)
+#endif
+#ifdef HAS_MIRRORUVROW_AVX2
+ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
+#endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_NEON
+ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
+#endif
+#ifdef HAS_MIRRORUVROW_MSA
+ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_LASX
+ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15)
#endif
#ifdef HAS_ARGBMIRRORROW_AVX2
ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
@@ -1113,13 +1772,19 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
#endif
#ifdef HAS_ARGBMIRRORROW_NEON
-ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
#endif
#ifdef HAS_ARGBMIRRORROW_MSA
ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
#endif
-#ifdef HAS_ARGBMIRRORROW_MMI
-ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1)
+#ifdef HAS_ARGBMIRRORROW_LASX
+ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15)
+#endif
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_RGB24MIRRORROW_NEON
+ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15)
#endif
#undef ANY11M
@@ -1127,6 +1792,7 @@ ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1)
#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
SIMD_ALIGNED(uint8_t temp[64]); \
+ memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
@@ -1142,12 +1808,18 @@ ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)
#ifdef HAS_SETROW_NEON
ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)
#endif
+#ifdef HAS_SETROW_LSX
+ANY1(SetRow_Any_LSX, SetRow_LSX, uint8_t, 1, 15)
+#endif
#ifdef HAS_ARGBSETROW_NEON
ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
#endif
#ifdef HAS_ARGBSETROW_MSA
ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
#endif
+#ifdef HAS_ARGBSETROW_LSX
+ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3)
+#endif
#undef ANY1
// Any 1 to 2. Outputs UV planes.
@@ -1179,8 +1851,8 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
#ifdef HAS_SPLITUVROW_MSA
ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
#endif
-#ifdef HAS_SPLITUVROW_MMI
-ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7)
+#ifdef HAS_SPLITUVROW_LSX
+ANY12(SplitUVRow_Any_LSX, SplitUVRow_LSX, 0, 2, 0, 31)
#endif
#ifdef HAS_ARGBTOUV444ROW_SSSE3
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
@@ -1203,13 +1875,39 @@ ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
#endif
-#ifdef HAS_YUY2TOUV422ROW_MMI
-ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7)
-ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15)
-ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15)
+#ifdef HAS_YUY2TOUV422ROW_LASX
+ANY12(ARGBToUV444Row_Any_LASX, ARGBToUV444Row_LASX, 0, 4, 0, 31)
+ANY12(YUY2ToUV422Row_Any_LASX, YUY2ToUV422Row_LASX, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_LASX, UYVYToUV422Row_LASX, 1, 4, 1, 31)
#endif
#undef ANY12
+// Any 2 16 bit planes with parameter to 1
+#define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \
+ void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \
+ SIMD_ALIGNED(T temp[16 * 4]); \
+ memset(temp, 0, 16 * 4 * BPP); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_uv, dst_u, dst_v, depth, n); \
+ } \
+ memcpy(temp, src_uv + n * 2, r * BPP * 2); \
+ ANY_SIMD(temp, temp + 32, temp + 48, depth, MASK + 1); \
+ memcpy(dst_u + n, temp + 32, r * BPP); \
+ memcpy(dst_v + n, temp + 48, r * BPP); \
+ }
+
+#ifdef HAS_SPLITUVROW_16_AVX2
+ANY12PT(SplitUVRow_16_Any_AVX2, SplitUVRow_16_AVX2, uint16_t, 2, 15)
+#endif
+
+#ifdef HAS_SPLITUVROW_16_NEON
+ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7)
+#endif
+
+#undef ANY21CT
+
// Any 1 to 3. Outputs RGB planes.
#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \
void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
@@ -1234,24 +1932,66 @@ ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
#ifdef HAS_SPLITRGBROW_NEON
ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
#endif
-#ifdef HAS_SPLITRGBROW_MMI
-ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
+#ifdef HAS_SPLITXRGBROW_SSE2
+ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7)
+#endif
+#ifdef HAS_SPLITXRGBROW_SSSE3
+ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7)
+#endif
+#ifdef HAS_SPLITXRGBROW_AVX2
+ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15)
+#endif
+#ifdef HAS_SPLITXRGBROW_NEON
+ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15)
+#endif
+
+// Any 1 to 4. Outputs ARGB planes.
+#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
+ uint8_t* dst_b, uint8_t* dst_a, int width) { \
+ SIMD_ALIGNED(uint8_t temp[16 * 8]); \
+ memset(temp, 0, 16 * 4); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n); \
+ } \
+ memcpy(temp, src_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(temp, temp + 16 * 4, temp + 16 * 5, temp + 16 * 6, temp + 16 * 7, \
+ MASK + 1); \
+ memcpy(dst_r + n, temp + 16 * 4, r); \
+ memcpy(dst_g + n, temp + 16 * 5, r); \
+ memcpy(dst_b + n, temp + 16 * 6, r); \
+ memcpy(dst_a + n, temp + 16 * 7, r); \
+ }
+
+#ifdef HAS_SPLITARGBROW_SSE2
+ANY14(SplitARGBRow_Any_SSE2, SplitARGBRow_SSE2, 4, 7)
+#endif
+#ifdef HAS_SPLITARGBROW_SSSE3
+ANY14(SplitARGBRow_Any_SSSE3, SplitARGBRow_SSSE3, 4, 7)
+#endif
+#ifdef HAS_SPLITARGBROW_AVX2
+ANY14(SplitARGBRow_Any_AVX2, SplitARGBRow_AVX2, 4, 15)
+#endif
+#ifdef HAS_SPLITARGBROW_NEON
+ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
#endif
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
// 128 byte row allows for 32 avx ARGB pixels.
#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, \
+ void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \
uint8_t* dst_v, int width) { \
SIMD_ALIGNED(uint8_t temp[128 * 4]); \
memset(temp, 0, 128 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
- ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \
+ ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
- memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
+ memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \
SS(r, UVSHIFT) * BPP); \
if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
@@ -1267,6 +2007,9 @@ ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
#ifdef HAS_ARGBTOUVROW_AVX2
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
#endif
+#ifdef HAS_ABGRTOUVROW_AVX2
+ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
+#endif
#ifdef HAS_ARGBTOUVJROW_AVX2
ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
#endif
@@ -1291,8 +2034,8 @@ ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
#ifdef HAS_ARGBTOUVROW_MSA
ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
#endif
-#ifdef HAS_ARGBTOUVROW_MMI
-ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15)
+#ifdef HAS_ARGBTOUVROW_LASX
+ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31)
#endif
#ifdef HAS_ARGBTOUVJROW_NEON
ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
@@ -1300,53 +2043,68 @@ ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
#ifdef HAS_ARGBTOUVJROW_MSA
ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
#endif
-#ifdef HAS_ARGBTOUVJROW_MMI
-ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15)
+#ifdef HAS_ARGBTOUVJROW_LSX
+ANY12S(ARGBToUVJRow_Any_LSX, ARGBToUVJRow_LSX, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_LASX
+ANY12S(ARGBToUVJRow_Any_LASX, ARGBToUVJRow_LASX, 0, 4, 31)
#endif
#ifdef HAS_BGRATOUVROW_NEON
ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_BGRATOUVROW_MSA
-ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15)
#endif
-#ifdef HAS_BGRATOUVROW_MMI
-ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15)
+#ifdef HAS_BGRATOUVROW_LSX
+ANY12S(BGRAToUVRow_Any_LSX, BGRAToUVRow_LSX, 0, 4, 15)
#endif
#ifdef HAS_ABGRTOUVROW_NEON
ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_ABGRTOUVROW_MSA
-ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15)
#endif
-#ifdef HAS_ABGRTOUVROW_MMI
-ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15)
+#ifdef HAS_ABGRTOUVROW_LSX
+ANY12S(ABGRToUVRow_Any_LSX, ABGRToUVRow_LSX, 0, 4, 15)
#endif
#ifdef HAS_RGBATOUVROW_NEON
ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_RGBATOUVROW_MSA
-ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15)
#endif
-#ifdef HAS_RGBATOUVROW_MMI
-ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15)
+#ifdef HAS_RGBATOUVROW_LSX
+ANY12S(RGBAToUVRow_Any_LSX, RGBAToUVRow_LSX, 0, 4, 15)
#endif
#ifdef HAS_RGB24TOUVROW_NEON
ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
#endif
+#ifdef HAS_RGB24TOUVJROW_NEON
+ANY12S(RGB24ToUVJRow_Any_NEON, RGB24ToUVJRow_NEON, 0, 3, 15)
+#endif
#ifdef HAS_RGB24TOUVROW_MSA
ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
#endif
-#ifdef HAS_RGB24TOUVROW_MMI
-ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15)
+#ifdef HAS_RGB24TOUVROW_LSX
+ANY12S(RGB24ToUVRow_Any_LSX, RGB24ToUVRow_LSX, 0, 3, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_LASX
+ANY12S(RGB24ToUVRow_Any_LASX, RGB24ToUVRow_LASX, 0, 3, 31)
#endif
#ifdef HAS_RAWTOUVROW_NEON
ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
#endif
+#ifdef HAS_RAWTOUVJROW_NEON
+ANY12S(RAWToUVJRow_Any_NEON, RAWToUVJRow_NEON, 0, 3, 15)
+#endif
#ifdef HAS_RAWTOUVROW_MSA
ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
#endif
-#ifdef HAS_RAWTOUVROW_MMI
-ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15)
+#ifdef HAS_RAWTOUVROW_LSX
+ANY12S(RAWToUVRow_Any_LSX, RAWToUVRow_LSX, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_LASX
+ANY12S(RAWToUVRow_Any_LASX, RAWToUVRow_LASX, 0, 3, 31)
#endif
#ifdef HAS_RGB565TOUVROW_NEON
ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
@@ -1354,8 +2112,11 @@ ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
#ifdef HAS_RGB565TOUVROW_MSA
ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
#endif
-#ifdef HAS_RGB565TOUVROW_MMI
-ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15)
+#ifdef HAS_RGB565TOUVROW_LSX
+ANY12S(RGB565ToUVRow_Any_LSX, RGB565ToUVRow_LSX, 0, 2, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_LASX
+ANY12S(RGB565ToUVRow_Any_LASX, RGB565ToUVRow_LASX, 0, 2, 31)
#endif
#ifdef HAS_ARGB1555TOUVROW_NEON
ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
@@ -1363,15 +2124,15 @@ ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
#ifdef HAS_ARGB1555TOUVROW_MSA
ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
#endif
-#ifdef HAS_ARGB1555TOUVROW_MMI
-ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15)
+#ifdef HAS_ARGB1555TOUVROW_LSX
+ANY12S(ARGB1555ToUVRow_Any_LSX, ARGB1555ToUVRow_LSX, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_LASX
+ANY12S(ARGB1555ToUVRow_Any_LASX, ARGB1555ToUVRow_LASX, 0, 2, 31)
#endif
#ifdef HAS_ARGB4444TOUVROW_NEON
ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
#endif
-#ifdef HAS_ARGB4444TOUVROW_MMI
-ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15)
-#endif
#ifdef HAS_YUY2TOUVROW_NEON
ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
#endif
@@ -1381,31 +2142,31 @@ ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
#ifdef HAS_YUY2TOUVROW_MSA
ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
#endif
-#ifdef HAS_YUY2TOUVROW_MMI
-ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15)
+#ifdef HAS_YUY2TOUVROW_LASX
+ANY12S(YUY2ToUVRow_Any_LASX, YUY2ToUVRow_LASX, 1, 4, 31)
#endif
#ifdef HAS_UYVYTOUVROW_MSA
ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
#endif
-#ifdef HAS_UYVYTOUVROW_MMI
-ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
+#ifdef HAS_UYVYTOUVROW_LASX
+ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31)
#endif
#undef ANY12S
// Any 1 to 1 with source stride (2 rows of source). Outputs UV plane.
// 128 byte row allows for 32 avx ARGB pixels.
#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu, \
+ void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, \
int width) { \
SIMD_ALIGNED(uint8_t temp[128 * 3]); \
memset(temp, 0, 128 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
- ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n); \
+ ANY_SIMD(src_ptr, src_stride, dst_vu, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
- memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
+ memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \
SS(r, UVSHIFT) * BPP); \
if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
@@ -1423,6 +2184,51 @@ ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
#endif
#undef ANY11S
+#define ANYDETILE(NAMEANY, ANY_SIMD, MASK) \
+ void NAMEANY(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t temp[16 * 2]); \
+ memset(temp, 0, 16); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src, src_tile_stride, dst, n); \
+ } \
+ memcpy(temp, src + (n / 16) * src_tile_stride, r); \
+ ANY_SIMD(temp, src_tile_stride, temp + 16, MASK + 1); \
+ memcpy(dst + n, temp + 16, r); \
+ }
+
+#ifdef HAS_DETILEROW_NEON
+ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, 15)
+#endif
+#ifdef HAS_DETILEROW_SSE2
+ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, 15)
+#endif
+
+#define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK) \
+ void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ SIMD_ALIGNED(uint8_t temp[16 * 2]); \
+ memset(temp, 0, 16 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_uv, src_tile_stride, dst_u, dst_v, n); \
+ } \
+ memcpy(temp, src_uv + (n / 16) * src_tile_stride, r); \
+ ANY_SIMD(temp, src_tile_stride, temp + 16, temp + 24, r); \
+ memcpy(dst_u + n / 2, temp + 16, (r + 1) / 2); \
+ memcpy(dst_v + n / 2, temp + 24, (r + 1) / 2); \
+ }
+
+#ifdef HAS_DETILESPLITUVROW_NEON
+ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15)
+#endif
+#ifdef HAS_DETILESPLITUVROW_SSSE3
+ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15)
+#endif
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/row_common.cc b/files/source/row_common.cc
index 8951d003..83442496 100644
--- a/files/source/row_common.cc
+++ b/files/source/row_common.cc
@@ -10,34 +10,67 @@
#include "libyuv/row.h"
-#include <stdio.h>
+#include <assert.h>
#include <string.h> // For memcpy and memset.
#include "libyuv/basic_types.h"
+#include "libyuv/convert_argb.h" // For kYuvI601Constants
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
+// This macro controls YUV to RGB using unsigned math to extend range of
+// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
+// LIBYUV_UNLIMITED_DATA
+
+// Macros to enable unlimited data for each colorspace
+// LIBYUV_UNLIMITED_BT601
+// LIBYUV_UNLIMITED_BT709
+// LIBYUV_UNLIMITED_BT2020
+
+// The following macro from row_win makes the C code match the row_win code,
+// which is 7 bit fixed point for ARGBToI420:
+#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \
+ defined(_MSC_VER) && !defined(__clang__) && \
+ (defined(_M_IX86) || defined(_M_X64))
+#define LIBYUV_RGB7 1
+#endif
+
+#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \
+ defined(__i386__) || defined(_M_IX86))
+#define LIBYUV_ARGBTOUV_PAVGB 1
+#define LIBYUV_RGBTOU_TRUNCATE 1
+#define LIBYUV_ATTENUATE_DUP 1
+#endif
+#if defined(LIBYUV_BIT_EXACT)
+#define LIBYUV_UNATTENUATE_DUP 1
+#endif
+
// llvm x86 is poor at ternary operator, so use branchless min/max.
#define USE_BRANCHLESS 1
#if USE_BRANCHLESS
static __inline int32_t clamp0(int32_t v) {
- return ((-(v) >> 31) & (v));
+ return -(v >= 0) & v;
}
-
+// TODO(fbarchard): make clamp255 preserve negative values.
static __inline int32_t clamp255(int32_t v) {
- return (((255 - (v)) >> 31) | (v)) & 255;
+ return (-(v >= 255) | v) & 255;
}
static __inline int32_t clamp1023(int32_t v) {
- return (((1023 - (v)) >> 31) | (v)) & 1023;
+ return (-(v >= 1023) | v) & 1023;
+}
+
+// clamp to max
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
+ return (-(v >= max) | v) & max;
}
static __inline uint32_t Abs(int32_t v) {
- int m = v >> 31;
+ int m = -(v < 0);
return (v + m) ^ m;
}
#else // USE_BRANCHLESS
@@ -53,6 +86,10 @@ static __inline int32_t clamp1023(int32_t v) {
return (v > 1023) ? 1023 : v;
}
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
+ return (v > max) ? max : v;
+}
+
static __inline uint32_t Abs(int32_t v) {
return (v < 0) ? -v : v;
}
@@ -111,6 +148,21 @@ void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
}
}
+void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t r = src_raw[0];
+ uint8_t g = src_raw[1];
+ uint8_t b = src_raw[2];
+ dst_rgba[0] = 255u;
+ dst_rgba[1] = b;
+ dst_rgba[2] = g;
+ dst_rgba[3] = r;
+ dst_rgba += 4;
+ src_raw += 3;
+ }
+}
+
void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -181,7 +233,8 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
uint32_t b = (ar30 >> 2) & 0xff;
uint32_t g = (ar30 >> 12) & 0xff;
uint32_t r = (ar30 >> 22) & 0xff;
@@ -195,7 +248,8 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
uint32_t b = (ar30 >> 2) & 0xff;
uint32_t g = (ar30 >> 12) & 0xff;
uint32_t r = (ar30 >> 22) & 0xff;
@@ -209,7 +263,8 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t ar30;
+ memcpy(&ar30, src_ar30, sizeof ar30);
uint32_t b = ar30 & 0x3ff;
uint32_t ga = ar30 & 0xc00ffc00;
uint32_t r = (ar30 >> 20) & 0x3ff;
@@ -291,8 +346,8 @@ void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
- WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
- (r1 << 27));
+ *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 11);
+ *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 11);
dst_rgb += 4;
src_argb += 8;
}
@@ -316,8 +371,8 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
uint8_t g1 = src_argb[5] >> 3;
uint8_t r1 = src_argb[6] >> 3;
uint8_t a1 = src_argb[7] >> 7;
- *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
- (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+ *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+ *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 10) | (a1 << 15);
dst_rgb += 4;
src_argb += 8;
}
@@ -341,8 +396,8 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
uint8_t g1 = src_argb[5] >> 4;
uint8_t r1 = src_argb[6] >> 4;
uint8_t a1 = src_argb[7] >> 4;
- *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
- (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+ *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+ *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 4) | (r1 << 8) | (a1 << 12);
dst_rgb += 4;
src_argb += 8;
}
@@ -381,56 +436,208 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
}
}
+void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_ar64[0] = src_argb[0] * 0x0101;
+ dst_ar64[1] = src_argb[1] * 0x0101;
+ dst_ar64[2] = src_argb[2] * 0x0101;
+ dst_ar64[3] = src_argb[3] * 0x0101;
+ dst_ar64 += 4;
+ src_argb += 4;
+ }
+}
+
+void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_ab64[0] = src_argb[2] * 0x0101;
+ dst_ab64[1] = src_argb[1] * 0x0101;
+ dst_ab64[2] = src_argb[0] * 0x0101;
+ dst_ab64[3] = src_argb[3] * 0x0101;
+ dst_ab64 += 4;
+ src_argb += 4;
+ }
+}
+
+void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_argb[0] = src_ar64[0] >> 8;
+ dst_argb[1] = src_ar64[1] >> 8;
+ dst_argb[2] = src_ar64[2] >> 8;
+ dst_argb[3] = src_ar64[3] >> 8;
+ dst_argb += 4;
+ src_ar64 += 4;
+ }
+}
+
+void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_argb[0] = src_ab64[2] >> 8;
+ dst_argb[1] = src_ab64[1] >> 8;
+ dst_argb[2] = src_ab64[0] >> 8;
+ dst_argb[3] = src_ab64[3] >> 8;
+ dst_argb += 4;
+ src_ab64 += 4;
+ }
+}
+
+// TODO(fbarchard): Make shuffle compatible with SIMD versions
+void AR64ShuffleRow_C(const uint8_t* src_ar64,
+ uint8_t* dst_ar64,
+ const uint8_t* shuffler,
+ int width) {
+ const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64;
+ uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64;
+ int index0 = shuffler[0] / 2;
+ int index1 = shuffler[2] / 2;
+ int index2 = shuffler[4] / 2;
+ int index3 = shuffler[6] / 2;
+ // Shuffle a row of AR64.
+ int x;
+ for (x = 0; x < width / 2; ++x) {
+ // To support in-place conversion.
+ uint16_t b = src_ar64_16[index0];
+ uint16_t g = src_ar64_16[index1];
+ uint16_t r = src_ar64_16[index2];
+ uint16_t a = src_ar64_16[index3];
+ dst_ar64_16[0] = b;
+ dst_ar64_16[1] = g;
+ dst_ar64_16[2] = r;
+ dst_ar64_16[3] = a;
+ src_ar64_16 += 4;
+ dst_ar64_16 += 4;
+ }
+}
+
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
+static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+ return ((33 * r + 65 * g + 13 * b) >> 7) + 16;
+}
+#else
+// 8 bit
+// Intel SSE/AVX uses the following equivalent formula
+// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
+// return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
+// 0x7e80) >> 8;
+
static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
}
+#endif
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+// LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round.
+#ifdef LIBYUV_RGBTOU_TRUNCATE
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
+ return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
+}
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
+ return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
+}
+#else
+// TODO(fbarchard): Add rounding to x86 SIMD and use this
static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
}
static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
}
+#endif
+
+// LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb.
+#if !defined(LIBYUV_ARGBTOUV_PAVGB)
+static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
+ return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
+ return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
+}
+#endif
// ARGBToY_C and ARGBToUV_C
-#define MAKEROWY(NAME, R, G, B, BPP) \
- void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
- int x; \
- for (x = 0; x < width; ++x) { \
- dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
- src_argb0 += BPP; \
- dst_y += 1; \
- } \
- } \
- void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
- uint8_t* dst_u, uint8_t* dst_v, int width) { \
- const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
- int x; \
- for (x = 0; x < width - 1; x += 2) { \
- uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
- src_rgb1[B + BPP]) >> \
- 2; \
- uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
- src_rgb1[G + BPP]) >> \
- 2; \
- uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
- src_rgb1[R + BPP]) >> \
- 2; \
- dst_u[0] = RGBToU(ar, ag, ab); \
- dst_v[0] = RGBToV(ar, ag, ab); \
- src_rgb0 += BPP * 2; \
- src_rgb1 += BPP * 2; \
- dst_u += 1; \
- dst_v += 1; \
- } \
- if (width & 1) { \
- uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
- uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
- uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
- dst_u[0] = RGBToU(ar, ag, ab); \
- dst_v[0] = RGBToV(ar, ag, ab); \
- } \
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+#define MAKEROWY(NAME, R, G, B, BPP) \
+ void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \
+ src_rgb += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \
+ AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \
+ uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \
+ AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \
+ uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \
+ AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ src_rgb += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \
+ uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \
+ uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ } \
}
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWY(NAME, R, G, B, BPP) \
+ void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \
+ src_rgb += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP] + 1) >> \
+ 1; \
+ uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP] + 1) >> \
+ 1; \
+ uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP] + 1) >> \
+ 1; \
+ dst_u[0] = RGB2xToU(ar, ag, ab); \
+ dst_v[0] = RGB2xToV(ar, ag, ab); \
+ src_rgb += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint16_t ab = src_rgb[B] + src_rgb1[B]; \
+ uint16_t ag = src_rgb[G] + src_rgb1[G]; \
+ uint16_t ar = src_rgb[R] + src_rgb1[R]; \
+ dst_u[0] = RGB2xToU(ar, ag, ab); \
+ dst_v[0] = RGB2xToV(ar, ag, ab); \
+ } \
+ }
+#endif
MAKEROWY(ARGB, 2, 1, 0, 4)
MAKEROWY(BGRA, 1, 2, 3, 4)
@@ -448,14 +655,14 @@ MAKEROWY(RAW, 0, 1, 2, 3)
// b 0.1016 * 255 = 25.908 = 25
// g 0.5078 * 255 = 129.489 = 129
// r 0.2578 * 255 = 65.739 = 66
-// JPeg 8 bit Y (not used):
-// b 0.11400 * 256 = 29.184 = 29
-// g 0.58700 * 256 = 150.272 = 150
-// r 0.29900 * 256 = 76.544 = 77
-// JPeg 7 bit Y:
+// JPeg 7 bit Y (deprecated)
// b 0.11400 * 128 = 14.592 = 15
// g 0.58700 * 128 = 75.136 = 75
// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit Y:
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
// JPeg 8 bit U:
// b 0.50000 * 255 = 127.5 = 127
// g -0.33126 * 255 = -84.4713 = -84
@@ -465,57 +672,119 @@ MAKEROWY(RAW, 0, 1, 2, 3)
// g -0.41869 * 255 = -106.76595 = -107
// r 0.50000 * 255 = 127.5 = 127
+#ifdef LIBYUV_RGB7
+// Old 7 bit math for compatibility on unsupported platforms.
static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
return (38 * r + 75 * g + 15 * b + 64) >> 7;
}
+#else
+// 8 bit
+static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+ return (77 * r + 150 * g + 29 * b + 128) >> 8;
+}
+#endif
+#if defined(LIBYUV_ARGBTOUV_PAVGB)
static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
}
static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
}
-
-#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+#else
+static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) {
+ return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8;
+}
+static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
+ return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8;
+}
+#endif
// ARGBToYJ_C and ARGBToUVJ_C
-#define MAKEROWYJ(NAME, R, G, B, BPP) \
- void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
- int x; \
- for (x = 0; x < width; ++x) { \
- dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
- src_argb0 += BPP; \
- dst_y += 1; \
- } \
- } \
- void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
- uint8_t* dst_u, uint8_t* dst_v, int width) { \
- const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
- int x; \
- for (x = 0; x < width - 1; x += 2) { \
- uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
- AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
- uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
- AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
- uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
- AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
- dst_u[0] = RGBToUJ(ar, ag, ab); \
- dst_v[0] = RGBToVJ(ar, ag, ab); \
- src_rgb0 += BPP * 2; \
- src_rgb1 += BPP * 2; \
- dst_u += 1; \
- dst_v += 1; \
- } \
- if (width & 1) { \
- uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \
- uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \
- uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \
- dst_u[0] = RGBToUJ(ar, ag, ab); \
- dst_v[0] = RGBToVJ(ar, ag, ab); \
- } \
+// Intel version mimic SSE/AVX which does 2 pavgb
+#if LIBYUV_ARGBTOUV_PAVGB
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+ void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \
+ src_rgb += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \
+ AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \
+ uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \
+ AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \
+ uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \
+ AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ src_rgb += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \
+ uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \
+ uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ } \
}
+#else
+// ARM version does sum / 2 then multiply by 2x smaller coefficients
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+ void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \
+ src_rgb += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP] + 1) >> \
+ 1; \
+ uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP] + 1) >> \
+ 1; \
+ uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP] + 1) >> \
+ 1; \
+ dst_u[0] = RGB2xToUJ(ar, ag, ab); \
+ dst_v[0] = RGB2xToVJ(ar, ag, ab); \
+ src_rgb += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint16_t ab = (src_rgb[B] + src_rgb1[B]); \
+ uint16_t ag = (src_rgb[G] + src_rgb1[G]); \
+ uint16_t ar = (src_rgb[R] + src_rgb1[R]); \
+ dst_u[0] = RGB2xToUJ(ar, ag, ab); \
+ dst_v[0] = RGB2xToVJ(ar, ag, ab); \
+ } \
+ }
+
+#endif
MAKEROWYJ(ARGB, 2, 1, 0, 4)
+MAKEROWYJ(RGBA, 3, 2, 1, 4)
+MAKEROWYJ(RGB24, 2, 1, 0, 3)
+MAKEROWYJ(RAW, 0, 1, 2, 3)
#undef MAKEROWYJ
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@@ -583,13 +852,34 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
uint8_t b3 = next_rgb565[2] & 0x1f;
uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
uint8_t r3 = next_rgb565[3] >> 3;
- uint8_t b = (b0 + b1 + b2 + b3); // 565 * 4 = 787.
- uint8_t g = (g0 + g1 + g2 + g3);
- uint8_t r = (r0 + r1 + r2 + r3);
- b = (b << 1) | (b >> 6); // 787 -> 888.
- r = (r << 1) | (r >> 6);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 2) | (g0 >> 4);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b1 = (b1 << 3) | (b1 >> 2);
+ g1 = (g1 << 2) | (g1 >> 4);
+ r1 = (r1 << 3) | (r1 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 2) | (g2 >> 4);
+ r2 = (r2 << 3) | (r2 >> 2);
+ b3 = (b3 << 3) | (b3 >> 2);
+ g3 = (g3 << 2) | (g3 >> 4);
+ r3 = (r3 << 3) | (r3 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
src_rgb565 += 4;
next_rgb565 += 4;
dst_u += 1;
@@ -602,14 +892,27 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
uint8_t b2 = next_rgb565[0] & 0x1f;
uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
uint8_t r2 = next_rgb565[1] >> 3;
- uint8_t b = (b0 + b2); // 565 * 2 = 676.
- uint8_t g = (g0 + g2);
- uint8_t r = (r0 + r2);
- b = (b << 2) | (b >> 4); // 676 -> 888
- g = (g << 1) | (g >> 6);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 2) | (g0 >> 4);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 2) | (g2 >> 4);
+ r2 = (r2 << 3) | (r2 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
}
}
@@ -633,14 +936,34 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
uint8_t b3 = next_argb1555[2] & 0x1f;
uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
- uint8_t b = (b0 + b1 + b2 + b3); // 555 * 4 = 777.
- uint8_t g = (g0 + g1 + g2 + g3);
- uint8_t r = (r0 + r1 + r2 + r3);
- b = (b << 1) | (b >> 6); // 777 -> 888.
- g = (g << 1) | (g >> 6);
- r = (r << 1) | (r >> 6);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 3) | (g0 >> 2);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b1 = (b1 << 3) | (b1 >> 2);
+ g1 = (g1 << 3) | (g1 >> 2);
+ r1 = (r1 << 3) | (r1 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 3) | (g2 >> 2);
+ r2 = (r2 << 3) | (r2 >> 2);
+ b3 = (b3 << 3) | (b3 >> 2);
+ g3 = (g3 << 3) | (g3 >> 2);
+ r3 = (r3 << 3) | (r3 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
src_argb1555 += 4;
next_argb1555 += 4;
dst_u += 1;
@@ -652,15 +975,28 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
uint8_t b2 = next_argb1555[0] & 0x1f;
uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
- uint8_t r2 = next_argb1555[1] >> 3;
- uint8_t b = (b0 + b2); // 555 * 2 = 666.
- uint8_t g = (g0 + g2);
- uint8_t r = (r0 + r2);
- b = (b << 2) | (b >> 4); // 666 -> 888.
- g = (g << 2) | (g >> 4);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+ uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
+
+ b0 = (b0 << 3) | (b0 >> 2);
+ g0 = (g0 << 3) | (g0 >> 2);
+ r0 = (r0 << 3) | (r0 >> 2);
+ b2 = (b2 << 3) | (b2 >> 2);
+ g2 = (g2 << 3) | (g2 >> 2);
+ r2 = (r2 << 3) | (r2 >> 2);
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
}
}
@@ -684,14 +1020,34 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
uint8_t b3 = next_argb4444[2] & 0x0f;
uint8_t g3 = next_argb4444[2] >> 4;
uint8_t r3 = next_argb4444[3] & 0x0f;
- uint8_t b = (b0 + b1 + b2 + b3); // 444 * 4 = 666.
- uint8_t g = (g0 + g1 + g2 + g3);
- uint8_t r = (r0 + r1 + r2 + r3);
- b = (b << 2) | (b >> 4); // 666 -> 888.
- g = (g << 2) | (g >> 4);
- r = (r << 2) | (r >> 4);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 4) | b0;
+ g0 = (g0 << 4) | g0;
+ r0 = (r0 << 4) | r0;
+ b1 = (b1 << 4) | b1;
+ g1 = (g1 << 4) | g1;
+ r1 = (r1 << 4) | r1;
+ b2 = (b2 << 4) | b2;
+ g2 = (g2 << 4) | g2;
+ r2 = (r2 << 4) | r2;
+ b3 = (b3 << 4) | b3;
+ g3 = (g3 << 4) | g3;
+ r3 = (r3 << 4) | r3;
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
+ uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
+ uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
+ uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
+ uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
+
src_argb4444 += 4;
next_argb4444 += 4;
dst_u += 1;
@@ -704,14 +1060,27 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
uint8_t b2 = next_argb4444[0] & 0x0f;
uint8_t g2 = next_argb4444[0] >> 4;
uint8_t r2 = next_argb4444[1] & 0x0f;
- uint8_t b = (b0 + b2); // 444 * 2 = 555.
- uint8_t g = (g0 + g2);
- uint8_t r = (r0 + r2);
- b = (b << 3) | (b >> 2); // 555 -> 888.
- g = (g << 3) | (g >> 2);
- r = (r << 3) | (r >> 2);
- dst_u[0] = RGBToU(r, g, b);
- dst_v[0] = RGBToV(r, g, b);
+
+ b0 = (b0 << 4) | b0;
+ g0 = (g0 << 4) | g0;
+ r0 = (r0 << 4) | r0;
+ b2 = (b2 << 4) | b2;
+ g2 = (g2 << 4) | g2;
+ r2 = (r2 << 4) | r2;
+
+#if LIBYUV_ARGBTOUV_PAVGB
+ uint8_t ab = AVGB(b0, b2);
+ uint8_t ag = AVGB(g0, g2);
+ uint8_t ar = AVGB(r0, r2);
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+#else
+ uint16_t b = b0 + b2;
+ uint16_t g = g0 + g2;
+ uint16_t r = r0 + r2;
+ dst_u[0] = RGB2xToU(r, g, b);
+ dst_v[0] = RGB2xToV(r, g, b);
+#endif
}
}
@@ -877,16 +1246,16 @@ void ARGBShadeRow_C(const uint8_t* src_argb,
#define REPEAT8(v) (v) | ((v) << 8)
#define SHADE(f, v) v* f >> 16
-void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+void ARGBMultiplyRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
- const uint32_t b = REPEAT8(src_argb0[0]);
- const uint32_t g = REPEAT8(src_argb0[1]);
- const uint32_t r = REPEAT8(src_argb0[2]);
- const uint32_t a = REPEAT8(src_argb0[3]);
+ const uint32_t b = REPEAT8(src_argb[0]);
+ const uint32_t g = REPEAT8(src_argb[1]);
+ const uint32_t r = REPEAT8(src_argb[2]);
+ const uint32_t a = REPEAT8(src_argb[3]);
const uint32_t b_scale = src_argb1[0];
const uint32_t g_scale = src_argb1[1];
const uint32_t r_scale = src_argb1[2];
@@ -895,7 +1264,7 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0,
dst_argb[1] = SHADE(g, g_scale);
dst_argb[2] = SHADE(r, r_scale);
dst_argb[3] = SHADE(a, a_scale);
- src_argb0 += 4;
+ src_argb += 4;
src_argb1 += 4;
dst_argb += 4;
}
@@ -905,16 +1274,16 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0,
#define SHADE(f, v) clamp255(v + f)
-void ARGBAddRow_C(const uint8_t* src_argb0,
+void ARGBAddRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
- const int b = src_argb0[0];
- const int g = src_argb0[1];
- const int r = src_argb0[2];
- const int a = src_argb0[3];
+ const int b = src_argb[0];
+ const int g = src_argb[1];
+ const int r = src_argb[2];
+ const int a = src_argb[3];
const int b_add = src_argb1[0];
const int g_add = src_argb1[1];
const int r_add = src_argb1[2];
@@ -923,7 +1292,7 @@ void ARGBAddRow_C(const uint8_t* src_argb0,
dst_argb[1] = SHADE(g, g_add);
dst_argb[2] = SHADE(r, r_add);
dst_argb[3] = SHADE(a, a_add);
- src_argb0 += 4;
+ src_argb += 4;
src_argb1 += 4;
dst_argb += 4;
}
@@ -932,16 +1301,16 @@ void ARGBAddRow_C(const uint8_t* src_argb0,
#define SHADE(f, v) clamp0(f - v)
-void ARGBSubtractRow_C(const uint8_t* src_argb0,
+void ARGBSubtractRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
- const int b = src_argb0[0];
- const int g = src_argb0[1];
- const int r = src_argb0[2];
- const int a = src_argb0[3];
+ const int b = src_argb[0];
+ const int g = src_argb[1];
+ const int r = src_argb[2];
+ const int a = src_argb[3];
const int b_sub = src_argb1[0];
const int g_sub = src_argb1[1];
const int r_sub = src_argb1[2];
@@ -950,7 +1319,7 @@ void ARGBSubtractRow_C(const uint8_t* src_argb0,
dst_argb[1] = SHADE(g, g_sub);
dst_argb[2] = SHADE(r, r_sub);
dst_argb[3] = SHADE(a, a_sub);
- src_argb0 += 4;
+ src_argb += 4;
src_argb1 += 4;
dst_argb += 4;
}
@@ -1058,257 +1427,244 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
}
}
-// TODO(fbarchard): Unify these structures to be platform independent.
-// TODO(fbarchard): Generate SIMD structures from float matrix.
+// Macros to create SIMD specific yuv to rgb conversion constants.
-// BT.601 YUV to RGB reference
-// R = (Y - 16) * 1.164 - V * -1.596
-// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
-// B = (Y - 16) * 1.164 - U * -2.018
+// clang-format off
-// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+#if defined(__aarch64__) || defined(__arm__)
+// Bias values include subtract 128 from U and V, bias from Y and rounding.
+// For B and R bias is negative. For G bias is positive.
+#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \
+ {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \
+ {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \
+ 0, 0}}
+#else
+#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \
+ {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \
+ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \
+ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
+ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \
+ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \
+ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
+ {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
+#endif
+
+// clang-format on
+
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR) \
+ const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
+ YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR); \
+ const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
+ YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
+
+// TODO(fbarchard): Generate SIMD structures from float matrix.
+
+// BT.601 limited range YUV to RGB reference
+// R = (Y - 16) * 1.164 + V * 1.596
+// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+// B = (Y - 16) * 1.164 + U * 2.018
+// KR = 0.299; KB = 0.114
// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25 /* round(0.391 * 64) */
-#define VG 52 /* round(0.813 * 64) */
-#define VR -102 /* round(-1.596 * 64) */
-
-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-#if defined(__aarch64__) // 64 bit arm
-const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
- {UG, VG, UG, VG, UG, VG, UG, VG},
- {UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
- {VG, UG, VG, UG, VG, UG, VG, UG},
- {VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-#elif defined(__arm__) // 32 bit arm
-const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
- {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
- {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
- {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
- {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601)
+#define UB 129 /* round(2.018 * 64) */
#else
-const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
- {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
- {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
- {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
-const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
- {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
- {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
- {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#define UB 128 /* max(128, round(2.018 * 64)) */
#endif
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR 102 /* round(1.596 * 64) */
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
-#undef YG
-// JPEG YUV to RGB reference
-// * R = Y - V * -1.40200
-// * G = Y - U * 0.34414 - V * 0.71414
-// * B = Y - U * -1.77200
+// BT.601 full range YUV to RGB reference (aka JPEG)
+// * R = Y + V * 1.40200
+// * G = Y - U * 0.34414 - V * 0.71414
+// * B = Y + U * 1.77200
+// KR = 0.299; KB = 0.114
+
+// U and V contributions to R,G,B.
+#define UB 113 /* round(1.77200 * 64) */
+#define UG 22 /* round(0.34414 * 64) */
+#define VG 46 /* round(0.71414 * 64) */
+#define VR 90 /* round(1.40200 * 64) */
// Y contribution to R,G,B. Scale and bias.
#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32 /* 64 / 2 */
+#define YB 32 /* 64 / 2 */
+
+MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+// BT.709 limited range YUV to RGB reference
+// R = (Y - 16) * 1.164 + V * 1.793
+// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
+// B = (Y - 16) * 1.164 + U * 2.112
+// KR = 0.2126, KB = 0.0722
// U and V contributions to R,G,B.
-#define UB -113 /* round(-1.77200 * 64) */
-#define UG 22 /* round(0.34414 * 64) */
-#define VG 46 /* round(0.71414 * 64) */
-#define VR -90 /* round(-1.40200 * 64) */
-
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-#if defined(__aarch64__)
-const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
- {UG, VG, UG, VG, UG, VG, UG, VG},
- {UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
- {VG, UG, VG, UG, VG, UG, VG, UG},
- {VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-#elif defined(__arm__)
-const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
- {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
- {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
- {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
- {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709)
+#define UB 135 /* round(2.112 * 64) */
#else
-const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
- {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
- {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
- {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
-const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
- {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
- {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
- {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#define UB 128 /* max(128, round(2.112 * 64)) */
#endif
+#define UG 14 /* round(0.213 * 64) */
+#define VG 34 /* round(0.533 * 64) */
+#define VR 115 /* round(1.793 * 64) */
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
-#undef YG
-// BT.709 YUV to RGB reference
-// R = (Y - 16) * 1.164 - V * -1.793
-// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
-// B = (Y - 16) * 1.164 - U * -2.112
-// See also http://www.equasys.de/colorconversion.html
+// BT.709 full range YUV to RGB reference
+// R = Y + V * 1.5748
+// G = Y - U * 0.18732 - V * 0.46812
+// B = Y + U * 1.8556
+// KR = 0.2126, KB = 0.0722
-// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+// U and V contributions to R,G,B.
+#define UB 119 /* round(1.8556 * 64) */
+#define UG 12 /* round(0.18732 * 64) */
+#define VG 30 /* round(0.46812 * 64) */
+#define VR 101 /* round(1.5748 * 64) */
+
+// Y contribution to R,G,B. Scale and bias. (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32 /* 64 / 2 */
+
+MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+// BT.2020 limited range YUV to RGB reference
+// R = (Y - 16) * 1.164384 + V * 1.67867
+// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
+// B = (Y - 16) * 1.164384 + U * 2.14177
+// KR = 0.2627; KB = 0.0593
-// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
// U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.112 * 64)) */
-#define UG 14 /* round(0.213 * 64) */
-#define VG 34 /* round(0.533 * 64) */
-#define VR -115 /* round(-1.793 * 64) */
-
-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-#if defined(__aarch64__)
-const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
- {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
- {UG, VG, UG, VG, UG, VG, UG, VG},
- {UG, VG, UG, VG, UG, VG, UG, VG},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
- {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
- {VG, UG, VG, UG, VG, UG, VG, UG},
- {VG, UG, VG, UG, VG, UG, VG, UG},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-#elif defined(__arm__)
-const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
- {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
- {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BB, BG, BR, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
-const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
- {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
- {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
- {BR, BG, BB, 0, 0, 0, 0, 0},
- {0x0101 * YG, 0, 0, 0}};
+#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020)
+#define UB 137 /* round(2.142 * 64) */
#else
-const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
- {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
- UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
- {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
- {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
- 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
-const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
- {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
- VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
- {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
- {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
- 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
- {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
- {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
- {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
- {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
+#define UB 128 /* max(128, round(2.142 * 64)) */
#endif
+#define UG 12 /* round(0.187326 * 64) */
+#define VG 42 /* round(0.65042 * 64) */
+#define VR 107 /* round(1.67867 * 64) */
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+// Y contribution to R,G,B. Scale and bias.
+#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
+
+MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)
+
+#undef YG
+#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
+
+// BT.2020 full range YUV to RGB reference
+// R = Y + V * 1.474600
+// G = Y - U * 0.164553 - V * 0.571353
+// B = Y + U * 1.881400
+// KR = 0.2627; KB = 0.0593
+
+#define UB 120 /* round(1.881400 * 64) */
+#define UG 11 /* round(0.164553 * 64) */
+#define VG 37 /* round(0.571353 * 64) */
+#define VR 94 /* round(1.474600 * 64) */
+
+// Y contribution to R,G,B. Scale and bias. (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32 /* 64 / 2 */
+
+MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)
+
#undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+#undef BB
+#undef BG
+#undef BR
+
+#undef MAKEYUVCONSTANTS
+
+#if defined(__aarch64__) || defined(__arm__)
+#define LOAD_YUV_CONSTANTS \
+ int ub = yuvconstants->kUVCoeff[0]; \
+ int vr = yuvconstants->kUVCoeff[1]; \
+ int ug = yuvconstants->kUVCoeff[2]; \
+ int vg = yuvconstants->kUVCoeff[3]; \
+ int yg = yuvconstants->kRGBCoeffBias[0]; \
+ int bb = yuvconstants->kRGBCoeffBias[1]; \
+ int bg = yuvconstants->kRGBCoeffBias[2]; \
+ int br = yuvconstants->kRGBCoeffBias[3]
+
+#define CALC_RGB16 \
+ int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
+ int b16 = y1 + (u * ub) - bb; \
+ int g16 = y1 + bg - (u * ug + v * vg); \
+ int r16 = y1 + (v * vr) - br
+#else
+#define LOAD_YUV_CONSTANTS \
+ int ub = yuvconstants->kUVToB[0]; \
+ int ug = yuvconstants->kUVToG[0]; \
+ int vg = yuvconstants->kUVToG[1]; \
+ int vr = yuvconstants->kUVToR[1]; \
+ int yg = yuvconstants->kYToRgb[0]; \
+ int yb = yuvconstants->kYBiasToRgb[0]
+
+#define CALC_RGB16 \
+ int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
+ int8_t ui = (int8_t)u; \
+ int8_t vi = (int8_t)v; \
+ ui -= 0x80; \
+ vi -= 0x80; \
+ int b16 = y1 + (ui * ub); \
+ int g16 = y1 - (ui * ug + vi * vg); \
+ int r16 = y1 + (vi * vr)
+#endif
// C reference code that mimics the YUV assembly.
// Reads 8 bit YUV and leaves result as 16 bit.
-
static __inline void YuvPixel(uint8_t y,
uint8_t u,
uint8_t v,
@@ -1316,39 +1672,12 @@ static __inline void YuvPixel(uint8_t y,
uint8_t* g,
uint8_t* r,
const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = -yuvconstants->kUVToRB[1];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#elif defined(__arm__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[4];
- int vr = -yuvconstants->kUVToRB[4];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#else
- int ub = yuvconstants->kUVToB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = yuvconstants->kUVToR[1];
- int bb = yuvconstants->kUVBiasB[0];
- int bg = yuvconstants->kUVBiasG[0];
- int br = yuvconstants->kUVBiasR[0];
- int yg = yuvconstants->kYToRgb[0];
-#endif
-
- uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
- *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6);
- *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6);
- *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6);
+ LOAD_YUV_CONSTANTS;
+ uint32_t y32 = y * 0x0101;
+ CALC_RGB16;
+ *b = Clamp((int32_t)(b16) >> 6);
+ *g = Clamp((int32_t)(g16) >> 6);
+ *r = Clamp((int32_t)(r16) >> 6);
}
// Reads 8 bit YUV and leaves result as 16 bit.
@@ -1359,85 +1688,50 @@ static __inline void YuvPixel8_16(uint8_t y,
int* g,
int* r,
const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = -yuvconstants->kUVToRB[1];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#elif defined(__arm__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[4];
- int vr = -yuvconstants->kUVToRB[4];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#else
- int ub = yuvconstants->kUVToB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = yuvconstants->kUVToR[1];
- int bb = yuvconstants->kUVBiasB[0];
- int bg = yuvconstants->kUVBiasG[0];
- int br = yuvconstants->kUVBiasR[0];
- int yg = yuvconstants->kYToRgb[0];
-#endif
-
- uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
- *b = (int)(-(u * ub) + y1 + bb);
- *g = (int)(-(u * ug + v * vg) + y1 + bg);
- *r = (int)(-(v * vr) + y1 + br);
+ LOAD_YUV_CONSTANTS;
+ uint32_t y32 = y * 0x0101;
+ CALC_RGB16;
+ *b = b16;
+ *g = g16;
+ *r = r16;
}
// C reference code that mimics the YUV 16 bit assembly.
// Reads 10 bit YUV and leaves result as 16 bit.
-static __inline void YuvPixel16(int16_t y,
- int16_t u,
- int16_t v,
- int* b,
- int* g,
- int* r,
- const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = -yuvconstants->kUVToRB[1];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#elif defined(__arm__)
- int ub = -yuvconstants->kUVToRB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[4];
- int vr = -yuvconstants->kUVToRB[4];
- int bb = yuvconstants->kUVBiasBGR[0];
- int bg = yuvconstants->kUVBiasBGR[1];
- int br = yuvconstants->kUVBiasBGR[2];
- int yg = yuvconstants->kYToRgb[0] / 0x0101;
-#else
- int ub = yuvconstants->kUVToB[0];
- int ug = yuvconstants->kUVToG[0];
- int vg = yuvconstants->kUVToG[1];
- int vr = yuvconstants->kUVToR[1];
- int bb = yuvconstants->kUVBiasB[0];
- int bg = yuvconstants->kUVBiasG[0];
- int br = yuvconstants->kUVBiasR[0];
- int yg = yuvconstants->kYToRgb[0];
-#endif
-
- uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
+static __inline void YuvPixel10_16(uint16_t y,
+ uint16_t u,
+ uint16_t v,
+ int* b,
+ int* g,
+ int* r,
+ const struct YuvConstants* yuvconstants) {
+ LOAD_YUV_CONSTANTS;
+ uint32_t y32 = y << 6;
u = clamp255(u >> 2);
v = clamp255(v >> 2);
- *b = (int)(-(u * ub) + y1 + bb);
- *g = (int)(-(u * ug + v * vg) + y1 + bg);
- *r = (int)(-(v * vr) + y1 + br);
+ CALC_RGB16;
+ *b = b16;
+ *g = g16;
+ *r = r16;
+}
+
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 12 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel12_16(int16_t y,
+ int16_t u,
+ int16_t v,
+ int* b,
+ int* g,
+ int* r,
+ const struct YuvConstants* yuvconstants) {
+ LOAD_YUV_CONSTANTS;
+ uint32_t y32 = y << 4;
+ u = clamp255(u >> 4);
+ v = clamp255(v >> 4);
+ CALC_RGB16;
+ *b = b16;
+ *g = g16;
+ *r = r16;
}
// C reference code that mimics the YUV 10 bit assembly.
@@ -1452,59 +1746,88 @@ static __inline void YuvPixel10(uint16_t y,
int b16;
int g16;
int r16;
- YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
+ YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants);
*b = Clamp(b16 >> 6);
*g = Clamp(g16 >> 6);
*r = Clamp(r16 >> 6);
}
-// Y contribution to R,G,B. Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
-
-// C reference code that mimics the YUV assembly.
-static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) {
- uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16;
- *b = Clamp((int32_t)(y1 + YGB) >> 6);
- *g = Clamp((int32_t)(y1 + YGB) >> 6);
- *r = Clamp((int32_t)(y1 + YGB) >> 6);
+// C reference code that mimics the YUV 12 bit assembly.
+// Reads 12 bit YUV and clamps down to 8 bit RGB.
+static __inline void YuvPixel12(uint16_t y,
+ uint16_t u,
+ uint16_t v,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+ int b16;
+ int g16;
+ int r16;
+ YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
+ *b = Clamp(b16 >> 6);
+ *g = Clamp(g16 >> 6);
+ *r = Clamp(r16 >> 6);
}
-#undef YG
-#undef YGB
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 16 bit YUV and leaves result as 8 bit.
+static __inline void YuvPixel16_8(uint16_t y,
+ uint16_t u,
+ uint16_t v,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+ LOAD_YUV_CONSTANTS;
+ uint32_t y32 = y;
+ u = clamp255(u >> 8);
+ v = clamp255(v >> 8);
+ CALC_RGB16;
+ *b = Clamp((int32_t)(b16) >> 6);
+ *g = Clamp((int32_t)(g16) >> 6);
+ *r = Clamp((int32_t)(r16) >> 6);
+}
-#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
-// C mimic assembly.
-// TODO(fbarchard): Remove subsampling from Neon.
-void I444ToARGBRow_C(const uint8_t* src_y,
- const uint8_t* src_u,
- const uint8_t* src_v,
- uint8_t* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
- int x;
- for (x = 0; x < width - 1; x += 2) {
- uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
- uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
- YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
- yuvconstants);
- rgb_buf[3] = 255;
- YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
- yuvconstants);
- rgb_buf[7] = 255;
- src_y += 2;
- src_u += 2;
- src_v += 2;
- rgb_buf += 8; // Advance 2 pixels.
- }
- if (width & 1) {
- YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
- rgb_buf + 2, yuvconstants);
- rgb_buf[3] = 255;
- }
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 16 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel16_16(uint16_t y,
+ uint16_t u,
+ uint16_t v,
+ int* b,
+ int* g,
+ int* r,
+ const struct YuvConstants* yuvconstants) {
+ LOAD_YUV_CONSTANTS;
+ uint32_t y32 = y;
+ u = clamp255(u >> 8);
+ v = clamp255(v >> 8);
+ CALC_RGB16;
+ *b = b16;
+ *g = g16;
+ *r = r16;
}
+
+// C reference code that mimics the YUV assembly.
+// Reads 8 bit YUV and leaves result as 8 bit.
+static __inline void YPixel(uint8_t y,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__) || defined(__arm__)
+ int yg = yuvconstants->kRGBCoeffBias[0];
+ int ygb = yuvconstants->kRGBCoeffBias[4];
#else
+ int ygb = yuvconstants->kYBiasToRgb[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+ uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+ *b = Clamp(((int32_t)(y1) + ygb) >> 6);
+ *g = Clamp(((int32_t)(y1) + ygb) >> 6);
+ *r = Clamp(((int32_t)(y1) + ygb) >> 6);
+}
+
void I444ToARGBRow_C(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -1522,7 +1845,6 @@ void I444ToARGBRow_C(const uint8_t* src_y,
rgb_buf += 4; // Advance 1 pixel.
}
}
-#endif
// Also used for 420
void I422ToARGBRow_C(const uint8_t* src_y,
@@ -1578,9 +1900,102 @@ void I210ToARGBRow_C(const uint16_t* src_y,
}
}
+void I410ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 4; // Advance 1 pixels.
+ }
+}
+
+void I210AlphaToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ const uint16_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = clamp255(src_a[0] >> 2);
+ YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = clamp255(src_a[1] >> 2);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ src_a += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = clamp255(src_a[0] >> 2);
+ }
+}
+
+void I410AlphaToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ const uint16_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = clamp255(src_a[0] >> 2);
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ src_a += 1;
+ rgb_buf += 4; // Advance 1 pixels.
+ }
+}
+
+// 12 bit YUV to ARGB
+void I212ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
uint32_t ar30;
- b = b >> 4; // convert 10.6 to 10 bit.
+ b = b >> 4; // convert 8 bit 10.6 to 10 bit.
g = g >> 4;
r = r >> 4;
b = Clamp10(b);
@@ -1602,9 +2017,9 @@ void I210ToAR30Row_C(const uint16_t* src_y,
int g;
int r;
for (x = 0; x < width - 1; x += 2) {
- YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
StoreAR30(rgb_buf, b, g, r);
- YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
StoreAR30(rgb_buf + 4, b, g, r);
src_y += 2;
src_u += 1;
@@ -1612,11 +2027,141 @@ void I210ToAR30Row_C(const uint16_t* src_y,
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
StoreAR30(rgb_buf, b, g, r);
}
}
+// 12 bit YUV to 10 bit AR30
+void I212ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf + 4, b, g, r);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ }
+}
+
+void I410ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width; ++x) {
+ YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 4; // Advance 1 pixel.
+ }
+}
+
+// P210 has 10 bits in msb of 16 bit NV12 style layout.
+void P210ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+ dst_argb + 2, yuvconstants);
+ dst_argb[3] = 255;
+ YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5,
+ dst_argb + 6, yuvconstants);
+ dst_argb[7] = 255;
+ src_y += 2;
+ src_uv += 2;
+ dst_argb += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+ dst_argb + 2, yuvconstants);
+ dst_argb[3] = 255;
+ }
+}
+
+void P410ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1,
+ dst_argb + 2, yuvconstants);
+ dst_argb[3] = 255;
+ src_y += 1;
+ src_uv += 2;
+ dst_argb += 4; // Advance 1 pixels.
+ }
+}
+
+void P210ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_uv,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+ StoreAR30(dst_ar30, b, g, r);
+ YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+ StoreAR30(dst_ar30 + 4, b, g, r);
+ src_y += 2;
+ src_uv += 2;
+ dst_ar30 += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+ StoreAR30(dst_ar30, b, g, r);
+ }
+}
+
+void P410ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_uv,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width; ++x) {
+ YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants);
+ StoreAR30(dst_ar30, b, g, r);
+ src_y += 1;
+ src_uv += 2;
+ dst_ar30 += 4; // Advance 1 pixel.
+ }
+}
+
// 8 bit YUV to 10 bit AR30
// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
void I422ToAR30Row_C(const uint8_t* src_y,
@@ -1645,6 +2190,26 @@ void I422ToAR30Row_C(const uint8_t* src_y,
}
}
+void I444AlphaToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = src_a[0];
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ src_a += 1;
+ rgb_buf += 4; // Advance 1 pixel.
+ }
+}
+
void I422AlphaToARGBRow_C(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@@ -1718,8 +2283,8 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
b1 = b1 >> 4;
g1 = g1 >> 4;
r1 = r1 >> 4;
- *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
- (g1 << 20) | (r1 << 24) | 0xf000f000;
+ *(uint16_t*)(dst_argb4444 + 0) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
+ *(uint16_t*)(dst_argb4444 + 2) = b1 | (g1 << 4) | (r1 << 8) | 0xf000;
src_y += 2;
src_u += 1;
src_v += 1;
@@ -1756,8 +2321,8 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
b1 = b1 >> 3;
g1 = g1 >> 3;
r1 = r1 >> 3;
- *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
- (g1 << 21) | (r1 << 26) | 0x80008000;
+ *(uint16_t*)(dst_argb1555 + 0) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
+ *(uint16_t*)(dst_argb1555 + 2) = b1 | (g1 << 5) | (r1 << 10) | 0x8000;
src_y += 2;
src_u += 1;
src_v += 1;
@@ -1794,8 +2359,8 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
b1 = b1 >> 3;
g1 = g1 >> 2;
r1 = r1 >> 3;
- *(uint32_t*)(dst_rgb565) =
- b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
+ *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11); // for ubsan
+ *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11);
src_y += 2;
src_u += 1;
src_v += 1;
@@ -1921,8 +2486,8 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
b1 = b1 >> 3;
g1 = g1 >> 2;
r1 = r1 >> 3;
- *(uint32_t*)(dst_rgb565) =
- b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
+ *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11);
+ *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11);
src_y += 2;
src_uv += 2;
dst_rgb565 += 4; // Advance 2 pixels.
@@ -2006,18 +2571,21 @@ void I422ToRGBARow_C(const uint8_t* src_y,
}
}
-void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
+void I400ToARGBRow_C(const uint8_t* src_y,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
- YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
- YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
@@ -2035,10 +2603,21 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
}
}
-void MirrorUVRow_C(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ src_uv += (width - 1) << 1;
+ for (x = 0; x < width; ++x) {
+ dst_uv[0] = src_uv[0];
+ dst_uv[1] = src_uv[1];
+ src_uv -= 2;
+ dst_uv += 2;
+ }
+}
+
+void MirrorSplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
int x;
src_uv += (width - 1) << 1;
for (x = 0; x < width - 1; x += 2) {
@@ -2069,6 +2648,21 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
}
}
+void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) {
+ int x;
+ src_rgb24 += width * 3 - 3;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_rgb24[0];
+ uint8_t g = src_rgb24[1];
+ uint8_t r = src_rgb24[2];
+ dst_rgb24[0] = b;
+ dst_rgb24[1] = g;
+ dst_rgb24[2] = r;
+ src_rgb24 -= 3;
+ dst_rgb24 += 3;
+ }
+}
+
void SplitUVRow_C(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -2105,6 +2699,38 @@ void MergeUVRow_C(const uint8_t* src_u,
}
}
+void DetileRow_C(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width) {
+ int x;
+ for (x = 0; x < width - 15; x += 16) {
+ memcpy(dst, src, 16);
+ dst += 16;
+ src += src_tile_stride;
+ }
+ if (width & 15) {
+ memcpy(dst, src, width & 15);
+ }
+}
+
+void DetileSplitUVRow_C(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ for (x = 0; x < width - 15; x += 16) {
+ SplitUVRow_C(src_uv, dst_u, dst_v, 8);
+ dst_u += 8;
+ dst_v += 8;
+ src_uv += src_tile_stride;
+ }
+ if (width & 15) {
+ SplitUVRow_C(src_uv, dst_u, dst_v, ((width & 15) + 1) / 2);
+ }
+}
+
void SplitRGBRow_C(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
@@ -2133,27 +2759,197 @@ void MergeRGBRow_C(const uint8_t* src_r,
}
}
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 128 = 9 bits
-// 64 = 10 bits
-// 16 = 12 bits
-// 1 = 16 bits
+void SplitARGBRow_C(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_b[x] = src_argb[0];
+ dst_g[x] = src_argb[1];
+ dst_r[x] = src_argb[2];
+ dst_a[x] = src_argb[3];
+ src_argb += 4;
+ }
+}
+
+void MergeARGBRow_C(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_argb[0] = src_b[x];
+ dst_argb[1] = src_g[x];
+ dst_argb[2] = src_r[x];
+ dst_argb[3] = src_a[x];
+ dst_argb += 4;
+ }
+}
+
+void MergeXR30Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int depth,
+ int width) {
+ assert(depth >= 10);
+ assert(depth <= 16);
+ int x;
+ int shift = depth - 10;
+ uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30;
+ for (x = 0; x < width; ++x) {
+ uint32_t r = clamp1023(src_r[x] >> shift);
+ uint32_t g = clamp1023(src_g[x] >> shift);
+ uint32_t b = clamp1023(src_b[x] >> shift);
+ dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000;
+ }
+}
+
+void MergeAR64Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ assert(depth >= 1);
+ assert(depth <= 16);
+ int x;
+ int shift = 16 - depth;
+ int max = (1 << depth) - 1;
+ for (x = 0; x < width; ++x) {
+ dst_ar64[0] = ClampMax(src_b[x], max) << shift;
+ dst_ar64[1] = ClampMax(src_g[x], max) << shift;
+ dst_ar64[2] = ClampMax(src_r[x], max) << shift;
+ dst_ar64[3] = ClampMax(src_a[x], max) << shift;
+ dst_ar64 += 4;
+ }
+}
+
+void MergeARGB16To8Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ assert(depth >= 8);
+ assert(depth <= 16);
+ int x;
+ int shift = depth - 8;
+ for (x = 0; x < width; ++x) {
+ dst_argb[0] = clamp255(src_b[x] >> shift);
+ dst_argb[1] = clamp255(src_g[x] >> shift);
+ dst_argb[2] = clamp255(src_r[x] >> shift);
+ dst_argb[3] = clamp255(src_a[x] >> shift);
+ dst_argb += 4;
+ }
+}
+
+void MergeXR64Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ assert(depth >= 1);
+ assert(depth <= 16);
+ int x;
+ int shift = 16 - depth;
+ int max = (1 << depth) - 1;
+ for (x = 0; x < width; ++x) {
+ dst_ar64[0] = ClampMax(src_b[x], max) << shift;
+ dst_ar64[1] = ClampMax(src_g[x], max) << shift;
+ dst_ar64[2] = ClampMax(src_r[x], max) << shift;
+ dst_ar64[3] = 0xffff;
+ dst_ar64 += 4;
+ }
+}
+
+void MergeXRGB16To8Row_C(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ assert(depth >= 8);
+ assert(depth <= 16);
+ int x;
+ int shift = depth - 8;
+ for (x = 0; x < width; ++x) {
+ dst_argb[0] = clamp255(src_b[x] >> shift);
+ dst_argb[1] = clamp255(src_g[x] >> shift);
+ dst_argb[2] = clamp255(src_r[x] >> shift);
+ dst_argb[3] = 0xff;
+ dst_argb += 4;
+ }
+}
+
+void SplitXRGBRow_C(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_b[x] = src_argb[0];
+ dst_g[x] = src_argb[1];
+ dst_r[x] = src_argb[2];
+ src_argb += 4;
+ }
+}
+
+void MergeXRGBRow_C(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_argb[0] = src_b[x];
+ dst_argb[1] = src_g[x];
+ dst_argb[2] = src_r[x];
+ dst_argb[3] = 255;
+ dst_argb += 4;
+ }
+}
+
+// Convert lsb formats to msb, depending on sample depth.
void MergeUVRow_16_C(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
- int scale,
+ int depth,
int width) {
+ int shift = 16 - depth;
+ assert(depth >= 8);
+ assert(depth <= 16);
int x;
- for (x = 0; x < width - 1; x += 2) {
- dst_uv[0] = src_u[x] * scale;
- dst_uv[1] = src_v[x] * scale;
- dst_uv[2] = src_u[x + 1] * scale;
- dst_uv[3] = src_v[x + 1] * scale;
- dst_uv += 4;
+ for (x = 0; x < width; ++x) {
+ dst_uv[0] = src_u[x] << shift;
+ dst_uv[1] = src_v[x] << shift;
+ dst_uv += 2;
}
- if (width & 1) {
- dst_uv[0] = src_u[width - 1] * scale;
- dst_uv[1] = src_v[width - 1] * scale;
+}
+
+// Convert msb formats to lsb, depending on sample depth.
+void SplitUVRow_16_C(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int x;
+ assert(depth >= 8);
+ assert(depth <= 16);
+ for (x = 0; x < width; ++x) {
+ dst_u[x] = src_uv[0] >> shift;
+ dst_v[x] = src_uv[1] >> shift;
+ src_uv += 2;
}
}
@@ -2167,18 +2963,34 @@ void MultiplyRow_16_C(const uint16_t* src_y,
}
}
+void DivideRow_16_C(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_y[x] = (src_y[x] * scale) >> 16;
+ }
+}
+
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 32768 = 9 bits
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
+
void Convert16To8Row_C(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width) {
int x;
+ assert(scale >= 256);
+ assert(scale <= 32768);
+
for (x = 0; x < width; ++x) {
- dst_y[x] = clamp255((src_y[x] * scale) >> 16);
+ dst_y[x] = C16TO8(src_y[x], scale);
}
}
@@ -2208,10 +3020,9 @@ void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
}
void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
- uint32_t* d = (uint32_t*)(dst_argb);
int x;
for (x = 0; x < width; ++x) {
- d[x] = v32;
+ memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32);
}
}
@@ -2309,21 +3120,21 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
}
}
-#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
-// Blend src_argb0 over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb0 or src_argb1.
+// Blend src_argb over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb or src_argb1.
// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8_t* src_argb0,
+void ARGBBlendRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- uint32_t fb = src_argb0[0];
- uint32_t fg = src_argb0[1];
- uint32_t fr = src_argb0[2];
- uint32_t a = src_argb0[3];
+ uint32_t fb = src_argb[0];
+ uint32_t fg = src_argb[1];
+ uint32_t fr = src_argb[2];
+ uint32_t a = src_argb[3];
uint32_t bb = src_argb1[0];
uint32_t bg = src_argb1[1];
uint32_t br = src_argb1[2];
@@ -2332,10 +3143,10 @@ void ARGBBlendRow_C(const uint8_t* src_argb0,
dst_argb[2] = BLEND(fr, br, a);
dst_argb[3] = 255u;
- fb = src_argb0[4 + 0];
- fg = src_argb0[4 + 1];
- fr = src_argb0[4 + 2];
- a = src_argb0[4 + 3];
+ fb = src_argb[4 + 0];
+ fg = src_argb[4 + 1];
+ fr = src_argb[4 + 2];
+ a = src_argb[4 + 3];
bb = src_argb1[4 + 0];
bg = src_argb1[4 + 1];
br = src_argb1[4 + 2];
@@ -2343,16 +3154,16 @@ void ARGBBlendRow_C(const uint8_t* src_argb0,
dst_argb[4 + 1] = BLEND(fg, bg, a);
dst_argb[4 + 2] = BLEND(fr, br, a);
dst_argb[4 + 3] = 255u;
- src_argb0 += 8;
+ src_argb += 8;
src_argb1 += 8;
dst_argb += 8;
}
if (width & 1) {
- uint32_t fb = src_argb0[0];
- uint32_t fg = src_argb0[1];
- uint32_t fr = src_argb0[2];
- uint32_t a = src_argb0[3];
+ uint32_t fb = src_argb[0];
+ uint32_t fg = src_argb[1];
+ uint32_t fr = src_argb[2];
+ uint32_t a = src_argb[3];
uint32_t bb = src_argb1[0];
uint32_t bg = src_argb1[1];
uint32_t br = src_argb1[2];
@@ -2385,10 +3196,14 @@ void BlendPlaneRow_C(const uint8_t* src0,
}
#undef UBLEND
+#if LIBYUV_ATTENUATE_DUP
+// This code mimics the SSSE3 version for better testability.
#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+#else
+#define ATTENUATE(f, a) (f * a + 128) >> 8
+#endif
// Multiply source RGB by alpha and store to destination.
-// This code mimics the SSSE3 version for better testability.
void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
int i;
for (i = 0; i < width - 1; i += 2) {
@@ -2472,6 +3287,14 @@ const uint32_t fixed_invtbl8[256] = {
T(0xfc), T(0xfd), T(0xfe), 0x01000100};
#undef T
+#if LIBYUV_UNATTENUATE_DUP
+// This code mimics the Intel SIMD version for better testability.
+#define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16)
+#else
+#define UNATTENUATE(f, ia) clamp255((f * ia) >> 8)
+#endif
+
+// mimics the Intel SIMD code for exactness.
void ARGBUnattenuateRow_C(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
@@ -2482,13 +3305,11 @@ void ARGBUnattenuateRow_C(const uint8_t* src_argb,
uint32_t r = src_argb[2];
const uint32_t a = src_argb[3];
const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point
- b = (b * ia) >> 8;
- g = (g * ia) >> 8;
- r = (r * ia) >> 8;
+
// Clamping should not be necessary but is free in assembly.
- dst_argb[0] = clamp255(b);
- dst_argb[1] = clamp255(g);
- dst_argb[2] = clamp255(r);
+ dst_argb[0] = UNATTENUATE(b, ia);
+ dst_argb[1] = UNATTENUATE(g, ia);
+ dst_argb[2] = UNATTENUATE(r, ia);
dst_argb[3] = a;
src_argb += 4;
dst_argb += 4;
@@ -2519,8 +3340,11 @@ void CumulativeSumToAverageRow_C(const int32_t* tl,
int area,
uint8_t* dst,
int count) {
- float ooa = 1.0f / area;
+ float ooa;
int i;
+ assert(area != 0);
+
+ ooa = 1.0f / area;
for (i = 0; i < count; ++i) {
dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
@@ -2576,6 +3400,17 @@ static void HalfRow_16_C(const uint16_t* src_uv,
}
}
+static void HalfRow_16To8_C(const uint16_t* src_uv,
+ ptrdiff_t src_uv_stride,
+ uint8_t* dst_uv,
+ int scale,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale);
+ }
+}
+
// C version 2x2 -> 2x1.
void InterpolateRow_C(uint8_t* dst_ptr,
const uint8_t* src_ptr,
@@ -2586,6 +3421,9 @@ void InterpolateRow_C(uint8_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint8_t* src_ptr1 = src_ptr + src_stride;
int x;
+ assert(source_y_fraction >= 0);
+ assert(source_y_fraction < 256);
+
if (y1_fraction == 0) {
memcpy(dst_ptr, src_ptr, width);
return;
@@ -2594,21 +3432,16 @@ void InterpolateRow_C(uint8_t* dst_ptr,
HalfRow_C(src_ptr, src_stride, dst_ptr, width);
return;
}
- for (x = 0; x < width - 1; x += 2) {
- dst_ptr[0] =
- (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
- dst_ptr[1] =
- (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
- src_ptr += 2;
- src_ptr1 += 2;
- dst_ptr += 2;
- }
- if (width & 1) {
+ for (x = 0; x < width; ++x) {
dst_ptr[0] =
(src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+ ++src_ptr;
+ ++src_ptr1;
+ ++dst_ptr;
}
}
+// C version 2x2 -> 2x1.
void InterpolateRow_16_C(uint16_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
@@ -2618,23 +3451,62 @@ void InterpolateRow_16_C(uint16_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
int x;
- if (source_y_fraction == 0) {
+ assert(source_y_fraction >= 0);
+ assert(source_y_fraction < 256);
+
+ if (y1_fraction == 0) {
memcpy(dst_ptr, src_ptr, width * 2);
return;
}
- if (source_y_fraction == 128) {
+ if (y1_fraction == 128) {
HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
return;
}
- for (x = 0; x < width - 1; x += 2) {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
- dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
- src_ptr += 2;
- src_ptr1 += 2;
- dst_ptr += 2;
+ for (x = 0; x < width; ++x) {
+ dst_ptr[0] =
+ (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+ ++src_ptr;
+ ++src_ptr1;
+ ++dst_ptr;
}
- if (width & 1) {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+}
+
+// C version 2x2 16 bit-> 2x1 8 bit.
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+
+void InterpolateRow_16To8_C(uint8_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int scale,
+ int width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ int x;
+ assert(source_y_fraction >= 0);
+ assert(source_y_fraction < 256);
+
+ if (source_y_fraction == 0) {
+ Convert16To8Row_C(src_ptr, dst_ptr, scale, width);
+ return;
+ }
+ if (source_y_fraction == 128) {
+ HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width);
+ return;
+ }
+ for (x = 0; x < width; ++x) {
+ dst_ptr[0] = C16TO8(
+ (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
+ scale);
+ src_ptr += 1;
+ src_ptr1 += 1;
+ dst_ptr += 1;
}
}
@@ -2873,7 +3745,7 @@ void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
// Maximum temporary width for wrappers to process at a time, in pixels.
#define MAXTWIDTH 2048
-#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
+#if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \
defined(HAS_I422TORGB565ROW_SSSE3)
// row_win.cc has asm version, but GCC uses 2 step wrapper.
void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
@@ -3175,12 +4047,93 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
}
#endif
+#ifdef HAS_RGB24TOYJROW_AVX2
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+ ARGBToYJRow_AVX2(row, dst_yj, twidth);
+ src_rgb24 += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RGB24TOYJROW_AVX2
+
+#ifdef HAS_RAWTOYJROW_AVX2
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RAWToARGBRow_SSSE3(src_raw, row, twidth);
+ ARGBToYJRow_AVX2(row, dst_yj, twidth);
+ src_raw += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RAWTOYJROW_AVX2
+
+#ifdef HAS_RGB24TOYJROW_SSSE3
+// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
+void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
+ ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+ src_rgb24 += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RGB24TOYJROW_SSSE3
+
+#ifdef HAS_RAWTOYJROW_SSSE3
+// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
+void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ RAWToARGBRow_SSSE3(src_raw, row, twidth);
+ ARGBToYJRow_SSSE3(row, dst_yj, twidth);
+ src_raw += twidth * 3;
+ dst_yj += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_RAWTOYJROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
+void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int scale,
+ int width,
+ int source_y_fraction) {
+ // Row buffer for intermediate 16 bit pixels.
+ SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
+ Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
+ src_ptr += twidth;
+ dst_ptr += twidth;
+ width -= twidth;
+ }
+}
+#endif // HAS_INTERPOLATEROW_16TO8_AVX2
+
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fsum = 0.f;
int i;
-#if defined(__clang__)
-#pragma clang loop vectorize_width(4)
-#endif
for (i = 0; i < width; ++i) {
float v = *src++;
fsum += v * v;
@@ -3231,6 +4184,29 @@ void GaussCol_C(const uint16_t* src0,
}
}
+void GaussRow_F32_C(const float* src, float* dst, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) *
+ (1.0f / 256.0f);
+ ++src;
+ }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_C(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+ }
+}
+
// Convert biplanar NV21 to packed YUV24
void NV21ToYUV24Row_C(const uint8_t* src_y,
const uint8_t* src_vu,
@@ -3256,13 +4232,14 @@ void NV21ToYUV24Row_C(const uint8_t* src_y,
}
// Filter 2 rows of AYUV UV's (444) into UV (420).
+// AYUV is VUYA in memory. UV for NV12 is UV order in memory.
void AYUVToUVRow_C(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_uv,
int width) {
// Output a row of UV values, filtering 2x2 rows of AYUV.
int x;
- for (x = 0; x < width; x += 2) {
+ for (x = 0; x < width - 1; x += 2) {
dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
src_ayuv[src_stride_ayuv + 5] + 2) >>
2;
@@ -3273,12 +4250,8 @@ void AYUVToUVRow_C(const uint8_t* src_ayuv,
dst_uv += 2;
}
if (width & 1) {
- dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
- src_ayuv[src_stride_ayuv + 0] + 2) >>
- 2;
- dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
- src_ayuv[src_stride_ayuv + 1] + 2) >>
- 2;
+ dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
+ dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
}
}
@@ -3289,7 +4262,7 @@ void AYUVToVURow_C(const uint8_t* src_ayuv,
int width) {
// Output a row of VU values, filtering 2x2 rows of AYUV.
int x;
- for (x = 0; x < width; x += 2) {
+ for (x = 0; x < width - 1; x += 2) {
dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
src_ayuv[src_stride_ayuv + 4] + 2) >>
2;
@@ -3300,12 +4273,8 @@ void AYUVToVURow_C(const uint8_t* src_ayuv,
dst_vu += 2;
}
if (width & 1) {
- dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
- src_ayuv[src_stride_ayuv + 0] + 2) >>
- 2;
- dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
- src_ayuv[src_stride_ayuv + 1] + 2) >>
- 2;
+ dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1;
+ dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1;
}
}
@@ -3319,7 +4288,8 @@ void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
}
}
-void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
int x;
for (x = 0; x < width; ++x) {
uint8_t u = src_uv[0];
@@ -3331,16 +4301,27 @@ void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
}
}
-// divide values by weights and provide mask to indicate weight of 0.
-void FloatDivToByteRow_C(const float* src_weights,
- const float* src_values,
- uint8_t* dst_out,
- uint8_t* dst_mask,
- int width) {
+void HalfMergeUVRow_C(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
int x;
- for (x = 0; x < width; ++x) {
- dst_out[x] = Clamp(src_values[x] / src_weights[x]);
- dst_mask[x] = src_weights[x] > 0 ? 0 : 0xff;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
+ src_u[src_stride_u + 1] + 2) >>
+ 2;
+ dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
+ src_v[src_stride_v + 1] + 2) >>
+ 2;
+ src_u += 2;
+ src_v += 2;
+ dst_uv += 2;
+ }
+ if (width & 1) {
+ dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
+ dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
}
}
diff --git a/files/source/row_dspr2.cc b/files/source/row_dspr2.cc
deleted file mode 100644
index 11f78e0d..00000000
--- a/files/source/row_dspr2.cc
+++ /dev/null
@@ -1,1721 +0,0 @@
-/*
- * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-#ifdef HAS_COPYROW_MIPS
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
- __asm__ __volatile__(
- ".set noreorder \n"
- ".set noat \n"
- "slti $at, %[count], 8 \n"
- "bne $at ,$zero, $last8 \n"
- "xor $t8, %[src], %[dst] \n"
- "andi $t8, $t8, 0x3 \n"
-
- "bne $t8, $zero, unaligned \n"
- "negu $a3, %[dst] \n"
- // make dst/src aligned
- "andi $a3, $a3, 0x3 \n"
- "beq $a3, $zero, $chk16w \n"
- // word-aligned now count is the remining bytes count
- "subu %[count], %[count], $a3 \n"
-
- "lwr $t8, 0(%[src]) \n"
- "addu %[src], %[src], $a3 \n"
- "swr $t8, 0(%[dst]) \n"
- "addu %[dst], %[dst], $a3 \n"
-
- // Now the dst/src are mutually word-aligned with word-aligned addresses
- "$chk16w: \n"
- "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
- // t8 is the byte count after 64-byte chunks
- "beq %[count], $t8, chk8w \n"
- // There will be at most 1 32-byte chunk after it
- "subu $a3, %[count], $t8 \n" // the reminder
- // Here a3 counts bytes in 16w chunks
- "addu $a3, %[dst], $a3 \n"
- // Now a3 is the final dst after 64-byte chunks
- "addu $t0, %[dst], %[count] \n"
- // t0 is the "past the end" address
-
- // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be
- // past
- // the "t0-32" address
- // This means: for x=128 the last "safe" a1 address is "t0-160"
- // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
- // we will use "pref 30,128(a1)", so "t0-160" is the limit
- "subu $t9, $t0, 160 \n"
- // t9 is the "last safe pref 30,128(a1)" address
- "pref 0, 0(%[src]) \n" // first line of src
- "pref 0, 32(%[src]) \n" // second line of src
- "pref 0, 64(%[src]) \n"
- "pref 30, 32(%[dst]) \n"
- // In case the a1 > t9 don't use "pref 30" at all
- "sltu $v1, $t9, %[dst] \n"
- "bgtz $v1, $loop16w \n"
- "nop \n"
- // otherwise, start with using pref30
- "pref 30, 64(%[dst]) \n"
- "$loop16w: \n"
- "pref 0, 96(%[src]) \n"
- "lw $t0, 0(%[src]) \n"
- "bgtz $v1, $skip_pref30_96 \n" // skip
- "lw $t1, 4(%[src]) \n"
- "pref 30, 96(%[dst]) \n" // continue
- "$skip_pref30_96: \n"
- "lw $t2, 8(%[src]) \n"
- "lw $t3, 12(%[src]) \n"
- "lw $t4, 16(%[src]) \n"
- "lw $t5, 20(%[src]) \n"
- "lw $t6, 24(%[src]) \n"
- "lw $t7, 28(%[src]) \n"
- "pref 0, 128(%[src]) \n"
- // bring the next lines of src, addr 128
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "lw $t0, 32(%[src]) \n"
- "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
- "lw $t1, 36(%[src]) \n"
- "pref 30, 128(%[dst]) \n" // set dest, addr 128
- "$skip_pref30_128: \n"
- "lw $t2, 40(%[src]) \n"
- "lw $t3, 44(%[src]) \n"
- "lw $t4, 48(%[src]) \n"
- "lw $t5, 52(%[src]) \n"
- "lw $t6, 56(%[src]) \n"
- "lw $t7, 60(%[src]) \n"
- "pref 0, 160(%[src]) \n"
- // bring the next lines of src, addr 160
- "sw $t0, 32(%[dst]) \n"
- "sw $t1, 36(%[dst]) \n"
- "sw $t2, 40(%[dst]) \n"
- "sw $t3, 44(%[dst]) \n"
- "sw $t4, 48(%[dst]) \n"
- "sw $t5, 52(%[dst]) \n"
- "sw $t6, 56(%[dst]) \n"
- "sw $t7, 60(%[dst]) \n"
-
- "addiu %[dst], %[dst], 64 \n" // adding 64 to dest
- "sltu $v1, $t9, %[dst] \n"
- "bne %[dst], $a3, $loop16w \n"
- " addiu %[src], %[src], 64 \n" // adding 64 to src
- "move %[count], $t8 \n"
-
- // Here we have src and dest word-aligned but less than 64-bytes to go
-
- "chk8w: \n"
- "pref 0, 0x0(%[src]) \n"
- "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
- // the t8 is the reminder count past 32-bytes
- "beq %[count], $t8, chk1w \n"
- // count=t8,no 32-byte chunk
- " nop \n"
-
- "lw $t0, 0(%[src]) \n"
- "lw $t1, 4(%[src]) \n"
- "lw $t2, 8(%[src]) \n"
- "lw $t3, 12(%[src]) \n"
- "lw $t4, 16(%[src]) \n"
- "lw $t5, 20(%[src]) \n"
- "lw $t6, 24(%[src]) \n"
- "lw $t7, 28(%[src]) \n"
- "addiu %[src], %[src], 32 \n"
-
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "addiu %[dst], %[dst], 32 \n"
-
- "chk1w: \n"
- "andi %[count], $t8, 0x3 \n"
- // now count is the reminder past 1w chunks
- "beq %[count], $t8, $last8 \n"
- " subu $a3, $t8, %[count] \n"
- // a3 is count of bytes in 1w chunks
- "addu $a3, %[dst], $a3 \n"
- // now a3 is the dst address past the 1w chunks
- // copying in words (4-byte chunks)
- "$wordCopy_loop: \n"
- "lw $t3, 0(%[src]) \n"
- // the first t3 may be equal t0 ... optimize?
- "addiu %[src], %[src],4 \n"
- "addiu %[dst], %[dst],4 \n"
- "bne %[dst], $a3,$wordCopy_loop \n"
- " sw $t3, -4(%[dst]) \n"
-
- // For the last (<8) bytes
- "$last8: \n"
- "blez %[count], leave \n"
- " addu $a3, %[dst], %[count] \n" // a3 -last dst address
- "$last8loop: \n"
- "lb $v1, 0(%[src]) \n"
- "addiu %[src], %[src], 1 \n"
- "addiu %[dst], %[dst], 1 \n"
- "bne %[dst], $a3, $last8loop \n"
- " sb $v1, -1(%[dst]) \n"
-
- "leave: \n"
- " j $ra \n"
- " nop \n"
-
- //
- // UNALIGNED case
- //
-
- "unaligned: \n"
- // got here with a3="negu a1"
- "andi $a3, $a3, 0x3 \n" // a1 is word aligned?
- "beqz $a3, $ua_chk16w \n"
- " subu %[count], %[count], $a3 \n"
- // bytes left after initial a3 bytes
- "lwr $v1, 0(%[src]) \n"
- "lwl $v1, 3(%[src]) \n"
- "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
- "swr $v1, 0(%[dst]) \n"
- "addu %[dst], %[dst], $a3 \n"
- // below the dst will be word aligned (NOTE1)
- "$ua_chk16w: \n"
- "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
- // t8 is the byte count after 64-byte chunks
- "beq %[count], $t8, ua_chk8w \n"
- // if a2==t8, no 64-byte chunks
- // There will be at most 1 32-byte chunk after it
- "subu $a3, %[count], $t8 \n" // the reminder
- // Here a3 counts bytes in 16w chunks
- "addu $a3, %[dst], $a3 \n"
- // Now a3 is the final dst after 64-byte chunks
- "addu $t0, %[dst], %[count] \n" // t0 "past the end"
- "subu $t9, $t0, 160 \n"
- // t9 is the "last safe pref 30,128(a1)" address
- "pref 0, 0(%[src]) \n" // first line of src
- "pref 0, 32(%[src]) \n" // second line addr 32
- "pref 0, 64(%[src]) \n"
- "pref 30, 32(%[dst]) \n"
- // safe, as we have at least 64 bytes ahead
- // In case the a1 > t9 don't use "pref 30" at all
- "sltu $v1, $t9, %[dst] \n"
- "bgtz $v1, $ua_loop16w \n"
- // skip "pref 30,64(a1)" for too short arrays
- " nop \n"
- // otherwise, start with using pref30
- "pref 30, 64(%[dst]) \n"
- "$ua_loop16w: \n"
- "pref 0, 96(%[src]) \n"
- "lwr $t0, 0(%[src]) \n"
- "lwl $t0, 3(%[src]) \n"
- "lwr $t1, 4(%[src]) \n"
- "bgtz $v1, $ua_skip_pref30_96 \n"
- " lwl $t1, 7(%[src]) \n"
- "pref 30, 96(%[dst]) \n"
- // continue setting up the dest, addr 96
- "$ua_skip_pref30_96: \n"
- "lwr $t2, 8(%[src]) \n"
- "lwl $t2, 11(%[src]) \n"
- "lwr $t3, 12(%[src]) \n"
- "lwl $t3, 15(%[src]) \n"
- "lwr $t4, 16(%[src]) \n"
- "lwl $t4, 19(%[src]) \n"
- "lwr $t5, 20(%[src]) \n"
- "lwl $t5, 23(%[src]) \n"
- "lwr $t6, 24(%[src]) \n"
- "lwl $t6, 27(%[src]) \n"
- "lwr $t7, 28(%[src]) \n"
- "lwl $t7, 31(%[src]) \n"
- "pref 0, 128(%[src]) \n"
- // bring the next lines of src, addr 128
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "lwr $t0, 32(%[src]) \n"
- "lwl $t0, 35(%[src]) \n"
- "lwr $t1, 36(%[src]) \n"
- "bgtz $v1, ua_skip_pref30_128 \n"
- " lwl $t1, 39(%[src]) \n"
- "pref 30, 128(%[dst]) \n"
- // continue setting up the dest, addr 128
- "ua_skip_pref30_128: \n"
-
- "lwr $t2, 40(%[src]) \n"
- "lwl $t2, 43(%[src]) \n"
- "lwr $t3, 44(%[src]) \n"
- "lwl $t3, 47(%[src]) \n"
- "lwr $t4, 48(%[src]) \n"
- "lwl $t4, 51(%[src]) \n"
- "lwr $t5, 52(%[src]) \n"
- "lwl $t5, 55(%[src]) \n"
- "lwr $t6, 56(%[src]) \n"
- "lwl $t6, 59(%[src]) \n"
- "lwr $t7, 60(%[src]) \n"
- "lwl $t7, 63(%[src]) \n"
- "pref 0, 160(%[src]) \n"
- // bring the next lines of src, addr 160
- "sw $t0, 32(%[dst]) \n"
- "sw $t1, 36(%[dst]) \n"
- "sw $t2, 40(%[dst]) \n"
- "sw $t3, 44(%[dst]) \n"
- "sw $t4, 48(%[dst]) \n"
- "sw $t5, 52(%[dst]) \n"
- "sw $t6, 56(%[dst]) \n"
- "sw $t7, 60(%[dst]) \n"
-
- "addiu %[dst],%[dst],64 \n" // adding 64 to dest
- "sltu $v1,$t9,%[dst] \n"
- "bne %[dst],$a3,$ua_loop16w \n"
- " addiu %[src],%[src],64 \n" // adding 64 to src
- "move %[count],$t8 \n"
-
- // Here we have src and dest word-aligned but less than 64-bytes to go
-
- "ua_chk8w: \n"
- "pref 0, 0x0(%[src]) \n"
- "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
- // the t8 is the reminder count
- "beq %[count], $t8, $ua_chk1w \n"
- // when count==t8, no 32-byte chunk
-
- "lwr $t0, 0(%[src]) \n"
- "lwl $t0, 3(%[src]) \n"
- "lwr $t1, 4(%[src]) \n"
- "lwl $t1, 7(%[src]) \n"
- "lwr $t2, 8(%[src]) \n"
- "lwl $t2, 11(%[src]) \n"
- "lwr $t3, 12(%[src]) \n"
- "lwl $t3, 15(%[src]) \n"
- "lwr $t4, 16(%[src]) \n"
- "lwl $t4, 19(%[src]) \n"
- "lwr $t5, 20(%[src]) \n"
- "lwl $t5, 23(%[src]) \n"
- "lwr $t6, 24(%[src]) \n"
- "lwl $t6, 27(%[src]) \n"
- "lwr $t7, 28(%[src]) \n"
- "lwl $t7, 31(%[src]) \n"
- "addiu %[src], %[src], 32 \n"
-
- "sw $t0, 0(%[dst]) \n"
- "sw $t1, 4(%[dst]) \n"
- "sw $t2, 8(%[dst]) \n"
- "sw $t3, 12(%[dst]) \n"
- "sw $t4, 16(%[dst]) \n"
- "sw $t5, 20(%[dst]) \n"
- "sw $t6, 24(%[dst]) \n"
- "sw $t7, 28(%[dst]) \n"
- "addiu %[dst], %[dst], 32 \n"
-
- "$ua_chk1w: \n"
- "andi %[count], $t8, 0x3 \n"
- // now count is the reminder past 1w chunks
- "beq %[count], $t8, ua_smallCopy \n"
- "subu $a3, $t8, %[count] \n"
- // a3 is count of bytes in 1w chunks
- "addu $a3, %[dst], $a3 \n"
- // now a3 is the dst address past the 1w chunks
-
- // copying in words (4-byte chunks)
- "$ua_wordCopy_loop: \n"
- "lwr $v1, 0(%[src]) \n"
- "lwl $v1, 3(%[src]) \n"
- "addiu %[src], %[src], 4 \n"
- "addiu %[dst], %[dst], 4 \n"
- // note: dst=a1 is word aligned here, see NOTE1
- "bne %[dst], $a3, $ua_wordCopy_loop \n"
- " sw $v1,-4(%[dst]) \n"
-
- // Now less than 4 bytes (value in count) left to copy
- "ua_smallCopy: \n"
- "beqz %[count], leave \n"
- " addu $a3, %[dst], %[count] \n" // a3 = last dst address
- "$ua_smallCopy_loop: \n"
- "lb $v1, 0(%[src]) \n"
- "addiu %[src], %[src], 1 \n"
- "addiu %[dst], %[dst], 1 \n"
- "bne %[dst],$a3,$ua_smallCopy_loop \n"
- " sb $v1, -1(%[dst]) \n"
-
- "j $ra \n"
- " nop \n"
- ".set at \n"
- ".set reorder \n"
- : [dst] "+r"(dst), [src] "+r"(src)
- : [count] "r"(count)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "a3", "v1",
- "at");
-}
-#endif // HAS_COPYROW_MIPS
-
-// DSPR2 functions
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
- (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) && \
- (__mips_isa_rev < 6)
-
-void SplitUVRow_DSPR2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "srl $t4, %[width], 4 \n" // multiplies of 16
- "blez $t4, 2f \n"
- " andi %[width], %[width], 0xf \n" // residual
-
- "1: \n"
- "addiu $t4, $t4, -1 \n"
- "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
- "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
- "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
- "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
- "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
- "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 |
- // U10
- "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 |
- // U12
- "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 |
- // U14
- "addiu %[src_uv], %[src_uv], 32 \n"
- "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
- "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
- "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
- "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
- "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
- "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
- "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 |
- // V12
- "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 |
- // U12
- "sw $t9, 0(%[dst_v]) \n"
- "sw $t0, 0(%[dst_u]) \n"
- "sw $t1, 4(%[dst_v]) \n"
- "sw $t2, 4(%[dst_u]) \n"
- "sw $t3, 8(%[dst_v]) \n"
- "sw $t5, 8(%[dst_u]) \n"
- "sw $t6, 12(%[dst_v]) \n"
- "sw $t7, 12(%[dst_u]) \n"
- "addiu %[dst_v], %[dst_v], 16 \n"
- "bgtz $t4, 1b \n"
- " addiu %[dst_u], %[dst_u], 16 \n"
-
- "beqz %[width], 3f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, 0(%[src_uv]) \n"
- "lbu $t1, 1(%[src_uv]) \n"
- "addiu %[src_uv], %[src_uv], 2 \n"
- "addiu %[width], %[width], -1 \n"
- "sb $t0, 0(%[dst_u]) \n"
- "sb $t1, 0(%[dst_v]) \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "bgtz %[width], 2b \n"
- " addiu %[dst_v], %[dst_v], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_uv] "+r"(src_uv), [width] "+r"(width), [dst_u] "+r"(dst_u),
- [dst_v] "+r"(dst_v)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t4, %[width], 4 \n" // multiplies of 16
- "andi $t5, %[width], 0xf \n"
- "blez $t4, 2f \n"
- " addu %[src], %[src], %[width] \n" // src += width
-
- "1: \n"
- "lw $t0, -16(%[src]) \n" // |3|2|1|0|
- "lw $t1, -12(%[src]) \n" // |7|6|5|4|
- "lw $t2, -8(%[src]) \n" // |11|10|9|8|
- "lw $t3, -4(%[src]) \n" // |15|14|13|12|
- "wsbh $t0, $t0 \n" // |2|3|0|1|
- "wsbh $t1, $t1 \n" // |6|7|4|5|
- "wsbh $t2, $t2 \n" // |10|11|8|9|
- "wsbh $t3, $t3 \n" // |14|15|12|13|
- "rotr $t0, $t0, 16 \n" // |0|1|2|3|
- "rotr $t1, $t1, 16 \n" // |4|5|6|7|
- "rotr $t2, $t2, 16 \n" // |8|9|10|11|
- "rotr $t3, $t3, 16 \n" // |12|13|14|15|
- "addiu %[src], %[src], -16 \n"
- "addiu $t4, $t4, -1 \n"
- "sw $t3, 0(%[dst]) \n" // |15|14|13|12|
- "sw $t2, 4(%[dst]) \n" // |11|10|9|8|
- "sw $t1, 8(%[dst]) \n" // |7|6|5|4|
- "sw $t0, 12(%[dst]) \n" // |3|2|1|0|
- "bgtz $t4, 1b \n"
- " addiu %[dst], %[dst], 16 \n"
- "beqz $t5, 3f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, -1(%[src]) \n"
- "addiu $t5, $t5, -1 \n"
- "addiu %[src], %[src], -1 \n"
- "sb $t0, 0(%[dst]) \n"
- "bgez $t5, 2b \n"
- " addiu %[dst], %[dst], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src] "+r"(src), [dst] "+r"(dst)
- : [width] "r"(width)
- : "t0", "t1", "t2", "t3", "t4", "t5");
-}
-
-void MirrorUVRow_DSPR2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- int x;
- int y;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "addu $t4, %[width], %[width] \n"
- "srl %[x], %[width], 4 \n"
- "andi %[y], %[width], 0xf \n"
- "blez %[x], 2f \n"
- " addu %[src_uv], %[src_uv], $t4 \n"
-
- "1: \n"
- "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
- "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
- "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
- "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
- "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
- "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
- "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
- "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
-
- "rotr $t0, $t0, 16 \n" // |1|0|3|2|
- "rotr $t1, $t1, 16 \n" // |5|4|7|6|
- "rotr $t2, $t2, 16 \n" // |9|8|11|10|
- "rotr $t3, $t3, 16 \n" // |13|12|15|14|
- "rotr $t4, $t4, 16 \n" // |17|16|19|18|
- "rotr $t6, $t6, 16 \n" // |21|20|23|22|
- "rotr $t7, $t7, 16 \n" // |25|24|27|26|
- "rotr $t8, $t8, 16 \n" // |29|28|31|30|
- "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
- "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
- "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
- "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
- "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
- "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
- "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
- "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
- "addiu %[src_uv], %[src_uv], -32 \n"
- "addiu %[x], %[x], -1 \n"
- "swr $t4, 0(%[dst_u]) \n"
- "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
- "swr $t6, 0(%[dst_v]) \n"
- "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
- "swr $t2, 4(%[dst_u]) \n"
- "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
- "swr $t3, 4(%[dst_v]) \n"
- "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
- "swr $t0, 8(%[dst_u]) \n"
- "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
- "swr $t1, 8(%[dst_v]) \n"
- "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
- "swr $t9, 12(%[dst_u]) \n"
- "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
- "swr $t5, 12(%[dst_v]) \n"
- "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
- "addiu %[dst_v], %[dst_v], 16 \n"
- "bgtz %[x], 1b \n"
- " addiu %[dst_u], %[dst_u], 16 \n"
- "beqz %[y], 3f \n"
- " nop \n"
- "b 2f \n"
- " nop \n"
-
- "2: \n"
- "lbu $t0, -2(%[src_uv]) \n"
- "lbu $t1, -1(%[src_uv]) \n"
- "addiu %[src_uv], %[src_uv], -2 \n"
- "addiu %[y], %[y], -1 \n"
- "sb $t0, 0(%[dst_u]) \n"
- "sb $t1, 0(%[dst_v]) \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "bgtz %[y], 2b \n"
- " addiu %[dst_v], %[dst_v], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_uv] "+r"(src_uv), [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v),
- [x] "=&r"(x), [y] "=&r"(y)
- : [width] "r"(width)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9");
-}
-
-void I422ToARGBRow_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
- int x;
- uint32 tmp_ub = yuvconstants->kUVToB[0];
- uint32 tmp_ug = yuvconstants->kUVToG[0];
- uint32 tmp_vg = yuvconstants->kUVToG[1];
- uint32 tmp_vr = yuvconstants->kUVToR[1];
- uint32 tmp_bb = yuvconstants->kUVBiasB[0];
- uint32 tmp_bg = yuvconstants->kUVBiasG[0];
- uint32 tmp_br = yuvconstants->kUVBiasR[0];
- uint32 yg = yuvconstants->kYToRgb[0];
- uint32 tmp_yg;
- uint32 tmp_mask = 0x7fff7fff;
- tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
- tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
- tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
- tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
- tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
- tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
- tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
- tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
- yg = yg * 0x0101;
-
- for (x = 0; x < width - 1; x += 2) {
- uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lbu %[tmp_t7], 0(%[src_y]) \n"
- "lbu %[tmp_t1], 1(%[src_y]) \n"
- "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
- "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
- "lbu %[tmp_t2], 0(%[src_u]) \n"
- "lbu %[tmp_t3], 0(%[src_v]) \n"
- "replv.ph %[tmp_t2], %[tmp_t2] \n"
- "replv.ph %[tmp_t3], %[tmp_t3] \n"
- "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
- "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
- "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
- "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
- "srl %[tmp_t7], %[tmp_t7], 16 \n"
- "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
- "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
- "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
- "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
- "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
- "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
- "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
- "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
- "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
- "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
- "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
- "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
- "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
- "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
- "precrq.ph.w %[tmp_t9], %[tmp_t8], %[tmp_t7] \n"
- "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
- "precr.qb.ph %[tmp_t8], %[tmp_t9], %[tmp_t7] \n"
- "precrq.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
- "sw %[tmp_t8], 0(%[rgb_buf]) \n"
- "sw %[tmp_t7], 4(%[rgb_buf]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
- : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
- [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [yg] "r"(yg),
- [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
- [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
- [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
- src_y += 2;
- src_u += 1;
- src_v += 1;
- rgb_buf += 8; // Advance 4 pixels.
- }
-}
-
-// Bilinear filter 8x2 -> 8x1
-void InterpolateRow_DSPR2(uint8* dst_ptr,
- const uint8* src_ptr,
- ptrdiff_t src_stride,
- int dst_width,
- int source_y_fraction) {
- int y0_fraction = 256 - source_y_fraction;
- const uint8* src_ptr1 = src_ptr + src_stride;
-
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "replv.ph $t0, %[y0_fraction] \n"
- "replv.ph $t1, %[source_y_fraction] \n"
-
- "1: \n"
- "lw $t2, 0(%[src_ptr]) \n"
- "lw $t3, 0(%[src_ptr1]) \n"
- "lw $t4, 4(%[src_ptr]) \n"
- "lw $t5, 4(%[src_ptr1]) \n"
- "muleu_s.ph.qbl $t6, $t2, $t0 \n"
- "muleu_s.ph.qbr $t7, $t2, $t0 \n"
- "muleu_s.ph.qbl $t8, $t3, $t1 \n"
- "muleu_s.ph.qbr $t9, $t3, $t1 \n"
- "muleu_s.ph.qbl $t2, $t4, $t0 \n"
- "muleu_s.ph.qbr $t3, $t4, $t0 \n"
- "muleu_s.ph.qbl $t4, $t5, $t1 \n"
- "muleu_s.ph.qbr $t5, $t5, $t1 \n"
- "addq.ph $t6, $t6, $t8 \n"
- "addq.ph $t7, $t7, $t9 \n"
- "addq.ph $t2, $t2, $t4 \n"
- "addq.ph $t3, $t3, $t5 \n"
- "shra_r.ph $t6, $t6, 8 \n"
- "shra_r.ph $t7, $t7, 8 \n"
- "shra_r.ph $t2, $t2, 8 \n"
- "shra_r.ph $t3, $t3, 8 \n"
- "precr.qb.ph $t6, $t6, $t7 \n"
- "precr.qb.ph $t2, $t2, $t3 \n"
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[src_ptr1], %[src_ptr1], 8 \n"
- "addiu %[dst_width], %[dst_width], -8 \n"
- "sw $t6, 0(%[dst_ptr]) \n"
- "sw $t2, 4(%[dst_ptr]) \n"
- "bgtz %[dst_width], 1b \n"
- " addiu %[dst_ptr], %[dst_ptr], 8 \n"
-
- ".set pop \n"
- : [dst_ptr] "+r"(dst_ptr), [src_ptr1] "+r"(src_ptr1),
- [src_ptr] "+r"(src_ptr), [dst_width] "+r"(dst_width)
- : [source_y_fraction] "r"(source_y_fraction),
- [y0_fraction] "r"(y0_fraction), [src_stride] "r"(src_stride)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-#include <stdio.h>
-void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width) {
- int x;
- uint32 tmp_mask = 0xff;
- uint32 tmp_t1;
- for (x = 0; x < (width - 1); ++x) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "ulw %[tmp_t1], 0(%[src_rgb24]) \n"
- "addiu %[dst_argb], %[dst_argb], 4 \n"
- "addiu %[src_rgb24], %[src_rgb24], 3 \n"
- "ins %[tmp_t1], %[tmp_mask], 24, 8 \n"
- "sw %[tmp_t1], -4(%[dst_argb]) \n"
- ".set pop \n"
- : [src_rgb24] "+r"(src_rgb24), [dst_argb] "+r"(dst_argb),
- [tmp_t1] "=&r"(tmp_t1)
- : [tmp_mask] "r"(tmp_mask)
- : "memory");
- }
- uint8 b = src_rgb24[0];
- uint8 g = src_rgb24[1];
- uint8 r = src_rgb24[2];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = 255u;
-}
-
-void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width) {
- int x;
- uint32 tmp_mask = 0xff;
- uint32 tmp_t1, tmp_t2;
- for (x = 0; x < (width - 1); ++x) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "ulw %[tmp_t1], 0(%[src_raw]) \n"
- "addiu %[dst_argb], %[dst_argb], 4 \n"
- "addiu %[src_raw], %[src_raw], 3 \n"
- "srl %[tmp_t2], %[tmp_t1], 16 \n"
- "ins %[tmp_t1], %[tmp_mask], 24, 8 \n"
- "ins %[tmp_t1], %[tmp_t1], 16, 8 \n"
- "ins %[tmp_t1], %[tmp_t2], 0, 8 \n"
- "sw %[tmp_t1], -4(%[dst_argb]) \n"
- ".set pop \n"
- : [src_raw] "+r"(src_raw), [dst_argb] "+r"(dst_argb),
- [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2)
- : [tmp_mask] "r"(tmp_mask)
- : "memory");
- }
- uint8 r = src_raw[0];
- uint8 g = src_raw[1];
- uint8 b = src_raw[2];
- dst_argb[0] = b;
- dst_argb[1] = g;
- dst_argb[2] = r;
- dst_argb[3] = 255u;
-}
-
-void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565,
- uint8* dst_argb,
- int width) {
- int x;
- uint32 tmp_mask = 0xff;
- uint32 tmp_t1, tmp_t2, tmp_t3;
- for (x = 0; x < width; ++x) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lhu %[tmp_t1], 0(%[src_rgb565]) \n"
- "addiu %[dst_argb], %[dst_argb], 4 \n"
- "addiu %[src_rgb565], %[src_rgb565], 2 \n"
- "sll %[tmp_t2], %[tmp_t1], 8 \n"
- "ins %[tmp_t2], %[tmp_mask], 24,8 \n"
- "ins %[tmp_t2], %[tmp_t1], 3, 16 \n"
- "ins %[tmp_t2], %[tmp_t1], 5, 11 \n"
- "srl %[tmp_t3], %[tmp_t1], 9 \n"
- "ins %[tmp_t2], %[tmp_t3], 8, 2 \n"
- "ins %[tmp_t2], %[tmp_t1], 3, 5 \n"
- "srl %[tmp_t3], %[tmp_t1], 2 \n"
- "ins %[tmp_t2], %[tmp_t3], 0, 3 \n"
- "sw %[tmp_t2], -4(%[dst_argb]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [src_rgb565] "+r"(src_rgb565),
- [dst_argb] "+r"(dst_argb)
- : [tmp_mask] "r"(tmp_mask));
- }
-}
-
-void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
- uint8* dst_argb,
- int width) {
- int x;
- uint32 tmp_t1, tmp_t2, tmp_t3;
- for (x = 0; x < width; ++x) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lh %[tmp_t1], 0(%[src_argb1555]) \n"
- "addiu %[dst_argb], %[dst_argb], 4 \n"
- "addiu %[src_argb1555], %[src_argb1555], 2 \n"
- "sll %[tmp_t2], %[tmp_t1], 9 \n"
- "ins %[tmp_t2], %[tmp_t1], 4, 15 \n"
- "ins %[tmp_t2], %[tmp_t1], 6, 10 \n"
- "srl %[tmp_t3], %[tmp_t1], 7 \n"
- "ins %[tmp_t2], %[tmp_t3], 8, 3 \n"
- "ins %[tmp_t2], %[tmp_t1], 3, 5 \n"
- "srl %[tmp_t3], %[tmp_t1], 2 \n"
- "ins %[tmp_t2], %[tmp_t3], 0, 3 \n"
- "sw %[tmp_t2], -4(%[dst_argb]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [src_argb1555] "+r"(src_argb1555),
- [dst_argb] "+r"(dst_argb)
- :);
- }
-}
-
-void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
- uint8* dst_argb,
- int width) {
- int x;
- uint32 tmp_t1;
- for (x = 0; x < width; ++x) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lh %[tmp_t1], 0(%[src_argb4444]) \n"
- "addiu %[dst_argb], %[dst_argb], 4 \n"
- "addiu %[src_argb4444], %[src_argb4444], 2 \n"
- "ins %[tmp_t1], %[tmp_t1], 16, 16 \n"
- "ins %[tmp_t1], %[tmp_t1], 12, 16 \n"
- "ins %[tmp_t1], %[tmp_t1], 8, 12 \n"
- "ins %[tmp_t1], %[tmp_t1], 4, 8 \n"
- "sw %[tmp_t1], -4(%[dst_argb]) \n"
- ".set pop \n"
- : [src_argb4444] "+r"(src_argb4444), [dst_argb] "+r"(dst_argb),
- [tmp_t1] "=&r"(tmp_t1));
- }
-}
-
-void I444ToARGBRow_DSPR2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
- int x;
- uint32 tmp_ub = yuvconstants->kUVToB[0];
- uint32 tmp_ug = yuvconstants->kUVToG[0];
- uint32 tmp_vg = yuvconstants->kUVToG[1];
- uint32 tmp_vr = yuvconstants->kUVToR[1];
- uint32 tmp_bb = yuvconstants->kUVBiasB[0];
- uint32 tmp_bg = yuvconstants->kUVBiasG[0];
- uint32 tmp_br = yuvconstants->kUVBiasR[0];
- uint32 yg = yuvconstants->kYToRgb[0];
- uint32 tmp_mask = 0x7fff7fff;
- uint32 tmp_yg;
-
- tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
- tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
- tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
- tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
- tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
- tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
- tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
- tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
- yg = yg * 0x0101;
-
- for (x = 0; x < width - 1; x += 2) {
- uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lbu %[tmp_t7], 0(%[y_buf]) \n"
- "lbu %[tmp_t1], 1(%[y_buf]) \n"
- "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
- "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
- "lh %[tmp_t2], 0(%[u_buf]) \n"
- "lh %[tmp_t3], 0(%[v_buf]) \n"
- "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n"
- "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
- "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
- "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
- "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
- "srl %[tmp_t7], %[tmp_t7], 16 \n"
- "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
- "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
- "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
- "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
- "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
- "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
- "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
- "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
- "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
- "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
- "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
- "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
- "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
- "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
- "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
- "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
- "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
- "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
- "sw %[tmp_t8], 0(%[rgb_buf]) \n"
- "sw %[tmp_t7], 4(%[rgb_buf]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
- : [y_buf] "r"(y_buf), [yg] "r"(yg), [u_buf] "r"(u_buf),
- [v_buf] "r"(v_buf), [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug),
- [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
- [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
- [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
- y_buf += 2;
- u_buf += 2;
- v_buf += 2;
- rgb_buf += 8; // Advance 1 pixel.
- }
-}
-
-void I422ToARGB4444Row_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
- int width) {
- int x;
- uint32 tmp_ub = yuvconstants->kUVToB[0];
- uint32 tmp_ug = yuvconstants->kUVToG[0];
- uint32 tmp_vg = yuvconstants->kUVToG[1];
- uint32 tmp_vr = yuvconstants->kUVToR[1];
- uint32 tmp_bb = yuvconstants->kUVBiasB[0];
- uint32 tmp_bg = yuvconstants->kUVBiasG[0];
- uint32 tmp_br = yuvconstants->kUVBiasR[0];
- uint32 yg = yuvconstants->kYToRgb[0];
- uint32 tmp_yg;
- uint32 tmp_mask = 0x7fff7fff;
- tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
- tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
- tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
- tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
- tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
- tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
- tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
- tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
- yg = yg * 0x0101;
-
- for (x = 0; x < width - 1; x += 2) {
- uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lbu %[tmp_t7], 0(%[src_y]) \n"
- "lbu %[tmp_t1], 1(%[src_y]) \n"
- "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
- "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
- "lbu %[tmp_t2], 0(%[src_u]) \n"
- "lbu %[tmp_t3], 0(%[src_v]) \n"
- "replv.ph %[tmp_t2], %[tmp_t2] \n"
- "replv.ph %[tmp_t3], %[tmp_t3] \n"
- "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
- "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
- "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
- "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
- "srl %[tmp_t7], %[tmp_t7], 16 \n"
- "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
- "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
- "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
- "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
- "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
- "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
- "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
- "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
- "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
- "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
- "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
- "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
- "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
- "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
- "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
- "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
- "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
- "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
- "shrl.qb %[tmp_t1], %[tmp_t8], 4 \n"
- "shrl.qb %[tmp_t2], %[tmp_t7], 4 \n"
- "shrl.ph %[tmp_t8], %[tmp_t1], 4 \n"
- "shrl.ph %[tmp_t7], %[tmp_t2], 4 \n"
- "or %[tmp_t8], %[tmp_t8], %[tmp_t1] \n"
- "or %[tmp_t7], %[tmp_t7], %[tmp_t2] \n"
- "precr.qb.ph %[tmp_t8], %[tmp_t7], %[tmp_t8] \n"
- "sw %[tmp_t8], 0(%[dst_argb4444]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
- : [dst_argb4444] "r"(dst_argb4444), [yg] "r"(yg), [src_u] "r"(src_u),
- [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
- [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
- [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
- [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
- src_y += 2;
- src_u += 1;
- src_v += 1;
- dst_argb4444 += 4; // Advance 2 pixels.
- }
-}
-
-void I422ToARGB1555Row_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
- const struct YuvConstants* yuvconstants,
- int width) {
- int x;
- uint32 tmp_ub = yuvconstants->kUVToB[0];
- uint32 tmp_ug = yuvconstants->kUVToG[0];
- uint32 tmp_vg = yuvconstants->kUVToG[1];
- uint32 tmp_vr = yuvconstants->kUVToR[1];
- uint32 tmp_bb = yuvconstants->kUVBiasB[0];
- uint32 tmp_bg = yuvconstants->kUVBiasG[0];
- uint32 tmp_br = yuvconstants->kUVBiasR[0];
- uint32 yg = yuvconstants->kYToRgb[0];
- uint32 tmp_yg;
- uint32 tmp_mask = 0x80008000;
- tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
- tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
- tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
- tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
- tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
- tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
- tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
- tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
- yg = yg * 0x0101;
-
- for (x = 0; x < width - 1; x += 2) {
- uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lbu %[tmp_t7], 0(%[src_y]) \n"
- "lbu %[tmp_t1], 1(%[src_y]) \n"
- "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
- "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
- "lbu %[tmp_t2], 0(%[src_u]) \n"
- "lbu %[tmp_t3], 0(%[src_v]) \n"
- "replv.ph %[tmp_t2], %[tmp_t2] \n"
- "replv.ph %[tmp_t3], %[tmp_t3] \n"
- "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
- "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
- "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
- "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
- "srl %[tmp_t7], %[tmp_t7], 16 \n"
- "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
- "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
- "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
- "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
- "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
- "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
- "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
- "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
- "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
- "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
- "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
- "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
- "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
- "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
- "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
- "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
- "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
- "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
- "ins %[tmp_t3], %[tmp_t8], 7, 24 \n"
- "ins %[tmp_t3], %[tmp_t8], 10, 16 \n"
- "ins %[tmp_t3], %[tmp_t8], 13, 8 \n"
- "ins %[tmp_t4], %[tmp_t7], 7, 24 \n"
- "ins %[tmp_t4], %[tmp_t7], 10, 16 \n"
- "ins %[tmp_t4], %[tmp_t7], 13, 8 \n"
- "precrq.ph.w %[tmp_t8], %[tmp_t4], %[tmp_t3] \n"
- "or %[tmp_t8], %[tmp_t8], %[tmp_mask]\n"
- "sw %[tmp_t8], 0(%[dst_argb1555]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
- : [dst_argb1555] "r"(dst_argb1555), [yg] "r"(yg), [src_u] "r"(src_u),
- [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
- [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
- [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
- [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
- src_y += 2;
- src_u += 1;
- src_v += 1;
- dst_argb1555 += 4; // Advance 2 pixels.
- }
-}
-
-void NV12ToARGBRow_DSPR2(const uint8* src_y,
- const uint8* src_uv,
- uint8* rgb_buf,
- const struct YuvConstants* yuvconstants,
- int width) {
- int x;
- uint32 tmp_ub = yuvconstants->kUVToB[0];
- uint32 tmp_ug = yuvconstants->kUVToG[0];
- uint32 tmp_vg = yuvconstants->kUVToG[1];
- uint32 tmp_vr = yuvconstants->kUVToR[1];
- uint32 tmp_bb = yuvconstants->kUVBiasB[0];
- uint32 tmp_bg = yuvconstants->kUVBiasG[0];
- uint32 tmp_br = yuvconstants->kUVBiasR[0];
- uint32 yg = yuvconstants->kYToRgb[0];
- uint32 tmp_mask = 0x7fff7fff;
- uint32 tmp_yg;
- tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
- tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
- tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
- tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
- tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
- tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
- tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
- tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
- yg = yg * 0x0101;
-
- for (x = 0; x < width - 1; x += 2) {
- uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lbu %[tmp_t7], 0(%[src_y]) \n"
- "lbu %[tmp_t1], 1(%[src_y]) \n"
- "mul %[tmp_t7], %[tmp_t7], %[yg] \n"
- "mul %[tmp_t1], %[tmp_t1], %[yg] \n"
- "lbu %[tmp_t2], 0(%[src_uv]) \n"
- "lbu %[tmp_t3], 1(%[src_uv]) \n"
- "replv.ph %[tmp_t2], %[tmp_t2] \n"
- "replv.ph %[tmp_t3], %[tmp_t3] \n"
- "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"
- "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"
- "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"
- "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"
- "srl %[tmp_t7], %[tmp_t7], 16 \n"
- "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"
- "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"
- "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"
- "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"
- "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"
- "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"
- "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"
- "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"
- "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"
- "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"
- "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"
- "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"
- "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"
- "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"
- "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"
- "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"
- "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"
- "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"
- "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"
- "sw %[tmp_t8], 0(%[rgb_buf]) \n"
- "sw %[tmp_t7], 4(%[rgb_buf]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
- : [src_y] "r"(src_y), [src_uv] "r"(src_uv), [yg] "r"(yg),
- [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg),
- [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg),
- [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), [rgb_buf] "r"(rgb_buf),
- [tmp_mask] "r"(tmp_mask));
-
- src_y += 2;
- src_uv += 2;
- rgb_buf += 8; // Advance 2 pixels.
- }
-}
-
-void BGRAToUVRow_DSPR2(const uint8* src_rgb0,
- int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
- int x;
- int const1 = 0xffda0000;
- int const2 = 0x0070ffb6;
- int const3 = 0x00700000;
- int const4 = 0xffeeffa2;
- int const5 = 0x100;
- for (x = 0; x < width - 1; x += 2) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_rgb0]) \n"
- "lw %[tmp_t2], 4(%[src_rgb0]) \n"
- "lw %[tmp_t3], 0(%[src_rgb1]) \n"
- "lw %[tmp_t4], 4(%[src_rgb1]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
- "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
- "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
- "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
- "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
- "extr_r.w %[tmp_t7], $ac0, 9 \n"
- "extr_r.w %[tmp_t8], $ac1, 9 \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "addiu %[dst_v], %[dst_v], 1 \n"
- "addiu %[src_rgb0], %[src_rgb0], 8 \n"
- "addiu %[src_rgb1], %[src_rgb1], 8 \n"
- "sb %[tmp_t7], -1(%[dst_u]) \n"
- "sb %[tmp_t8], -1(%[dst_v]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
- [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
- : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
- [const4] "r"(const4), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi");
- }
-}
-
-void BGRAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
- int x;
- int const1 = 0x00420000;
- int const2 = 0x00190081;
- int const5 = 0x40;
- for (x = 0; x < width; x += 4) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_argb0]) \n"
- "lw %[tmp_t2], 4(%[src_argb0]) \n"
- "lw %[tmp_t3], 8(%[src_argb0]) \n"
- "lw %[tmp_t4], 12(%[src_argb0]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "mult $ac2, %[const5], %[const5] \n"
- "mult $ac3, %[const5], %[const5] \n"
- "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
- "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
- "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
- "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
- "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
- "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
- "extr_r.w %[tmp_t1], $ac0, 8 \n"
- "extr_r.w %[tmp_t2], $ac1, 8 \n"
- "extr_r.w %[tmp_t3], $ac2, 8 \n"
- "extr_r.w %[tmp_t4], $ac3, 8 \n"
- "addiu %[src_argb0],%[src_argb0], 16 \n"
- "addiu %[dst_y], %[dst_y], 4 \n"
- "sb %[tmp_t1], -4(%[dst_y]) \n"
- "sb %[tmp_t2], -3(%[dst_y]) \n"
- "sb %[tmp_t3], -2(%[dst_y]) \n"
- "sb %[tmp_t4], -1(%[dst_y]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
- : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
- "$ac3hi");
- }
-}
-
-void ABGRToUVRow_DSPR2(const uint8* src_rgb0,
- int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
- int x;
- int const1 = 0xffb6ffda;
- int const2 = 0x00000070;
- int const3 = 0xffa20070;
- int const4 = 0x0000ffee;
- int const5 = 0x100;
-
- for (x = 0; x < width - 1; x += 2) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_rgb0]) \n"
- "lw %[tmp_t2], 4(%[src_rgb0]) \n"
- "lw %[tmp_t3], 0(%[src_rgb1]) \n"
- "lw %[tmp_t4], 4(%[src_rgb1]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
- "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
- "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
- "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
- "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
- "extr_r.w %[tmp_t7], $ac0, 9 \n"
- "extr_r.w %[tmp_t8], $ac1, 9 \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "addiu %[dst_v], %[dst_v], 1 \n"
- "addiu %[src_rgb0], %[src_rgb0], 8 \n"
- "addiu %[src_rgb1], %[src_rgb1], 8 \n"
- "sb %[tmp_t7], -1(%[dst_u]) \n"
- "sb %[tmp_t8], -1(%[dst_v]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
- [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
- : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
- [const4] "r"(const4), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi");
- }
-}
-
-void ARGBToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
- int x;
- int const1 = 0x00810019;
- int const2 = 0x00000042;
- int const5 = 0x40;
- for (x = 0; x < width; x += 4) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_argb0]) \n"
- "lw %[tmp_t2], 4(%[src_argb0]) \n"
- "lw %[tmp_t3], 8(%[src_argb0]) \n"
- "lw %[tmp_t4], 12(%[src_argb0]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "mult $ac2, %[const5], %[const5] \n"
- "mult $ac3, %[const5], %[const5] \n"
- "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
- "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
- "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
- "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
- "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
- "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
- "extr_r.w %[tmp_t1], $ac0, 8 \n"
- "extr_r.w %[tmp_t2], $ac1, 8 \n"
- "extr_r.w %[tmp_t3], $ac2, 8 \n"
- "extr_r.w %[tmp_t4], $ac3, 8 \n"
- "addiu %[dst_y], %[dst_y], 4 \n"
- "addiu %[src_argb0],%[src_argb0], 16 \n"
- "sb %[tmp_t1], -4(%[dst_y]) \n"
- "sb %[tmp_t2], -3(%[dst_y]) \n"
- "sb %[tmp_t3], -2(%[dst_y]) \n"
- "sb %[tmp_t4], -1(%[dst_y]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
- : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
- "$ac3hi");
- }
-}
-
-void ABGRToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
- int x;
- int const1 = 0x00810042;
- int const2 = 0x00000019;
- int const5 = 0x40;
- for (x = 0; x < width; x += 4) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_argb0]) \n"
- "lw %[tmp_t2], 4(%[src_argb0]) \n"
- "lw %[tmp_t3], 8(%[src_argb0]) \n"
- "lw %[tmp_t4], 12(%[src_argb0]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "mult $ac2, %[const5], %[const5] \n"
- "mult $ac3, %[const5], %[const5] \n"
- "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
- "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
- "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
- "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
- "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
- "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
- "extr_r.w %[tmp_t1], $ac0, 8 \n"
- "extr_r.w %[tmp_t2], $ac1, 8 \n"
- "extr_r.w %[tmp_t3], $ac2, 8 \n"
- "extr_r.w %[tmp_t4], $ac3, 8 \n"
- "addiu %[src_argb0],%[src_argb0], 16 \n"
- "addiu %[dst_y], %[dst_y], 4 \n"
- "sb %[tmp_t1], -4(%[dst_y]) \n"
- "sb %[tmp_t2], -3(%[dst_y]) \n"
- "sb %[tmp_t3], -2(%[dst_y]) \n"
- "sb %[tmp_t4], -1(%[dst_y]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
- : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
- "$ac3hi");
- }
-}
-
-void RGBAToUVRow_DSPR2(const uint8* src_rgb0,
- int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
- int x;
- int const1 = 0xffb60070;
- int const2 = 0x0000ffda;
- int const3 = 0xffa2ffee;
- int const4 = 0x00000070;
- int const5 = 0x100;
-
- for (x = 0; x < width - 1; x += 2) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "ulw %[tmp_t1], 0+1(%[src_rgb0]) \n"
- "ulw %[tmp_t2], 4+1(%[src_rgb0]) \n"
- "ulw %[tmp_t3], 0+1(%[src_rgb1]) \n"
- "ulw %[tmp_t4], 4+1(%[src_rgb1]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
- "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
- "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
- "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
- "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
- "extr_r.w %[tmp_t7], $ac0, 9 \n"
- "extr_r.w %[tmp_t8], $ac1, 9 \n"
- "addiu %[src_rgb0], %[src_rgb0], 8 \n"
- "addiu %[src_rgb1], %[src_rgb1], 8 \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "addiu %[dst_v], %[dst_v], 1 \n"
- "sb %[tmp_t7], -1(%[dst_u]) \n"
- "sb %[tmp_t8], -1(%[dst_v]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
- [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
- : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
- [const4] "r"(const4), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi");
- }
-}
-
-void RGBAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
- int x;
- int const1 = 0x00420081;
- int const2 = 0x00190000;
- int const5 = 0x40;
- for (x = 0; x < width; x += 4) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_argb0]) \n"
- "lw %[tmp_t2], 4(%[src_argb0]) \n"
- "lw %[tmp_t3], 8(%[src_argb0]) \n"
- "lw %[tmp_t4], 12(%[src_argb0]) \n"
- "preceu.ph.qbl %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbr %[tmp_t4], %[tmp_t4] \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "mult $ac2, %[const5], %[const5] \n"
- "mult $ac3, %[const5], %[const5] \n"
- "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"
- "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"
- "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"
- "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"
- "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"
- "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"
- "extr_r.w %[tmp_t1], $ac0, 8 \n"
- "extr_r.w %[tmp_t2], $ac1, 8 \n"
- "extr_r.w %[tmp_t3], $ac2, 8 \n"
- "extr_r.w %[tmp_t4], $ac3, 8 \n"
- "addiu %[dst_y], %[dst_y], 4 \n"
- "addiu %[src_argb0],%[src_argb0], 16 \n"
- "sb %[tmp_t1], -4(%[dst_y]) \n"
- "sb %[tmp_t2], -3(%[dst_y]) \n"
- "sb %[tmp_t3], -2(%[dst_y]) \n"
- "sb %[tmp_t4], -1(%[dst_y]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
- : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
- "$ac3hi");
- }
-}
-
-void ARGBToUVRow_DSPR2(const uint8* src_rgb0,
- int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
- int width) {
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
- int x;
- int const1 = 0xffb60070;
- int const2 = 0x0000ffda;
- int const3 = 0xffa2ffee;
- int const4 = 0x00000070;
- int const5 = 0x100;
-
- for (x = 0; x < width - 1; x += 2) {
- int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
- int tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t1], 0(%[src_rgb0]) \n"
- "lw %[tmp_t2], 4(%[src_rgb0]) \n"
- "lw %[tmp_t3], 0(%[src_rgb1]) \n"
- "lw %[tmp_t4], 4(%[src_rgb1]) \n"
- "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"
- "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"
- "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"
- "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"
- "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"
- "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"
- "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"
- "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"
- "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"
- "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"
- "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"
- "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"
- "mult $ac0, %[const5], %[const5] \n"
- "mult $ac1, %[const5], %[const5] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"
- "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"
- "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"
- "extr_r.w %[tmp_t7], $ac0, 9 \n"
- "extr_r.w %[tmp_t8], $ac1, 9 \n"
- "addiu %[src_rgb0], %[src_rgb0], 8 \n"
- "addiu %[src_rgb1], %[src_rgb1], 8 \n"
- "addiu %[dst_u], %[dst_u], 1 \n"
- "addiu %[dst_v], %[dst_v], 1 \n"
- "sb %[tmp_t7], -1(%[dst_u]) \n"
- "sb %[tmp_t8], -1(%[dst_v]) \n"
- ".set pop \n"
- : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
- [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
- [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
- [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
- [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
- : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
- [const4] "r"(const4), [const5] "r"(const5)
- : "hi", "lo", "$ac1lo", "$ac1hi");
- }
-}
-
-#endif // __mips_dsp_rev >= 2
-
-#endif // defined(__mips__)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc
index decd3d2e..dce8c439 100644
--- a/files/source/row_gcc.cc
+++ b/files/source/row_gcc.cc
@@ -9,25 +9,26 @@
*/
#include "libyuv/row.h"
-
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
// Constants for ARGB
-static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
- 13, 65, 33, 0, 13, 65, 33, 0};
+static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
+ 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
// JPeg full range.
-static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
- 15, 75, 38, 0, 15, 75, 38, 0};
+static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
+ 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
+
+static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
+ 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
@@ -45,8 +46,8 @@ static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
-20, -107, 127, 0, -20, -107, 127, 0};
// Constants for BGRA
-static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
- 0, 33, 65, 13, 0, 33, 65, 13};
+static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
+ 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
0, -38, -74, 112, 0, -38, -74, 112};
@@ -55,8 +56,8 @@ static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
0, 112, -94, -18, 0, 112, -94, -18};
// Constants for ABGR
-static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
- 33, 65, 13, 0, 33, 65, 13, 0};
+static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
+ 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
-38, -74, 112, 0, -38, -74, 112, 0};
@@ -65,8 +66,8 @@ static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
112, -94, -18, 0, 112, -94, -18, 0};
// Constants for RGBA.
-static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
- 0, 13, 65, 33, 0, 13, 65, 33};
+static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
+ 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
0, 112, -74, -38, 0, 112, -74, -38};
@@ -74,17 +75,15 @@ static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
0, -18, -94, 112, 0, -18, -94, 112};
-static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
-
-// 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
+static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
+ 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
- 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
#ifdef HAS_RGB24TOARGBROW_SSSE3
@@ -97,6 +96,10 @@ static const uvec8 kShuffleMaskRGB24ToARGB = {
static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+// Shuffle table for converting RAW to RGBA.
+static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u,
+ 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u};
+
// Shuffle table for converting RAW to RGB24. First 8.
static const uvec8 kShuffleMaskRAWToRGB24_0 = {
2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
@@ -154,24 +157,24 @@ static const lvec8 kShuffleNV21 = {
#ifdef HAS_J400TOARGBROW_SSE2
void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm0 \n"
- "por %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -185,35 +188,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -223,35 +226,35 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -259,29 +262,68 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
+// Same code as RAWToARGB with different shuffler and A in low bits
+void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
+ "psrld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToRGBA) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
uint8_t* dst_rgb24,
int width) {
asm volatile(
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
- "movdqa %5,%%xmm5 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+ "movdqa %5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x4(%0),%%xmm1 \n"
- "movdqu 0x8(%0),%%xmm2 \n"
- "lea 0x18(%0),%0 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x4(%0),%%xmm1 \n"
+ "movdqu 0x8(%0),%%xmm2 \n"
+ "lea 0x18(%0),%0 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -293,44 +335,44 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x20802080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xa,%%xmm4 \n"
- "psrlw $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,0x00(%1,%0,2) \n"
- "movdqu %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x20802080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xa,%%xmm4 \n"
+ "psrlw $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -341,47 +383,47 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x42004200,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "movdqa %%xmm3,%%xmm4 \n"
- "psrlw $0x6,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x42004200,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm4 \n"
+ "psrlw $0x6,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psllw $0x1,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "pand %%xmm7,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,0x00(%1,%0,2) \n"
- "movdqu %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psllw $0x1,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -392,34 +434,34 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "mov $0xf0f0f0f,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x4,%%xmm5 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
+ "mov $0xf0f0f0f,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x4,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pand %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "psllw $0x4,%%xmm1 \n"
- "psrlw $0x4,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,0x00(%1,%0,2) \n"
- "movdqu %%xmm1,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "psllw $0x4,%%xmm1 \n"
+ "psrlw $0x4,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,0x00(%1,%0,2) \n"
+ "movdqu %%xmm1,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -430,35 +472,35 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm6 \n"
+ "movdqa %3,%%xmm6 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -469,35 +511,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm6 \n"
+ "movdqa %3,%%xmm6 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "movdqu %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -512,37 +554,37 @@ static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm6 \n"
- "vmovdqa %4,%%ymm7 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
- "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
- "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
- "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
- "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
- "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
- "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
- "vpor %%ymm4,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
- "vpermq $0x4f,%%ymm2,%%ymm4 \n"
- "vpor %%ymm4,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
- "vpermq $0x93,%%ymm3,%%ymm3 \n"
- "vpor %%ymm3,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqa %4,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -571,26 +613,26 @@ static const ulvec8 kPermARGBToRGB24_2 = {
void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "vmovdqa %3,%%ymm5 \n"
- "vmovdqa %4,%%ymm6 \n"
- "vmovdqa %5,%%ymm7 \n"
+ "vmovdqa %3,%%ymm5 \n"
+ "vmovdqa %4,%%ymm6 \n"
+ "vmovdqa %5,%%ymm7 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
- "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
- "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
+ "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
+ "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -606,37 +648,37 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm6 \n"
- "vmovdqa %4,%%ymm7 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
- "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
- "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
- "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
- "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
- "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
- "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
- "vpor %%ymm4,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
- "vpermq $0x4f,%%ymm2,%%ymm4 \n"
- "vpor %%ymm4,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
- "vpermq $0x93,%%ymm3,%%ymm3 \n"
- "vpor %%ymm3,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm2,0x40(%1) \n"
- "lea 0x60(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqa %4,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -650,34 +692,34 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -690,40 +732,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
const uint32_t dither4,
int width) {
asm volatile(
- "movd %3,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm6 \n"
- "movdqa %%xmm6,%%xmm7 \n"
- "punpcklwd %%xmm6,%%xmm6 \n"
- "punpckhwd %%xmm7,%%xmm7 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "paddusb %%xmm6,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movd %3,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "punpcklwd %%xmm6,%%xmm6 \n"
+ "punpckhwd %%xmm7,%%xmm7 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "paddusb %%xmm6,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -739,35 +781,35 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
int width) {
asm volatile(
"vbroadcastss %3,%%xmm6 \n"
- "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
- "vpermq $0xd8,%%ymm6,%%ymm6 \n"
- "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
- "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
- "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
- "vpslld $0x5,%%ymm4,%%ymm4 \n"
- "vpslld $0xb,%%ymm3,%%ymm5 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
- "vpsrld $0x5,%%ymm0,%%ymm2 \n"
- "vpsrld $0x3,%%ymm0,%%ymm1 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm4,%%ymm2,%%ymm2 \n"
- "vpand %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpor %%ymm2,%%ymm1,%%ymm1 \n"
- "vpor %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
+ "vpermq $0xd8,%%ymm6,%%ymm6 \n"
+ "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
+ "vpslld $0x5,%%ymm4,%%ymm4 \n"
+ "vpslld $0xb,%%ymm3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x5,%%ymm0,%%ymm2 \n"
+ "vpsrld $0x3,%%ymm0,%%ymm1 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpand %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -780,38 +822,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1b,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x5,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pslld $0xa,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "pslld $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1b,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x5,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pslld $0xa,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "pslld $0xf,%%xmm7 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "psrad $0x10,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x6,%%xmm2 \n"
- "psrld $0x9,%%xmm3 \n"
- "pand %%xmm7,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm6,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x6,%%xmm2 \n"
+ "psrld $0x9,%%xmm3 \n"
+ "pand %%xmm7,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm6,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -821,26 +863,26 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xc,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xc,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm3 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm3,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "psrlq $0x4,%%xmm0 \n"
- "psrlq $0x8,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm3,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "psrlq $0x4,%%xmm0 \n"
+ "psrlq $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -884,31 +926,31 @@ static const uint32_t kMulAG10 = 64 * 65536 + 1028;
void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm2 \n" // shuffler for RB
- "movd %4,%%xmm3 \n" // multipler for RB
- "movd %5,%%xmm4 \n" // mask for R10 B10
- "movd %6,%%xmm5 \n" // mask for AG
- "movd %7,%%xmm6 \n" // multipler for AG
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "sub %0,%1 \n"
-
- "1: \n"
- "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n" // R0B0
- "pand %%xmm5,%%xmm0 \n" // A0G0
- "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
- "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
- "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
- "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
- "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
- "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
- "add $0x10,%0 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -923,31 +965,31 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "movdqa %3,%%xmm2 \n" // shuffler for RB
- "movd %4,%%xmm3 \n" // multipler for RB
- "movd %5,%%xmm4 \n" // mask for R10 B10
- "movd %6,%%xmm5 \n" // mask for AG
- "movd %7,%%xmm6 \n" // multipler for AG
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "sub %0,%1 \n"
-
- "1: \n"
- "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n" // R0B0
- "pand %%xmm5,%%xmm0 \n" // A0G0
- "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
- "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
- "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
- "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
- "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
- "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
- "add $0x10,%0 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -964,25 +1006,25 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
- "vbroadcastss %4,%%ymm3 \n" // multipler for RB
- "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
- "vbroadcastss %6,%%ymm5 \n" // mask for AG
- "vbroadcastss %7,%%ymm6 \n" // multipler for AG
- "sub %0,%1 \n"
-
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
- "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
- "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
- "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
- "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
- "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
- "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
- "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
- "add $0x20,%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
@@ -1001,25 +1043,25 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
- "vbroadcastss %4,%%ymm3 \n" // multipler for RB
- "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
- "vbroadcastss %6,%%ymm5 \n" // mask for AG
- "vbroadcastss %7,%%ymm6 \n" // multipler for AG
- "sub %0,%1 \n"
-
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
- "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
- "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
- "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
- "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
- "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
- "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
- "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
- "add $0x20,%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
@@ -1034,222 +1076,490 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
}
#endif
+static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
+ 10, 9, 8, 11, 14, 13, 12, 15};
+
+static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3,
+ 6, 6, 5, 5, 4, 4, 7, 7};
+static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11,
+ 14, 14, 13, 13, 12, 12, 15, 15};
+
+void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
+ uint16_t* dst_ar64,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ar64), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+
+void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
+ uint16_t* dst_ab64,
+ int width) {
+ asm volatile(
+
+ "movdqa %3,%%xmm2 \n"
+ "movdqa %4,%%xmm3 \n" LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm0 \n"
+ "pshufb %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleARGBToAB64Lo), // %3
+ "m"(kShuffleARGBToAB64Hi) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+
+void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psrlw $8,%%xmm0 \n"
+ "psrlw $8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ar64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+
+void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ "movdqa %3,%%xmm2 \n" LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psrlw $8,%%xmm0 \n"
+ "psrlw $8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "pshufb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ab64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleARGBToABGR) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+
+#ifdef HAS_ARGBTOAR64ROW_AVX2
+void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
+ uint16_t* dst_ar64,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ar64), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif
+
+#ifdef HAS_ARGBTOAB64ROW_AVX2
+void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
+ uint16_t* dst_ab64,
+ int width) {
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm2 \n"
+ "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm3,%%ymm0,%%ymm1 \n"
+ "vpshufb %%ymm2,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleARGBToAB64Lo), // %3
+ "m"(kShuffleARGBToAB64Hi) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_AR64TOARGBROW_AVX2
+void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpsrlw $8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x40(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ar64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
+#endif
+
+#ifdef HAS_AB64TOARGBROW_AVX2
+void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpsrlw $8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm2,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x40(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ab64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleARGBToABGR) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+// clang-format off
+
+// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
+// round parameter is register containing value to add before shift.
+#define RGBTOY(round) \
+ "1: \n" \
+ "movdqu (%0),%%xmm0 \n" \
+ "movdqu 0x10(%0),%%xmm1 \n" \
+ "movdqu 0x20(%0),%%xmm2 \n" \
+ "movdqu 0x30(%0),%%xmm3 \n" \
+ "psubb %%xmm5,%%xmm0 \n" \
+ "psubb %%xmm5,%%xmm1 \n" \
+ "psubb %%xmm5,%%xmm2 \n" \
+ "psubb %%xmm5,%%xmm3 \n" \
+ "movdqu %%xmm4,%%xmm6 \n" \
+ "pmaddubsw %%xmm0,%%xmm6 \n" \
+ "movdqu %%xmm4,%%xmm0 \n" \
+ "pmaddubsw %%xmm1,%%xmm0 \n" \
+ "movdqu %%xmm4,%%xmm1 \n" \
+ "pmaddubsw %%xmm2,%%xmm1 \n" \
+ "movdqu %%xmm4,%%xmm2 \n" \
+ "pmaddubsw %%xmm3,%%xmm2 \n" \
+ "lea 0x40(%0),%0 \n" \
+ "phaddw %%xmm0,%%xmm6 \n" \
+ "phaddw %%xmm2,%%xmm1 \n" \
+ "prefetcht0 1280(%0) \n" \
+ "paddw %%" #round ",%%xmm6 \n" \
+ "paddw %%" #round ",%%xmm1 \n" \
+ "psrlw $0x8,%%xmm6 \n" \
+ "psrlw $0x8,%%xmm1 \n" \
+ "packuswb %%xmm1,%%xmm6 \n" \
+ "movdqu %%xmm6,(%1) \n" \
+ "lea 0x10(%1),%1 \n" \
+ "sub $0x10,%2 \n" \
+ "jg 1b \n"
+
+#define RGBTOY_AVX2(round) \
+ "1: \n" \
+ "vmovdqu (%0),%%ymm0 \n" \
+ "vmovdqu 0x20(%0),%%ymm1 \n" \
+ "vmovdqu 0x40(%0),%%ymm2 \n" \
+ "vmovdqu 0x60(%0),%%ymm3 \n" \
+ "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
+ "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
+ "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
+ "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
+ "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
+ "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
+ "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
+ "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
+ "lea 0x80(%0),%0 \n" \
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
+ "prefetcht0 1280(%0) \n" \
+ "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
+ "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
+ "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
+ "vmovdqu %%ymm0,(%1) \n" \
+ "lea 0x20(%1),%1 \n" \
+ "sub $0x20,%2 \n" \
+ "jg 1b \n" \
+ "vzeroupper \n"
+
+// clang-format on
+
#ifdef HAS_ARGBTOYROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN RGBTOY(xmm7)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBTOYROW_SSSE3
#ifdef HAS_ARGBTOYJROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
-// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+// Same as ARGBToYRow but different coefficients, no add 16.
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN RGBTOY(xmm5)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ARGBTOYJROW_SSSE3
-#ifdef HAS_ARGBTOYROW_AVX2
+#ifdef HAS_RGBATOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16.
+void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN RGBTOY(xmm5)
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToYJ), // %3
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_RGBATOYJROW_SSSE3
+
+#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
// vpermd for vphaddw + vpackuswb vpermd.
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
+#endif
+
+#ifdef HAS_ARGBTOYROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
- "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
- "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
- "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "vbroadcastf128 %5,%%ymm7 \n"
+ "vmovdqu %6,%%ymm6 \n"
+
+ LABELALIGN RGBTOY_AVX2(ymm7)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToY), // %3
- "m"(kAddY16), // %4
- "m"(kPermdARGBToY_AVX) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+ "m"(kSub128), // %4
+ "m"(kAddY16), // %5
+ "m"(kPermdARGBToY_AVX) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBTOYROW_AVX2
+#ifdef HAS_ABGRTOYROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vbroadcastf128 %5,%%ymm7 \n"
+ "vmovdqu %6,%%ymm6 \n"
+
+ LABELALIGN RGBTOY_AVX2(ymm7)
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kSub128), // %4
+ "m"(kAddY16), // %5
+ "m"(kPermdARGBToY_AVX) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ABGRTOYROW_AVX2
+
#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
- "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
- "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
- "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "vmovdqu %5,%%ymm6 \n"
+
+ LABELALIGN RGBTOY_AVX2(ymm5)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToYJ), // %3
- "m"(kAddYJ64), // %4
+ "m"(kSub128), // %4
"m"(kPermdARGBToY_AVX) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBTOYJROW_AVX2
+#ifdef HAS_RGBATOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n"
+
+ LABELALIGN RGBTOY_AVX2(
+ ymm5) "vzeroupper \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToYJ), // %3
+ "m"(kSub128), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif // HAS_RGBATOYJROW_AVX2
+
#ifdef HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@@ -1266,7 +1576,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
static const lvec8 kShufARGBToUV_AVX = {
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
-void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1275,46 +1585,46 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
- "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
-
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
- "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
- : "+r"(src_argb0), // %0
+ : "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@@ -1328,8 +1638,71 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
}
#endif // HAS_ARGBTOUVROW_AVX2
+#ifdef HAS_ABGRTOUVROW_AVX2
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kAddUV128), // %5
+ "m"(kABGRToV), // %6
+ "m"(kABGRToU), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif // HAS_ABGRTOUVROW_AVX2
+
#ifdef HAS_ARGBTOUVJROW_AVX2
-void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1338,52 +1711,52 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x40(%0),%%ymm2 \n"
- "vmovdqu 0x60(%0),%%ymm3 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
- "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
- "lea 0x80(%0),%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
-
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
- : "+r"(src_argb0), // %0
+ : "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"((intptr_t)(src_stride_argb)), // %4
- "m"(kAddUVJ128), // %5
+ "m"(kSub128), // %5
"m"(kARGBToVJ), // %6
"m"(kARGBToUJ), // %7
"m"(kShufARGBToUV_AVX) // %8
@@ -1393,67 +1766,67 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
#endif // HAS_ARGBTOUVJROW_AVX2
#ifdef HAS_ARGBTOUVJROW_SSSE3
-void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"((intptr_t)(src_stride_argb)), // %4
"m"(kARGBToVJ), // %5
"m"(kARGBToUJ), // %6
- "m"(kAddUVJ128) // %7
+ "m"(kSub128) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
}
#endif // HAS_ARGBTOUVJROW_SSSE3
@@ -1464,47 +1837,47 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %4,%%xmm3 \n"
- "movdqa %5,%%xmm4 \n"
- "movdqa %6,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "lea 0x40(%0),%0 \n"
- "movdqu %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqa %4,%%xmm3 \n"
+ "movdqa %5,%%xmm4 \n"
+ "movdqa %6,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqu %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1518,91 +1891,74 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN RGBTOY(xmm7)
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kBGRAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
-void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_bgra0), // %0
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@@ -1615,125 +1971,91 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN RGBTOY(xmm7)
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kABGRToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN RGBTOY(xmm7)
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kRGBAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "m"(kSub128), // %4
+ "m"(kAddY16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
-void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_abgr0), // %0
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@@ -1744,59 +2066,59 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
}
-void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile(
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
-
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_rgba0), // %0
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@@ -1811,21 +2133,21 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
// Read 8 UV from 444
#define READYUV444 \
- "movq (%[u_buf]),%%xmm0 \n" \
+ "movq (%[u_buf]),%%xmm3 \n" \
"movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
"lea 0x8(%[u_buf]),%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklbw %%xmm1,%%xmm3 \n" \
"movq (%[y_buf]),%%xmm4 \n" \
"punpcklbw %%xmm4,%%xmm4 \n" \
"lea 0x8(%[y_buf]),%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV
#define READYUV422 \
- "movd (%[u_buf]),%%xmm0 \n" \
+ "movd (%[u_buf]),%%xmm3 \n" \
"movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
"lea 0x4(%[u_buf]),%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
+ "punpcklbw %%xmm1,%%xmm3 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
"movq (%[y_buf]),%%xmm4 \n" \
"punpcklbw %%xmm4,%%xmm4 \n" \
"lea 0x8(%[y_buf]),%[y_buf] \n"
@@ -1835,24 +2157,99 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
// TODO(fbarchard): Consider pmulhuw to replace psraw
// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
#define READYUV210 \
- "movq (%[u_buf]),%%xmm0 \n" \
+ "movq (%[u_buf]),%%xmm3 \n" \
"movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
"lea 0x8(%[u_buf]),%[u_buf] \n" \
- "punpcklwd %%xmm1,%%xmm0 \n" \
- "psraw $0x2,%%xmm0 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
+ "punpcklwd %%xmm1,%%xmm3 \n" \
+ "psraw $2,%%xmm3 \n" \
+ "packuswb %%xmm3,%%xmm3 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "psllw $6,%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+#define READYUVA210 \
+ "movq (%[u_buf]),%%xmm3 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklwd %%xmm1,%%xmm3 \n" \
+ "psraw $2,%%xmm3 \n" \
+ "packuswb %%xmm3,%%xmm3 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "psllw $6,%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n" \
+ "movdqu (%[a_buf]),%%xmm5 \n" \
+ "psraw $2,%%xmm5 \n" \
+ "packuswb %%xmm5,%%xmm5 \n" \
+ "lea 0x10(%[a_buf]),%[a_buf] \n"
+
+// Read 8 UV from 444 10 bit
+#define READYUV410 \
+ "movdqu (%[u_buf]),%%xmm3 \n" \
+ "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "psraw $2,%%xmm3 \n" \
+ "psraw $2,%%xmm2 \n" \
+ "movdqa %%xmm3,%%xmm1 \n" \
+ "punpcklwd %%xmm2,%%xmm3 \n" \
+ "punpckhwd %%xmm2,%%xmm1 \n" \
+ "packuswb %%xmm1,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "psllw $6,%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from 444 10 bit. With 8 Alpha.
+#define READYUVA410 \
+ "movdqu (%[u_buf]),%%xmm3 \n" \
+ "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "psraw $2,%%xmm3 \n" \
+ "psraw $2,%%xmm2 \n" \
+ "movdqa %%xmm3,%%xmm1 \n" \
+ "punpcklwd %%xmm2,%%xmm3 \n" \
+ "punpckhwd %%xmm2,%%xmm1 \n" \
+ "packuswb %%xmm1,%%xmm3 \n" \
"movdqu (%[y_buf]),%%xmm4 \n" \
"psllw $0x6,%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n" \
+ "movdqu (%[a_buf]),%%xmm5 \n" \
+ "psraw $2,%%xmm5 \n" \
+ "packuswb %%xmm5,%%xmm5 \n" \
+ "lea 0x10(%[a_buf]),%[a_buf] \n"
+
+// Read 4 UV from 422 12 bit, upsample to 8 UV
+#define READYUV212 \
+ "movq (%[u_buf]),%%xmm3 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklwd %%xmm1,%%xmm3 \n" \
+ "psraw $0x4,%%xmm3 \n" \
+ "packuswb %%xmm3,%%xmm3 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "psllw $0x4,%%xmm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \
- "movd (%[u_buf]),%%xmm0 \n" \
+ "movd (%[u_buf]),%%xmm3 \n" \
"movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
"lea 0x4(%[u_buf]),%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
+ "punpcklbw %%xmm1,%%xmm3 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n" \
+ "movq (%[a_buf]),%%xmm5 \n" \
+ "lea 0x8(%[a_buf]),%[a_buf] \n"
+
+// Read 8 UV from 444. With 8 Alpha.
+#define READYUVA444 \
+ "movq (%[u_buf]),%%xmm3 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm3 \n" \
"movq (%[y_buf]),%%xmm4 \n" \
"punpcklbw %%xmm4,%%xmm4 \n" \
"lea 0x8(%[y_buf]),%[y_buf] \n" \
@@ -1861,18 +2258,18 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
// Read 4 UV from NV12, upsample to 8 UV
#define READNV12 \
- "movq (%[uv_buf]),%%xmm0 \n" \
+ "movq (%[uv_buf]),%%xmm3 \n" \
"lea 0x8(%[uv_buf]),%[uv_buf] \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
"movq (%[y_buf]),%%xmm4 \n" \
"punpcklbw %%xmm4,%%xmm4 \n" \
"lea 0x8(%[y_buf]),%[y_buf] \n"
// Read 4 VU from NV21, upsample to 8 UV
#define READNV21 \
- "movq (%[vu_buf]),%%xmm0 \n" \
+ "movq (%[vu_buf]),%%xmm3 \n" \
"lea 0x8(%[vu_buf]),%[vu_buf] \n" \
- "pshufb %[kShuffleNV21], %%xmm0 \n" \
+ "pshufb %[kShuffleNV21], %%xmm3 \n" \
"movq (%[y_buf]),%%xmm4 \n" \
"punpcklbw %%xmm4,%%xmm4 \n" \
"lea 0x8(%[y_buf]),%[y_buf] \n"
@@ -1881,68 +2278,92 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
#define READYUY2 \
"movdqu (%[yuy2_buf]),%%xmm4 \n" \
"pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
- "movdqu (%[yuy2_buf]),%%xmm0 \n" \
- "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
+ "movdqu (%[yuy2_buf]),%%xmm3 \n" \
+ "pshufb %[kShuffleYUY2UV], %%xmm3 \n" \
"lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
#define READUYVY \
"movdqu (%[uyvy_buf]),%%xmm4 \n" \
"pshufb %[kShuffleUYVYY], %%xmm4 \n" \
- "movdqu (%[uyvy_buf]),%%xmm0 \n" \
- "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
+ "movdqu (%[uyvy_buf]),%%xmm3 \n" \
+ "pshufb %[kShuffleUYVYUV], %%xmm3 \n" \
"lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
+// Read 4 UV from P210, upsample to 8 UV
+#define READP210 \
+ "movdqu (%[uv_buf]),%%xmm3 \n" \
+ "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
+ "psrlw $0x8,%%xmm3 \n" \
+ "packuswb %%xmm3,%%xmm3 \n" \
+ "punpcklwd %%xmm3,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from P410
+#define READP410 \
+ "movdqu (%[uv_buf]),%%xmm3 \n" \
+ "movdqu 0x10(%[uv_buf]),%%xmm1 \n" \
+ "lea 0x20(%[uv_buf]),%[uv_buf] \n" \
+ "psrlw $0x8,%%xmm3 \n" \
+ "psrlw $0x8,%%xmm1 \n" \
+ "packuswb %%xmm1,%%xmm3 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
#if defined(__x86_64__)
#define YUVTORGB_SETUP(yuvconstants) \
+ "pcmpeqb %%xmm13,%%xmm13 \n" \
"movdqa (%[yuvconstants]),%%xmm8 \n" \
+ "pxor %%xmm12,%%xmm12 \n" \
"movdqa 32(%[yuvconstants]),%%xmm9 \n" \
+ "psllw $7,%%xmm13 \n" \
"movdqa 64(%[yuvconstants]),%%xmm10 \n" \
+ "pshufb %%xmm12,%%xmm13 \n" \
"movdqa 96(%[yuvconstants]),%%xmm11 \n" \
- "movdqa 128(%[yuvconstants]),%%xmm12 \n" \
- "movdqa 160(%[yuvconstants]),%%xmm13 \n" \
- "movdqa 192(%[yuvconstants]),%%xmm14 \n"
+ "movdqa 128(%[yuvconstants]),%%xmm12 \n"
+
// Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB16(yuvconstants) \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "movdqa %%xmm0,%%xmm3 \n" \
- "movdqa %%xmm11,%%xmm0 \n" \
- "pmaddubsw %%xmm8,%%xmm1 \n" \
- "psubw %%xmm1,%%xmm0 \n" \
- "movdqa %%xmm12,%%xmm1 \n" \
- "pmaddubsw %%xmm9,%%xmm2 \n" \
- "psubw %%xmm2,%%xmm1 \n" \
- "movdqa %%xmm13,%%xmm2 \n" \
- "pmaddubsw %%xmm10,%%xmm3 \n" \
- "psubw %%xmm3,%%xmm2 \n" \
- "pmulhuw %%xmm14,%%xmm4 \n" \
+ "psubb %%xmm13,%%xmm3 \n" \
+ "pmulhuw %%xmm11,%%xmm4 \n" \
+ "movdqa %%xmm8,%%xmm0 \n" \
+ "movdqa %%xmm9,%%xmm1 \n" \
+ "movdqa %%xmm10,%%xmm2 \n" \
+ "paddw %%xmm12,%%xmm4 \n" \
+ "pmaddubsw %%xmm3,%%xmm0 \n" \
+ "pmaddubsw %%xmm3,%%xmm1 \n" \
+ "pmaddubsw %%xmm3,%%xmm2 \n" \
"paddsw %%xmm4,%%xmm0 \n" \
- "paddsw %%xmm4,%%xmm1 \n" \
- "paddsw %%xmm4,%%xmm2 \n"
-#define YUVTORGB_REGS \
- "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+ "paddsw %%xmm4,%%xmm2 \n" \
+ "psubsw %%xmm1,%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm1 \n"
+
+#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
#else
#define YUVTORGB_SETUP(yuvconstants)
// Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB16(yuvconstants) \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "movdqa %%xmm0,%%xmm3 \n" \
- "movdqa 96(%[yuvconstants]),%%xmm0 \n" \
- "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \
- "psubw %%xmm1,%%xmm0 \n" \
- "movdqa 128(%[yuvconstants]),%%xmm1 \n" \
- "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \
- "psubw %%xmm2,%%xmm1 \n" \
- "movdqa 160(%[yuvconstants]),%%xmm2 \n" \
- "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \
- "psubw %%xmm3,%%xmm2 \n" \
- "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \
+ "pcmpeqb %%xmm0,%%xmm0 \n" \
+ "pxor %%xmm1,%%xmm1 \n" \
+ "psllw $7,%%xmm0 \n" \
+ "pshufb %%xmm1,%%xmm0 \n" \
+ "psubb %%xmm0,%%xmm3 \n" \
+ "pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \
+ "movdqa (%[yuvconstants]),%%xmm0 \n" \
+ "movdqa 32(%[yuvconstants]),%%xmm1 \n" \
+ "movdqa 64(%[yuvconstants]),%%xmm2 \n" \
+ "pmaddubsw %%xmm3,%%xmm0 \n" \
+ "pmaddubsw %%xmm3,%%xmm1 \n" \
+ "pmaddubsw %%xmm3,%%xmm2 \n" \
+ "movdqa 128(%[yuvconstants]),%%xmm3 \n" \
+ "paddw %%xmm3,%%xmm4 \n" \
"paddsw %%xmm4,%%xmm0 \n" \
- "paddsw %%xmm4,%%xmm1 \n" \
- "paddsw %%xmm4,%%xmm2 \n"
+ "paddsw %%xmm4,%%xmm2 \n" \
+ "psubsw %%xmm1,%%xmm4 \n" \
+ "movdqa %%xmm4,%%xmm1 \n"
+
#define YUVTORGB_REGS
#endif
@@ -2012,16 +2433,16 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV444
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2033,6 +2454,44 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
);
}
+#ifdef HAS_I444ALPHATOARGBROW_SSSE3
+void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUVA444
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [a_buf]"+r"(a_buf), // %[a_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_I444ALPHATOARGBROW_SSSE3
+
void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2041,27 +2500,27 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
- "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
- "sub %[u_buf],%[v_buf] \n"
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+ "sub %[u_buf],%[v_buf] \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "movq %%xmm0,(%[dst_rgb24]) \n"
- "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
- "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
- "subl $0x8,%[width] \n"
- "jg 1b \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "movq %%xmm0,(%[dst_rgb24]) \n"
+ "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
+ "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2087,16 +2546,16 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2116,21 +2575,21 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n" // 0 for min
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB16(yuvconstants)
STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2151,16 +2610,46 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+// 12 bit YUV to ARGB
+void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV212
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2181,21 +2670,21 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $14,%%xmm5 \n"
- "psllw $4,%%xmm5 \n" // 2 alpha bits
- "pxor %%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
- "psrlw $6,%%xmm7 \n" // 1023 for max
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n" // 0 for min
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $6,%%xmm7 \n" // 1023 for max
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210
YUVTORGB16(yuvconstants)
STOREAR30
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2207,6 +2696,176 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
);
}
+// 12 bit YUV to AR30
+void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n" // 0 for min
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READYUV212
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+// 10 bit YUV to ARGB
+void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV410
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+#ifdef HAS_I210ALPHATOARGBROW_SSSE3
+// 10 bit YUVA to ARGB
+void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP(
+ yuvconstants) "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN "1: \n" READYUVA210
+ YUVTORGB(yuvconstants) STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [u_buf] "+r"(u_buf), // %[u_buf]
+ [v_buf] "+r"(v_buf), // %[v_buf]
+ [a_buf] "+r"(a_buf),
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width] "+m"(width) // %[width]
+#else
+ [width] "+rm"(width) // %[width]
+#endif
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+ "xmm5");
+}
+#endif
+
+#ifdef HAS_I410ALPHATOARGBROW_SSSE3
+// 10 bit YUVA to ARGB
+void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile(
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUVA410
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [u_buf] "+r"(u_buf), // %[u_buf]
+ [v_buf] "+r"(v_buf), // %[v_buf]
+ [a_buf] "+r"(a_buf),
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width] "+m"(width) // %[width]
+#else
+ [width] "+rm"(width) // %[width]
+#endif
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+ "xmm5");
+ // clang-format on
+}
+#endif
+
+// 10 bit YUV to AR30
+void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n" // 0 for min
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READYUV410
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
#ifdef HAS_I422ALPHATOARGBROW_SSSE3
void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
const uint8_t* u_buf,
@@ -2218,15 +2877,15 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
+ "sub %[u_buf],%[v_buf] \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUVA422
YUVTORGB(yuvconstants)
STOREARGB
- "subl $0x8,%[width] \n"
- "jg 1b \n"
+ "subl $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2253,15 +2912,15 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV12
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2281,15 +2940,15 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV21
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[vu_buf]"+r"(vu_buf), // %[vu_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2309,15 +2968,15 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUY2
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -2337,15 +2996,15 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READUYVY
YUVTORGB(yuvconstants)
STOREARGB
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -2358,6 +3017,112 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
// clang-format on
}
+void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP(
+ yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN "1: \n" READP210
+ YUVTORGB(yuvconstants) STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [uv_buf] "+r"(uv_buf), // %[u_buf]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+rm"(width) // %[width]
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+ "xmm5");
+}
+
+void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP(
+ yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN "1: \n" READP410
+ YUVTORGB(yuvconstants) STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [uv_buf] "+r"(uv_buf), // %[u_buf]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+rm"(width) // %[width]
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+ "xmm5");
+}
+
+void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n" // 0 for min
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READP210
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n" // 0 for min
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READP410
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@@ -2366,16 +3131,16 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422
YUVTORGB(yuvconstants)
STORERGBA
- "sub $0x8,%[width] \n"
- "jg 1b \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2391,12 +3156,12 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
// Read 16 UV from 444
#define READYUV444_AVX2 \
- "vmovdqu (%[u_buf]),%%xmm0 \n" \
+ "vmovdqu (%[u_buf]),%%xmm3 \n" \
"vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
"lea 0x10(%[u_buf]),%[u_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
"vpermq $0xd8,%%ymm1,%%ymm1 \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%xmm4 \n" \
"vpermq $0xd8,%%ymm4,%%ymm4 \n" \
"vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
@@ -2404,42 +3169,139 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
// Read 8 UV from 422, upsample to 16 UV.
#define READYUV422_AVX2 \
- "vmovq (%[u_buf]),%%xmm0 \n" \
+ "vmovq (%[u_buf]),%%xmm3 \n" \
"vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
"lea 0x8(%[u_buf]),%[u_buf] \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%xmm4 \n" \
"vpermq $0xd8,%%ymm4,%%ymm4 \n" \
"vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n"
-// Read 8 UV from 210 10 bit, upsample to 16 UV
+#define READYUV422_AVX512BW \
+ "vmovdqu (%[u_buf]),%%xmm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "vpermq %%zmm3,%%zmm16,%%zmm3 \n" \
+ "vpermq %%zmm1,%%zmm16,%%zmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \
+ "vpermq $0xd8,%%zmm3,%%zmm3 \n" \
+ "vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \
+ "vmovdqu8 (%[y_buf]),%%ymm4 \n" \
+ "vpermq %%zmm4,%%zmm17,%%zmm4 \n" \
+ "vpermq $0xd8,%%zmm4,%%zmm4 \n" \
+ "vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from 210, upsample to 16 UV
// TODO(fbarchard): Consider vshufb to replace pack/unpack
// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
#define READYUV210_AVX2 \
- "vmovdqu (%[u_buf]),%%xmm0 \n" \
+ "vmovdqu (%[u_buf]),%%xmm3 \n" \
"vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
"lea 0x10(%[u_buf]),%[u_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
"vpermq $0xd8,%%ymm1,%%ymm1 \n" \
- "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpsraw $0x2,%%ymm0,%%ymm0 \n" \
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vpsraw $2,%%ymm3,%%ymm3 \n" \
+ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%ymm4 \n" \
- "vpsllw $0x6,%%ymm4,%%ymm4 \n" \
+ "vpsllw $6,%%ymm4,%%ymm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n"
+// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
+#define READYUVA210_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vpsraw $2,%%ymm3,%%ymm3 \n" \
+ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "vpsllw $6,%%ymm4,%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n" \
+ "vmovdqu (%[a_buf]),%%ymm5 \n" \
+ "vpsraw $2,%%ymm5,%%ymm5 \n" \
+ "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \
+ "lea 0x20(%[a_buf]),%[a_buf] \n"
+
+// Read 16 UV from 410
+#define READYUV410_AVX2 \
+ "vmovdqu (%[u_buf]),%%ymm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \
+ "lea 0x20(%[u_buf]),%[u_buf] \n" \
+ "vpsraw $2,%%ymm3,%%ymm3 \n" \
+ "vpsraw $2,%%ymm2,%%ymm2 \n" \
+ "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \
+ "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
+ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "vpsllw $6,%%ymm4,%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from 212 12 bit, upsample to 16 UV
+#define READYUV212_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vpsraw $0x4,%%ymm3,%%ymm3 \n" \
+ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "vpsllw $0x4,%%ymm4,%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
+// Read 16 UV from 410. With 16 Alpha.
+#define READYUVA410_AVX2 \
+ "vmovdqu (%[u_buf]),%%ymm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \
+ "lea 0x20(%[u_buf]),%[u_buf] \n" \
+ "vpsraw $2,%%ymm3,%%ymm3 \n" \
+ "vpsraw $2,%%ymm2,%%ymm2 \n" \
+ "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \
+ "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
+ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "vpsllw $6,%%ymm4,%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n" \
+ "vmovdqu (%[a_buf]),%%ymm5 \n" \
+ "vpsraw $2,%%ymm5,%%ymm5 \n" \
+ "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \
+ "lea 0x20(%[a_buf]),%[a_buf] \n"
+
+// Read 16 UV from 444. With 16 Alpha.
+#define READYUVA444_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm3 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n" \
+ "vmovdqu (%[a_buf]),%%xmm5 \n" \
+ "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
+ "lea 0x10(%[a_buf]),%[a_buf] \n"
+
// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
#define READYUVA422_AVX2 \
- "vmovq (%[u_buf]),%%xmm0 \n" \
+ "vmovq (%[u_buf]),%%xmm3 \n" \
"vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
"lea 0x8(%[u_buf]),%[u_buf] \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%xmm4 \n" \
"vpermq $0xd8,%%ymm4,%%ymm4 \n" \
"vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
@@ -2450,10 +3312,10 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
// Read 8 UV from NV12, upsample to 16 UV.
#define READNV12_AVX2 \
- "vmovdqu (%[uv_buf]),%%xmm0 \n" \
+ "vmovdqu (%[uv_buf]),%%xmm3 \n" \
"lea 0x10(%[uv_buf]),%[uv_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%xmm4 \n" \
"vpermq $0xd8,%%ymm4,%%ymm4 \n" \
"vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
@@ -2461,73 +3323,130 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
// Read 8 VU from NV21, upsample to 16 UV.
#define READNV21_AVX2 \
- "vmovdqu (%[vu_buf]),%%xmm0 \n" \
+ "vmovdqu (%[vu_buf]),%%xmm3 \n" \
"lea 0x10(%[vu_buf]),%[vu_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vpshufb %[kShuffleNV21], %%ymm3, %%ymm3 \n" \
"vmovdqu (%[y_buf]),%%xmm4 \n" \
"vpermq $0xd8,%%ymm4,%%ymm4 \n" \
"vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n"
+// Read 4 UV from P210, upsample to 8 UV
+#define READP210_AVX2 \
+ "vmovdqu (%[uv_buf]),%%ymm3 \n" \
+ "lea 0x20(%[uv_buf]),%[uv_buf] \n" \
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \
+ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from P410
+#define READP410_AVX2 \
+ "vmovdqu (%[uv_buf]),%%ymm3 \n" \
+ "vmovdqu 0x20(%[uv_buf]),%%ymm1 \n" \
+ "lea 0x40(%[uv_buf]),%[uv_buf] \n" \
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n" \
+ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
+
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
#define READYUY2_AVX2 \
"vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
"vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
- "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \
- "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
+ "vmovdqu (%[yuy2_buf]),%%ymm3 \n" \
+ "vpshufb %[kShuffleYUY2UV], %%ymm3, %%ymm3 \n" \
"lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
#define READUYVY_AVX2 \
"vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
"vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
- "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \
- "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
+ "vmovdqu (%[uyvy_buf]),%%ymm3 \n" \
+ "vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \
"lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
+// TODO(fbarchard): Remove broadcastb
#if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants) \
- "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
- "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
- "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
- "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
- "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \
- "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \
- "vmovdqa 192(%[yuvconstants]),%%ymm14 \n"
+#define YUVTORGB_SETUP_AVX2(yuvconstants) \
+ "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
+ "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
+ "vpsllw $7,%%xmm13,%%xmm13 \n" \
+ "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
+ "vpbroadcastb %%xmm13,%%ymm13 \n" \
+ "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
+ "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
+ "vmovdqa 128(%[yuvconstants]),%%ymm12 \n"
+
+#define YUVTORGB_SETUP_AVX512BW(yuvconstants) \
+ "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \
+ "movdqa (%[yuvconstants]),%%xmm8 \n" \
+ "vpbroadcastq %%xmm8, %%zmm8 \n" \
+ "vpsllw $7,%%xmm13,%%xmm13 \n" \
+ "vpbroadcastb %%xmm13,%%zmm13 \n" \
+ "movq 32(%[yuvconstants]),%%xmm9 \n" \
+ "vpbroadcastq %%xmm9,%%zmm9 \n" \
+ "movq 64(%[yuvconstants]),%%xmm10 \n" \
+ "vpbroadcastq %%xmm10,%%zmm10 \n" \
+ "movq 96(%[yuvconstants]),%%xmm11 \n" \
+ "vpbroadcastq %%xmm11,%%zmm11 \n" \
+ "movq 128(%[yuvconstants]),%%xmm12 \n" \
+ "vpbroadcastq %%xmm12,%%zmm12 \n" \
+ "vmovdqu8 (%[quadsplitperm]),%%zmm16 \n" \
+ "vmovdqu8 (%[dquadsplitperm]),%%zmm17 \n" \
+ "vmovdqu8 (%[unperm]),%%zmm18 \n"
#define YUVTORGB16_AVX2(yuvconstants) \
- "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
- "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
- "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
- "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
- "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
- "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
- "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
+ "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \
+ "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \
+ "vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \
+ "vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \
+ "vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \
+ "vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \
"vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
- "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
+ "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
"vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
-#define YUVTORGB_REGS_AVX2 \
- "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+#define YUVTORGB16_AVX512BW(yuvconstants) \
+ "vpsubb %%zmm13,%%zmm3,%%zmm3 \n" \
+ "vpmulhuw %%zmm11,%%zmm4,%%zmm4 \n" \
+ "vpmaddubsw %%zmm3,%%zmm8,%%zmm0 \n" \
+ "vpmaddubsw %%zmm3,%%zmm9,%%zmm1 \n" \
+ "vpmaddubsw %%zmm3,%%zmm10,%%zmm2 \n" \
+ "vpaddw %%zmm4,%%zmm12,%%zmm4 \n" \
+ "vpaddsw %%zmm4,%%zmm0,%%zmm0 \n" \
+ "vpsubsw %%zmm1,%%zmm4,%%zmm1 \n" \
+ "vpaddsw %%zmm4,%%zmm2,%%zmm2 \n"
+
+#define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",
+#define YUVTORGB_REGS_AVX512BW \
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18",
#else // Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_SETUP_AVX2(yuvconstants)
#define YUVTORGB16_AVX2(yuvconstants) \
- "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \
- "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \
- "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \
- "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \
- "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
- "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \
- "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
- "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \
- "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
- "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
+ "vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \
+ "vpsllw $7,%%xmm0,%%xmm0 \n" \
+ "vpbroadcastb %%xmm0,%%ymm0 \n" \
+ "vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \
+ "vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
+ "vmovdqa (%[yuvconstants]),%%ymm0 \n" \
+ "vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \
+ "vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \
+ "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \
+ "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \
+ "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \
+ "vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \
+ "vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \
"vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
- "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
+ "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \
"vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
+
#define YUVTORGB_REGS_AVX2
#endif
@@ -2540,6 +3459,15 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+#define YUVTORGB_AVX512BW(yuvconstants) \
+ YUVTORGB16_AVX512BW(yuvconstants) \
+ "vpsraw $0x6,%%zmm0,%%zmm0 \n" \
+ "vpsraw $0x6,%%zmm1,%%zmm1 \n" \
+ "vpsraw $0x6,%%zmm2,%%zmm2 \n" \
+ "vpackuswb %%zmm0,%%zmm0,%%zmm0 \n" \
+ "vpackuswb %%zmm1,%%zmm1,%%zmm1 \n" \
+ "vpackuswb %%zmm2,%%zmm2,%%zmm2 \n"
+
// Store 16 ARGB values.
#define STOREARGB_AVX2 \
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
@@ -2550,7 +3478,19 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
"vmovdqu %%ymm1,(%[dst_argb]) \n" \
"vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
- "lea 0x40(%[dst_argb]), %[dst_argb] \n"
+ "lea 0x40(%[dst_argb]), %[dst_argb] \n"
+
+// Store 32 ARGB values.
+#define STOREARGB_AVX512BW \
+ "vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" \
+ "vpermq %%zmm0,%%zmm18,%%zmm0 \n" \
+ "vpunpcklbw %%zmm5,%%zmm2,%%zmm2 \n" \
+ "vpermq %%zmm2,%%zmm18,%%zmm2 \n" \
+ "vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \
+ "vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \
+ "vmovdqu8 %%zmm1,(%[dst_argb]) \n" \
+ "vmovdqu8 %%zmm0,0x40(%[dst_argb]) \n" \
+ "lea 0x80(%[dst_argb]), %[dst_argb] \n"
// Store 16 AR30 values.
#define STOREAR30_AVX2 \
@@ -2590,17 +3530,17 @@ void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV444_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2624,18 +3564,18 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2648,6 +3588,50 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
}
#endif // HAS_I422TOARGBROW_AVX2
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2};
+static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4};
+static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7};
+
+// 32 pixels
+// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128
+// bytes).
+void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX512BW(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n"
+ "vpbroadcastq %%xmm5,%%zmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX512BW
+ YUVTORGB_AVX512BW(yuvconstants)
+ STOREARGB_AVX512BW
+ "sub $0x20,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
+ [quadsplitperm]"r"(kSplitQuadWords), // %[quadsplitperm]
+ [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm]
+ [unperm]"r"(kUnpermuteAVX512) // %[unperm]
+ : "memory", "cc", YUVTORGB_REGS_AVX512BW
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I422TOARGBROW_AVX512BW
+
#if defined(HAS_I422TOAR30ROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
@@ -2659,23 +3643,23 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB16_AVX2(yuvconstants)
STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2699,18 +3683,18 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2723,6 +3707,41 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
}
#endif // HAS_I210TOARGBROW_AVX2
+#if defined(HAS_I212TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV212_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I212TOARGBROW_AVX2
+
#if defined(HAS_I210TOAR30ROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
@@ -2734,23 +3753,23 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
- "vpsrlw $14,%%ymm5,%%ymm5 \n"
- "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
- "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
- "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
- "vpsrlw $6,%%ymm7,%%ymm7 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV210_AVX2
YUVTORGB16_AVX2(yuvconstants)
STOREAR30_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
- "vzeroupper \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2758,11 +3777,239 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", YUVTORGB_REGS_AVX2
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#endif // HAS_I210TOAR30ROW_AVX2
+#if defined(HAS_I212TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV212_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_I212TOAR30ROW_AVX2
+
+#if defined(HAS_I410TOARGBROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV410_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I410TOARGBROW_AVX2
+
+#if defined(HAS_I210ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
+void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP_AVX2(
+ yuvconstants) "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN "1: \n" READYUVA210_AVX2
+ YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [u_buf] "+r"(u_buf), // %[u_buf]
+ [v_buf] "+r"(v_buf), // %[v_buf]
+ [a_buf] "+r"(a_buf), // %[a_buf]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width] "+m"(width) // %[width]
+#else
+ [width] "+rm"(width) // %[width]
+#endif
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+ "xmm4", "xmm5");
+}
+#endif // HAS_I210TOARGBROW_AVX2
+
+#if defined(HAS_I410ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes).
+void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ const uint16_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP_AVX2(
+ yuvconstants) "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN "1: \n" READYUVA410_AVX2
+ YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+
+ : [y_buf] "+r"(y_buf), // %[y_buf]
+ [u_buf] "+r"(u_buf), // %[u_buf]
+ [v_buf] "+r"(v_buf), // %[v_buf]
+ [a_buf] "+r"(a_buf), // %[a_buf]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width] "+m"(width) // %[width]
+#else
+ [width] "+rm"(width) // %[width]
+#endif
+ : [yuvconstants] "r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+ "xmm4", "xmm5");
+}
+#endif // HAS_I410TOARGBROW_AVX2
+
+#if defined(HAS_I410TOAR30ROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV410_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_I410TOAR30ROW_AVX2
+
+#if defined(HAS_I444ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUVA444_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [a_buf]"+r"(a_buf), // %[a_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+#if defined(__i386__)
+ [width]"+m"(width) // %[width]
+#else
+ [width]"+rm"(width) // %[width]
+#endif
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_I444ALPHATOARGBROW_AVX2
+
#if defined(HAS_I422ALPHATOARGBROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
@@ -2776,16 +4023,16 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
+ "sub %[u_buf],%[v_buf] \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUVA422_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "subl $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "subl $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
@@ -2815,11 +4062,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
int width) {
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "sub %[u_buf],%[v_buf] \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(yuvconstants)
@@ -2859,16 +4106,16 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV12_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[uv_buf]"+r"(uv_buf), // %[uv_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2892,16 +4139,16 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READNV21_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[vu_buf]"+r"(vu_buf), // %[vu_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
@@ -2925,16 +4172,16 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READYUY2_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -2958,16 +4205,16 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
// clang-format off
asm volatile (
YUVTORGB_SETUP_AVX2(yuvconstants)
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
- "1: \n"
+ "1: \n"
READUYVY_AVX2
YUVTORGB_AVX2(yuvconstants)
STOREARGB_AVX2
- "sub $0x10,%[width] \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
@@ -2981,18 +4228,156 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
}
#endif // HAS_UYVYTOARGBROW_AVX2
+#if defined(HAS_P210TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READP210_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_P210TOARGBROW_AVX2
+
+#if defined(HAS_P410TOARGBROW_AVX2)
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // clang-format off
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READP410_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+ // clang-format on
+}
+#endif // HAS_P410TOARGBROW_AVX2
+
+#if defined(HAS_P210TOAR30ROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READP210_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_P210TOAR30ROW_AVX2
+
+#if defined(HAS_P410TOAR30ROW_AVX2)
+// 16 pixels
+// 16 UV values with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* uv_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READP410_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_P410TOAR30ROW_AVX2
+
#ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile(
- "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
- "movd %%eax,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 *
- // 16
- "movd %%eax,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
+ "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164
+ "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16
+ "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000
+ "pslld $0x18,%%xmm4 \n"
LABELALIGN
"1: \n"
@@ -3001,8 +4386,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
"lea 0x8(%0),%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
- "psubusw %%xmm3,%%xmm0 \n"
- "psrlw $6, %%xmm0 \n"
+ "paddsw %%xmm3,%%xmm0 \n"
+ "psraw $6, %%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
// Step 2: Weave into ARGB
@@ -3018,28 +4403,26 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
"sub $0x8,%2 \n"
"jg 1b \n"
- : "+r"(y_buf), // %0
- "+r"(dst_argb), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ : "r"(yuvconstants) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif // HAS_I400TOARGBROW_SSE2
#ifdef HAS_I400TOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
// note: vpunpcklbw mutates and vpackuswb unmutates.
-void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile(
- "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 *
- // 16
- "vmovd %%eax,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
- "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
- "vmovd %%eax,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpslld $0x18,%%ymm4,%%ymm4 \n"
+ "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164
+ "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000
+ "vpslld $0x18,%%ymm4,%%ymm4 \n"
LABELALIGN
"1: \n"
@@ -3049,8 +4432,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
+ "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
@@ -3060,15 +4443,15 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
"vpor %%ymm4,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm0,(%1) \n"
"vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
+ "lea 0x40(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
- : "+r"(y_buf), // %0
- "+r"(dst_argb), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ : "r"(yuvconstants) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif // HAS_I400TOARGBROW_AVX2
@@ -3081,16 +4464,16 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "movdqa %3,%%xmm5 \n"
+ "movdqa %3,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu -0x10(%0,%2,1),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu -0x10(%0,%2,1),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
@@ -3108,13 +4491,13 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
LABELALIGN
"1: \n"
- "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpermq $0x4e,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -3125,37 +4508,136 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
#endif // HAS_MIRRORROW_AVX2
#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+ 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "movdqa %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu -0x10(%0,%2,2),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorUV) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_MIRRORUVROW_AVX2
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorUV) // %3
+ : "memory", "cc", "xmm0", "xmm5");
+}
+#endif // HAS_MIRRORUVROW_AVX2
+
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
- 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-void MirrorUVRow_SSSE3(const uint8_t* src,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+ 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "movdqa %4,%%xmm1 \n"
- "lea -0x10(%0,%3,2),%0 \n"
- "sub %1,%2 \n"
+ "movdqa %4,%%xmm1 \n"
+ "lea -0x10(%0,%3,2),%0 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea -0x10(%0),%0 \n"
- "pshufb %%xmm1,%%xmm0 \n"
- "movlpd %%xmm0,(%1) \n"
- "movhpd %%xmm0,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $8,%3 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(temp_width) // %3
- : "m"(kShuffleMirrorUV) // %4
+ "movdqu (%0),%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n"
+ "movhpd %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(temp_width) // %3
+ : "m"(kShuffleMirrorSplitUV) // %4
: "memory", "cc", "xmm0", "xmm1");
}
-#endif // HAS_MIRRORUVROW_SSSE3
+#endif // HAS_MIRRORSPLITUVROW_SSSE3
+
+#ifdef HAS_RGB24MIRRORROW_SSSE3
+
+// Shuffle first 5 pixels to last 5 mirrored. first byte zero
+static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u,
+ 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u};
+
+// Shuffle last 5 pixels to first 5 mirrored. last byte zero
+static const uvec8 kShuffleMirrorRGB1 = {
+ 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u};
+
+// Shuffle 5 pixels at a time (15 bytes)
+void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ src_rgb24 += width * 3 - 48;
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // first 5
+ "movdqu 15(%0),%%xmm1 \n" // next 5
+ "movdqu 30(%0),%%xmm2 \n" // next 5
+ "movdqu 32(%0),%%xmm3 \n" // last 1 special
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm3 \n"
+ "lea -0x30(%0),%0 \n"
+ "movdqu %%xmm0,32(%1) \n" // last 5
+ "movdqu %%xmm1,17(%1) \n" // next 5
+ "movdqu %%xmm2,2(%1) \n" // next 5
+ "movlpd %%xmm3,0(%1) \n" // first 1
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirrorRGB0), // %3
+ "m"(kShuffleMirrorRGB1) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_RGB24MIRRORROW_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
@@ -3163,17 +4645,17 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "lea -0x10(%0,%2,4),%0 \n"
+ "lea -0x10(%0,%2,4),%0 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "pshufd $0x1b,%%xmm0,%%xmm0 \n"
- "lea -0x10(%0),%0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufd $0x1b,%%xmm0,%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
@@ -3189,15 +4671,15 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
- "vmovdqu %3,%%ymm5 \n"
+ "vmovdqu %3,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -3213,28 +4695,28 @@ void SplitUVRow_AVX2(const uint8_t* src_uv,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
@@ -3251,28 +4733,28 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm2,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm2,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -3282,6 +4764,63 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
}
#endif // HAS_SPLITUVROW_SSE2
+#ifdef HAS_DETILEROW_SSE2
+void DetileRow_SSE2(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "lea (%0,%3),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride) // %3
+ : "cc", "memory", "xmm0");
+}
+#endif // HAS_DETILEROW_SSE2
+
+#ifdef HAS_DETILESPLITUVROW_SSSE3
+// TODO(greenjustin): Look into generating these constants instead of loading
+// them since this can cause branch mispredicts for fPIC code on 32-bit
+// machines.
+static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14,
+ 1, 3, 5, 7, 9, 11, 13, 15};
+
+// TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very
+// slow on older SSE2 processors.
+void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "movdqu %4,%%xmm1 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea (%0, %5),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "movhps %%xmm0,(%2) \n"
+ "lea 0x8(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "m"(kDeinterlaceUV), // %4
+ "r"(src_tile_stride) // %5
+ : "cc", "memory", "xmm0", "xmm1");
+}
+#endif // HAS_DETILESPLITUVROW_SSSE3
+
#ifdef HAS_MERGEUVROW_AVX2
void MergeUVRow_AVX2(const uint8_t* src_u,
const uint8_t* src_v,
@@ -3289,22 +4828,22 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
int width) {
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
- "lea 0x20(%0),%0 \n"
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
- "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm2,(%2) \n"
"vextractf128 $0x0,%%ymm0,0x10(%2) \n"
"vextractf128 $0x1,%%ymm2,0x20(%2) \n"
"vextractf128 $0x1,%%ymm0,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
@@ -3322,21 +4861,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
int width) {
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm2 \n"
- "movdqu %%xmm0,(%2) \n"
- "movdqu %%xmm2,0x10(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -3346,53 +4885,94 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
}
#endif // HAS_MERGEUVROW_SSE2
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 128 = 9 bits
-// 64 = 10 bits
-// 16 = 12 bits
-// 1 = 16 bits
#ifdef HAS_MERGEUVROW_16_AVX2
void MergeUVRow_16_AVX2(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
- int scale,
+ int depth,
int width) {
+ depth = 16 - depth;
// clang-format off
asm volatile (
- "vmovd %4,%%xmm3 \n"
- "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "sub %0,%1 \n"
+ "vmovd %4,%%xmm3 \n"
+ "sub %0,%1 \n"
// 16 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu (%0,%1,1),%%ymm1 \n"
- "add $0x20,%0 \n"
-
- "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
- "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm2,(%2) \n"
- "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
- "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
- "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
- "add $0x40,%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu (%0,%1,1),%%ymm1 \n"
+ "add $0x20,%0 \n"
+
+ "vpsllw %%xmm3,%%ymm0,%%ymm0 \n"
+ "vpsllw %%xmm3,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2,(%2) \n"
+ "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
+ "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
+ "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
+ "add $0x40,%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
- : "r"(scale) // %4
+ : "r"(depth) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
// clang-format on
}
#endif // HAS_MERGEUVROW_AVX2
+#ifdef HAS_SPLITUVROW_16_AVX2
+const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13,
+ 2, 3, 6, 7, 10, 11, 14, 15};
+void SplitUVRow_16_AVX2(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width) {
+ depth = 16 - depth;
+ // clang-format off
+ asm volatile (
+ "vmovd %4,%%xmm3 \n"
+ "vbroadcastf128 %5,%%ymm4 \n"
+ "sub %1,%2 \n"
+
+ // 16 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "add $0x40,%0 \n"
+
+ "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n"
+ "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x0,%%ymm1,0x10(%1) \n"
+ "vextractf128 $0x1,%%ymm0,(%1,%2) \n"
+ "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n"
+ "add $0x20,%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(depth), // %4
+ "m"(kSplitUVShuffle16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+ // clang-format on
+}
+#endif // HAS_SPLITUVROW_16_AVX2
+
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 128 = 9 bits
// 64 = 10 bits
@@ -3405,24 +4985,24 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm3 \n"
- "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "sub %0,%1 \n"
+ "vmovd %3,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
- // 16 pixels per loop.
+ // 32 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%0,%1) \n"
- "vmovdqu %%ymm1,0x20(%0,%1) \n"
- "add $0x40,%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%0,%1) \n"
+ "vmovdqu %%ymm1,0x20(%0,%1) \n"
+ "add $0x40,%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3432,6 +5012,46 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
}
#endif // HAS_MULTIPLYROW_16_AVX2
+// Use scale to convert msb formats to lsb, depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+// 65536 = 16 bits
+#ifdef HAS_DIVIDEROW_16_AVX2
+void DivideRow_16_AVX2(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%0,%1) \n"
+ "vmovdqu %%ymm1,0x20(%0,%1) \n"
+ "add $0x40,%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width), // %2
+ "+r"(scale) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm3");
+ // clang-format on
+}
+#endif // HAS_MULTIPLYROW_16_AVX2
+
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 32768 = 9 bits
// 16384 = 10 bits
@@ -3443,23 +5063,23 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "movd %3,%%xmm2 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "add $0x20,%0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "add $0x10,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "add $0x20,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3475,25 +5095,25 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm2 \n"
- "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "add $0x40,%0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "add $0x20,%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "add $0x40,%0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3514,25 +5134,25 @@ void Convert8To16Row_SSE2(const uint8_t* src_y,
int width) {
// clang-format off
asm volatile (
- "movd %3,%%xmm2 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "add $0x10,%0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "add $0x20,%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3548,26 +5168,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
int width) {
// clang-format off
asm volatile (
- "vmovd %3,%%xmm2 \n"
- "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
// 32 pixels per loop.
LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "add $0x40,%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "add $0x40,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -3578,37 +5198,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
#endif // HAS_CONVERT8TO16ROW_AVX2
#ifdef HAS_SPLITRGBROW_SSSE3
-
// Shuffle table for converting RGB to Planar.
-static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
- 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
- 2u, 5u, 8u, 11u, 14u, 128u,
- 128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 1u,
- 4u, 7u, 10u, 13u};
-
-static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
- 3u, 6u, 9u, 12u, 15u, 128u,
- 128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 2u,
- 5u, 8u, 11u, 14u};
-
-static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
- 4u, 7u, 10u, 13u, 128u, 128u,
- 128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 0u, 3u,
- 6u, 9u, 12u, 15u};
+static const uvec8 kSplitRGBShuffle[9] = {
+ {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u},
+ {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u,
+ 128u, 128u},
+ {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u,
+ 7u, 10u, 13u},
+ {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u},
+ {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u,
+ 128u, 128u},
+ {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u,
+ 8u, 11u, 14u},
+ {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u},
+ {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u,
+ 128u, 128u},
+ {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u,
+ 12u, 15u}};
void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
uint8_t* dst_r,
@@ -3619,91 +5228,72 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %5, %%xmm0 \n"
- "pshufb %6, %%xmm1 \n"
- "pshufb %7, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %8, %%xmm0 \n"
- "pshufb %9, %%xmm1 \n"
- "pshufb %10, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "pshufb %11, %%xmm0 \n"
- "pshufb %12, %%xmm1 \n"
- "pshufb %13, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%3) \n"
- "lea 0x10(%3),%3 \n"
- "lea 0x30(%0),%0 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb 0(%5), %%xmm0 \n"
+ "pshufb 16(%5), %%xmm1 \n"
+ "pshufb 32(%5), %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb 48(%5),%%xmm0 \n"
+ "pshufb 64(%5),%%xmm1 \n"
+ "pshufb 80(%5), %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb 96(%5), %%xmm0 \n"
+ "pshufb 112(%5), %%xmm1 \n"
+ "pshufb 128(%5), %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "lea 0x10(%3),%3 \n"
+ "lea 0x30(%0),%0 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
- : "m"(kShuffleMaskRGBToR0), // %5
- "m"(kShuffleMaskRGBToR1), // %6
- "m"(kShuffleMaskRGBToR2), // %7
- "m"(kShuffleMaskRGBToG0), // %8
- "m"(kShuffleMaskRGBToG1), // %9
- "m"(kShuffleMaskRGBToG2), // %10
- "m"(kShuffleMaskRGBToB0), // %11
- "m"(kShuffleMaskRGBToB1), // %12
- "m"(kShuffleMaskRGBToB2) // %13
+ : "r"(&kSplitRGBShuffle[0]) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_SPLITRGBROW_SSSE3
#ifdef HAS_MERGERGBROW_SSSE3
-
-// Shuffle table for converting RGB to Planar.
-static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
- 2u, 128u, 128u, 3u, 128u, 128u,
- 4u, 128u, 128u, 5u};
-static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
- 128u, 2u, 128u, 128u, 3u, 128u,
- 128u, 4u, 128u, 128u};
-static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
- 128u, 128u, 2u, 128u, 128u, 3u,
- 128u, 128u, 4u, 128u};
-
-static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
- 7u, 128u, 128u, 8u, 128u, 128u,
- 9u, 128u, 128u, 10u};
-static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
- 128u, 7u, 128u, 128u, 8u, 128u,
- 128u, 9u, 128u, 128u};
-static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
- 128u, 128u, 8u, 128u, 128u, 9u,
- 128u, 128u, 10u, 128u};
-
-static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
- 12u, 128u, 128u, 13u, 128u, 128u,
- 14u, 128u, 128u, 15u};
-static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
- 128u, 13u, 128u, 128u, 14u, 128u,
- 128u, 15u, 128u, 128u};
-static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
- 128u, 128u, 13u, 128u, 128u, 14u,
- 128u, 128u, 15u, 128u};
+// Shuffle table for converting Planar to RGB.
+static const uvec8 kMergeRGBShuffle[9] = {
+ {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u,
+ 128u, 5u},
+ {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u,
+ 128u, 128u},
+ {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u,
+ 4u, 128u},
+ {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u,
+ 10u, 128u},
+ {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u,
+ 128u, 10u},
+ {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u,
+ 128u, 128u},
+ {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u,
+ 15u, 128u, 128u},
+ {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u,
+ 128u, 15u, 128u},
+ {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u,
+ 128u, 128u, 15u}};
void MergeRGBRow_SSSE3(const uint8_t* src_r,
const uint8_t* src_g,
@@ -3714,92 +5304,858 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %5, %%xmm0 \n"
- "pshufb %6, %%xmm1 \n"
- "pshufb %7, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%3) \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %8, %%xmm0 \n"
- "pshufb %9, %%xmm1 \n"
- "pshufb %10, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,16(%3) \n"
-
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%2),%%xmm2 \n"
- "pshufb %11, %%xmm0 \n"
- "pshufb %12, %%xmm1 \n"
- "pshufb %13, %%xmm2 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,32(%3) \n"
-
- "lea 0x10(%0),%0 \n"
- "lea 0x10(%1),%1 \n"
- "lea 0x10(%2),%2 \n"
- "lea 0x30(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb (%5), %%xmm0 \n"
+ "pshufb 16(%5), %%xmm1 \n"
+ "pshufb 32(%5), %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb 48(%5), %%xmm0 \n"
+ "pshufb 64(%5), %%xmm1 \n"
+ "pshufb 80(%5), %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,16(%3) \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb 96(%5), %%xmm0 \n"
+ "pshufb 112(%5), %%xmm1 \n"
+ "pshufb 128(%5), %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,32(%3) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x10(%1),%1 \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x30(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_rgb), // %3
"+r"(width) // %4
- : "m"(kShuffleMaskRToRGB0), // %5
- "m"(kShuffleMaskGToRGB0), // %6
- "m"(kShuffleMaskBToRGB0), // %7
- "m"(kShuffleMaskRToRGB1), // %8
- "m"(kShuffleMaskGToRGB1), // %9
- "m"(kShuffleMaskBToRGB1), // %10
- "m"(kShuffleMaskRToRGB2), // %11
- "m"(kShuffleMaskGToRGB2), // %12
- "m"(kShuffleMaskBToRGB2) // %13
+ : "r"(&kMergeRGBShuffle[0]) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_MERGERGBROW_SSSE3
+#ifdef HAS_MERGEARGBROW_SSE2
+void MergeARGBRow_SSE2(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "movq (%0,%2),%%xmm0 \n" // B
+ "movq (%0),%%xmm1 \n" // R
+ "movq (%0,%1),%%xmm2 \n" // G
+ "punpcklbw %%xmm1,%%xmm0 \n" // BR
+ "movq (%0,%3),%%xmm1 \n" // A
+ "punpcklbw %%xmm1,%%xmm2 \n" // GA
+ "movdqa %%xmm0,%%xmm1 \n" // BR
+ "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi)
+ "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo)
+ "movdqu %%xmm0,(%4) \n"
+ "movdqu %%xmm1,16(%4) \n"
+
+ "lea 8(%0),%0 \n"
+ "lea 32(%4),%4 \n"
+ "sub $0x8,%5 \n"
+ "jg 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_MERGEXRGBROW_SSE2
+void MergeXRGBRow_SSE2(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+
+ "movq (%2),%%xmm0 \n" // B
+ "movq (%0),%%xmm1 \n" // R
+ "movq (%1),%%xmm2 \n" // G
+ "punpcklbw %%xmm1,%%xmm0 \n" // BR
+ "pcmpeqd %%xmm1,%%xmm1 \n" // A(255)
+ "punpcklbw %%xmm1,%%xmm2 \n" // GA
+ "movdqa %%xmm0,%%xmm1 \n" // BR
+ "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi)
+ "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo)
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,16(%3) \n"
+
+ "lea 8(%0),%0 \n"
+ "lea 8(%1),%1 \n"
+ "lea 8(%2),%2 \n"
+ "lea 32(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGEARGBROW_SSE2
+
+#ifdef HAS_MERGEARGBROW_AVX2
+void MergeARGBRow_AVX2(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0,%2),%%xmm0 \n" // B
+ "vmovdqu (%0,%1),%%xmm1 \n" // R
+ "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
+ "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // A
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
+ "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
+ "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%4) \n" // First 8
+ "vmovdqu %%ymm1,32(%4) \n" // Next 8
+
+ "lea 16(%0),%0 \n"
+ "lea 64(%4),%4 \n"
+ "sub $0x10,%5 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_MERGEXRGBROW_AVX2
+void MergeXRGBRow_AVX2(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%2),%%xmm0 \n" // B
+ "vpcmpeqd %%ymm1,%%ymm1,%%ymm1 \n" // A(255)
+ "vinserti128 $0,(%1),%%ymm1,%%ymm1 \n" // R
+ "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
+ "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
+ "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%3) \n" // First 8
+ "vmovdqu %%ymm1,32(%3) \n" // Next 8
+
+ "lea 16(%0),%0 \n"
+ "lea 16(%1),%1 \n"
+ "lea 16(%2),%2 \n"
+ "lea 64(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGEARGBROW_AVX2
+
+#ifdef HAS_SPLITARGBROW_SSE2
+void SplitARGBRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+ "sub %1,%3 \n"
+ "sub %1,%4 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "movdqu (%0),%%xmm0 \n" // 00-0F
+ "movdqu 16(%0),%%xmm1 \n" // 10-1F
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17
+ "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo)
+ "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B
+ "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo)
+ "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
+ "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
+ "movlps %%xmm0,(%1,%3) \n" // B
+ "movhps %%xmm0,(%1,%2) \n" // G
+ "movlps %%xmm2,(%1) \n" // R
+ "movhps %%xmm2,(%1,%4) \n" // A
+
+ "lea 32(%0),%0 \n"
+ "lea 8(%1),%1 \n"
+ "sub $0x8,%5 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(dst_a), // %4
+ "+rm"(width) // %5
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+#ifdef HAS_SPLITXRGBROW_SSE2
+void SplitXRGBRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+
+ "movdqu (%0),%%xmm0 \n" // 00-0F
+ "movdqu 16(%0),%%xmm1 \n" // 10-1F
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17
+ "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo)
+ "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B
+ "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo)
+ "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
+ "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
+ "movlps %%xmm0,(%3) \n" // B
+ "movhps %%xmm0,(%2) \n" // G
+ "movlps %%xmm2,(%1) \n" // R
+
+ "lea 32(%0),%0 \n"
+ "lea 8(%1),%1 \n"
+ "lea 8(%2),%2 \n"
+ "lea 8(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif
+
+static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15};
+#ifdef HAS_SPLITARGBROW_SSSE3
+void SplitARGBRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+
+ "movdqa %6,%%xmm3 \n"
+ "sub %1,%2 \n"
+ "sub %1,%3 \n"
+ "sub %1,%4 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "movdqu (%0),%%xmm0 \n" // 00-0F
+ "movdqu 16(%0),%%xmm1 \n" // 10-1F
+ "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
+ "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
+ "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
+ "movlps %%xmm0,(%1,%3) \n" // B
+ "movhps %%xmm0,(%1,%2) \n" // G
+ "movlps %%xmm2,(%1) \n" // R
+ "movhps %%xmm2,(%1,%4) \n" // A
+
+ "lea 32(%0),%0 \n"
+ "lea 8(%1),%1 \n"
+ "subl $0x8,%5 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(dst_a), // %4
+#if defined(__i386__)
+ "+m"(width) // %5
+#else
+ "+rm"(width) // %5
+#endif
+ : "m"(kShuffleMaskARGBSplit) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+#endif
+
+#ifdef HAS_SPLITXRGBROW_SSSE3
+void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+
+ "movdqa %5,%%xmm3 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "movdqu (%0),%%xmm0 \n" // 00-0F
+ "movdqu 16(%0),%%xmm1 \n" // 10-1F
+ "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
+ "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
+ "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
+ "movlps %%xmm0,(%3) \n" // B
+ "movhps %%xmm0,(%2) \n" // G
+ "movlps %%xmm2,(%1) \n" // R
+
+ "lea 32(%0),%0 \n"
+ "lea 8(%1),%1 \n"
+ "lea 8(%2),%2 \n"
+ "lea 8(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : "m"(kShuffleMaskARGBSplit) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+}
+#endif
+
+#ifdef HAS_SPLITARGBROW_AVX2
+static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
+void SplitARGBRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+ "sub %1,%3 \n"
+ "sub %1,%4 \n"
+ "vmovdqa %7,%%ymm3 \n"
+ "vbroadcastf128 %6,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0),%%xmm0 \n" // 00-0F
+ "vmovdqu 16(%0),%%xmm1 \n" // 10-1F
+ "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
+ "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
+ "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpermd %%ymm0,%%ymm3,%%ymm0 \n"
+ "vpermd %%ymm1,%%ymm3,%%ymm1 \n"
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR
+ "vmovdqu %%xmm0,(%1,%3) \n" // B
+ "vextracti128 $1,%%ymm0,(%1) \n" // R
+ "vmovdqu %%xmm2,(%1,%2) \n" // G
+ "vextracti128 $1,%%ymm2,(%1,%4) \n" // A
+ "lea 64(%0),%0 \n"
+ "lea 16(%1),%1 \n"
+ "subl $0x10,%5 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(dst_a), // %4
+#if defined(__i386__)
+ "+m"(width) // %5
+#else
+ "+rm"(width) // %5
+#endif
+ : "m"(kShuffleMaskARGBSplit), // %6
+ "m"(kShuffleMaskARGBPermute) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SPLITXRGBROW_AVX2
+void SplitXRGBRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+
+ "vmovdqa %6,%%ymm3 \n"
+ "vbroadcastf128 %5,%%ymm4 \n"
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0),%%xmm0 \n" // 00-0F
+ "vmovdqu 16(%0),%%xmm1 \n" // 10-1F
+ "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
+ "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
+ "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpermd %%ymm0,%%ymm3,%%ymm0 \n"
+ "vpermd %%ymm1,%%ymm3,%%ymm1 \n"
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR
+ "vmovdqu %%xmm0,(%3) \n" // B
+ "vextracti128 $1,%%ymm0,(%1) \n" // R
+ "vmovdqu %%xmm2,(%2) \n" // G
+
+ "lea 64(%0),%0 \n"
+ "lea 16(%1),%1 \n"
+ "lea 16(%2),%2 \n"
+ "lea 16(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : "m"(kShuffleMaskARGBSplit), // %5
+ "m"(kShuffleMaskARGBPermute) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_MERGEXR30ROW_AVX2
+void MergeXR30Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int depth,
+ int width) {
+ int shift = depth - 10;
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrlw $6,%%ymm6,%%ymm6 \n"
+ "vmovd %5,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu (%0,%1),%%ymm1 \n"
+ "vmovdqu (%0,%2),%%ymm2 \n"
+ "vpsrlw %%xmm4,%%ymm0,%%ymm0 \n"
+ "vpsrlw %%xmm4,%%ymm1,%%ymm1 \n"
+ "vpsrlw %%xmm4,%%ymm2,%%ymm2 \n"
+ "vpminuw %%ymm0,%%ymm6,%%ymm0 \n"
+ "vpminuw %%ymm1,%%ymm6,%%ymm1 \n"
+ "vpminuw %%ymm2,%%ymm6,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit
+ "vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB
+ "vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n"
+ "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG
+ "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit
+ "vpslld $0xa,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine
+ "vpor %%ymm2,%%ymm3,%%ymm3 \n"
+ "vmovdqu %%ymm0,(%3) \n"
+ "vmovdqu %%ymm3,0x20(%3) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar30), // %3
+ "+r"(width) // %4
+#if defined(__i386__)
+ : "m"(shift) // %5
+#else
+ : "rm"(shift) // %5
+#endif
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_MERGEAR64ROW_AVX2
+static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
+void MergeAR64Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int mask = (1 << depth) - 1;
+ mask = (mask << 16) + mask;
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "vmovdqa %8,%%ymm5 \n"
+ "vmovd %6,%%xmm6 \n"
+ "vbroadcastss %7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // R
+ "vmovdqu (%0,%1),%%ymm1 \n" // G
+ "vmovdqu (%0,%2),%%ymm2 \n" // B
+ "vmovdqu (%0,%3),%%ymm3 \n" // A
+ "vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
+ "vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpminuw %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
+ "vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
+ "vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
+ "vpsllw %%xmm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpermd %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm5,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm5,%%ymm3 \n"
+ "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
+ "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
+ "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
+ "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
+ "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
+ "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
+ "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
+ "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
+ "vmovdqu %%ymm3,(%4) \n"
+ "vmovdqu %%ymm2,0x20(%4) \n"
+ "vmovdqu %%ymm4,0x40(%4) \n"
+ "vmovdqu %%ymm1,0x60(%4) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x80(%4),%4 \n"
+ "subl $0x10,%5 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_ar64), // %4
+#if defined(__i386__)
+ "+m"(width) // %5
+#else
+ "+rm"(width) // %5
+#endif
+ : "m"(shift), // %6
+ "m"(mask), // %7
+ "m"(MergeAR64Permute) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_MERGEXR64ROW_AVX2
+void MergeXR64Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int mask = (1 << depth) - 1;
+ mask = (mask << 16) + mask;
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "vmovdqa %7,%%ymm5 \n"
+ "vmovd %5,%%xmm6 \n"
+ "vbroadcastss %6,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // R
+ "vmovdqu (%0,%1),%%ymm1 \n" // G
+ "vmovdqu (%0,%2),%%ymm2 \n" // B
+ "vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
+ "vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
+ "vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
+ "vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
+ "vpermd %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpermd %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm5,%%ymm2 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff)
+ "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
+ "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
+ "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
+ "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
+ "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
+ "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
+ "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
+ "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
+ "vmovdqu %%ymm3,(%3) \n"
+ "vmovdqu %%ymm2,0x20(%3) \n"
+ "vmovdqu %%ymm4,0x40(%3) \n"
+ "vmovdqu %%ymm1,0x60(%3) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x80(%3),%3 \n"
+ "subl $0x10,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar64), // %3
+ "+r"(width) // %4
+ : "m"(shift), // %5
+ "m"(mask), // %6
+ "m"(MergeAR64Permute) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_MERGEARGB16TO8ROW_AVX2
+static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11,
+ 4, 12, 5, 13, 6, 14, 7, 15};
+void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ int shift = depth - 8;
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "vbroadcastf128 %7,%%ymm5 \n"
+ "vmovd %6,%%xmm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // R
+ "vmovdqu (%0,%1),%%ymm1 \n" // G
+ "vmovdqu (%0,%2),%%ymm2 \n" // B
+ "vmovdqu (%0,%3),%%ymm3 \n" // A
+ "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
+ "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
+ "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
+ "vpsrlw %%xmm6,%%ymm3,%%ymm3 \n"
+ "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
+ "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
+ "vmovdqu %%ymm2,(%4) \n"
+ "vmovdqu %%ymm0,0x20(%4) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x40(%4),%4 \n"
+ "subl $0x10,%5 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+#if defined(__i386__)
+ "+m"(width) // %5
+#else
+ "+rm"(width) // %5
+#endif
+ : "m"(shift), // %6
+ "m"(MergeARGB16To8Shuffle) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
+void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ int shift = depth - 8;
+ asm volatile(
+
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "vbroadcastf128 %6,%%ymm5 \n"
+ "vmovd %5,%%xmm6 \n"
+ "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff)
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // R
+ "vmovdqu (%0,%1),%%ymm1 \n" // G
+ "vmovdqu (%0,%2),%%ymm2 \n" // B
+ "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
+ "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
+ "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
+ "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
+ "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
+ "vmovdqu %%ymm2,(%3) \n"
+ "vmovdqu %%ymm0,0x20(%3) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x40(%3),%3 \n"
+ "subl $0x10,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "m"(shift), // %5
+ "m"(MergeARGB16To8Shuffle) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
#ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "test $0xf,%0 \n"
- "jne 2f \n"
- "test $0xf,%1 \n"
- "jne 2f \n"
+ "test $0xf,%0 \n"
+ "jne 2f \n"
+ "test $0xf,%1 \n"
+ "jne 2f \n"
LABELALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "jmp 9f \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 9f \n"
LABELALIGN
"2: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 2b \n"
-
- LABELALIGN "9: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 2b \n"
+
+ LABELALIGN "9: \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -3814,14 +6170,14 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x40,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -3836,7 +6192,7 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
size_t width_tmp = (size_t)(width);
asm volatile(
- "rep movsb \n"
+ "rep movsb \n"
: "+S"(src), // %0
"+D"(dst), // %1
"+c"(width_tmp) // %2
@@ -3849,29 +6205,29 @@ void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
// width in pixels
void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
-
- LABELALIGN
- "1: \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu 0x10(%0),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "movdqu (%1),%%xmm4 \n"
- "movdqu 0x10(%1),%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -3884,21 +6240,21 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
// width in pixels
void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm1 \n"
- "vmovdqu 0x20(%0),%%ymm2 \n"
- "lea 0x40(%0),%0 \n"
- "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1,(%1) \n"
- "vmovdqu %%ymm2,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "vmovdqu 0x20(%0),%%ymm2 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -3917,17 +6273,17 @@ void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "movdqu (%0), %%xmm0 \n"
- "movdqu 0x10(%0), %%xmm1 \n"
- "lea 0x20(%0), %0 \n"
- "psrld $0x18, %%xmm0 \n"
- "psrld $0x18, %%xmm1 \n"
- "packssdw %%xmm1, %%xmm0 \n"
- "packuswb %%xmm0, %%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1), %1 \n"
- "sub $0x8, %2 \n"
- "jg 1b \n"
+ "movdqu (%0), %%xmm0 \n"
+ "movdqu 0x10(%0), %%xmm1 \n"
+ "lea 0x20(%0), %0 \n"
+ "psrld $0x18, %%xmm0 \n"
+ "psrld $0x18, %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "packuswb %%xmm0, %%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1), %1 \n"
+ "sub $0x8, %2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
"+rm"(width) // %2
@@ -3945,28 +6301,28 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_a,
int width) {
asm volatile(
- "vmovdqa %3,%%ymm4 \n"
+ "vmovdqa %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0), %%ymm0 \n"
- "vmovdqu 0x20(%0), %%ymm1 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x40(%0), %%ymm2 \n"
- "vmovdqu 0x60(%0), %%ymm3 \n"
- "lea 0x80(%0), %0 \n"
- "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
- "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20, %2 \n"
- "jg 1b \n"
+ "vmovdqu (%0), %%ymm0 \n"
+ "vmovdqu 0x20(%0), %%ymm1 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x40(%0), %%ymm2 \n"
+ "vmovdqu 0x60(%0), %%ymm3 \n"
+ "lea 0x80(%0), %0 \n"
+ "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20, %2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
@@ -3981,31 +6337,31 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
// width in pixels
void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm2 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpckhwd %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "movdqu (%1),%%xmm4 \n"
- "movdqu 0x10(%1),%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm2 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -4018,23 +6374,23 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
// width in pixels
void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
LABELALIGN
"1: \n"
- "vpmovzxbd (%0),%%ymm1 \n"
- "vpmovzxbd 0x8(%0),%%ymm2 \n"
- "lea 0x10(%0),%0 \n"
- "vpslld $0x18,%%ymm1,%%ymm1 \n"
- "vpslld $0x18,%%ymm2,%%ymm2 \n"
- "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1,(%1) \n"
- "vmovdqu %%ymm2,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vpmovzxbd (%0),%%ymm1 \n"
+ "vpmovzxbd 0x8(%0),%%ymm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpslld $0x18,%%ymm1,%%ymm1 \n"
+ "vpslld $0x18,%%ymm2,%%ymm2 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -4050,7 +6406,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
asm volatile(
- "rep stosl \n"
+ "rep stosl \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
@@ -4061,7 +6417,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
size_t width_tmp = (size_t)(width);
asm volatile(
- "rep stosb \n"
+ "rep stosb \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v8) // %2
@@ -4072,7 +6428,7 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
size_t width_tmp = (size_t)(width);
asm volatile(
- "rep stosl \n"
+ "rep stosl \n"
: "+D"(dst_argb), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
@@ -4083,21 +6439,21 @@ void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
#ifdef HAS_YUY2TOYROW_SSE2
void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -4111,32 +6467,32 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4150,28 +6506,28 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4185,16 +6541,16 @@ void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -4208,32 +6564,32 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4247,28 +6603,28 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x00(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -4281,22 +6637,22 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
#ifdef HAS_YUY2TOYROW_AVX2
void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
@@ -4311,32 +6667,32 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
@@ -4351,30 +6707,30 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
@@ -4389,17 +6745,17 @@ void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
@@ -4413,32 +6769,32 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
- "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
@@ -4453,30 +6809,30 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
@@ -4493,78 +6849,78 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
// Blend 8 pixels at a time
-void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0xf,%%xmm7 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x8,%%xmm6 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
// 4 pixel loop.
LABELALIGN
"40: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqu (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%3 \n"
- "jl 99f \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
// 1 pixel loop.
"91: \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "sub $0x1,%3 \n"
- "jge 91b \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 91b \n"
"99: \n"
- : "+r"(src_argb0), // %0
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -4586,36 +6942,36 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0,
uint8_t* dst,
int width) {
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "mov $0x807f807f,%%eax \n"
- "movd %%eax,%%xmm7 \n"
- "pshufd $0x0,%%xmm7,%%xmm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "mov $0x807f807f,%%eax \n"
+ "movd %%eax,%%xmm7 \n"
+ "pshufd $0x0,%%xmm7,%%xmm7 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movq (%2),%%xmm0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm0 \n"
- "movq (%0,%2,1),%%xmm1 \n"
- "movq (%1,%2,1),%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "paddw %%xmm7,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%3,%2,1) \n"
- "lea 0x8(%2),%2 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
+ "movq (%2),%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm0 \n"
+ "movq (%0,%2,1),%%xmm1 \n"
+ "movq (%1,%2,1),%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm7,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%3,%2,1) \n"
+ "lea 0x8(%2),%2 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(alpha), // %2
@@ -4638,43 +6994,43 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
uint8_t* dst,
int width) {
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsllw $0x8,%%ymm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm6 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsllw $0x8,%%ymm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm6 \n"
"vbroadcastss %%xmm6,%%ymm6 \n"
- "mov $0x807f807f,%%eax \n"
- "vmovd %%eax,%%xmm7 \n"
+ "mov $0x807f807f,%%eax \n"
+ "vmovd %%eax,%%xmm7 \n"
"vbroadcastss %%xmm7,%%ymm7 \n"
- "sub %2,%0 \n"
- "sub %2,%1 \n"
- "sub %2,%3 \n"
+ "sub %2,%0 \n"
+ "sub %2,%1 \n"
+ "sub %2,%3 \n"
// 32 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%2),%%ymm0 \n"
- "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
- "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
- "vmovdqu (%0,%2,1),%%ymm1 \n"
- "vmovdqu (%1,%2,1),%%ymm2 \n"
- "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
- "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
- "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%3,%2,1) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
+ "vmovdqu (%2),%%ymm0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
+ "vmovdqu (%0,%2,1),%%ymm1 \n"
+ "vmovdqu (%1,%2,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
+ "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%3,%2,1) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src0), // %0
"+r"(src1), // %1
@@ -4688,7 +7044,7 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
#endif // HAS_BLENDPLANEROW_AVX2
#ifdef HAS_ARGBATTENUATEROW_SSSE3
-// Shuffle table duplicating alpha
+// Shuffle table duplicating alpha.
static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
@@ -4698,35 +7054,35 @@ void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "pslld $0x18,%%xmm3 \n"
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "pslld $0x18,%%xmm3 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "punpcklbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm1,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "punpckhbw %%xmm2,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqu (%0),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "pand %%xmm3,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm1,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "punpckhbw %%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "pand %%xmm3,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -4747,29 +7103,29 @@ void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpslld $0x18,%%ymm5,%%ymm5 \n"
- "sub %0,%1 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpslld $0x18,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm6 \n"
- "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
- "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
- "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
- "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm6,%%ymm6 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpor %%ymm6,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm6,%%ymm6 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm6,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -4789,32 +7145,32 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movzb 0x03(%0),%3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x07(%0),%3 \n"
- "movd 0x00(%4,%3,4),%%xmm3 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "movzb 0x0b(%0),%3 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "movd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x0f(%0),%3 \n"
- "movd 0x00(%4,%3,4),%%xmm3 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movzb 0x03(%0),%3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x07(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width), // %2
@@ -4834,52 +7190,52 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
int width) {
uintptr_t alpha;
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
"vbroadcastf128 %5,%%ymm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
// replace VPGATHER
- "movzb 0x03(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm0 \n"
- "movzb 0x07(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm1 \n"
- "movzb 0x0b(%0),%3 \n"
- "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
- "vmovd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x0f(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm3 \n"
- "movzb 0x13(%0),%3 \n"
- "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
- "vmovd 0x00(%4,%3,4),%%xmm0 \n"
- "movzb 0x17(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm1 \n"
- "movzb 0x1b(%0),%3 \n"
- "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
- "vmovd 0x00(%4,%3,4),%%xmm2 \n"
- "movzb 0x1f(%0),%3 \n"
- "vmovd 0x00(%4,%3,4),%%xmm3 \n"
- "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
+ "movzb 0x03(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x07(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "movzb 0x13(%0),%3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x17(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x1b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x1f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
"vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
"vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
"vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
// end of VPGATHER
- "vmovdqu (%0),%%ymm6 \n"
- "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
- "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
- "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
- "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
- "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
+ "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -4896,44 +7252,48 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqu (%0),%%xmm2 \n"
- "movdqu 0x10(%0),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "psrld $0x18,%%xmm2 \n"
- "psrld $0x18,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm3 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psubb %%xmm5,%%xmm0 \n"
+ "psubb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm4,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "movdqu %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm6 \n"
+ "paddw %%xmm5,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrld $0x18,%%xmm2 \n"
+ "psrld $0x18,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm6,%%xmm3 \n"
+ "punpcklbw %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm6,%%xmm1 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm6,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "m"(kSub128) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ARGBGRAYROW_SSSE3
@@ -4954,50 +7314,50 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
asm volatile(
- "movdqa %2,%%xmm2 \n"
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
+ "movdqa %2,%%xmm2 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm6 \n"
- "phaddw %%xmm6,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqu (%0),%%xmm5 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm5 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqu (%0),%%xmm5 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm5 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "psrld $0x18,%%xmm6 \n"
- "psrld $0x18,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm5 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "punpckhwd %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%1 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "phaddw %%xmm6,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%1 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "m"(kARGBToSepiaB), // %2
@@ -5015,54 +7375,54 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
const int8_t* matrix_argb,
int width) {
asm volatile(
- "movdqu (%3),%%xmm5 \n"
- "pshufd $0x00,%%xmm5,%%xmm2 \n"
- "pshufd $0x55,%%xmm5,%%xmm3 \n"
- "pshufd $0xaa,%%xmm5,%%xmm4 \n"
- "pshufd $0xff,%%xmm5,%%xmm5 \n"
+ "movdqu (%3),%%xmm5 \n"
+ "pshufd $0x00,%%xmm5,%%xmm2 \n"
+ "pshufd $0x55,%%xmm5,%%xmm3 \n"
+ "pshufd $0xaa,%%xmm5,%%xmm4 \n"
+ "pshufd $0xff,%%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm7 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddsw %%xmm7,%%xmm0 \n"
- "phaddsw %%xmm1,%%xmm6 \n"
- "psraw $0x6,%%xmm0 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm1 \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x10(%0),%%xmm7 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm6 \n"
- "psraw $0x6,%%xmm1 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "punpcklwd %%xmm1,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm6 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm6,0x10(%1) \n"
- "lea 0x20(%0),%0 \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm7 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddsw %%xmm7,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm6 \n"
+ "psraw $0x6,%%xmm0 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm1 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm6 \n"
+ "psraw $0x6,%%xmm1 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "punpcklwd %%xmm1,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm6 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm6,0x10(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -5080,40 +7440,40 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
int interval_offset,
int width) {
asm volatile(
- "movd %2,%%xmm2 \n"
- "movd %3,%%xmm3 \n"
- "movd %4,%%xmm4 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshufd $0x44,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "pshufd $0x44,%%xmm3,%%xmm3 \n"
- "pshuflw $0x40,%%xmm4,%%xmm4 \n"
- "pshufd $0x44,%%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "pslld $0x18,%%xmm6 \n"
+ "movd %2,%%xmm2 \n"
+ "movd %3,%%xmm3 \n"
+ "movd %4,%%xmm4 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshufd $0x44,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "pshufd $0x44,%%xmm3,%%xmm3 \n"
+ "pshuflw $0x40,%%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "pslld $0x18,%%xmm6 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu (%0),%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "pmullw %%xmm3,%%xmm0 \n"
- "movdqu (%0),%%xmm7 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "pand %%xmm6,%%xmm7 \n"
- "paddw %%xmm4,%%xmm0 \n"
- "paddw %%xmm4,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqu %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x4,%1 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "pmullw %%xmm3,%%xmm0 \n"
+ "movdqu (%0),%%xmm7 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "pand %%xmm6,%%xmm7 \n"
+ "paddw %%xmm4,%%xmm0 \n"
+ "paddw %%xmm4,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x4,%1 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
@@ -5131,27 +7491,27 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb,
int width,
uint32_t value) {
asm volatile(
- "movd %3,%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm2 \n"
+ "movd %3,%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm2 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -5162,35 +7522,35 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb,
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile(
- "pxor %%xmm5,%%xmm5 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "movdqu %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -5201,50 +7561,45 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm1 \n"
- "lea 0x20(%0),%0 \n"
- "vmovdqu (%1),%%ymm3 \n"
- "lea 0x20(%1),%1 \n"
- "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
- "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu (%1),%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
- : "+r"(src_argb0), // %0
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "memory", "cc"
-#if defined(__AVX2__)
- ,
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_ARGBMULTIPLYROW_AVX2
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+void ARGBAddRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -5252,16 +7607,16 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -5272,7 +7627,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+void ARGBAddRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -5280,16 +7635,16 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vpaddusb (%1),%%ymm0,%%ymm0 \n"
- "lea 0x20(%1),%1 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpaddusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
- : "+r"(src_argb0), // %0
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -5300,7 +7655,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -5308,16 +7663,16 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
- "psubusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psubusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -5328,7 +7683,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -5336,16 +7691,16 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
// 4 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "lea 0x20(%0),%0 \n"
- "vpsubusb (%1),%%ymm0,%%ymm0 \n"
- "lea 0x20(%1),%1 \n"
- "vmovdqu %%ymm0,(%2) \n"
- "lea 0x20(%2),%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpsubusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
"vzeroupper \n"
- : "+r"(src_argb0), // %0
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -5365,40 +7720,40 @@ void SobelXRow_SSE2(const uint8_t* src_y0,
uint8_t* dst_sobelx,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "sub %0,%3 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movq 0x2(%0),%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- "movq 0x00(%0,%1,1),%%xmm1 \n"
- "movq 0x02(%0,%1,1),%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- "movq 0x00(%0,%2,1),%%xmm2 \n"
- "movq 0x02(%0,%2,1),%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x00(%0,%3,1) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x2(%0),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "movq 0x02(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x00(%0,%2,1),%%xmm2 \n"
+ "movq 0x02(%0,%2,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%3,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -5419,39 +7774,39 @@ void SobelYRow_SSE2(const uint8_t* src_y0,
uint8_t* dst_sobely,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movq 0x00(%0,%1,1),%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- "movq 0x1(%0),%%xmm1 \n"
- "movq 0x01(%0,%1,1),%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- "movq 0x2(%0),%%xmm2 \n"
- "movq 0x02(%0,%1,1),%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x00(%0,%2,1) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x1(%0),%%xmm1 \n"
+ "movq 0x01(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x2(%0),%%xmm2 \n"
+ "movq 0x02(%0,%1,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%2,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
@@ -5472,37 +7827,37 @@ void SobelRow_SSE2(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm2 \n"
- "punpckhbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm1 \n"
- "punpckhwd %%xmm2,%%xmm2 \n"
- "por %%xmm5,%%xmm1 \n"
- "por %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklwd %%xmm0,%%xmm3 \n"
- "punpckhwd %%xmm0,%%xmm0 \n"
- "por %%xmm5,%%xmm3 \n"
- "por %%xmm5,%%xmm0 \n"
- "movdqu %%xmm1,(%2) \n"
- "movdqu %%xmm2,0x10(%2) \n"
- "movdqu %%xmm3,0x20(%2) \n"
- "movdqu %%xmm0,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm2 \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm1 \n"
+ "punpckhwd %%xmm2,%%xmm2 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklwd %%xmm0,%%xmm3 \n"
+ "punpckhwd %%xmm0,%%xmm0 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm1,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "movdqu %%xmm3,0x20(%2) \n"
+ "movdqu %%xmm0,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -5519,21 +7874,21 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
uint8_t* dst_y,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -5554,36 +7909,36 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "paddusb %%xmm1,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "punpckhbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "punpcklbw %%xmm2,%%xmm4 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "punpcklwd %%xmm3,%%xmm6 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "punpcklwd %%xmm0,%%xmm7 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqu %%xmm6,(%2) \n"
- "movdqu %%xmm4,0x10(%2) \n"
- "movdqu %%xmm7,0x20(%2) \n"
- "movdqu %%xmm1,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "paddusb %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "punpckhbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm2,%%xmm4 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "punpcklwd %%xmm0,%%xmm7 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm6,(%2) \n"
+ "movdqu %%xmm4,0x10(%2) \n"
+ "movdqu %%xmm7,0x20(%2) \n"
+ "movdqu %%xmm1,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -5602,67 +7957,67 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
const int32_t* previous_cumsum,
int width) {
asm volatile(
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "test $0xf,%1 \n"
- "jne 49f \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%1 \n"
+ "jne 49f \n"
// 4 pixel loop.
LABELALIGN
"40: \n"
- "movdqu (%0),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "punpckhwd %%xmm1,%%xmm3 \n"
- "punpckhbw %%xmm1,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "punpcklwd %%xmm1,%%xmm4 \n"
- "punpckhwd %%xmm1,%%xmm5 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu (%2),%%xmm2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "movdqu 0x10(%2),%%xmm3 \n"
- "paddd %%xmm0,%%xmm3 \n"
- "paddd %%xmm4,%%xmm0 \n"
- "movdqu 0x20(%2),%%xmm4 \n"
- "paddd %%xmm0,%%xmm4 \n"
- "paddd %%xmm5,%%xmm0 \n"
- "movdqu 0x30(%2),%%xmm5 \n"
- "lea 0x40(%2),%2 \n"
- "paddd %%xmm0,%%xmm5 \n"
- "movdqu %%xmm2,(%1) \n"
- "movdqu %%xmm3,0x10(%1) \n"
- "movdqu %%xmm4,0x20(%1) \n"
- "movdqu %%xmm5,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm3 \n"
+ "punpckhbw %%xmm1,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "punpcklwd %%xmm1,%%xmm4 \n"
+ "punpckhwd %%xmm1,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "movdqu 0x10(%2),%%xmm3 \n"
+ "paddd %%xmm0,%%xmm3 \n"
+ "paddd %%xmm4,%%xmm0 \n"
+ "movdqu 0x20(%2),%%xmm4 \n"
+ "paddd %%xmm0,%%xmm4 \n"
+ "paddd %%xmm5,%%xmm0 \n"
+ "movdqu 0x30(%2),%%xmm5 \n"
+ "lea 0x40(%2),%2 \n"
+ "paddd %%xmm0,%%xmm5 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "movdqu %%xmm4,0x20(%1) \n"
+ "movdqu %%xmm5,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
// 1 pixel loop.
LABELALIGN
"10: \n"
- "movd (%0),%%xmm2 \n"
- "lea 0x4(%0),%0 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu (%2),%%xmm2 \n"
- "lea 0x10(%2),%2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "movdqu %%xmm2,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
+ "movd (%0),%%xmm2 \n"
+ "lea 0x4(%0),%0 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "lea 0x10(%2),%2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
"19: \n"
: "+r"(row), // %0
@@ -5682,119 +8037,119 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
uint8_t* dst,
int count) {
asm volatile(
- "movd %5,%%xmm5 \n"
- "cvtdq2ps %%xmm5,%%xmm5 \n"
- "rcpss %%xmm5,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "cmpl $0x80,%5 \n"
- "ja 40f \n"
-
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrld $0x10,%%xmm6 \n"
- "cvtdq2ps %%xmm6,%%xmm6 \n"
- "addps %%xmm6,%%xmm5 \n"
- "mulps %%xmm4,%%xmm5 \n"
- "cvtps2dq %%xmm5,%%xmm5 \n"
- "packssdw %%xmm5,%%xmm5 \n"
+ "movd %5,%%xmm5 \n"
+ "cvtdq2ps %%xmm5,%%xmm5 \n"
+ "rcpss %%xmm5,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "cmpl $0x80,%5 \n"
+ "ja 40f \n"
+
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrld $0x10,%%xmm6 \n"
+ "cvtdq2ps %%xmm6,%%xmm6 \n"
+ "addps %%xmm6,%%xmm5 \n"
+ "mulps %%xmm4,%%xmm5 \n"
+ "cvtps2dq %%xmm5,%%xmm5 \n"
+ "packssdw %%xmm5,%%xmm5 \n"
// 4 pixel small loop.
LABELALIGN
"4: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "psubd 0x10(%0,%4,4),%%xmm1 \n"
- "psubd 0x20(%0,%4,4),%%xmm2 \n"
- "psubd 0x30(%0,%4,4),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "psubd 0x10(%1),%%xmm1 \n"
- "psubd 0x20(%1),%%xmm2 \n"
- "psubd 0x30(%1),%%xmm3 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "paddd 0x10(%1,%4,4),%%xmm1 \n"
- "paddd 0x20(%1,%4,4),%%xmm2 \n"
- "paddd 0x30(%1,%4,4),%%xmm3 \n"
- "lea 0x40(%1),%1 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 4b \n"
- "jmp 49f \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 4b \n"
+ "jmp 49f \n"
// 4 pixel loop
LABELALIGN
"40: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "psubd 0x10(%0,%4,4),%%xmm1 \n"
- "psubd 0x20(%0,%4,4),%%xmm2 \n"
- "psubd 0x30(%0,%4,4),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "psubd 0x10(%1),%%xmm1 \n"
- "psubd 0x20(%1),%%xmm2 \n"
- "psubd 0x30(%1),%%xmm3 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "paddd 0x10(%1,%4,4),%%xmm1 \n"
- "paddd 0x20(%1,%4,4),%%xmm2 \n"
- "paddd 0x30(%1,%4,4),%%xmm3 \n"
- "lea 0x40(%1),%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm1,%%xmm1 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "mulps %%xmm4,%%xmm1 \n"
- "cvtdq2ps %%xmm2,%%xmm2 \n"
- "cvtdq2ps %%xmm3,%%xmm3 \n"
- "mulps %%xmm4,%%xmm2 \n"
- "mulps %%xmm4,%%xmm3 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "cvtps2dq %%xmm1,%%xmm1 \n"
- "cvtps2dq %%xmm2,%%xmm2 \n"
- "cvtps2dq %%xmm3,%%xmm3 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm1,%%xmm1 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm1 \n"
+ "cvtdq2ps %%xmm2,%%xmm2 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "cvtps2dq %%xmm1,%%xmm1 \n"
+ "cvtps2dq %%xmm2,%%xmm2 \n"
+ "cvtps2dq %%xmm3,%%xmm3 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
// 1 pixel loop
LABELALIGN
"10: \n"
- "movdqu (%0),%%xmm0 \n"
- "psubd 0x00(%0,%4,4),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "paddd 0x00(%1,%4,4),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
"19: \n"
: "+r"(topleft), // %0
"+r"(botleft), // %1
@@ -5817,70 +8172,70 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
intptr_t src_argb_stride_temp = src_argb_stride;
intptr_t temp;
asm volatile(
- "movq (%3),%%xmm2 \n"
- "movq 0x08(%3),%%xmm7 \n"
- "shl $0x10,%1 \n"
- "add $0x4,%1 \n"
- "movd %1,%%xmm5 \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
-
- "pshufd $0x44,%%xmm7,%%xmm7 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "addps %%xmm7,%%xmm0 \n"
- "movlhps %%xmm0,%%xmm2 \n"
- "movdqa %%xmm7,%%xmm4 \n"
- "addps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "addps %%xmm4,%%xmm3 \n"
- "addps %%xmm4,%%xmm4 \n"
+ "movq (%3),%%xmm2 \n"
+ "movq 0x08(%3),%%xmm7 \n"
+ "shl $0x10,%1 \n"
+ "add $0x4,%1 \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
+ "movlhps %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
// 4 pixel loop
LABELALIGN
"40: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
- "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
- "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
- "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd 0x00(%0,%1,1),%%xmm1 \n"
- "movd 0x00(%0,%5,1),%%xmm6 \n"
- "punpckldq %%xmm6,%%xmm1 \n"
- "addps %%xmm4,%%xmm2 \n"
- "movq %%xmm1,(%2) \n"
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- "movd 0x00(%0,%1,1),%%xmm0 \n"
- "movd 0x00(%0,%5,1),%%xmm6 \n"
- "punpckldq %%xmm6,%%xmm0 \n"
- "addps %%xmm4,%%xmm3 \n"
- "movq %%xmm0,0x08(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%4 \n"
- "jge 40b \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
+ "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
+ "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
+ "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1,(%2) \n"
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "movq %%xmm0,0x08(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
"49: \n"
- "add $0x3,%4 \n"
- "jl 19f \n"
+ "add $0x3,%4 \n"
+ "jl 19f \n"
// 1 pixel loop
LABELALIGN
"10: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "pmaddwd %%xmm5,%%xmm0 \n"
- "addps %%xmm7,%%xmm2 \n"
- "movd %%xmm0,%k1 \n"
- "movd 0x00(%0,%1,1),%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x04(%2),%2 \n"
- "sub $0x1,%4 \n"
- "jge 10b \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%k1 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x04(%2),%2 \n"
+ "sub $0x1,%4 \n"
+ "jge 10b \n"
"19: \n"
: "+r"(src_argb), // %0
"+r"(src_argb_stride_temp), // %1
@@ -5899,76 +8254,76 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
void InterpolateRow_SSSE3(uint8_t* dst_ptr,
const uint8_t* src_ptr,
ptrdiff_t src_stride,
- int dst_width,
+ int width,
int source_y_fraction) {
asm volatile(
- "sub %1,%0 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "cmp $0x80,%3 \n"
- "je 50f \n"
-
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x100,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
// General purpose row blend.
LABELALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x00(%1,%4,1),%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm4,%%xmm0 \n"
- "psubb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm5,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm3 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "pmaddubsw %%xmm1,%%xmm3 \n"
- "paddw %%xmm4,%%xmm2 \n"
- "paddw %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm2,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "jmp 99f \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm4,%%xmm0 \n"
+ "psubb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm3 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "pmaddubsw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm2,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
// Blend 50 / 50.
LABELALIGN
"50: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x00(%1,%4,1),%%xmm1 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 50b \n"
- "jmp 99f \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu %%xmm0,0x00(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 100b \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
- "+rm"(dst_width), // %2
+ "+rm"(width), // %2
"+r"(source_y_fraction) // %3
: "r"((intptr_t)(src_stride)) // %4
: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
@@ -5980,71 +8335,73 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr,
void InterpolateRow_AVX2(uint8_t* dst_ptr,
const uint8_t* src_ptr,
ptrdiff_t src_stride,
- int dst_width,
+ int width,
int source_y_fraction) {
asm volatile(
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "sub %1,%0 \n"
- "cmp $0x80,%3 \n"
- "je 50f \n"
-
- "vmovd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x100,%3 \n"
- "vmovd %3,%%xmm5 \n"
- "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
- "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
+
+ "vmovd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "vmovd %3,%%xmm5 \n"
+ "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
+ "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
"vbroadcastss %%xmm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm4 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm4 \n"
"vbroadcastss %%xmm4,%%ymm4 \n"
// General purpose row blend.
LABELALIGN
"1: \n"
- "vmovdqu (%1),%%ymm0 \n"
- "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
- "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
- "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
- "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
- "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "jmp 99f \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
// Blend 50 / 50.
LABELALIGN
"50: \n"
- "vmovdqu (%1),%%ymm0 \n"
- "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 50b \n"
- "jmp 99f \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
- "rep movsb \n"
- "jmp 999f \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 100b \n"
"99: \n"
"vzeroupper \n"
- "999: \n"
- : "+D"(dst_ptr), // %0
- "+S"(src_ptr), // %1
- "+cm"(dst_width), // %2
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(width), // %2
"+r"(source_y_fraction) // %3
: "r"((intptr_t)(src_stride)) // %4
: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
@@ -6059,20 +8416,20 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
int width) {
asm volatile(
- "movdqu (%3),%%xmm5 \n"
+ "movdqu (%3),%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -6093,16 +8450,16 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -6120,24 +8477,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movq (%1),%%xmm2 \n"
- "movq 0x00(%1,%2,1),%%xmm1 \n"
- "add $0x8,%1 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqu (%0),%%xmm0 \n"
- "add $0x10,%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%3) \n"
- "movdqu %%xmm1,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "add $0x10,%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6156,24 +8513,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "movq (%1),%%xmm2 \n"
- "movq 0x00(%1,%2,1),%%xmm1 \n"
- "add $0x8,%1 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "add $0x10,%0 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,(%3) \n"
- "movdqu %%xmm2,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,(%3) \n"
+ "movdqu %%xmm2,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6192,27 +8549,27 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vpmovzxbw (%1),%%ymm1 \n"
- "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
- "add $0x10,%1 \n"
- "vpsllw $0x8,%%ymm2,%%ymm2 \n"
- "vpor %%ymm1,%%ymm2,%%ymm2 \n"
- "vmovdqu (%0),%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
- "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
- "vextractf128 $0x0,%%ymm1,(%3) \n"
- "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
- "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
- "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
- "lea 0x40(%3),%3 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
+ "vextractf128 $0x0,%%ymm1,(%3) \n"
+ "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
+ "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
+ "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6231,27 +8588,27 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
int width) {
asm volatile(
- "sub %1,%2 \n"
-
- LABELALIGN
- "1: \n"
- "vpmovzxbw (%1),%%ymm1 \n"
- "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
- "add $0x10,%1 \n"
- "vpsllw $0x8,%%ymm2,%%ymm2 \n"
- "vpor %%ymm1,%%ymm2,%%ymm2 \n"
- "vmovdqu (%0),%%ymm0 \n"
- "add $0x20,%0 \n"
- "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
- "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
- "vextractf128 $0x0,%%ymm1,(%3) \n"
- "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
- "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
- "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
- "lea 0x40(%3),%3 \n"
- "sub $0x20,%4 \n"
- "jg 1b \n"
- "vzeroupper \n"
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
+ "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
+ "vextractf128 $0x0,%%ymm1,(%3) \n"
+ "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
+ "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
+ "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6269,47 +8626,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
int width) {
asm volatile(
- "pxor %%xmm3,%%xmm3 \n"
+ "pxor %%xmm3,%%xmm3 \n"
// 2 pixel loop.
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm3,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "mulps 0x10(%3),%%xmm0 \n"
- "mulps 0x10(%3),%%xmm4 \n"
- "addps (%3),%%xmm0 \n"
- "addps (%3),%%xmm4 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm6 \n"
- "mulps %%xmm1,%%xmm2 \n"
- "mulps %%xmm5,%%xmm6 \n"
- "mulps %%xmm2,%%xmm1 \n"
- "mulps %%xmm6,%%xmm5 \n"
- "mulps 0x20(%3),%%xmm2 \n"
- "mulps 0x20(%3),%%xmm6 \n"
- "mulps 0x30(%3),%%xmm1 \n"
- "mulps 0x30(%3),%%xmm5 \n"
- "addps %%xmm2,%%xmm0 \n"
- "addps %%xmm6,%%xmm4 \n"
- "addps %%xmm1,%%xmm0 \n"
- "addps %%xmm5,%%xmm4 \n"
- "cvttps2dq %%xmm0,%%xmm0 \n"
- "cvttps2dq %%xmm4,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x2,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm3,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "mulps 0x10(%3),%%xmm0 \n"
+ "mulps 0x10(%3),%%xmm4 \n"
+ "addps (%3),%%xmm0 \n"
+ "addps (%3),%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "mulps %%xmm1,%%xmm2 \n"
+ "mulps %%xmm5,%%xmm6 \n"
+ "mulps %%xmm2,%%xmm1 \n"
+ "mulps %%xmm6,%%xmm5 \n"
+ "mulps 0x20(%3),%%xmm2 \n"
+ "mulps 0x20(%3),%%xmm6 \n"
+ "mulps 0x30(%3),%%xmm1 \n"
+ "mulps 0x30(%3),%%xmm5 \n"
+ "addps %%xmm2,%%xmm0 \n"
+ "addps %%xmm6,%%xmm4 \n"
+ "addps %%xmm1,%%xmm0 \n"
+ "addps %%xmm5,%%xmm4 \n"
+ "cvttps2dq %%xmm0,%%xmm0 \n"
+ "cvttps2dq %%xmm4,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -6405,27 +8762,27 @@ void HalfFloatRow_AVX2(const uint16_t* src,
int width) {
scale *= kScaleBias;
asm volatile(
- "vbroadcastss %3, %%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
- "sub %0,%1 \n"
+ "vbroadcastss %3, %%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm2 \n" // 16 shorts
- "add $0x20,%0 \n"
- "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
- "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
- "vcvtdq2ps %%ymm3,%%ymm3 \n"
- "vcvtdq2ps %%ymm2,%%ymm2 \n"
- "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
- "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
- "vpsrld $0xd,%%ymm3,%%ymm3 \n"
- "vpsrld $0xd,%%ymm2,%%ymm2 \n"
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
- "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm2 \n" // 16 shorts
+ "add $0x20,%0 \n"
+ "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
+ "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vpsrld $0xd,%%ymm3,%%ymm3 \n"
+ "vpsrld $0xd,%%ymm2,%%ymm2 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
+ "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
@@ -6434,7 +8791,7 @@ void HalfFloatRow_AVX2(const uint16_t* src,
#if defined(__x86_64__)
: "x"(scale) // %3
#else
- : "m"(scale) // %3
+ : "m"(scale) // %3
#endif
: "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
}
@@ -6446,8 +8803,8 @@ void HalfFloatRow_F16C(const uint16_t* src,
float scale,
int width) {
asm volatile(
- "vbroadcastss %3, %%ymm4 \n"
- "sub %0,%1 \n"
+ "vbroadcastss %3, %%ymm4 \n"
+ "sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
@@ -6472,7 +8829,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
#if defined(__x86_64__)
: "x"(scale) // %3
#else
- : "m"(scale) // %3
+ : "m"(scale) // %3
#endif
: "memory", "cc", "xmm2", "xmm3", "xmm4");
}
@@ -6481,7 +8838,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
#ifdef HAS_HALFFLOATROW_F16C
void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
asm volatile(
- "sub %0,%1 \n"
+ "sub %0,%1 \n"
// 16 pixel loop.
LABELALIGN
"1: \n"
@@ -6515,21 +8872,21 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb,
// 1 pixel loop.
LABELALIGN
"1: \n"
- "movzb (%0),%1 \n"
- "lea 0x4(%0),%0 \n"
- "movzb 0x00(%3,%1,4),%1 \n"
- "mov %b1,-0x4(%0) \n"
- "movzb -0x3(%0),%1 \n"
- "movzb 0x01(%3,%1,4),%1 \n"
- "mov %b1,-0x3(%0) \n"
- "movzb -0x2(%0),%1 \n"
- "movzb 0x02(%3,%1,4),%1 \n"
- "mov %b1,-0x2(%0) \n"
- "movzb -0x1(%0),%1 \n"
- "movzb 0x03(%3,%1,4),%1 \n"
- "mov %b1,-0x1(%0) \n"
- "dec %2 \n"
- "jg 1b \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "movzb -0x1(%0),%1 \n"
+ "movzb 0x03(%3,%1,4),%1 \n"
+ "mov %b1,-0x1(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"=&d"(pixel_temp), // %1
"+r"(width) // %2
@@ -6548,18 +8905,18 @@ void RGBColorTableRow_X86(uint8_t* dst_argb,
// 1 pixel loop.
LABELALIGN
"1: \n"
- "movzb (%0),%1 \n"
- "lea 0x4(%0),%0 \n"
- "movzb 0x00(%3,%1,4),%1 \n"
- "mov %b1,-0x4(%0) \n"
- "movzb -0x3(%0),%1 \n"
- "movzb 0x01(%3,%1,4),%1 \n"
- "mov %b1,-0x3(%0) \n"
- "movzb -0x2(%0),%1 \n"
- "movzb 0x02(%3,%1,4),%1 \n"
- "mov %b1,-0x2(%0) \n"
- "dec %2 \n"
- "jg 1b \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
: "+r"(dst_argb), // %0
"=&d"(pixel_temp), // %1
"+r"(width) // %2
@@ -6578,86 +8935,86 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
uintptr_t pixel_temp;
uintptr_t table_temp;
asm volatile(
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0x8,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0x8,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%2),%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "phaddw %%xmm0,%%xmm0 \n"
- "pand %%xmm4,%%xmm0 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb (%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,(%3) \n"
- "movzb 0x1(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x1(%3) \n"
- "movzb 0x2(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x2(%3) \n"
- "movzb 0x3(%2),%0 \n"
- "mov %b0,0x3(%3) \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb 0x4(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x4(%3) \n"
- "movzb 0x5(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x5(%3) \n"
- "movzb 0x6(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x6(%3) \n"
- "movzb 0x7(%2),%0 \n"
- "mov %b0,0x7(%3) \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
-
- "movzb 0x8(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x8(%3) \n"
- "movzb 0x9(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0x9(%3) \n"
- "movzb 0xa(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xa(%3) \n"
- "movzb 0xb(%2),%0 \n"
- "mov %b0,0xb(%3) \n"
-
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
-
- "movzb 0xc(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xc(%3) \n"
- "movzb 0xd(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xd(%3) \n"
- "movzb 0xe(%2),%0 \n"
- "movzb 0x00(%1,%0,1),%0 \n"
- "mov %b0,0xe(%3) \n"
- "movzb 0xf(%2),%0 \n"
- "mov %b0,0xf(%3) \n"
- "lea 0x10(%2),%2 \n"
- "lea 0x10(%3),%3 \n"
- "sub $0x4,%4 \n"
- "jg 1b \n"
+ "movdqu (%2),%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm0 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb (%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,(%3) \n"
+ "movzb 0x1(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x1(%3) \n"
+ "movzb 0x2(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x2(%3) \n"
+ "movzb 0x3(%2),%0 \n"
+ "mov %b0,0x3(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb 0x4(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x4(%3) \n"
+ "movzb 0x5(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x5(%3) \n"
+ "movzb 0x6(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x6(%3) \n"
+ "movzb 0x7(%2),%0 \n"
+ "mov %b0,0x7(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb 0x8(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x8(%3) \n"
+ "movzb 0x9(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x9(%3) \n"
+ "movzb 0xa(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xa(%3) \n"
+ "movzb 0xb(%2),%0 \n"
+ "mov %b0,0xb(%3) \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+
+ "movzb 0xc(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xc(%3) \n"
+ "movzb 0xd(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xd(%3) \n"
+ "movzb 0xe(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xe(%3) \n"
+ "movzb 0xf(%2),%0 \n"
+ "mov %b0,0xf(%3) \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x10(%3),%3 \n"
+ "sub $0x4,%4 \n"
+ "jg 1b \n"
: "=&d"(pixel_temp), // %0
"=&a"(table_temp), // %1
"+r"(src_argb), // %2
@@ -6669,126 +9026,306 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
}
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
-#ifdef HAS_NV21TOYUV24ROW_AVX2
-
-// begin NV21ToYUV24Row_C avx2 constants
-static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
- 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
- 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
- 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
-
-static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
- 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
- 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
- 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
-
-static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
- 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
- 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
- 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
-
-static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
- 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
- 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
- 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
-
-static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
- 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
- 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
- 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
-
-static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
- 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
- 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
- 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
-
-static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
- 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
- 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
- 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
-
-static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
- 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
- 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
- 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
-
-static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
- 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
- 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
- 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
-
-// NV21ToYUV24Row_AVX2
+static const uvec8 kYUV24Shuffle[3] = {
+ {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12},
+ {9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15},
+ {2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}};
+
+// Convert biplanar NV21 to packed YUV24
+// NV21 has VU in memory for chroma.
+// YUV24 is VUY in memory
+void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "movdqa (%4),%%xmm4 \n" // 3 shuffler constants
+ "movdqa 16(%4),%%xmm5 \n"
+ "movdqa 32(%4),%%xmm6 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm2 \n" // load 16 Y values
+ "movdqu (%0,%1),%%xmm3 \n" // load 8 VU values
+ "lea 16(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "shufps $0x44,%%xmm3,%%xmm0 \n" // Y 0..7, UV 0..3
+ "shufps $0x99,%%xmm3,%%xmm1 \n" // Y 4..11, UV 2..5
+ "shufps $0xee,%%xmm3,%%xmm2 \n" // Y 8..15, UV 4..7
+ "pshufb %%xmm4, %%xmm0 \n" // weave into YUV24
+ "pshufb %%xmm5, %%xmm1 \n"
+ "pshufb %%xmm6, %%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm1,16(%2) \n"
+ "movdqu %%xmm2,32(%2) \n"
+ "lea 48(%2),%2 \n"
+ "sub $16,%3 \n" // 16 pixels per loop
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ : "r"(&kYUV24Shuffle[0]) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+// Convert biplanar NV21 to packed YUV24
+// NV21 has VU in memory for chroma.
+// YUV24 is VUY in memory
void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_yuv24,
int width) {
- uint8_t* src_y_ptr;
- uint64_t src_offset = 0;
- uint64_t width64;
-
- width64 = width;
- src_y_ptr = (uint8_t*)src_y;
-
- asm volatile(
- "vmovdqu %5, %%ymm0 \n" // init blend value
- "vmovdqu %6, %%ymm1 \n" // init blend value
- "vmovdqu %7, %%ymm2 \n" // init blend value
- // "sub $0x20, %3 \n" //sub 32 from width for final loop
-
- LABELALIGN
- "1: \n" // label 1
- "vmovdqu (%0,%4), %%ymm3 \n" // src_y
- "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
- "vmovdqu (%1), %%ymm5 \n" // src_uv
- "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
- "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
- // shuf
- "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
- // shuf
- "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
- "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
- // shuf
- "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
- "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
- "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
- "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
- "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
- "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
- "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
- "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
- "add $0x20, %4 \n" // add to src buffer
- // ptr
- "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
- "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
- "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
- "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
- "add $0x60,%2 \n" // add to dst buffer
- // ptr
- // "cmp %3, %4 \n" //(width64 -
- // 32 bytes) and src_offset
- "sub $0x20,%3 \n" // 32 pixels per loop
- "jg 1b \n"
- "vzeroupper \n" // sse-avx2
- // transistions
-
- : "+r"(src_y), //%0
- "+r"(src_vu), //%1
- "+r"(dst_yuv24), //%2
- "+r"(width64), //%3
- "+r"(src_offset) //%4
- : "m"(kBLEND0), //%5
- "m"(kBLEND1), //%6
- "m"(kBLEND2), //%7
- "m"(kSHUF0), //%8
- "m"(kSHUF1), //%9
- "m"(kSHUF2), //%10
- "m"(kSHUF3), //%11
- "m"(kSHUF4), //%12
- "m"(kSHUF5) //%13
- : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
- "xmm13", "xmm14", "xmm15");
-}
-#endif // HAS_NV21TOYUV24ROW_AVX2
+ asm volatile(
+ "sub %0,%1 \n"
+ "vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants
+ "vbroadcastf128 16(%4),%%ymm5 \n"
+ "vbroadcastf128 32(%4),%%ymm6 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm2 \n" // load 32 Y values
+ "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
+ "lea 32(%0),%0 \n"
+ "vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3
+ "vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5
+ "vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7
+ "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n"
+ "vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n"
+ "vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm3,(%2) \n"
+ "vmovdqu %%ymm0,32(%2) \n"
+ "vmovdqu %%ymm1,64(%2) \n"
+ "lea 96(%2),%2 \n"
+ "sub $32,%3 \n" // 32 pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ : "r"(&kYUV24Shuffle[0]) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_NV21ToYUV24ROW_AVX512
+// The following VMBI VEX256 code tests okay with the intelsde emulator.
+static const lvec8 kYUV24Perm[3] = {
+ {32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36,
+ 37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43},
+ {10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
+ 48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52},
+ {53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
+ 26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}};
+
+void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ "vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants
+ "vmovdqa 32(%4),%%ymm5 \n"
+ "vmovdqa 64(%4),%%ymm6 \n" LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm2 \n" // load 32 Y values
+ "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
+ "lea 32(%0),%0 \n"
+ "vmovdqa %%ymm2, %%ymm0 \n"
+ "vmovdqa %%ymm2, %%ymm1 \n"
+ "vpermt2b %%ymm3,%%ymm4,%%ymm0 \n"
+ "vpermt2b %%ymm3,%%ymm5,%%ymm1 \n"
+ "vpermt2b %%ymm3,%%ymm6,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "vmovdqu %%ymm1,32(%2) \n"
+ "vmovdqu %%ymm2,64(%2) \n"
+ "lea 96(%2),%2 \n"
+ "sub $32,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ : "r"(&kYUV24Perm[0]) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#endif // HAS_NV21ToYUV24ROW_AVX512
+
+#ifdef HAS_SWAPUVROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
+ 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+
+ "movdqu %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleUVToVU) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_SWAPUVROW_SSSE3
+
+#ifdef HAS_SWAPUVROW_AVX2
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleUVToVU) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif // HAS_SWAPUVROW_AVX2
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // load 16 U values
+ "movdqu (%1),%%xmm1 \n" // load 16 V values
+ "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
+ "movdqu 0(%1,%5,1),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // half size
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n" // store 8 UV pixels
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n" // 16 src pixels per loop
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride_u)), // %4
+ "r"((intptr_t)(src_stride_v)) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // load 32 U values
+ "vmovdqu (%1),%%ymm1 \n" // load 32 V values
+ "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
+ "vmovdqu 0(%1,%5,1),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
+ "lea 0x20(%2),%2 \n"
+ "sub $0x20,%3 \n" // 32 src pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(src_stride_u)), // %4
+ "r"((intptr_t)(src_stride_v)) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
+ asm volatile(
+ "pxor %%xmm1,%%xmm1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movd (%0),%%xmm0 \n" // load float
+ "maxss %%xmm1, %%xmm0 \n" // clamp to zero
+ "add 4, %0 \n"
+ "movd %%xmm0, (%1) \n" // store float
+ "add 4, %1 \n"
+ "sub $0x4,%2 \n" // 1 float per loop
+ "jg 1b \n"
+ : "+r"(src_x), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
+}
#endif // defined(__x86_64__) || defined(__i386__)
diff --git a/files/source/row_lasx.cc b/files/source/row_lasx.cc
new file mode 100644
index 00000000..7dd18f40
--- /dev/null
+++ b/files/source/row_lasx.cc
@@ -0,0 +1,2230 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
+#include "libyuv/loongson_intrinsics.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ALPHA_VAL (-1)
+
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
+ { \
+ ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \
+ vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \
+ ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \
+ vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \
+ yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \
+ yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \
+ }
+
+// Load 32 YUV422 pixel data
+#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
+ { \
+ __m256i temp0, temp1; \
+ \
+ DUP2_ARG2(__lasx_xvld, psrc_y, 0, psrc_u, 0, out_y, temp0); \
+ temp1 = __lasx_xvld(psrc_v, 0); \
+ temp0 = __lasx_xvsub_b(temp0, const_0x80); \
+ temp1 = __lasx_xvsub_b(temp1, const_0x80); \
+ temp0 = __lasx_vext2xv_h_b(temp0); \
+ temp1 = __lasx_vext2xv_h_b(temp1); \
+ uv_l = __lasx_xvilvl_h(temp0, temp1); \
+ uv_h = __lasx_xvilvh_h(temp0, temp1); \
+ }
+
+// Load 16 YUV422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
+ { \
+ __m256i temp0, temp1; \
+ \
+ out_y = __lasx_xvld(psrc_y, 0); \
+ temp0 = __lasx_xvldrepl_d(psrc_u, 0); \
+ temp1 = __lasx_xvldrepl_d(psrc_v, 0); \
+ uv = __lasx_xvilvl_b(temp0, temp1); \
+ uv = __lasx_xvsub_b(uv, const_0x80); \
+ uv = __lasx_vext2xv_h_b(uv); \
+ }
+
+// Convert 16 pixels of YUV420 to RGB.
+#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \
+ g_h, r_l, r_h) \
+ { \
+ __m256i u_l, u_h, v_l, v_h; \
+ __m256i yl_ev, yl_od, yh_ev, yh_od; \
+ __m256i temp0, temp1, temp2, temp3; \
+ \
+ temp0 = __lasx_xvilvl_b(in_y, in_y); \
+ temp1 = __lasx_xvilvh_b(in_y, in_y); \
+ yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \
+ yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \
+ yh_ev = __lasx_xvmulwev_w_hu_h(temp1, yg); \
+ yh_od = __lasx_xvmulwod_w_hu_h(temp1, yg); \
+ DUP4_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \
+ yl_ev, yl_od, yh_ev, yh_od); \
+ yl_ev = __lasx_xvadd_w(yl_ev, yb); \
+ yl_od = __lasx_xvadd_w(yl_od, yb); \
+ yh_ev = __lasx_xvadd_w(yh_ev, yb); \
+ yh_od = __lasx_xvadd_w(yh_od, yb); \
+ v_l = __lasx_xvmulwev_w_h(in_uvl, ubvr); \
+ u_l = __lasx_xvmulwod_w_h(in_uvl, ubvr); \
+ v_h = __lasx_xvmulwev_w_h(in_uvh, ubvr); \
+ u_h = __lasx_xvmulwod_w_h(in_uvh, ubvr); \
+ temp0 = __lasx_xvadd_w(yl_ev, u_l); \
+ temp1 = __lasx_xvadd_w(yl_od, u_l); \
+ temp2 = __lasx_xvadd_w(yh_ev, u_h); \
+ temp3 = __lasx_xvadd_w(yh_od, u_h); \
+ DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+ temp1, temp2, temp3); \
+ DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
+ temp2, temp3); \
+ b_l = __lasx_xvpackev_h(temp1, temp0); \
+ b_h = __lasx_xvpackev_h(temp3, temp2); \
+ temp0 = __lasx_xvadd_w(yl_ev, v_l); \
+ temp1 = __lasx_xvadd_w(yl_od, v_l); \
+ temp2 = __lasx_xvadd_w(yh_ev, v_h); \
+ temp3 = __lasx_xvadd_w(yh_od, v_h); \
+ DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+ temp1, temp2, temp3); \
+ DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
+ temp2, temp3); \
+ r_l = __lasx_xvpackev_h(temp1, temp0); \
+ r_h = __lasx_xvpackev_h(temp3, temp2); \
+ DUP2_ARG2(__lasx_xvdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \
+ temp0 = __lasx_xvsub_w(yl_ev, u_l); \
+ temp1 = __lasx_xvsub_w(yl_od, u_l); \
+ temp2 = __lasx_xvsub_w(yh_ev, u_h); \
+ temp3 = __lasx_xvsub_w(yh_od, u_h); \
+ DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \
+ temp1, temp2, temp3); \
+ DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \
+ temp2, temp3); \
+ g_l = __lasx_xvpackev_h(temp1, temp0); \
+ g_h = __lasx_xvpackev_h(temp3, temp2); \
+ }
+
+// Convert 8 pixels of YUV420 to RGB.
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
+ { \
+ __m256i u_l, v_l, yl_ev, yl_od; \
+ __m256i temp0, temp1; \
+ \
+ in_y = __lasx_xvpermi_d(in_y, 0xD8); \
+ temp0 = __lasx_xvilvl_b(in_y, in_y); \
+ yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \
+ yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \
+ DUP2_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yl_ev, yl_od); \
+ yl_ev = __lasx_xvadd_w(yl_ev, yb); \
+ yl_od = __lasx_xvadd_w(yl_od, yb); \
+ v_l = __lasx_xvmulwev_w_h(in_uv, ubvr); \
+ u_l = __lasx_xvmulwod_w_h(in_uv, ubvr); \
+ temp0 = __lasx_xvadd_w(yl_ev, u_l); \
+ temp1 = __lasx_xvadd_w(yl_od, u_l); \
+ DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
+ DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
+ out_b = __lasx_xvpackev_h(temp1, temp0); \
+ temp0 = __lasx_xvadd_w(yl_ev, v_l); \
+ temp1 = __lasx_xvadd_w(yl_od, v_l); \
+ DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
+ DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
+ out_r = __lasx_xvpackev_h(temp1, temp0); \
+ u_l = __lasx_xvdp2_w_h(in_uv, ugvg); \
+ temp0 = __lasx_xvsub_w(yl_ev, u_l); \
+ temp1 = __lasx_xvsub_w(yl_od, u_l); \
+ DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
+ DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
+ out_g = __lasx_xvpackev_h(temp1, temp0); \
+ }
+
+// Pack and Store 16 ARGB values.
+#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \
+ { \
+ __m256i temp0, temp1, temp2, temp3; \
+ \
+ temp0 = __lasx_xvpackev_b(g_l, b_l); \
+ temp1 = __lasx_xvpackev_b(a_l, r_l); \
+ temp2 = __lasx_xvpackev_b(g_h, b_h); \
+ temp3 = __lasx_xvpackev_b(a_h, r_h); \
+ r_l = __lasx_xvilvl_h(temp1, temp0); \
+ r_h = __lasx_xvilvh_h(temp1, temp0); \
+ g_l = __lasx_xvilvl_h(temp3, temp2); \
+ g_h = __lasx_xvilvh_h(temp3, temp2); \
+ temp0 = __lasx_xvpermi_q(r_h, r_l, 0x20); \
+ temp1 = __lasx_xvpermi_q(g_h, g_l, 0x20); \
+ temp2 = __lasx_xvpermi_q(r_h, r_l, 0x31); \
+ temp3 = __lasx_xvpermi_q(g_h, g_l, 0x31); \
+ __lasx_xvst(temp0, pdst_argb, 0); \
+ __lasx_xvst(temp1, pdst_argb, 32); \
+ __lasx_xvst(temp2, pdst_argb, 64); \
+ __lasx_xvst(temp3, pdst_argb, 96); \
+ pdst_argb += 128; \
+ }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
+ { \
+ __m256i temp0, temp1, temp2, temp3; \
+ \
+ temp0 = __lasx_xvpackev_b(in_g, in_b); \
+ temp1 = __lasx_xvpackev_b(in_a, in_r); \
+ temp2 = __lasx_xvilvl_h(temp1, temp0); \
+ temp3 = __lasx_xvilvh_h(temp1, temp0); \
+ temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20); \
+ temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31); \
+ __lasx_xvst(temp0, pdst_argb, 0); \
+ __lasx_xvst(temp1, pdst_argb, 32); \
+ pdst_argb += 64; \
+ }
+
+#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \
+ { \
+ __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
+ _tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb); \
+ _tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb); \
+ _tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg); \
+ _tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg); \
+ _reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr); \
+ _reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr); \
+ _tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1); \
+ _tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3); \
+ _tmpr = __lasx_xvavgr_hu(_reg0, _reg1); \
+ _reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb); \
+ _reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr); \
+ _reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg); \
+ _reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg); \
+ _reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr); \
+ _reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb); \
+ }
+
+void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
+ int x;
+ int len = width / 64;
+ __m256i src0, src1;
+ __m256i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607,
+ 0x08090A0B0C0D0E0F, 0x0001020304050607};
+ src += width - 64;
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1);
+ DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+ src1);
+ src0 = __lasx_xvpermi_q(src0, src0, 0x01);
+ src1 = __lasx_xvpermi_q(src1, src1, 0x01);
+ __lasx_xvst(src1, dst, 0);
+ __lasx_xvst(src0, dst, 32);
+ dst += 64;
+ src -= 64;
+ }
+}
+
+void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ int len = width / 16;
+ __m256i src, dst;
+ __m256i shuffler = {0x0004000500060007, 0x0000000100020003,
+ 0x0004000500060007, 0x0000000100020003};
+
+ src_uv += (width - 16) << 1;
+ for (x = 0; x < len; x++) {
+ src = __lasx_xvld(src_uv, 0);
+ dst = __lasx_xvshuf_h(shuffler, src, src);
+ dst = __lasx_xvpermi_q(dst, dst, 0x01);
+ __lasx_xvst(dst, dst_uv, 0);
+ src_uv -= 32;
+ dst_uv += 32;
+ }
+}
+
+void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1;
+ __m256i dst0, dst1;
+ __m256i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504,
+ 0x0B0A09080F0E0D0C, 0x0302010007060504};
+ src += (width * 4) - 64;
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src, 0, src, 32, src0, src1);
+ DUP2_ARG3(__lasx_xvshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0,
+ src1);
+ dst1 = __lasx_xvpermi_q(src0, src0, 0x01);
+ dst0 = __lasx_xvpermi_q(src1, src1, 0x01);
+ __lasx_xvst(dst0, dst, 0);
+ __lasx_xvst(dst1, dst, 32);
+ dst += 64;
+ src -= 64;
+ }
+}
+
+void I422ToYUY2Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src_u0, src_v0, src_y0, vec_uv0;
+ __m256i vec_yuy2_0, vec_yuy2_1;
+ __m256i dst_yuy2_0, dst_yuy2_1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0);
+ src_y0 = __lasx_xvld(src_y, 0);
+ src_u0 = __lasx_xvpermi_d(src_u0, 0xD8);
+ src_v0 = __lasx_xvpermi_d(src_v0, 0xD8);
+ vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0);
+ vec_yuy2_0 = __lasx_xvilvl_b(vec_uv0, src_y0);
+ vec_yuy2_1 = __lasx_xvilvh_b(vec_uv0, src_y0);
+ dst_yuy2_0 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x20);
+ dst_yuy2_1 = __lasx_xvpermi_q(vec_yuy2_1, vec_yuy2_0, 0x31);
+ __lasx_xvst(dst_yuy2_0, dst_yuy2, 0);
+ __lasx_xvst(dst_yuy2_1, dst_yuy2, 32);
+ src_u += 16;
+ src_v += 16;
+ src_y += 32;
+ dst_yuy2 += 64;
+ }
+}
+
+void I422ToUYVYRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src_u0, src_v0, src_y0, vec_uv0;
+ __m256i vec_uyvy0, vec_uyvy1;
+ __m256i dst_uyvy0, dst_uyvy1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_u, 0, src_v, 0, src_u0, src_v0);
+ src_y0 = __lasx_xvld(src_y, 0);
+ src_u0 = __lasx_xvpermi_d(src_u0, 0xD8);
+ src_v0 = __lasx_xvpermi_d(src_v0, 0xD8);
+ vec_uv0 = __lasx_xvilvl_b(src_v0, src_u0);
+ vec_uyvy0 = __lasx_xvilvl_b(src_y0, vec_uv0);
+ vec_uyvy1 = __lasx_xvilvh_b(src_y0, vec_uv0);
+ dst_uyvy0 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x20);
+ dst_uyvy1 = __lasx_xvpermi_q(vec_uyvy1, vec_uyvy0, 0x31);
+ __lasx_xvst(dst_uyvy0, dst_uyvy, 0);
+ __lasx_xvst(dst_uyvy1, dst_uyvy, 32);
+ src_u += 16;
+ src_v += 16;
+ src_y += 32;
+ dst_uyvy += 64;
+ }
+}
+
+void I422ToARGBRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i alpha = __lasx_xvldi(0xFF);
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+void I422ToRGBARow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i alpha = __lasx_xvldi(0xFF);
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ int res = width & 31;
+ __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
+
+ y = __lasx_xvld(src_a, 0);
+ a_l = __lasx_xvilvl_b(zero, y);
+ a_h = __lasx_xvilvh_b(zero, y);
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ src_a += 32;
+ }
+ if (res) {
+ __m256i y, uv, r, g, b, a;
+ a = __lasx_xvld(src_a, 0);
+ a = __lasx_vext2xv_hu_bu(a);
+ READYUV422(src_y, src_u, src_v, y, uv);
+ YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
+ STOREARGB(a, r, g, b, dst_argb);
+ }
+}
+
+void I422ToRGB24Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int32_t width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i shuffler0 = {0x0504120302100100, 0x0A18090816070614,
+ 0x0504120302100100, 0x0A18090816070614};
+ __m256i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B,
+ 0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m256i temp0, temp1, temp2, temp3;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ temp0 = __lasx_xvpackev_b(g_l, b_l);
+ temp1 = __lasx_xvpackev_b(g_h, b_h);
+ DUP4_ARG3(__lasx_xvshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1,
+ r_l, temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0,
+ temp1);
+
+ b_l = __lasx_xvilvl_d(temp1, temp2);
+ b_h = __lasx_xvilvh_d(temp3, temp1);
+ temp1 = __lasx_xvpermi_q(b_l, temp0, 0x20);
+ temp2 = __lasx_xvpermi_q(temp0, b_h, 0x30);
+ temp3 = __lasx_xvpermi_q(b_h, b_l, 0x31);
+ __lasx_xvst(temp1, dst_argb, 0);
+ __lasx_xvst(temp2, dst_argb, 32);
+ __lasx_xvst(temp3, dst_argb, 64);
+ dst_argb += 96;
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m256i dst_l, dst_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ b_l = __lasx_xvsrli_h(b_l, 3);
+ b_h = __lasx_xvsrli_h(b_h, 3);
+ g_l = __lasx_xvsrli_h(g_l, 2);
+ g_h = __lasx_xvsrli_h(g_h, 2);
+ r_l = __lasx_xvsrli_h(r_l, 3);
+ r_h = __lasx_xvsrli_h(r_h, 3);
+ r_l = __lasx_xvslli_h(r_l, 11);
+ r_h = __lasx_xvslli_h(r_h, 11);
+ g_l = __lasx_xvslli_h(g_l, 5);
+ g_h = __lasx_xvslli_h(g_h, 5);
+ r_l = __lasx_xvor_v(r_l, g_l);
+ r_l = __lasx_xvor_v(r_l, b_l);
+ r_h = __lasx_xvor_v(r_h, g_h);
+ r_h = __lasx_xvor_v(r_h, b_h);
+ dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
+ dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
+ __lasx_xvst(dst_l, dst_rgb565, 0);
+ __lasx_xvst(dst_h, dst_rgb565, 32);
+ dst_rgb565 += 64;
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i alpha = {0xF000F000F000F000, 0xF000F000F000F000, 0xF000F000F000F000,
+ 0xF000F000F000F000};
+ __m256i mask = {0x00F000F000F000F0, 0x00F000F000F000F0, 0x00F000F000F000F0,
+ 0x00F000F000F000F0};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m256i dst_l, dst_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ b_l = __lasx_xvsrli_h(b_l, 4);
+ b_h = __lasx_xvsrli_h(b_h, 4);
+ r_l = __lasx_xvsrli_h(r_l, 4);
+ r_h = __lasx_xvsrli_h(r_h, 4);
+ g_l = __lasx_xvand_v(g_l, mask);
+ g_h = __lasx_xvand_v(g_h, mask);
+ r_l = __lasx_xvslli_h(r_l, 8);
+ r_h = __lasx_xvslli_h(r_h, 8);
+ r_l = __lasx_xvor_v(r_l, alpha);
+ r_h = __lasx_xvor_v(r_h, alpha);
+ r_l = __lasx_xvor_v(r_l, g_l);
+ r_h = __lasx_xvor_v(r_h, g_h);
+ r_l = __lasx_xvor_v(r_l, b_l);
+ r_h = __lasx_xvor_v(r_h, b_h);
+ dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
+ dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
+ __lasx_xvst(dst_l, dst_argb4444, 0);
+ __lasx_xvst(dst_h, dst_argb4444, 32);
+ dst_argb4444 += 64;
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+void I422ToARGB1555Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i alpha = {0x8000800080008000, 0x8000800080008000, 0x8000800080008000,
+ 0x8000800080008000};
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ __m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
+ __m256i dst_l, dst_h;
+
+ READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
+ YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l,
+ g_h, r_l, r_h);
+ b_l = __lasx_xvsrli_h(b_l, 3);
+ b_h = __lasx_xvsrli_h(b_h, 3);
+ g_l = __lasx_xvsrli_h(g_l, 3);
+ g_h = __lasx_xvsrli_h(g_h, 3);
+ g_l = __lasx_xvslli_h(g_l, 5);
+ g_h = __lasx_xvslli_h(g_h, 5);
+ r_l = __lasx_xvsrli_h(r_l, 3);
+ r_h = __lasx_xvsrli_h(r_h, 3);
+ r_l = __lasx_xvslli_h(r_l, 10);
+ r_h = __lasx_xvslli_h(r_h, 10);
+ r_l = __lasx_xvor_v(r_l, alpha);
+ r_h = __lasx_xvor_v(r_h, alpha);
+ r_l = __lasx_xvor_v(r_l, g_l);
+ r_h = __lasx_xvor_v(r_h, g_h);
+ r_l = __lasx_xvor_v(r_l, b_l);
+ r_h = __lasx_xvor_v(r_h, b_h);
+ dst_l = __lasx_xvpermi_q(r_h, r_l, 0x20);
+ dst_h = __lasx_xvpermi_q(r_h, r_l, 0x31);
+ __lasx_xvst(dst_l, dst_argb1555, 0);
+ __lasx_xvst(dst_h, dst_argb1555, 32);
+ dst_argb1555 += 64;
+ src_y += 32;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1);
+ dst0 = __lasx_xvpickev_b(src1, src0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_y, 0);
+ src_yuy2 += 64;
+ dst_y += 32;
+ }
+}
+
+void YUY2ToUVRow_LASX(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src_yuy2_next, 0,
+ src_yuy2_next, 32, src0, src1, src2, src3);
+ src0 = __lasx_xvpickod_b(src1, src0);
+ src1 = __lasx_xvpickod_b(src3, src2);
+ tmp0 = __lasx_xvavgr_bu(src1, src0);
+ tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+ dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+ dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_yuy2 += 64;
+ src_yuy2_next += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_yuy2, 0, src_yuy2, 32, src0, src1);
+ tmp0 = __lasx_xvpickod_b(src1, src0);
+ tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+ dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+ dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_yuy2 += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1);
+ dst0 = __lasx_xvpickod_b(src1, src0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_y, 0);
+ src_uyvy += 64;
+ dst_y += 32;
+ }
+}
+
+void UYVYToUVRow_LASX(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3, tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src_uyvy_next, 0,
+ src_uyvy_next, 32, src0, src1, src2, src3);
+ src0 = __lasx_xvpickev_b(src1, src0);
+ src1 = __lasx_xvpickev_b(src3, src2);
+ tmp0 = __lasx_xvavgr_bu(src1, src0);
+ tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+ dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+ dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_uyvy += 64;
+ src_uyvy_next += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void UYVYToUV422Row_LASX(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, tmp0, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_uyvy, 0, src_uyvy, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp0 = __lasx_xvpermi_d(tmp0, 0xD8);
+ dst0 = __lasx_xvpickev_b(tmp0, tmp0);
+ dst1 = __lasx_xvpickod_b(tmp0, tmp0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_uyvy += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3, vec0, vec1, vec2, vec3;
+ __m256i tmp0, tmp1, dst0;
+ __m256i const_19 = __lasx_xvldi(0x19);
+ __m256i const_42 = __lasx_xvldi(0x42);
+ __m256i const_81 = __lasx_xvldi(0x81);
+ __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
+ 0x1080108010801080, 0x1080108010801080};
+ __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
+ 0x0000000700000003};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
+ src_argb0, 96, src0, src1, src2, src3);
+ vec0 = __lasx_xvpickev_b(src1, src0);
+ vec1 = __lasx_xvpickev_b(src3, src2);
+ vec2 = __lasx_xvpickod_b(src1, src0);
+ vec3 = __lasx_xvpickod_b(src3, src2);
+ tmp0 = __lasx_xvmaddwev_h_bu(const_1080, vec0, const_19);
+ tmp1 = __lasx_xvmaddwev_h_bu(const_1080, vec1, const_19);
+ tmp0 = __lasx_xvmaddwev_h_bu(tmp0, vec2, const_81);
+ tmp1 = __lasx_xvmaddwev_h_bu(tmp1, vec3, const_81);
+ tmp0 = __lasx_xvmaddwod_h_bu(tmp0, vec0, const_42);
+ tmp1 = __lasx_xvmaddwod_h_bu(tmp1, vec1, const_42);
+ dst0 = __lasx_xvssrani_b_h(tmp1, tmp0, 8);
+ dst0 = __lasx_xvperm_w(dst0, control);
+ __lasx_xvst(dst0, dst_y, 0);
+ src_argb0 += 128;
+ dst_y += 32;
+ }
+}
+
+void ARGBToUVRow_LASX(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ const uint8_t* src_argb1 = src_argb0 + src_stride_argb;
+
+ __m256i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m256i vec0, vec1, vec2, vec3;
+ __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
+ __m256i const_0x70 = {0x0038003800380038, 0x0038003800380038,
+ 0x0038003800380038, 0x0038003800380038};
+ __m256i const_0x4A = {0x0025002500250025, 0x0025002500250025,
+ 0x0025002500250025, 0x0025002500250025};
+ __m256i const_0x26 = {0x0013001300130013, 0x0013001300130013,
+ 0x0013001300130013, 0x0013001300130013};
+ __m256i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f,
+ 0x002f002f002f002f, 0x002f002f002f002f};
+ __m256i const_0x12 = {0x0009000900090009, 0x0009000900090009,
+ 0x0009000900090009, 0x0009000900090009};
+ __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
+ 0x0000000700000003};
+ __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
+ src_argb0, 96, src0, src1, src2, src3);
+ DUP4_ARG2(__lasx_xvld, src_argb1, 0, src_argb1, 32, src_argb1, 64,
+ src_argb1, 96, src4, src5, src6, src7);
+ vec0 = __lasx_xvaddwev_h_bu(src0, src4);
+ vec1 = __lasx_xvaddwev_h_bu(src1, src5);
+ vec2 = __lasx_xvaddwev_h_bu(src2, src6);
+ vec3 = __lasx_xvaddwev_h_bu(src3, src7);
+ tmp0 = __lasx_xvpickev_h(vec1, vec0);
+ tmp1 = __lasx_xvpickev_h(vec3, vec2);
+ tmp2 = __lasx_xvpickod_h(vec1, vec0);
+ tmp3 = __lasx_xvpickod_h(vec3, vec2);
+ vec0 = __lasx_xvaddwod_h_bu(src0, src4);
+ vec1 = __lasx_xvaddwod_h_bu(src1, src5);
+ vec2 = __lasx_xvaddwod_h_bu(src2, src6);
+ vec3 = __lasx_xvaddwod_h_bu(src3, src7);
+ tmp4 = __lasx_xvpickev_h(vec1, vec0);
+ tmp5 = __lasx_xvpickev_h(vec3, vec2);
+ vec0 = __lasx_xvpickev_h(tmp1, tmp0);
+ vec1 = __lasx_xvpickod_h(tmp1, tmp0);
+ src0 = __lasx_xvavgr_h(vec0, vec1);
+ vec0 = __lasx_xvpickev_h(tmp3, tmp2);
+ vec1 = __lasx_xvpickod_h(tmp3, tmp2);
+ src1 = __lasx_xvavgr_h(vec0, vec1);
+ vec0 = __lasx_xvpickev_h(tmp5, tmp4);
+ vec1 = __lasx_xvpickod_h(tmp5, tmp4);
+ src2 = __lasx_xvavgr_h(vec0, vec1);
+ dst0 = __lasx_xvmadd_h(const_0x8080, src0, const_0x70);
+ dst0 = __lasx_xvmsub_h(dst0, src2, const_0x4A);
+ dst0 = __lasx_xvmsub_h(dst0, src1, const_0x26);
+ dst1 = __lasx_xvmadd_h(const_0x8080, src1, const_0x70);
+ dst1 = __lasx_xvmsub_h(dst1, src2, const_0x5E);
+ dst1 = __lasx_xvmsub_h(dst1, src0, const_0x12);
+ dst0 = __lasx_xvperm_w(dst0, control);
+ dst1 = __lasx_xvperm_w(dst1, control);
+ dst0 = __lasx_xvssrani_b_h(dst0, dst0, 8);
+ dst1 = __lasx_xvssrani_b_h(dst1, dst1, 8);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst1, dst_v, 0, 0);
+ __lasx_xvstelm_d(dst1, dst_v, 8, 2);
+ src_argb0 += 128;
+ src_argb1 += 128;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ int len = (width / 32) - 1;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i shuf = {0x0908060504020100, 0x000000000E0D0C0A, 0x0908060504020100,
+ 0x000000000E0D0C0A};
+ __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005,
+ 0x0000000700000003};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+ 96, src0, src1, src2, src3);
+ tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+ tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+ tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+ tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+ tmp0 = __lasx_xvperm_w(tmp0, control);
+ tmp1 = __lasx_xvperm_w(tmp1, control);
+ tmp2 = __lasx_xvperm_w(tmp2, control);
+ tmp3 = __lasx_xvperm_w(tmp3, control);
+ __lasx_xvst(tmp0, dst_rgb, 0);
+ __lasx_xvst(tmp1, dst_rgb, 24);
+ __lasx_xvst(tmp2, dst_rgb, 48);
+ __lasx_xvst(tmp3, dst_rgb, 72);
+ dst_rgb += 96;
+ src_argb += 128;
+ }
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96,
+ src0, src1, src2, src3);
+ tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+ tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+ tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+ tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+ tmp0 = __lasx_xvperm_w(tmp0, control);
+ tmp1 = __lasx_xvperm_w(tmp1, control);
+ tmp2 = __lasx_xvperm_w(tmp2, control);
+ tmp3 = __lasx_xvperm_w(tmp3, control);
+ __lasx_xvst(tmp0, dst_rgb, 0);
+ __lasx_xvst(tmp1, dst_rgb, 24);
+ __lasx_xvst(tmp2, dst_rgb, 48);
+ dst_rgb += 72;
+ __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0);
+ __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1);
+ __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2);
+}
+
+void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ int len = (width / 32) - 1;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i shuf = {0x090A040506000102, 0x000000000C0D0E08, 0x090A040506000102,
+ 0x000000000C0D0E08};
+ __m256i control = {0x0000000100000000, 0x0000000400000002, 0x0000000600000005,
+ 0x0000000700000003};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+ 96, src0, src1, src2, src3);
+ tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+ tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+ tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+ tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+ tmp0 = __lasx_xvperm_w(tmp0, control);
+ tmp1 = __lasx_xvperm_w(tmp1, control);
+ tmp2 = __lasx_xvperm_w(tmp2, control);
+ tmp3 = __lasx_xvperm_w(tmp3, control);
+ __lasx_xvst(tmp0, dst_rgb, 0);
+ __lasx_xvst(tmp1, dst_rgb, 24);
+ __lasx_xvst(tmp2, dst_rgb, 48);
+ __lasx_xvst(tmp3, dst_rgb, 72);
+ dst_rgb += 96;
+ src_argb += 128;
+ }
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, 96,
+ src0, src1, src2, src3);
+ tmp0 = __lasx_xvshuf_b(src0, src0, shuf);
+ tmp1 = __lasx_xvshuf_b(src1, src1, shuf);
+ tmp2 = __lasx_xvshuf_b(src2, src2, shuf);
+ tmp3 = __lasx_xvshuf_b(src3, src3, shuf);
+ tmp0 = __lasx_xvperm_w(tmp0, control);
+ tmp1 = __lasx_xvperm_w(tmp1, control);
+ tmp2 = __lasx_xvperm_w(tmp2, control);
+ tmp3 = __lasx_xvperm_w(tmp3, control);
+ __lasx_xvst(tmp0, dst_rgb, 0);
+ __lasx_xvst(tmp1, dst_rgb, 24);
+ __lasx_xvst(tmp2, dst_rgb, 48);
+ dst_rgb += 72;
+ __lasx_xvstelm_d(tmp3, dst_rgb, 0, 0);
+ __lasx_xvstelm_d(tmp3, dst_rgb, 8, 1);
+ __lasx_xvstelm_d(tmp3, dst_rgb, 16, 2);
+}
+
+void ARGBToRGB565Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i src0, src1, tmp0, tmp1, dst0;
+ __m256i shift = {0x0300030003000300, 0x0300030003000300, 0x0300030003000300,
+ 0x0300030003000300};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmp0 = __lasx_xvsrli_b(tmp0, 3);
+ tmp1 = __lasx_xvpackev_b(zero, tmp1);
+ tmp1 = __lasx_xvsrli_h(tmp1, 2);
+ tmp0 = __lasx_xvsll_b(tmp0, shift);
+ tmp1 = __lasx_xvslli_h(tmp1, 5);
+ dst0 = __lasx_xvor_v(tmp0, tmp1);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_rgb, 0);
+ dst_rgb += 32;
+ src_argb += 64;
+ }
+}
+
+void ARGBToARGB1555Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0;
+ __m256i shift1 = {0x0703070307030703, 0x0703070307030703, 0x0703070307030703,
+ 0x0703070307030703};
+ __m256i shift2 = {0x0200020002000200, 0x0200020002000200, 0x0200020002000200,
+ 0x0200020002000200};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmp0 = __lasx_xvsrli_b(tmp0, 3);
+ tmp1 = __lasx_xvsrl_b(tmp1, shift1);
+ tmp0 = __lasx_xvsll_b(tmp0, shift2);
+ tmp2 = __lasx_xvpackev_b(zero, tmp1);
+ tmp3 = __lasx_xvpackod_b(zero, tmp1);
+ tmp2 = __lasx_xvslli_h(tmp2, 5);
+ tmp3 = __lasx_xvslli_h(tmp3, 15);
+ dst0 = __lasx_xvor_v(tmp0, tmp2);
+ dst0 = __lasx_xvor_v(dst0, tmp3);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_rgb, 0);
+ dst_rgb += 32;
+ src_argb += 64;
+ }
+}
+
+void ARGBToARGB4444Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmp1 = __lasx_xvandi_b(tmp1, 0xF0);
+ tmp0 = __lasx_xvsrli_b(tmp0, 4);
+ dst0 = __lasx_xvor_v(tmp1, tmp0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_rgb, 0);
+ dst_rgb += 32;
+ src_argb += 64;
+ }
+}
+
+void ARGBToUV444Row_LASX(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int32_t width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i reg0, reg1, reg2, reg3, dst0, dst1;
+ __m256i const_112 = __lasx_xvldi(112);
+ __m256i const_74 = __lasx_xvldi(74);
+ __m256i const_38 = __lasx_xvldi(38);
+ __m256i const_94 = __lasx_xvldi(94);
+ __m256i const_18 = __lasx_xvldi(18);
+ __m256i const_0x8080 = {0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+ __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
+ 0x0000000700000003};
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+ 96, src0, src1, src2, src3);
+ tmp0 = __lasx_xvpickev_h(src1, src0);
+ tmp1 = __lasx_xvpickod_h(src1, src0);
+ tmp2 = __lasx_xvpickev_h(src3, src2);
+ tmp3 = __lasx_xvpickod_h(src3, src2);
+ reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp0, const_112);
+ reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp2, const_112);
+ reg2 = __lasx_xvmulwod_h_bu(tmp0, const_74);
+ reg3 = __lasx_xvmulwod_h_bu(tmp2, const_74);
+ reg2 = __lasx_xvmaddwev_h_bu(reg2, tmp1, const_38);
+ reg3 = __lasx_xvmaddwev_h_bu(reg3, tmp3, const_38);
+ reg0 = __lasx_xvsub_h(reg0, reg2);
+ reg1 = __lasx_xvsub_h(reg1, reg3);
+ dst0 = __lasx_xvssrani_b_h(reg1, reg0, 8);
+ dst0 = __lasx_xvperm_w(dst0, control);
+ reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp1, const_112);
+ reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp3, const_112);
+ reg2 = __lasx_xvmulwev_h_bu(tmp0, const_18);
+ reg3 = __lasx_xvmulwev_h_bu(tmp2, const_18);
+ reg2 = __lasx_xvmaddwod_h_bu(reg2, tmp0, const_94);
+ reg3 = __lasx_xvmaddwod_h_bu(reg3, tmp2, const_94);
+ reg0 = __lasx_xvsub_h(reg0, reg2);
+ reg1 = __lasx_xvsub_h(reg1, reg3);
+ dst1 = __lasx_xvssrani_b_h(reg1, reg0, 8);
+ dst1 = __lasx_xvperm_w(dst1, control);
+ __lasx_xvst(dst0, dst_u, 0);
+ __lasx_xvst(dst1, dst_v, 0);
+ dst_u += 32;
+ dst_v += 32;
+ src_argb += 128;
+ }
+}
+
+void ARGBMultiplyRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i src0, src1, dst0, dst1;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
+ tmp0 = __lasx_xvilvl_b(src0, src0);
+ tmp1 = __lasx_xvilvh_b(src0, src0);
+ tmp2 = __lasx_xvilvl_b(zero, src1);
+ tmp3 = __lasx_xvilvh_b(zero, src1);
+ dst0 = __lasx_xvmuh_hu(tmp0, tmp2);
+ dst1 = __lasx_xvmuh_hu(tmp1, tmp3);
+ dst0 = __lasx_xvpickev_b(dst1, dst0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBAddRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m256i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
+ dst0 = __lasx_xvsadd_bu(src0, src1);
+ __lasx_xvst(dst0, dst_argb, 0);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBSubtractRow_LASX(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m256i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb0, 0, src_argb1, 0, src0, src1);
+ dst0 = __lasx_xvssub_bu(src0, src1);
+ __lasx_xvst(dst0, dst_argb, 0);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBAttenuateRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1;
+ __m256i reg0, reg1, reg2, reg3, reg4, reg5;
+ __m256i b, g, r, a, dst0, dst1;
+ __m256i control = {0x0005000100040000, 0x0007000300060002, 0x0005000100040000,
+ 0x0007000300060002};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ b = __lasx_xvpackev_b(tmp0, tmp0);
+ r = __lasx_xvpackod_b(tmp0, tmp0);
+ g = __lasx_xvpackev_b(tmp1, tmp1);
+ a = __lasx_xvpackod_b(tmp1, tmp1);
+ reg0 = __lasx_xvmulwev_w_hu(b, a);
+ reg1 = __lasx_xvmulwod_w_hu(b, a);
+ reg2 = __lasx_xvmulwev_w_hu(r, a);
+ reg3 = __lasx_xvmulwod_w_hu(r, a);
+ reg4 = __lasx_xvmulwev_w_hu(g, a);
+ reg5 = __lasx_xvmulwod_w_hu(g, a);
+ reg0 = __lasx_xvssrani_h_w(reg1, reg0, 24);
+ reg2 = __lasx_xvssrani_h_w(reg3, reg2, 24);
+ reg4 = __lasx_xvssrani_h_w(reg5, reg4, 24);
+ reg0 = __lasx_xvshuf_h(control, reg0, reg0);
+ reg2 = __lasx_xvshuf_h(control, reg2, reg2);
+ reg4 = __lasx_xvshuf_h(control, reg4, reg4);
+ tmp0 = __lasx_xvpackev_b(reg4, reg0);
+ tmp1 = __lasx_xvpackev_b(a, reg2);
+ dst0 = __lasx_xvilvl_h(tmp1, tmp0);
+ dst1 = __lasx_xvilvh_h(tmp1, tmp0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ dst_argb += 64;
+ src_argb += 64;
+ }
+}
+
+void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1, dst0;
+ __m256i b, g, r;
+ __m256i zero = __lasx_xvldi(0);
+ __m256i vec_dither = __lasx_xvldrepl_w(&dither4, 0);
+
+ vec_dither = __lasx_xvilvl_b(zero, vec_dither);
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ b = __lasx_xvpackev_b(zero, tmp0);
+ r = __lasx_xvpackod_b(zero, tmp0);
+ g = __lasx_xvpackev_b(zero, tmp1);
+ b = __lasx_xvadd_h(b, vec_dither);
+ g = __lasx_xvadd_h(g, vec_dither);
+ r = __lasx_xvadd_h(r, vec_dither);
+ DUP2_ARG1(__lasx_xvclip255_h, b, g, b, g);
+ r = __lasx_xvclip255_h(r);
+ b = __lasx_xvsrai_h(b, 3);
+ g = __lasx_xvsrai_h(g, 2);
+ r = __lasx_xvsrai_h(r, 3);
+ g = __lasx_xvslli_h(g, 5);
+ r = __lasx_xvslli_h(r, 11);
+ dst0 = __lasx_xvor_v(b, g);
+ dst0 = __lasx_xvor_v(dst0, r);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_rgb, 0);
+ src_argb += 64;
+ dst_rgb += 32;
+ }
+}
+
+void ARGBShuffleRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, dst0, dst1;
+ __m256i shuf = {0x0404040400000000, 0x0C0C0C0C08080808, 0x0404040400000000,
+ 0x0C0C0C0C08080808};
+ __m256i temp = __lasx_xvldrepl_w(shuffler, 0);
+
+ shuf = __lasx_xvadd_b(shuf, temp);
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ dst0 = __lasx_xvshuf_b(src0, src0, shuf);
+ dst1 = __lasx_xvshuf_b(src1, src1, shuf);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ src_argb += 64;
+ dst_argb += 64;
+ }
+}
+
+void ARGBShadeRow_LASX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ int x;
+ int len = width / 8;
+ __m256i src0, dst0, tmp0, tmp1;
+ __m256i vec_value = __lasx_xvreplgr2vr_w(value);
+
+ vec_value = __lasx_xvilvl_b(vec_value, vec_value);
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_argb, 0);
+ tmp0 = __lasx_xvilvl_b(src0, src0);
+ tmp1 = __lasx_xvilvh_b(src0, src0);
+ tmp0 = __lasx_xvmuh_hu(tmp0, vec_value);
+ tmp1 = __lasx_xvmuh_hu(tmp1, vec_value);
+ dst0 = __lasx_xvpickod_b(tmp1, tmp0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1;
+ __m256i reg0, reg1, reg2, dst0, dst1;
+ __m256i const_128 = __lasx_xvldi(0x480);
+ __m256i const_150 = __lasx_xvldi(0x96);
+ __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D,
+ 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ reg0 = __lasx_xvdp2_h_bu(tmp0, const_br);
+ reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150);
+ reg2 = __lasx_xvadd_h(reg0, reg1);
+ tmp0 = __lasx_xvpackod_b(reg2, reg2);
+ tmp1 = __lasx_xvpackod_b(tmp1, reg2);
+ dst0 = __lasx_xvilvl_h(tmp1, tmp0);
+ dst1 = __lasx_xvilvh_h(tmp1, tmp0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ src_argb += 64;
+ dst_argb += 64;
+ }
+}
+
+void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 16;
+ __m256i src0, src1, tmp0, tmp1;
+ __m256i reg0, reg1, spb, spg, spr;
+ __m256i dst0, dst1;
+ __m256i spb_g = __lasx_xvldi(68);
+ __m256i spg_g = __lasx_xvldi(88);
+ __m256i spr_g = __lasx_xvldi(98);
+ __m256i spb_br = {0x2311231123112311, 0x2311231123112311, 0x2311231123112311,
+ 0x2311231123112311};
+ __m256i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16, 0x2D162D162D162D16,
+ 0x2D162D162D162D16};
+ __m256i spr_br = {0x3218321832183218, 0x3218321832183218, 0x3218321832183218,
+ 0x3218321832183218};
+ __m256i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908, 0x1706150413021100,
+ 0x1F0E1D0C1B0A1908};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lasx_xvld, dst_argb, 0, dst_argb, 32, src0, src1);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ DUP2_ARG2(__lasx_xvdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg);
+ spr = __lasx_xvdp2_h_bu(tmp0, spr_br);
+ spb = __lasx_xvmaddwev_h_bu(spb, tmp1, spb_g);
+ spg = __lasx_xvmaddwev_h_bu(spg, tmp1, spg_g);
+ spr = __lasx_xvmaddwev_h_bu(spr, tmp1, spr_g);
+ spb = __lasx_xvsrli_h(spb, 7);
+ spg = __lasx_xvsrli_h(spg, 7);
+ spr = __lasx_xvsrli_h(spr, 7);
+ spg = __lasx_xvsat_hu(spg, 7);
+ spr = __lasx_xvsat_hu(spr, 7);
+ reg0 = __lasx_xvpackev_b(spg, spb);
+ reg1 = __lasx_xvshuf_b(tmp1, spr, shuff);
+ dst0 = __lasx_xvilvl_h(reg1, reg0);
+ dst1 = __lasx_xvilvh_h(reg1, reg0);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ dst_argb += 64;
+ }
+}
+
+void ARGB4444ToARGBRow_LASX(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i reg0, reg1, reg2, reg3;
+ __m256i dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_argb4444, 0);
+ src1 = __lasx_xvld(src_argb4444, 32);
+ DUP4_ARG2(__lasx_xvandi_b, src0, 0x0F, src0, 0xF0, src1, 0x0F, src1, 0xF0,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lasx_xvslli_b, tmp0, 4, tmp2, 4, reg0, reg2);
+ DUP2_ARG2(__lasx_xvsrli_b, tmp1, 4, tmp3, 4, reg1, reg3);
+ DUP4_ARG2(__lasx_xvor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lasx_xvilvl_b, tmp1, tmp0, tmp3, tmp2, reg0, reg2);
+ DUP2_ARG2(__lasx_xvilvh_b, tmp1, tmp0, tmp3, tmp2, reg1, reg3);
+ DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg1, reg0, 0x31, reg3, reg2,
+ 0x20, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ __lasx_xvst(dst2, dst_argb, 64);
+ __lasx_xvst(dst3, dst_argb, 96);
+ src_argb4444 += 64;
+ dst_argb += 128;
+ }
+}
+
+void ARGB1555ToARGBRow_LASX(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1;
+ __m256i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa;
+ __m256i reg0, reg1, reg2, reg3;
+ __m256i dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_argb1555, 0);
+ src1 = __lasx_xvld(src_argb1555, 32);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+ tmpg = __lasx_xvsrli_b(tmp0, 5);
+ reg0 = __lasx_xvandi_b(tmp1, 0x03);
+ reg0 = __lasx_xvslli_b(reg0, 3);
+ tmpg = __lasx_xvor_v(tmpg, reg0);
+ reg1 = __lasx_xvandi_b(tmp1, 0x7C);
+ tmpr = __lasx_xvsrli_b(reg1, 2);
+ tmpa = __lasx_xvsrli_b(tmp1, 7);
+ tmpa = __lasx_xvneg_b(tmpa);
+ reg0 = __lasx_xvslli_b(tmpb, 3);
+ reg1 = __lasx_xvslli_b(tmpg, 3);
+ reg2 = __lasx_xvslli_b(tmpr, 3);
+ tmpb = __lasx_xvsrli_b(tmpb, 2);
+ tmpg = __lasx_xvsrli_b(tmpg, 2);
+ tmpr = __lasx_xvsrli_b(tmpr, 2);
+ tmpb = __lasx_xvor_v(reg0, tmpb);
+ tmpg = __lasx_xvor_v(reg1, tmpg);
+ tmpr = __lasx_xvor_v(reg2, tmpr);
+ DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
+ DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, tmpa, tmpr, reg2, reg3);
+ dst0 = __lasx_xvilvl_h(reg1, reg0);
+ dst1 = __lasx_xvilvh_h(reg1, reg0);
+ dst2 = __lasx_xvilvl_h(reg3, reg2);
+ dst3 = __lasx_xvilvh_h(reg3, reg2);
+ DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2,
+ 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3);
+ __lasx_xvst(reg0, dst_argb, 0);
+ __lasx_xvst(reg1, dst_argb, 32);
+ __lasx_xvst(reg2, dst_argb, 64);
+ __lasx_xvst(reg3, dst_argb, 96);
+ src_argb1555 += 64;
+ dst_argb += 128;
+ }
+}
+
+void RGB565ToARGBRow_LASX(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1;
+ __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
+ __m256i reg0, reg1, reg2, reg3, dst0, dst1, dst2, dst3;
+ __m256i alpha = __lasx_xvldi(0xFF);
+
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_rgb565, 0);
+ src1 = __lasx_xvld(src_rgb565, 32);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+ tmpr = __lasx_xvandi_b(tmp1, 0xF8);
+ reg1 = __lasx_xvandi_b(tmp1, 0x07);
+ reg0 = __lasx_xvsrli_b(tmp0, 5);
+ reg1 = __lasx_xvslli_b(reg1, 3);
+ tmpg = __lasx_xvor_v(reg1, reg0);
+ reg0 = __lasx_xvslli_b(tmpb, 3);
+ reg1 = __lasx_xvsrli_b(tmpb, 2);
+ tmpb = __lasx_xvor_v(reg1, reg0);
+ reg0 = __lasx_xvslli_b(tmpg, 2);
+ reg1 = __lasx_xvsrli_b(tmpg, 4);
+ tmpg = __lasx_xvor_v(reg1, reg0);
+ reg0 = __lasx_xvsrli_b(tmpr, 5);
+ tmpr = __lasx_xvor_v(tmpr, reg0);
+ DUP2_ARG2(__lasx_xvilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
+ dst0 = __lasx_xvilvl_h(reg1, reg0);
+ dst1 = __lasx_xvilvh_h(reg1, reg0);
+ DUP2_ARG2(__lasx_xvilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
+ dst2 = __lasx_xvilvl_h(reg1, reg0);
+ dst3 = __lasx_xvilvh_h(reg1, reg0);
+ DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst1, dst0, 0x31, dst3, dst2,
+ 0x20, dst3, dst2, 0x31, reg0, reg1, reg2, reg3);
+ __lasx_xvst(reg0, dst_argb, 0);
+ __lasx_xvst(reg1, dst_argb, 32);
+ __lasx_xvst(reg2, dst_argb, 64);
+ __lasx_xvst(reg3, dst_argb, 96);
+ src_rgb565 += 64;
+ dst_argb += 128;
+ }
+}
+
+void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2;
+ __m256i tmp0, tmp1, tmp2;
+ __m256i dst0, dst1, dst2, dst3;
+ __m256i reg0, reg1, reg2, reg3;
+ __m256i alpha = __lasx_xvldi(0xFF);
+ __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C,
+ 0x1B1A191817161514};
+ __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918,
+ 0x0706050403020100};
+ __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504,
+ 0x131211100F0E0D0C};
+ __m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706, 0x1005040310020100,
+ 0x100B0A0910080706};
+
+ for (x = 0; x < len; x++) {
+ reg0 = __lasx_xvld(src_rgb24, 0);
+ reg1 = __lasx_xvld(src_rgb24, 32);
+ reg2 = __lasx_xvld(src_rgb24, 64);
+ src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
+ src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
+ src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0,
+ tmp1);
+ tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
+ DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
+ tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
+ DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0,
+ 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ __lasx_xvst(dst2, dst_argb, 64);
+ __lasx_xvst(dst3, dst_argb, 96);
+ src_rgb24 += 96;
+ dst_argb += 128;
+ }
+}
+
+void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2;
+ __m256i tmp0, tmp1, tmp2, reg0, reg1, reg2, reg3;
+ __m256i dst0, dst1, dst2, dst3;
+ __m256i alpha = __lasx_xvldi(0xFF);
+ __m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C,
+ 0x1B1A191817161514};
+ __m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918,
+ 0x0706050403020100};
+ __m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504,
+ 0x131211100F0E0D0C};
+ __m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708, 0x1003040510000102,
+ 0x10090A0B10060708};
+
+ for (x = 0; x < len; x++) {
+ reg0 = __lasx_xvld(src_raw, 0);
+ reg1 = __lasx_xvld(src_raw, 32);
+ reg2 = __lasx_xvld(src_raw, 64);
+ src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
+ src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
+ src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0,
+ tmp1);
+ tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
+ DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
+ tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
+ DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x20, reg3, reg2, 0x20, reg1, reg0,
+ 0x31, reg3, reg2, 0x31, dst0, dst1, dst2, dst3);
+ __lasx_xvst(dst0, dst_argb, 0);
+ __lasx_xvst(dst1, dst_argb, 32);
+ __lasx_xvst(dst2, dst_argb, 64);
+ __lasx_xvst(dst3, dst_argb, 96);
+ src_raw += 96;
+ dst_argb += 128;
+ }
+}
+
+void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1;
+ __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
+ __m256i reg0, reg1, reg2, dst0;
+ __m256i const_66 = __lasx_xvldi(66);
+ __m256i const_129 = __lasx_xvldi(129);
+ __m256i const_25 = __lasx_xvldi(25);
+ __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
+ 0x1080108010801080, 0x1080108010801080};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_argb1555, 0);
+ src1 = __lasx_xvld(src_argb1555, 32);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+ tmpg = __lasx_xvsrli_b(tmp0, 5);
+ reg0 = __lasx_xvandi_b(tmp1, 0x03);
+ reg0 = __lasx_xvslli_b(reg0, 3);
+ tmpg = __lasx_xvor_v(tmpg, reg0);
+ reg1 = __lasx_xvandi_b(tmp1, 0x7C);
+ tmpr = __lasx_xvsrli_b(reg1, 2);
+ reg0 = __lasx_xvslli_b(tmpb, 3);
+ reg1 = __lasx_xvslli_b(tmpg, 3);
+ reg2 = __lasx_xvslli_b(tmpr, 3);
+ tmpb = __lasx_xvsrli_b(tmpb, 2);
+ tmpg = __lasx_xvsrli_b(tmpg, 2);
+ tmpr = __lasx_xvsrli_b(tmpr, 2);
+ tmpb = __lasx_xvor_v(reg0, tmpb);
+ tmpg = __lasx_xvor_v(reg1, tmpg);
+ tmpr = __lasx_xvor_v(reg2, tmpr);
+ reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25);
+ reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25);
+ reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129);
+ reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129);
+ reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66);
+ reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66);
+ dst0 = __lasx_xvpackod_b(reg1, reg0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_y, 0);
+ src_argb1555 += 64;
+ dst_y += 32;
+ }
+}
+
+void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m256i reg0, reg1, reg2, reg3, dst0;
+ __m256i const_112 = __lasx_xvldi(0x438);
+ __m256i const_74 = __lasx_xvldi(0x425);
+ __m256i const_38 = __lasx_xvldi(0x413);
+ __m256i const_94 = __lasx_xvldi(0x42F);
+ __m256i const_18 = __lasx_xvldi(0x409);
+ __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0,
+ next_argb1555, 32, src0, src1, src2, src3);
+ DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+ DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+ tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+ nexb = __lasx_xvandi_b(tmp2, 0x1F);
+ tmpg = __lasx_xvsrli_b(tmp0, 5);
+ nexg = __lasx_xvsrli_b(tmp2, 5);
+ reg0 = __lasx_xvandi_b(tmp1, 0x03);
+ reg2 = __lasx_xvandi_b(tmp3, 0x03);
+ reg0 = __lasx_xvslli_b(reg0, 3);
+ reg2 = __lasx_xvslli_b(reg2, 3);
+ tmpg = __lasx_xvor_v(tmpg, reg0);
+ nexg = __lasx_xvor_v(nexg, reg2);
+ reg1 = __lasx_xvandi_b(tmp1, 0x7C);
+ reg3 = __lasx_xvandi_b(tmp3, 0x7C);
+ tmpr = __lasx_xvsrli_b(reg1, 2);
+ nexr = __lasx_xvsrli_b(reg3, 2);
+ reg0 = __lasx_xvslli_b(tmpb, 3);
+ reg1 = __lasx_xvslli_b(tmpg, 3);
+ reg2 = __lasx_xvslli_b(tmpr, 3);
+ tmpb = __lasx_xvsrli_b(tmpb, 2);
+ tmpg = __lasx_xvsrli_b(tmpg, 2);
+ tmpr = __lasx_xvsrli_b(tmpr, 2);
+ tmpb = __lasx_xvor_v(reg0, tmpb);
+ tmpg = __lasx_xvor_v(reg1, tmpg);
+ tmpr = __lasx_xvor_v(reg2, tmpr);
+ reg0 = __lasx_xvslli_b(nexb, 3);
+ reg1 = __lasx_xvslli_b(nexg, 3);
+ reg2 = __lasx_xvslli_b(nexr, 3);
+ nexb = __lasx_xvsrli_b(nexb, 2);
+ nexg = __lasx_xvsrli_b(nexg, 2);
+ nexr = __lasx_xvsrli_b(nexr, 2);
+ nexb = __lasx_xvor_v(reg0, nexb);
+ nexg = __lasx_xvor_v(reg1, nexg);
+ nexr = __lasx_xvor_v(reg2, nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
+ reg0 = __lasx_xvpermi_d(reg0, 0xD8);
+ reg1 = __lasx_xvpermi_d(reg1, 0xD8);
+ dst0 = __lasx_xvpickod_b(reg1, reg0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_v, 0, 1);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+ src_argb1555 += 64;
+ next_argb1555 += 64;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1;
+ __m256i tmp0, tmp1, tmpb, tmpg, tmpr;
+ __m256i reg0, reg1, dst0;
+ __m256i const_66 = __lasx_xvldi(66);
+ __m256i const_129 = __lasx_xvldi(129);
+ __m256i const_25 = __lasx_xvldi(25);
+ __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
+ 0x1080108010801080, 0x1080108010801080};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lasx_xvld(src_rgb565, 0);
+ src1 = __lasx_xvld(src_rgb565, 32);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+ tmpr = __lasx_xvandi_b(tmp1, 0xF8);
+ reg1 = __lasx_xvandi_b(tmp1, 0x07);
+ reg0 = __lasx_xvsrli_b(tmp0, 5);
+ reg1 = __lasx_xvslli_b(reg1, 3);
+ tmpg = __lasx_xvor_v(reg1, reg0);
+ reg0 = __lasx_xvslli_b(tmpb, 3);
+ reg1 = __lasx_xvsrli_b(tmpb, 2);
+ tmpb = __lasx_xvor_v(reg1, reg0);
+ reg0 = __lasx_xvslli_b(tmpg, 2);
+ reg1 = __lasx_xvsrli_b(tmpg, 4);
+ tmpg = __lasx_xvor_v(reg1, reg0);
+ reg0 = __lasx_xvsrli_b(tmpr, 5);
+ tmpr = __lasx_xvor_v(tmpr, reg0);
+ reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmpb, const_25);
+ reg1 = __lasx_xvmaddwod_h_bu(const_1080, tmpb, const_25);
+ reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpg, const_129);
+ reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpg, const_129);
+ reg0 = __lasx_xvmaddwev_h_bu(reg0, tmpr, const_66);
+ reg1 = __lasx_xvmaddwod_h_bu(reg1, tmpr, const_66);
+ dst0 = __lasx_xvpackod_b(reg1, reg0);
+ dst0 = __lasx_xvpermi_d(dst0, 0xD8);
+ __lasx_xvst(dst0, dst_y, 0);
+ dst_y += 32;
+ src_rgb565 += 64;
+ }
+}
+
+void RGB565ToUVRow_LASX(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
+ __m256i src0, src1, src2, src3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m256i reg0, reg1, reg2, reg3, dst0;
+ __m256i const_112 = __lasx_xvldi(0x438);
+ __m256i const_74 = __lasx_xvldi(0x425);
+ __m256i const_38 = __lasx_xvldi(0x413);
+ __m256i const_94 = __lasx_xvldi(0x42F);
+ __m256i const_18 = __lasx_xvldi(0x409);
+ __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_rgb565, 0, src_rgb565, 32, next_rgb565, 0,
+ next_rgb565, 32, src0, src1, src2, src3);
+ DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+ DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+ tmpb = __lasx_xvandi_b(tmp0, 0x1F);
+ tmpr = __lasx_xvandi_b(tmp1, 0xF8);
+ nexb = __lasx_xvandi_b(tmp2, 0x1F);
+ nexr = __lasx_xvandi_b(tmp3, 0xF8);
+ reg1 = __lasx_xvandi_b(tmp1, 0x07);
+ reg3 = __lasx_xvandi_b(tmp3, 0x07);
+ reg0 = __lasx_xvsrli_b(tmp0, 5);
+ reg1 = __lasx_xvslli_b(reg1, 3);
+ reg2 = __lasx_xvsrli_b(tmp2, 5);
+ reg3 = __lasx_xvslli_b(reg3, 3);
+ tmpg = __lasx_xvor_v(reg1, reg0);
+ nexg = __lasx_xvor_v(reg2, reg3);
+ reg0 = __lasx_xvslli_b(tmpb, 3);
+ reg1 = __lasx_xvsrli_b(tmpb, 2);
+ reg2 = __lasx_xvslli_b(nexb, 3);
+ reg3 = __lasx_xvsrli_b(nexb, 2);
+ tmpb = __lasx_xvor_v(reg1, reg0);
+ nexb = __lasx_xvor_v(reg2, reg3);
+ reg0 = __lasx_xvslli_b(tmpg, 2);
+ reg1 = __lasx_xvsrli_b(tmpg, 4);
+ reg2 = __lasx_xvslli_b(nexg, 2);
+ reg3 = __lasx_xvsrli_b(nexg, 4);
+ tmpg = __lasx_xvor_v(reg1, reg0);
+ nexg = __lasx_xvor_v(reg2, reg3);
+ reg0 = __lasx_xvsrli_b(tmpr, 5);
+ reg2 = __lasx_xvsrli_b(nexr, 5);
+ tmpr = __lasx_xvor_v(tmpr, reg0);
+ nexr = __lasx_xvor_v(nexr, reg2);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
+ reg0 = __lasx_xvpermi_d(reg0, 0xD8);
+ reg1 = __lasx_xvpermi_d(reg1, 0xD8);
+ dst0 = __lasx_xvpickod_b(reg1, reg0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_v, 0, 1);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+ dst_u += 16;
+ dst_v += 16;
+ src_rgb565 += 64;
+ next_rgb565 += 64;
+ }
+}
+
+void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i reg0, reg1, reg2, dst0;
+ __m256i const_129 = __lasx_xvldi(129);
+ __m256i const_br = {0x4219421942194219, 0x4219421942194219,
+ 0x4219421942194219, 0x4219421942194219};
+ __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
+ 0x1080108010801080, 0x1080108010801080};
+ __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200,
+ 0x17151412110F0E0C};
+ __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18,
+ 0x0F0D0C0A09070604};
+ __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001,
+ 0x001600130010000D};
+ __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019,
+ 0x000E000B00080005};
+
+ for (x = 0; x < len; x++) {
+ reg0 = __lasx_xvld(src_rgb24, 0);
+ reg1 = __lasx_xvld(src_rgb24, 32);
+ reg2 = __lasx_xvld(src_rgb24, 64);
+ src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
+ src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
+ src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
+ tmp0 = __lasx_xvshuf_b(src1, src0, shuff0);
+ tmp1 = __lasx_xvshuf_b(src1, src2, shuff1);
+ tmp2 = __lasx_xvshuf_b(src1, src0, shuff2);
+ tmp3 = __lasx_xvshuf_b(src1, src2, shuff3);
+ reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129);
+ reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129);
+ reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0);
+ reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1);
+ dst0 = __lasx_xvpickod_b(reg1, reg0);
+ __lasx_xvst(dst0, dst_y, 0);
+ dst_y += 32;
+ src_rgb24 += 96;
+ }
+}
+
+void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24;
+ int len = width / 32;
+ __m256i src0, src1, src2, reg0, reg1, reg2;
+ __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
+ __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m256i const_112 = __lasx_xvldi(0x438);
+ __m256i const_74 = __lasx_xvldi(0x425);
+ __m256i const_38 = __lasx_xvldi(0x413);
+ __m256i const_94 = __lasx_xvldi(0x42F);
+ __m256i const_18 = __lasx_xvldi(0x409);
+ __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+ __m256i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18,
+ 0x15120F0C09060300, 0x00000000001E1B18};
+ __m256i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908,
+ 0x0706050403020100, 0x1D1A1714110A0908};
+ __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19,
+ 0x1613100D0A070401, 0x00000000001F1C19};
+ __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908,
+ 0x0706050403020100, 0x1E1B1815120A0908};
+ __m256i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A,
+ 0x1714110E0B080502, 0x0000000000001D1A};
+ __m256i shuff1_r = {0x0706050403020100, 0x1F1C191613100908,
+ 0x0706050403020100, 0x1F1C191613100908};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_rgb24, 0, src_rgb24, 32, src_rgb24, 64,
+ next_rgb24, 0, reg0, reg1, reg2, tmp0);
+ DUP2_ARG2(__lasx_xvld, next_rgb24, 32, next_rgb24, 64, tmp1, tmp2);
+ DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1,
+ 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
+ DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
+ nexr);
+ DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
+ nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
+ dst0 = __lasx_xvpickod_b(reg1, reg0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_v, 0, 1);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+ src_rgb24 += 96;
+ next_rgb24 += 96;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i reg0, reg1, reg2, dst0;
+ __m256i const_129 = __lasx_xvldi(129);
+ __m256i const_br = {0x1942194219421942, 0x1942194219421942,
+ 0x1942194219421942, 0x1942194219421942};
+ __m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
+ 0x1080108010801080, 0x1080108010801080};
+ __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200,
+ 0x17151412110F0E0C};
+ __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18,
+ 0x0F0D0C0A09070604};
+ __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001,
+ 0x001600130010000D};
+ __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019,
+ 0x000E000B00080005};
+
+ for (x = 0; x < len; x++) {
+ reg0 = __lasx_xvld(src_raw, 0);
+ reg1 = __lasx_xvld(src_raw, 32);
+ reg2 = __lasx_xvld(src_raw, 64);
+ src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
+ src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
+ src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
+ tmp0 = __lasx_xvshuf_b(src1, src0, shuff0);
+ tmp1 = __lasx_xvshuf_b(src1, src2, shuff1);
+ tmp2 = __lasx_xvshuf_b(src1, src0, shuff2);
+ tmp3 = __lasx_xvshuf_b(src1, src2, shuff3);
+ reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129);
+ reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129);
+ reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0);
+ reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1);
+ dst0 = __lasx_xvpickod_b(reg1, reg0);
+ __lasx_xvst(dst0, dst_y, 0);
+ dst_y += 32;
+ src_raw += 96;
+ }
+}
+
+void RAWToUVRow_LASX(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_raw = src_raw + src_stride_raw;
+ int len = width / 32;
+ __m256i src0, src1, src2, reg0, reg1, reg2;
+ __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
+ __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m256i const_112 = __lasx_xvldi(0x438);
+ __m256i const_74 = __lasx_xvldi(0x425);
+ __m256i const_38 = __lasx_xvldi(0x413);
+ __m256i const_94 = __lasx_xvldi(0x42F);
+ __m256i const_18 = __lasx_xvldi(0x409);
+ __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+ __m256i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18,
+ 0x15120F0C09060300, 0x00000000001E1B18};
+ __m256i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908,
+ 0x0706050403020100, 0x1D1A1714110A0908};
+ __m256i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19,
+ 0x1613100D0A070401, 0x00000000001F1C19};
+ __m256i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908,
+ 0x0706050403020100, 0x1E1B1815120A0908};
+ __m256i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A,
+ 0x1714110E0B080502, 0x0000000000001D1A};
+ __m256i shuff1_b = {0x0706050403020100, 0x1F1C191613100908,
+ 0x0706050403020100, 0x1F1C191613100908};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64, next_raw, 0,
+ reg0, reg1, reg2, tmp0);
+ DUP2_ARG2(__lasx_xvld, next_raw, 32, next_raw, 64, tmp1, tmp2);
+ DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1,
+ 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
+ DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
+ nexr);
+ DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
+ nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
+ dst0 = __lasx_xvpickod_b(reg1, reg0);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_v, 0, 1);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 2);
+ __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+ src_raw += 96;
+ next_raw += 96;
+ dst_u += 16;
+ dst_v += 16;
+ }
+}
+
+void NV12ToARGBRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_vrub, vec_vgug, vec_y, vec_vu;
+ __m256i out_b, out_g, out_r;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i alpha = __lasx_xvldi(0xFF);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub);
+ vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lasx_xvld(src_y, 0);
+ vec_vu = __lasx_xvld(src_uv, 0);
+ vec_vu = __lasx_xvsub_b(vec_vu, const_0x80);
+ vec_vu = __lasx_vext2xv_h_b(vec_vu);
+ YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g,
+ out_b);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_y += 16;
+ src_uv += 16;
+ }
+}
+
+void NV12ToRGB565Row_LASX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_vrub, vec_vgug, vec_y, vec_vu;
+ __m256i out_b, out_g, out_r;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_vrub = __lasx_xvilvl_h(vec_vr, vec_ub);
+ vec_vgug = __lasx_xvilvl_h(vec_vg, vec_ug);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lasx_xvld(src_y, 0);
+ vec_vu = __lasx_xvld(src_uv, 0);
+ vec_vu = __lasx_xvsub_b(vec_vu, const_0x80);
+ vec_vu = __lasx_vext2xv_h_b(vec_vu);
+ YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g,
+ out_b);
+ out_b = __lasx_xvsrli_h(out_b, 3);
+ out_g = __lasx_xvsrli_h(out_g, 2);
+ out_r = __lasx_xvsrli_h(out_r, 3);
+ out_g = __lasx_xvslli_h(out_g, 5);
+ out_r = __lasx_xvslli_h(out_r, 11);
+ out_r = __lasx_xvor_v(out_r, out_g);
+ out_r = __lasx_xvor_v(out_r, out_b);
+ __lasx_xvst(out_r, dst_rgb565, 0);
+ src_y += 16;
+ src_uv += 16;
+ dst_rgb565 += 32;
+ }
+}
+
+void NV21ToARGBRow_LASX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m256i vec_yg, vec_yb, vec_ub, vec_vr, vec_ug, vec_vg;
+ __m256i vec_ubvr, vec_ugvg, vec_y, vec_uv;
+ __m256i out_b, out_g, out_r;
+ __m256i const_0x80 = __lasx_xvldi(0x80);
+ __m256i alpha = __lasx_xvldi(0xFF);
+
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
+ vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lasx_xvilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lasx_xvld(src_y, 0);
+ vec_uv = __lasx_xvld(src_uv, 0);
+ vec_uv = __lasx_xvsub_b(vec_uv, const_0x80);
+ vec_uv = __lasx_vext2xv_h_b(vec_uv);
+ YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b, out_g,
+ out_r);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_y += 16;
+ src_uv += 16;
+ }
+}
+
+void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3, dst0;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i reg0, reg1;
+ __m256i const_128 = __lasx_xvldi(0x480);
+ __m256i const_150 = __lasx_xvldi(0x96);
+ __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D,
+ 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
+ __m256i shuff = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
+ 0x0000000700000003};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+ 96, src0, src1, src2, src3);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmp2 = __lasx_xvpickev_b(src3, src2);
+ tmp3 = __lasx_xvpickod_b(src3, src2);
+ reg0 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150);
+ reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp3, const_150);
+ reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0);
+ reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp2);
+ dst0 = __lasx_xvpickod_b(reg1, reg0);
+ dst0 = __lasx_xvperm_w(dst0, shuff);
+ __lasx_xvst(dst0, dst_y, 0);
+ dst_y += 32;
+ src_argb += 128;
+ }
+}
+
+void ARGBToUVJRow_LASX(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_argb = src_argb + src_stride_argb;
+ int len = width / 32;
+ __m256i src0, src1, src2, src3;
+ __m256i nex0, nex1, nex2, nex3;
+ __m256i tmp0, tmp1, tmp2, tmp3;
+ __m256i reg0, reg1, dst0;
+ __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m256i const_63 = __lasx_xvldi(0x43F);
+ __m256i const_42 = __lasx_xvldi(0x42A);
+ __m256i const_21 = __lasx_xvldi(0x415);
+ __m256i const_53 = __lasx_xvldi(0x435);
+ __m256i const_10 = __lasx_xvldi(0x40A);
+ __m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
+ 0x8080808080808080, 0x8080808080808080};
+ __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301,
+ 0x1F1D0F0D1B190B09};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
+ 96, src0, src1, src2, src3);
+ DUP4_ARG2(__lasx_xvld, next_argb, 0, next_argb, 32, next_argb, 64,
+ next_argb, 96, nex0, nex1, nex2, nex3);
+ tmp0 = __lasx_xvpickev_b(src1, src0);
+ tmp1 = __lasx_xvpickod_b(src1, src0);
+ tmp2 = __lasx_xvpickev_b(src3, src2);
+ tmp3 = __lasx_xvpickod_b(src3, src2);
+ tmpr = __lasx_xvpickod_b(tmp2, tmp0);
+ tmpb = __lasx_xvpickev_b(tmp2, tmp0);
+ tmpg = __lasx_xvpickev_b(tmp3, tmp1);
+ tmp0 = __lasx_xvpickev_b(nex1, nex0);
+ tmp1 = __lasx_xvpickod_b(nex1, nex0);
+ tmp2 = __lasx_xvpickev_b(nex3, nex2);
+ tmp3 = __lasx_xvpickod_b(nex3, nex2);
+ nexr = __lasx_xvpickod_b(tmp2, tmp0);
+ nexb = __lasx_xvpickev_b(tmp2, tmp0);
+ nexg = __lasx_xvpickev_b(tmp3, tmp1);
+ tmp0 = __lasx_xvaddwev_h_bu(tmpb, nexb);
+ tmp1 = __lasx_xvaddwod_h_bu(tmpb, nexb);
+ tmp2 = __lasx_xvaddwev_h_bu(tmpg, nexg);
+ tmp3 = __lasx_xvaddwod_h_bu(tmpg, nexg);
+ reg0 = __lasx_xvaddwev_h_bu(tmpr, nexr);
+ reg1 = __lasx_xvaddwod_h_bu(tmpr, nexr);
+ tmpb = __lasx_xvavgr_hu(tmp0, tmp1);
+ tmpg = __lasx_xvavgr_hu(tmp2, tmp3);
+ tmpr = __lasx_xvavgr_hu(reg0, reg1);
+ reg0 = __lasx_xvmadd_h(const_8080, const_63, tmpb);
+ reg1 = __lasx_xvmadd_h(const_8080, const_63, tmpr);
+ reg0 = __lasx_xvmsub_h(reg0, const_42, tmpg);
+ reg1 = __lasx_xvmsub_h(reg1, const_53, tmpg);
+ reg0 = __lasx_xvmsub_h(reg0, const_21, tmpr);
+ reg1 = __lasx_xvmsub_h(reg1, const_10, tmpb);
+ dst0 = __lasx_xvpackod_b(reg1, reg0);
+ tmp0 = __lasx_xvpermi_d(dst0, 0x44);
+ tmp1 = __lasx_xvpermi_d(dst0, 0xEE);
+ dst0 = __lasx_xvshuf_b(tmp1, tmp0, shuff);
+ __lasx_xvstelm_d(dst0, dst_u, 0, 0);
+ __lasx_xvstelm_d(dst0, dst_v, 0, 2);
+ __lasx_xvstelm_d(dst0, dst_u, 8, 1);
+ __lasx_xvstelm_d(dst0, dst_v, 8, 3);
+ dst_u += 16;
+ dst_v += 16;
+ src_argb += 128;
+ next_argb += 128;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
diff --git a/files/source/row_lsx.cc b/files/source/row_lsx.cc
new file mode 100644
index 00000000..3e8b901a
--- /dev/null
+++ b/files/source/row_lsx.cc
@@ -0,0 +1,1829 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#include "libyuv/loongson_intrinsics.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, vr, ub, vg, ug, yg, yb) \
+ { \
+ ub = __lsx_vreplgr2vr_h(yuvconst->kUVToB[0]); \
+ vr = __lsx_vreplgr2vr_h(yuvconst->kUVToR[1]); \
+ ug = __lsx_vreplgr2vr_h(yuvconst->kUVToG[0]); \
+ vg = __lsx_vreplgr2vr_h(yuvconst->kUVToG[1]); \
+ yg = __lsx_vreplgr2vr_h(yuvconst->kYToRgb[0]); \
+ yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \
+ }
+
+// Convert 8 pixels of YUV420 to RGB.
+#define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \
+ { \
+ __m128i y_ev, y_od, u_l, v_l; \
+ __m128i tmp0, tmp1, tmp2, tmp3; \
+ \
+ tmp0 = __lsx_vilvl_b(in_y, in_y); \
+ y_ev = __lsx_vmulwev_w_hu_h(tmp0, yg); \
+ y_od = __lsx_vmulwod_w_hu_h(tmp0, yg); \
+ y_ev = __lsx_vsrai_w(y_ev, 16); \
+ y_od = __lsx_vsrai_w(y_od, 16); \
+ y_ev = __lsx_vadd_w(y_ev, yb); \
+ y_od = __lsx_vadd_w(y_od, yb); \
+ in_vu = __lsx_vilvl_b(zero, in_vu); \
+ in_vu = __lsx_vsub_h(in_vu, const_80); \
+ u_l = __lsx_vmulwev_w_h(in_vu, vrub); \
+ v_l = __lsx_vmulwod_w_h(in_vu, vrub); \
+ tmp0 = __lsx_vadd_w(y_ev, u_l); \
+ tmp1 = __lsx_vadd_w(y_od, u_l); \
+ tmp2 = __lsx_vadd_w(y_ev, v_l); \
+ tmp3 = __lsx_vadd_w(y_od, v_l); \
+ tmp0 = __lsx_vsrai_w(tmp0, 6); \
+ tmp1 = __lsx_vsrai_w(tmp1, 6); \
+ tmp2 = __lsx_vsrai_w(tmp2, 6); \
+ tmp3 = __lsx_vsrai_w(tmp3, 6); \
+ tmp0 = __lsx_vclip255_w(tmp0); \
+ tmp1 = __lsx_vclip255_w(tmp1); \
+ tmp2 = __lsx_vclip255_w(tmp2); \
+ tmp3 = __lsx_vclip255_w(tmp3); \
+ out_b = __lsx_vpackev_h(tmp1, tmp0); \
+ out_r = __lsx_vpackev_h(tmp3, tmp2); \
+ tmp0 = __lsx_vdp2_w_h(in_vu, vgug); \
+ tmp1 = __lsx_vsub_w(y_ev, tmp0); \
+ tmp2 = __lsx_vsub_w(y_od, tmp0); \
+ tmp1 = __lsx_vsrai_w(tmp1, 6); \
+ tmp2 = __lsx_vsrai_w(tmp2, 6); \
+ tmp1 = __lsx_vclip255_w(tmp1); \
+ tmp2 = __lsx_vclip255_w(tmp2); \
+ out_g = __lsx_vpackev_h(tmp2, tmp1); \
+ }
+
+// Convert I444 pixels of YUV420 to RGB.
+#define I444TORGB(in_yy, in_u, in_v, ub, vr, ugvg, yg, yb, out_b, out_g, \
+ out_r) \
+ { \
+ __m128i y_ev, y_od, u_ev, v_ev, u_od, v_od; \
+ __m128i tmp0, tmp1, tmp2, tmp3; \
+ \
+ y_ev = __lsx_vmulwev_w_hu_h(in_yy, yg); \
+ y_od = __lsx_vmulwod_w_hu_h(in_yy, yg); \
+ y_ev = __lsx_vsrai_w(y_ev, 16); \
+ y_od = __lsx_vsrai_w(y_od, 16); \
+ y_ev = __lsx_vadd_w(y_ev, yb); \
+ y_od = __lsx_vadd_w(y_od, yb); \
+ in_u = __lsx_vsub_h(in_u, const_80); \
+ in_v = __lsx_vsub_h(in_v, const_80); \
+ u_ev = __lsx_vmulwev_w_h(in_u, ub); \
+ u_od = __lsx_vmulwod_w_h(in_u, ub); \
+ v_ev = __lsx_vmulwev_w_h(in_v, vr); \
+ v_od = __lsx_vmulwod_w_h(in_v, vr); \
+ tmp0 = __lsx_vadd_w(y_ev, u_ev); \
+ tmp1 = __lsx_vadd_w(y_od, u_od); \
+ tmp2 = __lsx_vadd_w(y_ev, v_ev); \
+ tmp3 = __lsx_vadd_w(y_od, v_od); \
+ tmp0 = __lsx_vsrai_w(tmp0, 6); \
+ tmp1 = __lsx_vsrai_w(tmp1, 6); \
+ tmp2 = __lsx_vsrai_w(tmp2, 6); \
+ tmp3 = __lsx_vsrai_w(tmp3, 6); \
+ tmp0 = __lsx_vclip255_w(tmp0); \
+ tmp1 = __lsx_vclip255_w(tmp1); \
+ tmp2 = __lsx_vclip255_w(tmp2); \
+ tmp3 = __lsx_vclip255_w(tmp3); \
+ out_b = __lsx_vpackev_h(tmp1, tmp0); \
+ out_r = __lsx_vpackev_h(tmp3, tmp2); \
+ u_ev = __lsx_vpackev_h(in_u, in_v); \
+ u_od = __lsx_vpackod_h(in_u, in_v); \
+ v_ev = __lsx_vdp2_w_h(u_ev, ugvg); \
+ v_od = __lsx_vdp2_w_h(u_od, ugvg); \
+ tmp0 = __lsx_vsub_w(y_ev, v_ev); \
+ tmp1 = __lsx_vsub_w(y_od, v_od); \
+ tmp0 = __lsx_vsrai_w(tmp0, 6); \
+ tmp1 = __lsx_vsrai_w(tmp1, 6); \
+ tmp0 = __lsx_vclip255_w(tmp0); \
+ tmp1 = __lsx_vclip255_w(tmp1); \
+ out_g = __lsx_vpackev_h(tmp1, tmp0); \
+ }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
+ { \
+ __m128i temp0, temp1; \
+ __m128i dst0, dst1; \
+ \
+ temp0 = __lsx_vpackev_b(in_g, in_b); \
+ temp1 = __lsx_vpackev_b(in_a, in_r); \
+ dst0 = __lsx_vilvl_h(temp1, temp0); \
+ dst1 = __lsx_vilvh_h(temp1, temp0); \
+ __lsx_vst(dst0, pdst_argb, 0); \
+ __lsx_vst(dst1, pdst_argb, 16); \
+ pdst_argb += 32; \
+ }
+
+#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
+ { \
+ __m128i _tmp0, _tmp1, _tmp2, _tmp3; \
+ __m128i _reg0, _reg1; \
+ _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb); \
+ _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb); \
+ _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg); \
+ _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg); \
+ _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr); \
+ _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr); \
+ _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1); \
+ _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3); \
+ _tmpr = __lsx_vavgr_hu(_reg0, _reg1); \
+ _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb); \
+ _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr); \
+ _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg); \
+ _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg); \
+ _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr); \
+ _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb); \
+ _dst0 = __lsx_vpickod_b(_reg1, _reg0); \
+ }
+
+void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, reg2, reg3;
+ __m128i dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_argb4444, 0);
+ src1 = __lsx_vld(src_argb4444, 16);
+ tmp0 = __lsx_vandi_b(src0, 0x0F);
+ tmp1 = __lsx_vandi_b(src0, 0xF0);
+ tmp2 = __lsx_vandi_b(src1, 0x0F);
+ tmp3 = __lsx_vandi_b(src1, 0xF0);
+ reg0 = __lsx_vslli_b(tmp0, 4);
+ reg2 = __lsx_vslli_b(tmp2, 4);
+ reg1 = __lsx_vsrli_b(tmp1, 4);
+ reg3 = __lsx_vsrli_b(tmp3, 4);
+ DUP4_ARG2(__lsx_vor_v, tmp0, reg0, tmp1, reg1, tmp2, reg2, tmp3, reg3, tmp0,
+ tmp1, tmp2, tmp3);
+ dst0 = __lsx_vilvl_b(tmp1, tmp0);
+ dst2 = __lsx_vilvl_b(tmp3, tmp2);
+ dst1 = __lsx_vilvh_b(tmp1, tmp0);
+ dst3 = __lsx_vilvh_b(tmp3, tmp2);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_argb4444 += 32;
+ }
+}
+
+void ARGB1555ToARGBRow_LSX(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1;
+ __m128i tmp0, tmp1, tmpb, tmpg, tmpr, tmpa;
+ __m128i reg0, reg1, reg2;
+ __m128i dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_argb1555, 0);
+ src1 = __lsx_vld(src_argb1555, 16);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmpb = __lsx_vandi_b(tmp0, 0x1F);
+ tmpg = __lsx_vsrli_b(tmp0, 5);
+ reg0 = __lsx_vandi_b(tmp1, 0x03);
+ reg0 = __lsx_vslli_b(reg0, 3);
+ tmpg = __lsx_vor_v(tmpg, reg0);
+ reg1 = __lsx_vandi_b(tmp1, 0x7C);
+ tmpr = __lsx_vsrli_b(reg1, 2);
+ tmpa = __lsx_vsrli_b(tmp1, 7);
+ tmpa = __lsx_vneg_b(tmpa);
+ reg0 = __lsx_vslli_b(tmpb, 3);
+ reg1 = __lsx_vslli_b(tmpg, 3);
+ reg2 = __lsx_vslli_b(tmpr, 3);
+ tmpb = __lsx_vsrli_b(tmpb, 2);
+ tmpg = __lsx_vsrli_b(tmpg, 2);
+ tmpr = __lsx_vsrli_b(tmpr, 2);
+ tmpb = __lsx_vor_v(reg0, tmpb);
+ tmpg = __lsx_vor_v(reg1, tmpg);
+ tmpr = __lsx_vor_v(reg2, tmpr);
+ DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
+ dst0 = __lsx_vilvl_h(reg1, reg0);
+ dst1 = __lsx_vilvh_h(reg1, reg0);
+ DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, tmpa, tmpr, reg0, reg1);
+ dst2 = __lsx_vilvl_h(reg1, reg0);
+ dst3 = __lsx_vilvh_h(reg1, reg0);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_argb1555 += 32;
+ }
+}
+
+void RGB565ToARGBRow_LSX(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1;
+ __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
+ __m128i reg0, reg1, dst0, dst1, dst2, dst3;
+ __m128i alpha = __lsx_vldi(0xFF);
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_rgb565, 0);
+ src1 = __lsx_vld(src_rgb565, 16);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmpb = __lsx_vandi_b(tmp0, 0x1F);
+ tmpr = __lsx_vandi_b(tmp1, 0xF8);
+ reg1 = __lsx_vandi_b(tmp1, 0x07);
+ reg0 = __lsx_vsrli_b(tmp0, 5);
+ reg1 = __lsx_vslli_b(reg1, 3);
+ tmpg = __lsx_vor_v(reg1, reg0);
+ reg0 = __lsx_vslli_b(tmpb, 3);
+ reg1 = __lsx_vsrli_b(tmpb, 2);
+ tmpb = __lsx_vor_v(reg1, reg0);
+ reg0 = __lsx_vslli_b(tmpg, 2);
+ reg1 = __lsx_vsrli_b(tmpg, 4);
+ tmpg = __lsx_vor_v(reg1, reg0);
+ reg0 = __lsx_vsrli_b(tmpr, 5);
+ tmpr = __lsx_vor_v(tmpr, reg0);
+ DUP2_ARG2(__lsx_vilvl_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
+ dst0 = __lsx_vilvl_h(reg1, reg0);
+ dst1 = __lsx_vilvh_h(reg1, reg0);
+ DUP2_ARG2(__lsx_vilvh_b, tmpg, tmpb, alpha, tmpr, reg0, reg1);
+ dst2 = __lsx_vilvl_h(reg1, reg0);
+ dst3 = __lsx_vilvh_h(reg1, reg0);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_rgb565 += 32;
+ }
+}
+
+void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2;
+ __m128i tmp0, tmp1, tmp2;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514};
+ __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100};
+ __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C};
+ __m128i shuf3 = {0x1005040310020100, 0x100B0A0910080706};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_rgb24, 0);
+ src1 = __lsx_vld(src_rgb24, 16);
+ src2 = __lsx_vld(src_rgb24, 32);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
+ tmp2 = __lsx_vshuf_b(src1, src2, shuf2);
+ DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
+ tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_rgb24 += 48;
+ }
+}
+
+void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2;
+ __m128i tmp0, tmp1, tmp2;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514};
+ __m128i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100};
+ __m128i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C};
+ __m128i shuf3 = {0x1003040510000102, 0x10090A0B10060708};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_raw, 0);
+ src1 = __lsx_vld(src_raw, 16);
+ src2 = __lsx_vld(src_raw, 32);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
+ tmp2 = __lsx_vshuf_b(src1, src2, shuf2);
+ DUP4_ARG3(__lsx_vshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
+ tmp1, shuf3, alpha, tmp2, shuf3, dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_raw += 48;
+ }
+}
+
+void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1;
+ __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
+ __m128i reg0, reg1, reg2, dst0;
+ __m128i const_66 = __lsx_vldi(66);
+ __m128i const_129 = __lsx_vldi(129);
+ __m128i const_25 = __lsx_vldi(25);
+ __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_argb1555, 0);
+ src1 = __lsx_vld(src_argb1555, 16);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmpb = __lsx_vandi_b(tmp0, 0x1F);
+ tmpg = __lsx_vsrli_b(tmp0, 5);
+ reg0 = __lsx_vandi_b(tmp1, 0x03);
+ reg0 = __lsx_vslli_b(reg0, 3);
+ tmpg = __lsx_vor_v(tmpg, reg0);
+ reg1 = __lsx_vandi_b(tmp1, 0x7C);
+ tmpr = __lsx_vsrli_b(reg1, 2);
+ reg0 = __lsx_vslli_b(tmpb, 3);
+ reg1 = __lsx_vslli_b(tmpg, 3);
+ reg2 = __lsx_vslli_b(tmpr, 3);
+ tmpb = __lsx_vsrli_b(tmpb, 2);
+ tmpg = __lsx_vsrli_b(tmpg, 2);
+ tmpr = __lsx_vsrli_b(tmpr, 2);
+ tmpb = __lsx_vor_v(reg0, tmpb);
+ tmpg = __lsx_vor_v(reg1, tmpg);
+ tmpr = __lsx_vor_v(reg2, tmpr);
+ reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25);
+ reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25);
+ reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129);
+ reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129);
+ reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66);
+ reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66);
+ dst0 = __lsx_vpackod_b(reg1, reg0);
+ __lsx_vst(dst0, dst_y, 0);
+ dst_y += 16;
+ src_argb1555 += 32;
+ }
+}
+
+void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 16;
+ const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i reg0, reg1, reg2, reg3, dst0;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb1555, 0, src_argb1555, 16, next_argb1555, 0,
+ next_argb1555, 16, src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+ tmpb = __lsx_vandi_b(tmp0, 0x1F);
+ nexb = __lsx_vandi_b(tmp2, 0x1F);
+ tmpg = __lsx_vsrli_b(tmp0, 5);
+ nexg = __lsx_vsrli_b(tmp2, 5);
+ reg0 = __lsx_vandi_b(tmp1, 0x03);
+ reg2 = __lsx_vandi_b(tmp3, 0x03);
+ reg0 = __lsx_vslli_b(reg0, 3);
+ reg2 = __lsx_vslli_b(reg2, 3);
+ tmpg = __lsx_vor_v(tmpg, reg0);
+ nexg = __lsx_vor_v(nexg, reg2);
+ reg1 = __lsx_vandi_b(tmp1, 0x7C);
+ reg3 = __lsx_vandi_b(tmp3, 0x7C);
+ tmpr = __lsx_vsrli_b(reg1, 2);
+ nexr = __lsx_vsrli_b(reg3, 2);
+ reg0 = __lsx_vslli_b(tmpb, 3);
+ reg1 = __lsx_vslli_b(tmpg, 3);
+ reg2 = __lsx_vslli_b(tmpr, 3);
+ tmpb = __lsx_vsrli_b(tmpb, 2);
+ tmpg = __lsx_vsrli_b(tmpg, 2);
+ tmpr = __lsx_vsrli_b(tmpr, 2);
+ tmpb = __lsx_vor_v(reg0, tmpb);
+ tmpg = __lsx_vor_v(reg1, tmpg);
+ tmpr = __lsx_vor_v(reg2, tmpr);
+ reg0 = __lsx_vslli_b(nexb, 3);
+ reg1 = __lsx_vslli_b(nexg, 3);
+ reg2 = __lsx_vslli_b(nexr, 3);
+ nexb = __lsx_vsrli_b(nexb, 2);
+ nexg = __lsx_vsrli_b(nexg, 2);
+ nexr = __lsx_vsrli_b(nexr, 2);
+ nexb = __lsx_vor_v(reg0, nexb);
+ nexg = __lsx_vor_v(reg1, nexg);
+ nexr = __lsx_vor_v(reg2, nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_argb1555 += 32;
+ next_argb1555 += 32;
+ }
+}
+
+void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1;
+ __m128i tmp0, tmp1, tmpb, tmpg, tmpr;
+ __m128i reg0, reg1, dst0;
+ __m128i const_66 = __lsx_vldi(66);
+ __m128i const_129 = __lsx_vldi(129);
+ __m128i const_25 = __lsx_vldi(25);
+ __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_rgb565, 0);
+ src1 = __lsx_vld(src_rgb565, 16);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmpb = __lsx_vandi_b(tmp0, 0x1F);
+ tmpr = __lsx_vandi_b(tmp1, 0xF8);
+ reg1 = __lsx_vandi_b(tmp1, 0x07);
+ reg0 = __lsx_vsrli_b(tmp0, 5);
+ reg1 = __lsx_vslli_b(reg1, 3);
+ tmpg = __lsx_vor_v(reg1, reg0);
+ reg0 = __lsx_vslli_b(tmpb, 3);
+ reg1 = __lsx_vsrli_b(tmpb, 2);
+ tmpb = __lsx_vor_v(reg1, reg0);
+ reg0 = __lsx_vslli_b(tmpg, 2);
+ reg1 = __lsx_vsrli_b(tmpg, 4);
+ tmpg = __lsx_vor_v(reg1, reg0);
+ reg0 = __lsx_vsrli_b(tmpr, 5);
+ tmpr = __lsx_vor_v(tmpr, reg0);
+ reg0 = __lsx_vmaddwev_h_bu(const_1080, tmpb, const_25);
+ reg1 = __lsx_vmaddwod_h_bu(const_1080, tmpb, const_25);
+ reg0 = __lsx_vmaddwev_h_bu(reg0, tmpg, const_129);
+ reg1 = __lsx_vmaddwod_h_bu(reg1, tmpg, const_129);
+ reg0 = __lsx_vmaddwev_h_bu(reg0, tmpr, const_66);
+ reg1 = __lsx_vmaddwod_h_bu(reg1, tmpr, const_66);
+ dst0 = __lsx_vpackod_b(reg1, reg0);
+ __lsx_vst(dst0, dst_y, 0);
+ dst_y += 16;
+ src_rgb565 += 32;
+ }
+}
+
+void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 16;
+ const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i reg0, reg1, reg2, reg3, dst0;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_rgb565, 0, src_rgb565, 16, next_rgb565, 0,
+ next_rgb565, 16, src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+ tmpb = __lsx_vandi_b(tmp0, 0x1F);
+ tmpr = __lsx_vandi_b(tmp1, 0xF8);
+ nexb = __lsx_vandi_b(tmp2, 0x1F);
+ nexr = __lsx_vandi_b(tmp3, 0xF8);
+ reg1 = __lsx_vandi_b(tmp1, 0x07);
+ reg3 = __lsx_vandi_b(tmp3, 0x07);
+ reg0 = __lsx_vsrli_b(tmp0, 5);
+ reg1 = __lsx_vslli_b(reg1, 3);
+ reg2 = __lsx_vsrli_b(tmp2, 5);
+ reg3 = __lsx_vslli_b(reg3, 3);
+ tmpg = __lsx_vor_v(reg1, reg0);
+ nexg = __lsx_vor_v(reg2, reg3);
+ reg0 = __lsx_vslli_b(tmpb, 3);
+ reg1 = __lsx_vsrli_b(tmpb, 2);
+ reg2 = __lsx_vslli_b(nexb, 3);
+ reg3 = __lsx_vsrli_b(nexb, 2);
+ tmpb = __lsx_vor_v(reg1, reg0);
+ nexb = __lsx_vor_v(reg2, reg3);
+ reg0 = __lsx_vslli_b(tmpg, 2);
+ reg1 = __lsx_vsrli_b(tmpg, 4);
+ reg2 = __lsx_vslli_b(nexg, 2);
+ reg3 = __lsx_vsrli_b(nexg, 4);
+ tmpg = __lsx_vor_v(reg1, reg0);
+ nexg = __lsx_vor_v(reg2, reg3);
+ reg0 = __lsx_vsrli_b(tmpr, 5);
+ reg2 = __lsx_vsrli_b(nexr, 5);
+ tmpr = __lsx_vor_v(tmpr, reg0);
+ nexr = __lsx_vor_v(nexr, reg2);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_rgb565 += 32;
+ next_rgb565 += 32;
+ }
+}
+
+void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, dst0;
+ __m128i const_129 = __lsx_vldi(129);
+ __m128i const_br = {0x4219421942194219, 0x4219421942194219};
+ __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
+ __m128i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C};
+ __m128i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604};
+ __m128i shuff2 = {0x000A000700040001, 0x001600130010000D};
+ __m128i shuff3 = {0x0002001F001C0019, 0x000E000B00080005};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_rgb24, 0);
+ src1 = __lsx_vld(src_rgb24, 16);
+ src2 = __lsx_vld(src_rgb24, 32);
+ tmp0 = __lsx_vshuf_b(src1, src0, shuff0);
+ tmp1 = __lsx_vshuf_b(src1, src2, shuff1);
+ tmp2 = __lsx_vshuf_b(src1, src0, shuff2);
+ tmp3 = __lsx_vshuf_b(src1, src2, shuff3);
+ reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp2, const_129);
+ reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129);
+ reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
+ reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1);
+ dst0 = __lsx_vpickod_b(reg1, reg0);
+ __lsx_vst(dst0, dst_y, 0);
+ dst_y += 16;
+ src_rgb24 += 48;
+ }
+}
+
+void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_rgb24 = src_rgb24 + src_stride_rgb24;
+ int len = width / 16;
+ __m128i src0, src1, src2;
+ __m128i nex0, nex1, nex2, dst0;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+ __m128i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18};
+ __m128i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908};
+ __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
+ __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908};
+ __m128i shuff0_r = {0x1714110E0B080502, 0x0000000000001D1A};
+ __m128i shuff1_r = {0x0706050403020100, 0x1F1C191613100908};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_rgb24, 0);
+ src1 = __lsx_vld(src_rgb24, 16);
+ src2 = __lsx_vld(src_rgb24, 32);
+ nex0 = __lsx_vld(next_rgb24, 0);
+ nex1 = __lsx_vld(next_rgb24, 16);
+ nex2 = __lsx_vld(next_rgb24, 32);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
+ nexr);
+ DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
+ nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_rgb24 += 48;
+ next_rgb24 += 48;
+ }
+}
+
+void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, dst0;
+ __m128i const_129 = __lsx_vldi(129);
+ __m128i const_br = {0x1942194219421942, 0x1942194219421942};
+ __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
+ __m128i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C};
+ __m128i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604};
+ __m128i shuff2 = {0x000A000700040001, 0x001600130010000D};
+ __m128i shuff3 = {0x0002001F001C0019, 0x000E000B00080005};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_raw, 0);
+ src1 = __lsx_vld(src_raw, 16);
+ src2 = __lsx_vld(src_raw, 32);
+ tmp0 = __lsx_vshuf_b(src1, src0, shuff0);
+ tmp1 = __lsx_vshuf_b(src1, src2, shuff1);
+ tmp2 = __lsx_vshuf_b(src1, src0, shuff2);
+ tmp3 = __lsx_vshuf_b(src1, src2, shuff3);
+ reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp2, const_129);
+ reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129);
+ reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
+ reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1);
+ dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
+ __lsx_vst(dst0, dst_y, 0);
+ dst_y += 16;
+ src_raw += 48;
+ }
+}
+
+void RAWToUVRow_LSX(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_raw = src_raw + src_stride_raw;
+ int len = width / 16;
+ __m128i src0, src1, src2;
+ __m128i nex0, nex1, nex2, dst0;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+ __m128i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18};
+ __m128i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908};
+ __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
+ __m128i shuff1_g = {0x0706050403020100, 0x1E1B1815120A0908};
+ __m128i shuff0_b = {0x1714110E0B080502, 0x0000000000001D1A};
+ __m128i shuff1_b = {0x0706050403020100, 0x1F1C191613100908};
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_raw, 0);
+ src1 = __lsx_vld(src_raw, 16);
+ src2 = __lsx_vld(src_raw, 32);
+ nex0 = __lsx_vld(next_raw, 0);
+ nex1 = __lsx_vld(next_raw, 16);
+ nex2 = __lsx_vld(next_raw, 32);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
+ nexr);
+ DUP2_ARG3(__lsx_vshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
+ nexb);
+ DUP2_ARG3(__lsx_vshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
+ nexg);
+ DUP2_ARG3(__lsx_vshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
+ nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_raw += 48;
+ next_raw += 48;
+ }
+}
+
+void NV12ToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i vec_y, vec_vu;
+ __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+ __m128i vec_vrub, vec_vgug;
+ __m128i out_b, out_g, out_r;
+ __m128i const_80 = __lsx_vldi(0x480);
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i zero = __lsx_vldi(0);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
+ vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lsx_vld(src_y, 0);
+ vec_vu = __lsx_vld(src_uv, 0);
+ YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
+ out_r);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_y += 8;
+ src_uv += 8;
+ }
+}
+
+void NV12ToRGB565Row_LSX(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i vec_y, vec_vu;
+ __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+ __m128i vec_vrub, vec_vgug;
+ __m128i out_b, out_g, out_r;
+ __m128i const_80 = __lsx_vldi(0x480);
+ __m128i zero = __lsx_vldi(0);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
+ vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lsx_vld(src_y, 0);
+ vec_vu = __lsx_vld(src_uv, 0);
+ YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
+ out_r);
+ out_b = __lsx_vsrli_h(out_b, 3);
+ out_g = __lsx_vsrli_h(out_g, 2);
+ out_r = __lsx_vsrli_h(out_r, 3);
+ out_g = __lsx_vslli_h(out_g, 5);
+ out_r = __lsx_vslli_h(out_r, 11);
+ out_r = __lsx_vor_v(out_r, out_g);
+ out_r = __lsx_vor_v(out_r, out_b);
+ __lsx_vst(out_r, dst_rgb565, 0);
+ src_y += 8;
+ src_uv += 8;
+ dst_rgb565 += 16;
+ }
+}
+
+void NV21ToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i vec_y, vec_uv;
+ __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+ __m128i vec_ubvr, vec_ugvg;
+ __m128i out_b, out_g, out_r;
+ __m128i const_80 = __lsx_vldi(0x480);
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i zero = __lsx_vldi(0);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr);
+ vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lsx_vld(src_y, 0);
+ vec_uv = __lsx_vld(src_vu, 0);
+ YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_r, out_g,
+ out_b);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_y += 8;
+ src_vu += 8;
+ }
+}
+
+void SobelRow_LSX(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, tmp0;
+ __m128i out0, out1, out2, out3;
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i shuff0 = {0x1001010110000000, 0x1003030310020202};
+ __m128i shuff1 = __lsx_vaddi_bu(shuff0, 0x04);
+ __m128i shuff2 = __lsx_vaddi_bu(shuff1, 0x04);
+ __m128i shuff3 = __lsx_vaddi_bu(shuff2, 0x04);
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_sobelx, 0);
+ src1 = __lsx_vld(src_sobely, 0);
+ tmp0 = __lsx_vsadd_bu(src0, src1);
+ DUP4_ARG3(__lsx_vshuf_b, alpha, tmp0, shuff0, alpha, tmp0, shuff1, alpha,
+ tmp0, shuff2, alpha, tmp0, shuff3, out0, out1, out2, out3);
+ __lsx_vst(out0, dst_argb, 0);
+ __lsx_vst(out1, dst_argb, 16);
+ __lsx_vst(out2, dst_argb, 32);
+ __lsx_vst(out3, dst_argb, 48);
+ src_sobelx += 16;
+ src_sobely += 16;
+ dst_argb += 64;
+ }
+}
+
+void SobelToPlaneRow_LSX(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m128i src0, src1, src2, src3, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_sobelx, 0, src_sobelx, 16, src0, src1);
+ DUP2_ARG2(__lsx_vld, src_sobely, 0, src_sobely, 16, src2, src3);
+ dst0 = __lsx_vsadd_bu(src0, src2);
+ dst1 = __lsx_vsadd_bu(src1, src3);
+ __lsx_vst(dst0, dst_y, 0);
+ __lsx_vst(dst1, dst_y, 16);
+ src_sobelx += 32;
+ src_sobely += 32;
+ dst_y += 32;
+ }
+}
+
+void SobelXYRow_LSX(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src_r, src_b, src_g;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i alpha = __lsx_vldi(0xFF);
+
+ for (x = 0; x < len; x++) {
+ src_r = __lsx_vld(src_sobelx, 0);
+ src_b = __lsx_vld(src_sobely, 0);
+ src_g = __lsx_vsadd_bu(src_r, src_b);
+ tmp0 = __lsx_vilvl_b(src_g, src_b);
+ tmp1 = __lsx_vilvh_b(src_g, src_b);
+ tmp2 = __lsx_vilvl_b(alpha, src_r);
+ tmp3 = __lsx_vilvh_b(alpha, src_r);
+ dst0 = __lsx_vilvl_h(tmp2, tmp0);
+ dst1 = __lsx_vilvh_h(tmp2, tmp0);
+ dst2 = __lsx_vilvl_h(tmp3, tmp1);
+ dst3 = __lsx_vilvh_h(tmp3, tmp1);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ src_sobelx += 16;
+ src_sobely += 16;
+ dst_argb += 64;
+ }
+}
+
+void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3, dst0;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1;
+ __m128i const_128 = __lsx_vldi(0x480);
+ __m128i const_150 = __lsx_vldi(0x96);
+ __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmp2 = __lsx_vpickev_b(src3, src2);
+ tmp3 = __lsx_vpickod_b(src3, src2);
+ reg0 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150);
+ reg1 = __lsx_vmaddwev_h_bu(const_128, tmp3, const_150);
+ reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
+ reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
+ dst0 = __lsx_vpickod_b(reg1, reg0);
+ __lsx_vst(dst0, dst_y, 0);
+ dst_y += 16;
+ src_argb += 64;
+ }
+}
+
+void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3, dst0;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1;
+ __m128i const_129 = __lsx_vldi(0x81);
+ __m128i const_br = {0x1942194219421942, 0x1942194219421942};
+ __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vpickod_b(src1, src0);
+ tmp1 = __lsx_vpickev_b(src1, src0);
+ tmp2 = __lsx_vpickod_b(src3, src2);
+ tmp3 = __lsx_vpickev_b(src3, src2);
+ reg0 = __lsx_vmaddwod_h_bu(const_1080, tmp1, const_129);
+ reg1 = __lsx_vmaddwod_h_bu(const_1080, tmp3, const_129);
+ reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
+ reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
+ dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
+ __lsx_vst(dst0, dst_y, 0);
+ dst_y += 16;
+ src_bgra += 64;
+ }
+}
+
+void BGRAToUVRow_LSX(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_bgra = src_bgra + src_stride_bgra;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3;
+ __m128i nex0, nex1, nex2, nex3;
+ __m128i tmp0, tmp1, tmp2, tmp3, dst0;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, next_bgra, 0, next_bgra, 16, next_bgra, 32, next_bgra,
+ 48, nex0, nex1, nex2, nex3);
+ tmp0 = __lsx_vpickod_b(src1, src0);
+ tmp1 = __lsx_vpickev_b(src1, src0);
+ tmp2 = __lsx_vpickod_b(src3, src2);
+ tmp3 = __lsx_vpickev_b(src3, src2);
+ tmpb = __lsx_vpickod_b(tmp2, tmp0);
+ tmpr = __lsx_vpickev_b(tmp2, tmp0);
+ tmpg = __lsx_vpickod_b(tmp3, tmp1);
+ tmp0 = __lsx_vpickod_b(nex1, nex0);
+ tmp1 = __lsx_vpickev_b(nex1, nex0);
+ tmp2 = __lsx_vpickod_b(nex3, nex2);
+ tmp3 = __lsx_vpickev_b(nex3, nex2);
+ nexb = __lsx_vpickod_b(tmp2, tmp0);
+ nexr = __lsx_vpickev_b(tmp2, tmp0);
+ nexg = __lsx_vpickod_b(tmp3, tmp1);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_bgra += 64;
+ next_bgra += 64;
+ }
+}
+
+void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3, dst0;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1;
+ __m128i const_129 = __lsx_vldi(0x81);
+ __m128i const_br = {0x1942194219421942, 0x1942194219421942};
+ __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmp2 = __lsx_vpickev_b(src3, src2);
+ tmp3 = __lsx_vpickod_b(src3, src2);
+ reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp1, const_129);
+ reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129);
+ reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
+ reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
+ dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
+ __lsx_vst(dst0, dst_y, 0);
+ dst_y += 16;
+ src_abgr += 64;
+ }
+}
+
+void ABGRToUVRow_LSX(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_abgr = src_abgr + src_stride_abgr;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3;
+ __m128i nex0, nex1, nex2, nex3;
+ __m128i tmp0, tmp1, tmp2, tmp3, dst0;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, next_abgr, 0, next_abgr, 16, next_abgr, 32, next_abgr,
+ 48, nex0, nex1, nex2, nex3);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmp2 = __lsx_vpickev_b(src3, src2);
+ tmp3 = __lsx_vpickod_b(src3, src2);
+ tmpb = __lsx_vpickod_b(tmp2, tmp0);
+ tmpr = __lsx_vpickev_b(tmp2, tmp0);
+ tmpg = __lsx_vpickev_b(tmp3, tmp1);
+ tmp0 = __lsx_vpickev_b(nex1, nex0);
+ tmp1 = __lsx_vpickod_b(nex1, nex0);
+ tmp2 = __lsx_vpickev_b(nex3, nex2);
+ tmp3 = __lsx_vpickod_b(nex3, nex2);
+ nexb = __lsx_vpickod_b(tmp2, tmp0);
+ nexr = __lsx_vpickev_b(tmp2, tmp0);
+ nexg = __lsx_vpickev_b(tmp3, tmp1);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_abgr += 64;
+ next_abgr += 64;
+ }
+}
+
+void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3, dst0;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1;
+ __m128i const_129 = __lsx_vldi(0x81);
+ __m128i const_br = {0x4219421942194219, 0x4219421942194219};
+ __m128i const_1080 = {0x1080108010801080, 0x1080108010801080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vpickod_b(src1, src0);
+ tmp1 = __lsx_vpickev_b(src1, src0);
+ tmp2 = __lsx_vpickod_b(src3, src2);
+ tmp3 = __lsx_vpickev_b(src3, src2);
+ reg0 = __lsx_vmaddwod_h_bu(const_1080, tmp1, const_129);
+ reg1 = __lsx_vmaddwod_h_bu(const_1080, tmp3, const_129);
+ reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0);
+ reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2);
+ dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8);
+ __lsx_vst(dst0, dst_y, 0);
+ dst_y += 16;
+ src_rgba += 64;
+ }
+}
+
+void RGBAToUVRow_LSX(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_rgba = src_rgba + src_stride_rgba;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3;
+ __m128i nex0, nex1, nex2, nex3;
+ __m128i tmp0, tmp1, tmp2, tmp3, dst0;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i const_112 = __lsx_vldi(0x438);
+ __m128i const_74 = __lsx_vldi(0x425);
+ __m128i const_38 = __lsx_vldi(0x413);
+ __m128i const_94 = __lsx_vldi(0x42F);
+ __m128i const_18 = __lsx_vldi(0x409);
+ __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, next_rgba, 0, next_rgba, 16, next_rgba, 32, next_rgba,
+ 48, nex0, nex1, nex2, nex3);
+ tmp0 = __lsx_vpickod_b(src1, src0);
+ tmp1 = __lsx_vpickev_b(src1, src0);
+ tmp2 = __lsx_vpickod_b(src3, src2);
+ tmp3 = __lsx_vpickev_b(src3, src2);
+ tmpr = __lsx_vpickod_b(tmp2, tmp0);
+ tmpb = __lsx_vpickev_b(tmp2, tmp0);
+ tmpg = __lsx_vpickod_b(tmp3, tmp1);
+ tmp0 = __lsx_vpickod_b(nex1, nex0);
+ tmp1 = __lsx_vpickev_b(nex1, nex0);
+ tmp2 = __lsx_vpickod_b(nex3, nex2);
+ tmp3 = __lsx_vpickev_b(nex3, nex2);
+ nexr = __lsx_vpickod_b(tmp2, tmp0);
+ nexb = __lsx_vpickev_b(tmp2, tmp0);
+ nexg = __lsx_vpickod_b(tmp3, tmp1);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_rgba += 64;
+ next_rgba += 64;
+ }
+}
+
+void ARGBToUVJRow_LSX(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ const uint8_t* next_argb = src_argb + src_stride_argb;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3;
+ __m128i nex0, nex1, nex2, nex3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i reg0, reg1, dst0;
+ __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ __m128i const_63 = __lsx_vldi(0x43F);
+ __m128i const_42 = __lsx_vldi(0x42A);
+ __m128i const_21 = __lsx_vldi(0x415);
+ __m128i const_53 = __lsx_vldi(0x435);
+ __m128i const_10 = __lsx_vldi(0x40A);
+ __m128i const_8080 = {0x8080808080808080, 0x8080808080808080};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, next_argb, 0, next_argb, 16, next_argb, 32, next_argb,
+ 48, nex0, nex1, nex2, nex3);
+ tmp0 = __lsx_vpickev_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src1, src0);
+ tmp2 = __lsx_vpickev_b(src3, src2);
+ tmp3 = __lsx_vpickod_b(src3, src2);
+ tmpr = __lsx_vpickod_b(tmp2, tmp0);
+ tmpb = __lsx_vpickev_b(tmp2, tmp0);
+ tmpg = __lsx_vpickev_b(tmp3, tmp1);
+ tmp0 = __lsx_vpickev_b(nex1, nex0);
+ tmp1 = __lsx_vpickod_b(nex1, nex0);
+ tmp2 = __lsx_vpickev_b(nex3, nex2);
+ tmp3 = __lsx_vpickod_b(nex3, nex2);
+ nexr = __lsx_vpickod_b(tmp2, tmp0);
+ nexb = __lsx_vpickev_b(tmp2, tmp0);
+ nexg = __lsx_vpickev_b(tmp3, tmp1);
+ tmp0 = __lsx_vaddwev_h_bu(tmpb, nexb);
+ tmp1 = __lsx_vaddwod_h_bu(tmpb, nexb);
+ tmp2 = __lsx_vaddwev_h_bu(tmpg, nexg);
+ tmp3 = __lsx_vaddwod_h_bu(tmpg, nexg);
+ reg0 = __lsx_vaddwev_h_bu(tmpr, nexr);
+ reg1 = __lsx_vaddwod_h_bu(tmpr, nexr);
+ tmpb = __lsx_vavgr_hu(tmp0, tmp1);
+ tmpg = __lsx_vavgr_hu(tmp2, tmp3);
+ tmpr = __lsx_vavgr_hu(reg0, reg1);
+ reg0 = __lsx_vmadd_h(const_8080, const_63, tmpb);
+ reg1 = __lsx_vmadd_h(const_8080, const_63, tmpr);
+ reg0 = __lsx_vmsub_h(reg0, const_42, tmpg);
+ reg1 = __lsx_vmsub_h(reg1, const_53, tmpg);
+ reg0 = __lsx_vmsub_h(reg0, const_21, tmpr);
+ reg1 = __lsx_vmsub_h(reg1, const_10, tmpb);
+ dst0 = __lsx_vpickod_b(reg1, reg0);
+ __lsx_vstelm_d(dst0, dst_u, 0, 0);
+ __lsx_vstelm_d(dst0, dst_v, 0, 1);
+ dst_u += 8;
+ dst_v += 8;
+ src_argb += 64;
+ next_argb += 64;
+ }
+}
+
+void I444ToARGBRow_LSX(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i vec_y, vec_u, vec_v, out_b, out_g, out_r;
+ __m128i vec_yl, vec_yh, vec_ul, vec_vl, vec_uh, vec_vh;
+ __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb, vec_ugvg;
+ __m128i const_80 = __lsx_vldi(0x480);
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i zero = __lsx_vldi(0);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lsx_vld(src_y, 0);
+ vec_u = __lsx_vld(src_u, 0);
+ vec_v = __lsx_vld(src_v, 0);
+ vec_yl = __lsx_vilvl_b(vec_y, vec_y);
+ vec_ul = __lsx_vilvl_b(zero, vec_u);
+ vec_vl = __lsx_vilvl_b(zero, vec_v);
+ I444TORGB(vec_yl, vec_ul, vec_vl, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb,
+ out_b, out_g, out_r);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ vec_yh = __lsx_vilvh_b(vec_y, vec_y);
+ vec_uh = __lsx_vilvh_b(zero, vec_u);
+ vec_vh = __lsx_vilvh_b(zero, vec_v);
+ I444TORGB(vec_yh, vec_uh, vec_vh, vec_ub, vec_vr, vec_ugvg, vec_yg, vec_yb,
+ out_b, out_g, out_r);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_y += 16;
+ src_u += 16;
+ src_v += 16;
+ }
+}
+
+void I400ToARGBRow_LSX(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i vec_y, vec_yl, vec_yh, out0;
+ __m128i y_ev, y_od, dst0, dst1, dst2, dst3;
+ __m128i temp0, temp1;
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i vec_yg = __lsx_vreplgr2vr_h(yuvconstants->kYToRgb[0]);
+ __m128i vec_yb = __lsx_vreplgr2vr_w(yuvconstants->kYBiasToRgb[0]);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lsx_vld(src_y, 0);
+ vec_yl = __lsx_vilvl_b(vec_y, vec_y);
+ y_ev = __lsx_vmulwev_w_hu_h(vec_yl, vec_yg);
+ y_od = __lsx_vmulwod_w_hu_h(vec_yl, vec_yg);
+ y_ev = __lsx_vsrai_w(y_ev, 16);
+ y_od = __lsx_vsrai_w(y_od, 16);
+ y_ev = __lsx_vadd_w(y_ev, vec_yb);
+ y_od = __lsx_vadd_w(y_od, vec_yb);
+ y_ev = __lsx_vsrai_w(y_ev, 6);
+ y_od = __lsx_vsrai_w(y_od, 6);
+ y_ev = __lsx_vclip255_w(y_ev);
+ y_od = __lsx_vclip255_w(y_od);
+ out0 = __lsx_vpackev_h(y_od, y_ev);
+ temp0 = __lsx_vpackev_b(out0, out0);
+ temp1 = __lsx_vpackev_b(alpha, out0);
+ dst0 = __lsx_vilvl_h(temp1, temp0);
+ dst1 = __lsx_vilvh_h(temp1, temp0);
+ vec_yh = __lsx_vilvh_b(vec_y, vec_y);
+ y_ev = __lsx_vmulwev_w_hu_h(vec_yh, vec_yg);
+ y_od = __lsx_vmulwod_w_hu_h(vec_yh, vec_yg);
+ y_ev = __lsx_vsrai_w(y_ev, 16);
+ y_od = __lsx_vsrai_w(y_od, 16);
+ y_ev = __lsx_vadd_w(y_ev, vec_yb);
+ y_od = __lsx_vadd_w(y_od, vec_yb);
+ y_ev = __lsx_vsrai_w(y_ev, 6);
+ y_od = __lsx_vsrai_w(y_od, 6);
+ y_ev = __lsx_vclip255_w(y_ev);
+ y_od = __lsx_vclip255_w(y_od);
+ out0 = __lsx_vpackev_h(y_od, y_ev);
+ temp0 = __lsx_vpackev_b(out0, out0);
+ temp1 = __lsx_vpackev_b(alpha, out0);
+ dst2 = __lsx_vilvl_h(temp1, temp0);
+ dst3 = __lsx_vilvh_h(temp1, temp0);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_y += 16;
+ }
+}
+
+void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ int x;
+ int len = width / 16;
+ __m128i vec_y, dst0, dst1, dst2, dst3;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i alpha = __lsx_vldi(0xFF);
+
+ for (x = 0; x < len; x++) {
+ vec_y = __lsx_vld(src_y, 0);
+ tmp0 = __lsx_vilvl_b(vec_y, vec_y);
+ tmp1 = __lsx_vilvh_b(vec_y, vec_y);
+ tmp2 = __lsx_vilvl_b(alpha, vec_y);
+ tmp3 = __lsx_vilvh_b(alpha, vec_y);
+ dst0 = __lsx_vilvl_h(tmp2, tmp0);
+ dst1 = __lsx_vilvh_h(tmp2, tmp0);
+ dst2 = __lsx_vilvl_h(tmp3, tmp1);
+ dst3 = __lsx_vilvh_h(tmp3, tmp1);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ src_y += 16;
+ }
+}
+
+void YUY2ToARGBRow_LSX(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, vec_y, vec_vu;
+ __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+ __m128i vec_vrub, vec_vgug;
+ __m128i out_b, out_g, out_r;
+ __m128i const_80 = __lsx_vldi(0x480);
+ __m128i zero = __lsx_vldi(0);
+ __m128i alpha = __lsx_vldi(0xFF);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
+ vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_yuy2, 0);
+ vec_y = __lsx_vpickev_b(src0, src0);
+ vec_vu = __lsx_vpickod_b(src0, src0);
+ YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
+ out_r);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_yuy2 += 16;
+ }
+}
+
+void UYVYToARGBRow_LSX(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, vec_y, vec_vu;
+ __m128i vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb;
+ __m128i vec_vrub, vec_vgug;
+ __m128i out_b, out_g, out_r;
+ __m128i const_80 = __lsx_vldi(0x480);
+ __m128i zero = __lsx_vldi(0);
+ __m128i alpha = __lsx_vldi(0xFF);
+
+ YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb);
+ vec_vrub = __lsx_vilvl_h(vec_vr, vec_ub);
+ vec_vgug = __lsx_vilvl_h(vec_vg, vec_ug);
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_uyvy, 0);
+ vec_y = __lsx_vpickod_b(src0, src0);
+ vec_vu = __lsx_vpickev_b(src0, src0);
+ YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_b, out_g,
+ out_r);
+ STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
+ src_uyvy += 16;
+ }
+}
+
+void InterpolateRow_LSX(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int32_t source_y_fraction) {
+ int x;
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8_t* nex_ptr = src_ptr + src_stride;
+ uint16_t y_fractions;
+ int len = width / 32;
+ __m128i src0, src1, nex0, nex1;
+ __m128i dst0, dst1, y_frac;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+ __m128i const_128 = __lsx_vldi(0x480);
+
+ if (y1_fraction == 0) {
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ __lsx_vst(src0, dst_ptr, 0);
+ __lsx_vst(src1, dst_ptr, 16);
+ src_ptr += 32;
+ dst_ptr += 32;
+ }
+ return;
+ }
+
+ if (y1_fraction == 128) {
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1);
+ dst0 = __lsx_vavgr_bu(src0, nex0);
+ dst1 = __lsx_vavgr_bu(src1, nex1);
+ __lsx_vst(dst0, dst_ptr, 0);
+ __lsx_vst(dst1, dst_ptr, 16);
+ src_ptr += 32;
+ nex_ptr += 32;
+ dst_ptr += 32;
+ }
+ return;
+ }
+
+ y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
+ y_frac = __lsx_vreplgr2vr_h(y_fractions);
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ DUP2_ARG2(__lsx_vld, nex_ptr, 0, nex_ptr, 16, nex0, nex1);
+ tmp0 = __lsx_vilvl_b(nex0, src0);
+ tmp1 = __lsx_vilvh_b(nex0, src0);
+ tmp2 = __lsx_vilvl_b(nex1, src1);
+ tmp3 = __lsx_vilvh_b(nex1, src1);
+ tmp0 = __lsx_vdp2add_h_bu(const_128, tmp0, y_frac);
+ tmp1 = __lsx_vdp2add_h_bu(const_128, tmp1, y_frac);
+ tmp2 = __lsx_vdp2add_h_bu(const_128, tmp2, y_frac);
+ tmp3 = __lsx_vdp2add_h_bu(const_128, tmp3, y_frac);
+ dst0 = __lsx_vsrlni_b_h(tmp1, tmp0, 8);
+ dst1 = __lsx_vsrlni_b_h(tmp3, tmp2, 8);
+ __lsx_vst(dst0, dst_ptr, 0);
+ __lsx_vst(dst1, dst_ptr, 16);
+ src_ptr += 32;
+ nex_ptr += 32;
+ dst_ptr += 32;
+ }
+}
+
+void ARGBSetRow_LSX(uint8_t* dst_argb, uint32_t v32, int width) {
+ int x;
+ int len = width / 4;
+ __m128i dst0 = __lsx_vreplgr2vr_w(v32);
+
+ for (x = 0; x < len; x++) {
+ __lsx_vst(dst0, dst_argb, 0);
+ dst_argb += 16;
+ }
+}
+
+void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2;
+ __m128i dst0, dst1, dst2;
+ __m128i shuf0 = {0x0708030405000102, 0x110C0D0E090A0B06};
+ __m128i shuf1 = {0x1516171213140F10, 0x1F1E1B1C1D18191A};
+ __m128i shuf2 = {0x090405060102031E, 0x0D0E0F0A0B0C0708};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_raw, 0, src_raw, 16, src0, src1);
+ src2 = __lsx_vld(src_raw, 32);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuf0, src1, src0, shuf1, dst0, dst1);
+ dst2 = __lsx_vshuf_b(src1, src2, shuf2);
+ dst1 = __lsx_vinsgr2vr_b(dst1, src_raw[32], 0x0E);
+ __lsx_vst(dst0, dst_rgb24, 0);
+ __lsx_vst(dst1, dst_rgb24, 16);
+ __lsx_vst(dst2, dst_rgb24, 32);
+ dst_rgb24 += 48;
+ src_raw += 48;
+ }
+}
+
+void MergeUVRow_LSX(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src0, src1);
+ dst0 = __lsx_vilvl_b(src1, src0);
+ dst1 = __lsx_vilvh_b(src1, src0);
+ __lsx_vst(dst0, dst_uv, 0);
+ __lsx_vst(dst1, dst_uv, 16);
+ src_u += 16;
+ src_v += 16;
+ dst_uv += 32;
+ }
+}
+
+void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3, tmp0, tmp1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vpickod_b(src1, src0);
+ tmp1 = __lsx_vpickod_b(src3, src2);
+ dst0 = __lsx_vpickod_b(tmp1, tmp0);
+ __lsx_vst(dst0, dst_a, 0);
+ src_argb += 64;
+ dst_a += 16;
+ }
+}
+
+void ARGBBlendRow_LSX(const uint8_t* src_argb,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, dst0, dst1;
+ __m128i reg0, reg1, reg2, reg3;
+ __m128i a0, a1, a2, a3;
+ __m128i const_256 = __lsx_vldi(0x500);
+ __m128i zero = __lsx_vldi(0);
+ __m128i alpha = __lsx_vldi(0xFF);
+ __m128i control = {0xFF000000FF000000, 0xFF000000FF000000};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb1, 0, src_argb1, 16,
+ src0, src1, src2, src3);
+ tmp0 = __lsx_vshuf4i_b(src0, 0xFF);
+ tmp1 = __lsx_vshuf4i_b(src1, 0xFF);
+ a0 = __lsx_vilvl_b(zero, tmp0);
+ a1 = __lsx_vilvh_b(zero, tmp0);
+ a2 = __lsx_vilvl_b(zero, tmp1);
+ a3 = __lsx_vilvh_b(zero, tmp1);
+ reg0 = __lsx_vilvl_b(zero, src2);
+ reg1 = __lsx_vilvh_b(zero, src2);
+ reg2 = __lsx_vilvl_b(zero, src3);
+ reg3 = __lsx_vilvh_b(zero, src3);
+ DUP4_ARG2(__lsx_vsub_h, const_256, a0, const_256, a1, const_256, a2,
+ const_256, a3, a0, a1, a2, a3);
+ DUP4_ARG2(__lsx_vmul_h, a0, reg0, a1, reg1, a2, reg2, a3, reg3, reg0, reg1,
+ reg2, reg3);
+ DUP2_ARG3(__lsx_vsrani_b_h, reg1, reg0, 8, reg3, reg2, 8, dst0, dst1);
+ dst0 = __lsx_vsadd_bu(dst0, src0);
+ dst1 = __lsx_vsadd_bu(dst1, src1);
+ dst0 = __lsx_vbitsel_v(dst0, alpha, control);
+ dst1 = __lsx_vbitsel_v(dst1, alpha, control);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ src_argb += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBQuantizeRow_LSX(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ int x;
+ int len = width / 16;
+ __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i vec_size = __lsx_vreplgr2vr_b(interval_size);
+ __m128i vec_offset = __lsx_vreplgr2vr_b(interval_offset);
+ __m128i vec_scale = __lsx_vreplgr2vr_w(scale);
+ __m128i zero = __lsx_vldi(0);
+ __m128i control = {0xFF000000FF000000, 0xFF000000FF000000};
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, dst_argb, 32, dst_argb, 48,
+ src0, src1, src2, src3);
+ reg0 = __lsx_vilvl_b(zero, src0);
+ reg1 = __lsx_vilvh_b(zero, src0);
+ reg2 = __lsx_vilvl_b(zero, src1);
+ reg3 = __lsx_vilvh_b(zero, src1);
+ reg4 = __lsx_vilvl_b(zero, src2);
+ reg5 = __lsx_vilvh_b(zero, src2);
+ reg6 = __lsx_vilvl_b(zero, src3);
+ reg7 = __lsx_vilvh_b(zero, src3);
+ tmp0 = __lsx_vilvl_h(zero, reg0);
+ tmp1 = __lsx_vilvh_h(zero, reg0);
+ tmp2 = __lsx_vilvl_h(zero, reg1);
+ tmp3 = __lsx_vilvh_h(zero, reg1);
+ tmp4 = __lsx_vilvl_h(zero, reg2);
+ tmp5 = __lsx_vilvh_h(zero, reg2);
+ tmp6 = __lsx_vilvl_h(zero, reg3);
+ tmp7 = __lsx_vilvh_h(zero, reg3);
+ DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale,
+ tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale,
+ tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16,
+ tmp7, tmp6, 16, reg0, reg1, reg2, reg3);
+ dst0 = __lsx_vpickev_b(reg1, reg0);
+ dst1 = __lsx_vpickev_b(reg3, reg2);
+ tmp0 = __lsx_vilvl_h(zero, reg4);
+ tmp1 = __lsx_vilvh_h(zero, reg4);
+ tmp2 = __lsx_vilvl_h(zero, reg5);
+ tmp3 = __lsx_vilvh_h(zero, reg5);
+ tmp4 = __lsx_vilvl_h(zero, reg6);
+ tmp5 = __lsx_vilvh_h(zero, reg6);
+ tmp6 = __lsx_vilvl_h(zero, reg7);
+ tmp7 = __lsx_vilvh_h(zero, reg7);
+ DUP4_ARG2(__lsx_vmul_w, tmp0, vec_scale, tmp1, vec_scale, tmp2, vec_scale,
+ tmp3, vec_scale, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vmul_w, tmp4, vec_scale, tmp5, vec_scale, tmp6, vec_scale,
+ tmp7, vec_scale, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vsrani_h_w, tmp1, tmp0, 16, tmp3, tmp2, 16, tmp5, tmp4, 16,
+ tmp7, tmp6, 16, reg0, reg1, reg2, reg3);
+ dst2 = __lsx_vpickev_b(reg1, reg0);
+ dst3 = __lsx_vpickev_b(reg3, reg2);
+ DUP4_ARG2(__lsx_vmul_b, dst0, vec_size, dst1, vec_size, dst2, vec_size,
+ dst3, vec_size, dst0, dst1, dst2, dst3);
+ DUP4_ARG2(__lsx_vadd_b, dst0, vec_offset, dst1, vec_offset, dst2,
+ vec_offset, dst3, vec_offset, dst0, dst1, dst2, dst3);
+ DUP4_ARG3(__lsx_vbitsel_v, dst0, src0, control, dst1, src1, control, dst2,
+ src2, control, dst3, src3, control, dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ __lsx_vst(dst2, dst_argb, 32);
+ __lsx_vst(dst3, dst_argb, 48);
+ dst_argb += 64;
+ }
+}
+
+void ARGBColorMatrixRow_LSX(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ int x;
+ int len = width / 8;
+ __m128i src0, src1, tmp0, tmp1, dst0, dst1;
+ __m128i tmp_b, tmp_g, tmp_r, tmp_a;
+ __m128i reg_b, reg_g, reg_r, reg_a;
+ __m128i matrix_b = __lsx_vldrepl_w(matrix_argb, 0);
+ __m128i matrix_g = __lsx_vldrepl_w(matrix_argb, 4);
+ __m128i matrix_r = __lsx_vldrepl_w(matrix_argb, 8);
+ __m128i matrix_a = __lsx_vldrepl_w(matrix_argb, 12);
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src0, matrix_b, src0, matrix_g, src0, matrix_r,
+ src0, matrix_a, tmp_b, tmp_g, tmp_r, tmp_a);
+ DUP4_ARG2(__lsx_vdp2_h_bu_b, src1, matrix_b, src1, matrix_g, src1, matrix_r,
+ src1, matrix_a, reg_b, reg_g, reg_r, reg_a);
+ DUP4_ARG2(__lsx_vhaddw_w_h, tmp_b, tmp_b, tmp_g, tmp_g, tmp_r, tmp_r, tmp_a,
+ tmp_a, tmp_b, tmp_g, tmp_r, tmp_a);
+ DUP4_ARG2(__lsx_vhaddw_w_h, reg_b, reg_b, reg_g, reg_g, reg_r, reg_r, reg_a,
+ reg_a, reg_b, reg_g, reg_r, reg_a);
+ DUP4_ARG2(__lsx_vsrai_w, tmp_b, 6, tmp_g, 6, tmp_r, 6, tmp_a, 6, tmp_b,
+ tmp_g, tmp_r, tmp_a);
+ DUP4_ARG2(__lsx_vsrai_w, reg_b, 6, reg_g, 6, reg_r, 6, reg_a, 6, reg_b,
+ reg_g, reg_r, reg_a);
+ DUP4_ARG1(__lsx_vclip255_w, tmp_b, tmp_g, tmp_r, tmp_a, tmp_b, tmp_g, tmp_r,
+ tmp_a)
+ DUP4_ARG1(__lsx_vclip255_w, reg_b, reg_g, reg_r, reg_a, reg_b, reg_g, reg_r,
+ reg_a)
+ DUP4_ARG2(__lsx_vpickev_h, reg_b, tmp_b, reg_g, tmp_g, reg_r, tmp_r, reg_a,
+ tmp_a, tmp_b, tmp_g, tmp_r, tmp_a);
+ tmp0 = __lsx_vpackev_b(tmp_g, tmp_b);
+ tmp1 = __lsx_vpackev_b(tmp_a, tmp_r);
+ dst0 = __lsx_vilvl_h(tmp1, tmp0);
+ dst1 = __lsx_vilvh_h(tmp1, tmp0);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void SplitUVRow_LSX(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m128i src0, src1, src2, src3;
+ __m128i dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, dst0, dst1);
+ DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst2, dst3);
+ __lsx_vst(dst0, dst_u, 0);
+ __lsx_vst(dst1, dst_u, 16);
+ __lsx_vst(dst2, dst_v, 0);
+ __lsx_vst(dst3, dst_v, 16);
+ src_uv += 64;
+ dst_u += 32;
+ dst_v += 32;
+ }
+}
+
+void SetRow_LSX(uint8_t* dst, uint8_t v8, int width) {
+ int x;
+ int len = width / 16;
+ __m128i dst0 = __lsx_vreplgr2vr_b(v8);
+
+ for (x = 0; x < len; x++) {
+ __lsx_vst(dst0, dst, 0);
+ dst += 16;
+ }
+}
+
+void MirrorSplitUVRow_LSX(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ int len = width / 32;
+ __m128i src0, src1, src2, src3;
+ __m128i dst0, dst1, dst2, dst3;
+ __m128i shuff0 = {0x10121416181A1C1E, 0x00020406080A0C0E};
+ __m128i shuff1 = {0x11131517191B1D1F, 0x01030507090B0D0F};
+
+ src_uv += (width << 1);
+ for (x = 0; x < len; x++) {
+ src_uv -= 64;
+ DUP4_ARG2(__lsx_vld, src_uv, 0, src_uv, 16, src_uv, 32, src_uv, 48, src2,
+ src3, src0, src1);
+ DUP4_ARG3(__lsx_vshuf_b, src1, src0, shuff1, src3, src2, shuff1, src1, src0,
+ shuff0, src3, src2, shuff0, dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst_v, 0);
+ __lsx_vst(dst1, dst_v, 16);
+ __lsx_vst(dst2, dst_u, 0);
+ __lsx_vst(dst3, dst_u, 16);
+ dst_u += 32;
+ dst_v += 32;
+ }
+}
+
+void HalfFloatRow_LSX(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ int x;
+ int len = width / 32;
+ float mult = 1.9259299444e-34f * scale;
+ __m128i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128 vec_mult = (__m128)__lsx_vldrepl_w(&mult, 0);
+ __m128i zero = __lsx_vldi(0);
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+ src3);
+ DUP4_ARG2(__lsx_vilvl_h, zero, src0, zero, src1, zero, src2, zero, src3,
+ tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vilvh_h, zero, src0, zero, src1, zero, src2, zero, src3,
+ tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG1(__lsx_vffint_s_wu, tmp0, tmp2, tmp4, tmp6, reg0, reg2, reg4,
+ reg6);
+ DUP4_ARG1(__lsx_vffint_s_wu, tmp1, tmp3, tmp5, tmp7, reg1, reg3, reg5,
+ reg7);
+ DUP4_ARG2(__lsx_vfmul_s, reg0, vec_mult, reg1, vec_mult, reg2, vec_mult,
+ reg3, vec_mult, reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vfmul_s, reg4, vec_mult, reg5, vec_mult, reg6, vec_mult,
+ reg7, vec_mult, reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg0, 13, (v4u32)reg1, 13, (v4u32)reg2, 13,
+ (v4u32)reg3, 13, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vsrli_w, (v4u32)reg4, 13, (v4u32)reg5, 13, (v4u32)reg6, 13,
+ (v4u32)reg7, 13, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG2(__lsx_vpickev_h, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
+ dst0, dst1, dst2, dst3);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ __lsx_vst(dst3, dst, 48);
+ src += 32;
+ dst += 32;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
diff --git a/files/source/row_mmi.cc b/files/source/row_mmi.cc
index d8726d09..362fd1cf 100644
--- a/files/source/row_mmi.cc
+++ b/files/source/row_mmi.cc
@@ -21,6 +21,8 @@ extern "C" {
// This module is for Mips MMI.
#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+// clang-format off
+
void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
@@ -603,7 +605,7 @@ void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
: "memory");
}
-void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
uint64_t src, src_hi, src_lo;
uint64_t dest0, dest1, dest2, dest3;
const uint64_t value = 0x1080;
@@ -611,8 +613,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
__asm__ volatile(
"1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -624,8 +626,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest0], %[dest0], %[src] \n\t"
"psrlw %[dest0], %[dest0], %[eight] \n\t"
- "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -637,8 +639,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest1], %[dest1], %[src] \n\t"
"psrlw %[dest1], %[dest1], %[eight] \n\t"
- "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x17(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -650,8 +652,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest2], %[dest2], %[src] \n\t"
"psrlw %[dest2], %[dest2], %[eight] \n\t"
- "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -669,35 +671,38 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
"gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
- "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
+ "daddiu %[src_argb], %[src_argb], 0x20 \n\t"
"daddiu %[dst_y], %[dst_y], 0x08 \n\t"
"daddi %[width], %[width], -0x08 \n\t"
"bnez %[width], 1b \n\t"
: [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
[dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
[dest3] "=&f"(dest3)
- : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
[mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
[zero] "f"(0x00)
: "memory");
}
-void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
+void ARGBToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
uint64_t src_rgb1;
- uint64_t ftmp[12];
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x0026004a00700002;
- const uint64_t mask_v = 0x00020070005e0012;
+ const uint64_t mask_u = 0x0013002500380002;
+ const uint64_t mask_v = 0x00020038002f0009;
__asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -707,15 +712,16 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest0_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
"pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
"pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
"pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -725,7 +731,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -741,8 +748,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest0_v], %[src1], %[src0] \n\t"
"psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -752,15 +759,16 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest1_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
"pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
"pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
"pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -770,7 +778,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -786,8 +795,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest1_v], %[src1], %[src0] \n\t"
"psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -797,15 +806,16 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest2_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
"pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
"pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
"pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -815,7 +825,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -831,8 +842,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest2_v], %[src1], %[src0] \n\t"
"psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -842,15 +853,16 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest3_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
"pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
"pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
"pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -860,7 +872,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -888,7 +901,7 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
"gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
"gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
- "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t"
"daddiu %[dst_u], %[dst_u], 0x08 \n\t"
"daddiu %[dst_v], %[dst_v], 0x08 \n\t"
"daddi %[width], %[width], -0x10 \n\t"
@@ -898,16 +911,17 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
[dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
[sixteen] "f"(0x10)
: "memory");
}
-void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
uint64_t src, src_hi, src_lo;
uint64_t dest0, dest1, dest2, dest3;
const uint64_t value = 0x1080;
@@ -915,8 +929,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
__asm__ volatile(
"1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -928,8 +942,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest0], %[dest0], %[src] \n\t"
"psrlw %[dest0], %[dest0], %[eight] \n\t"
- "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -941,8 +955,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest1], %[dest1], %[src] \n\t"
"psrlw %[dest1], %[dest1], %[eight] \n\t"
- "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x17(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -954,8 +968,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest2], %[dest2], %[src] \n\t"
"psrlw %[dest2], %[dest2], %[eight] \n\t"
- "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -973,35 +987,38 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
"gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
- "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
+ "daddiu %[src_argb], %[src_argb], 0x20 \n\t"
"daddiu %[dst_y], %[dst_y], 0x08 \n\t"
"daddi %[width], %[width], -0x08 \n\t"
"bnez %[width], 1b \n\t"
: [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
[dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
[dest3] "=&f"(dest3)
- : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
[mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
[zero] "f"(0x00)
: "memory");
}
-void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
+void BGRAToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
uint64_t src_rgb1;
- uint64_t ftmp[12];
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x00020070004a0026;
- const uint64_t mask_v = 0x0012005e00700002;
+ const uint64_t mask_u = 0x0002003800250013;
+ const uint64_t mask_v = 0x0009002f00380002;
__asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1011,15 +1028,16 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[dest0_u], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t"
"pinsrh_0 %[dest0_v], %[src0], %[value] \n\t"
"pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
"pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1029,7 +1047,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
@@ -1045,8 +1064,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest0_v], %[src0], %[src1] \n\t"
"psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1056,15 +1075,16 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[dest1_u], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t"
"pinsrh_0 %[dest1_v], %[src0], %[value] \n\t"
"pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
"pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1074,7 +1094,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
@@ -1090,8 +1111,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest1_v], %[src0], %[src1] \n\t"
"psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1101,15 +1122,16 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[dest2_u], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t"
"pinsrh_0 %[dest2_v], %[src0], %[value] \n\t"
"pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
"pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1119,7 +1141,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
@@ -1135,8 +1158,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest2_v], %[src0], %[src1] \n\t"
"psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1146,15 +1169,16 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[dest3_u], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t"
"pinsrh_0 %[dest3_v], %[src0], %[value] \n\t"
"pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
"pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1164,7 +1188,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsrl %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
@@ -1192,7 +1217,7 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
"gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
"gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
- "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t"
"daddiu %[dst_u], %[dst_u], 0x08 \n\t"
"daddiu %[dst_v], %[dst_v], 0x08 \n\t"
"daddi %[width], %[width], -0x10 \n\t"
@@ -1202,16 +1227,17 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
[dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
[sixteen] "f"(0x10)
: "memory");
}
-void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
uint64_t src, src_hi, src_lo;
uint64_t dest0, dest1, dest2, dest3;
const uint64_t value = 0x1080;
@@ -1219,8 +1245,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
__asm__ volatile(
"1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -1232,8 +1258,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest0], %[dest0], %[src] \n\t"
"psrlw %[dest0], %[dest0], %[eight] \n\t"
- "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -1245,8 +1271,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest1], %[dest1], %[src] \n\t"
"psrlw %[dest1], %[dest1], %[eight] \n\t"
- "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x17(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -1258,8 +1284,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest2], %[dest2], %[src] \n\t"
"psrlw %[dest2], %[dest2], %[eight] \n\t"
- "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -1277,35 +1303,38 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
"gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
- "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
+ "daddiu %[src_argb], %[src_argb], 0x20 \n\t"
"daddiu %[dst_y], %[dst_y], 0x08 \n\t"
"daddi %[width], %[width], -0x08 \n\t"
"bnez %[width], 1b \n\t"
: [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
[dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
[dest3] "=&f"(dest3)
- : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
[mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
[zero] "f"(0x00)
: "memory");
}
-void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
+void ABGRToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
uint64_t src_rgb1;
- uint64_t ftmp[12];
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x00020070004a0026;
- const uint64_t mask_v = 0x0012005e00700002;
+ const uint64_t mask_u = 0x0002003800250013;
+ const uint64_t mask_v = 0x0009002F00380002;
__asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1315,15 +1344,16 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest0_u], %[src0], %[value] \n\t"
"dsll %[dest0_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t"
"pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
"pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1333,7 +1363,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1349,8 +1380,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest0_v], %[src0], %[src1] \n\t"
"psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1360,15 +1391,16 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest1_u], %[src0], %[value] \n\t"
"dsll %[dest1_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t"
"pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
"pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1378,7 +1410,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1394,8 +1427,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest1_v], %[src0], %[src1] \n\t"
"psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1405,15 +1438,16 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest2_u], %[src0], %[value] \n\t"
"dsll %[dest2_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t"
"pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
"pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1423,7 +1457,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1439,8 +1474,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest2_v], %[src0], %[src1] \n\t"
"psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1450,15 +1485,16 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest3_u], %[src0], %[value] \n\t"
"dsll %[dest3_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t"
"pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
"pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1468,7 +1504,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1496,7 +1533,7 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
"gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
"gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
- "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t"
"daddiu %[dst_u], %[dst_u], 0x08 \n\t"
"daddiu %[dst_v], %[dst_v], 0x08 \n\t"
"daddi %[width], %[width], -0x10 \n\t"
@@ -1506,16 +1543,17 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
[dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
[sixteen] "f"(0x10)
: "memory");
}
-void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
uint64_t src, src_hi, src_lo;
uint64_t dest0, dest1, dest2, dest3;
const uint64_t value = 0x1080;
@@ -1523,8 +1561,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
__asm__ volatile(
"1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -1536,8 +1574,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest0], %[dest0], %[src] \n\t"
"psrlw %[dest0], %[dest0], %[eight] \n\t"
- "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -1549,8 +1587,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest1], %[dest1], %[src] \n\t"
"psrlw %[dest1], %[dest1], %[eight] \n\t"
- "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x17(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -1562,8 +1600,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest2], %[dest2], %[src] \n\t"
"psrlw %[dest2], %[dest2], %[eight] \n\t"
- "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -1581,35 +1619,38 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
"gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
- "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
+ "daddiu %[src_argb], %[src_argb], 0x20 \n\t"
"daddiu %[dst_y], %[dst_y], 0x08 \n\t"
"daddi %[width], %[width], -0x08 \n\t"
"bnez %[width], 1b \n\t"
: [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
[dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
[dest3] "=&f"(dest3)
- : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
[mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
[zero] "f"(0x00)
: "memory");
}
-void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
+void RGBAToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
uint64_t src_rgb1;
- uint64_t ftmp[12];
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x0026004a00700002;
- const uint64_t mask_v = 0x00020070005e0012;
+ const uint64_t mask_u = 0x0013002500380002;
+ const uint64_t mask_v = 0x00020038002f0009;
__asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1619,15 +1660,16 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[dest0_u], %[src0], %[value] \n\t"
"dsrl %[dest0_v], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t"
"pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
"pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1637,7 +1679,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
"dsrl %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1653,8 +1696,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest0_v], %[src1], %[src0] \n\t"
"psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1664,15 +1707,16 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[dest1_u], %[src0], %[value] \n\t"
"dsrl %[dest1_v], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t"
"pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
"pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1682,7 +1726,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
"dsrl %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1698,8 +1743,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest1_v], %[src1], %[src0] \n\t"
"psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1709,15 +1754,16 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[dest2_u], %[src0], %[value] \n\t"
"dsrl %[dest2_v], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t"
"pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
"pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1727,7 +1773,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
"dsrl %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1743,8 +1790,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest2_v], %[src1], %[src0] \n\t"
"psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1754,15 +1801,16 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[dest3_u], %[src0], %[value] \n\t"
"dsrl %[dest3_v], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t"
"pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
"pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1772,7 +1820,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"paddh %[src0], %[src0], %[src_lo] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
"dsrl %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
@@ -1800,7 +1849,7 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
"gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
"gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
- "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t"
"daddiu %[dst_u], %[dst_u], 0x08 \n\t"
"daddiu %[dst_v], %[dst_v], 0x08 \n\t"
"daddi %[width], %[width], -0x10 \n\t"
@@ -1810,16 +1859,17 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
[dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
[sixteen] "f"(0x10)
: "memory");
}
-void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
uint64_t src, src_hi, src_lo;
uint64_t dest0, dest1, dest2, dest3;
const uint64_t value = 0x1080;
@@ -1827,8 +1877,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
__asm__ volatile(
"1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -1841,8 +1891,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest0], %[dest0], %[src] \n\t"
"psrlw %[dest0], %[dest0], %[eight] \n\t"
- "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x06(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -1855,8 +1905,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest1], %[dest1], %[src] \n\t"
"psrlw %[dest1], %[dest1], %[eight] \n\t"
- "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x13(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -1869,8 +1919,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest2], %[dest2], %[src] \n\t"
"psrlw %[dest2], %[dest2], %[eight] \n\t"
- "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x19(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x12(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -1889,35 +1939,38 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
"gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
- "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t"
+ "daddiu %[src_argb], %[src_argb], 0x18 \n\t"
"daddiu %[dst_y], %[dst_y], 0x08 \n\t"
"daddi %[width], %[width], -0x08 \n\t"
"bnez %[width], 1b \n\t"
: [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
[dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
[dest3] "=&f"(dest3)
- : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
[mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
[zero] "f"(0x00)
: "memory");
}
-void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
+void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
uint64_t src_rgb1;
- uint64_t ftmp[12];
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x0026004a00700002;
- const uint64_t mask_v = 0x00020070005e0012;
+ const uint64_t mask_u = 0x0013002500380002;
+ const uint64_t mask_v = 0x00020038002f0009;
__asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1929,15 +1982,16 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest0_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
"pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
"pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
"pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1949,7 +2003,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -1965,8 +2020,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest0_v], %[src1], %[src0] \n\t"
"psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1978,15 +2033,16 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest1_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
"pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
"pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
"pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -1998,7 +2054,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -2014,8 +2071,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest1_v], %[src1], %[src0] \n\t"
"psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -2027,15 +2084,16 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest2_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
"pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
"pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
"pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -2047,7 +2105,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -2063,8 +2122,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest2_v], %[src1], %[src0] \n\t"
"psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -2076,15 +2135,16 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[dest3_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
"pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
"pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
"pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -2096,7 +2156,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
"pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
@@ -2124,7 +2185,7 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
"gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
"gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
- "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t"
+ "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t"
"daddiu %[dst_u], %[dst_u], 0x08 \n\t"
"daddiu %[dst_v], %[dst_v], 0x08 \n\t"
"daddi %[width], %[width], -0x10 \n\t"
@@ -2134,16 +2195,17 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
[dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
[sixteen] "f"(0x10)
: "memory");
}
-void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
uint64_t src, src_hi, src_lo;
uint64_t dest0, dest1, dest2, dest3;
const uint64_t value = 0x1080;
@@ -2151,8 +2213,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
__asm__ volatile(
"1: \n\t"
- "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -2165,8 +2227,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest0], %[dest0], %[src] \n\t"
"psrlw %[dest0], %[dest0], %[eight] \n\t"
- "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x06(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -2179,8 +2241,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest1], %[dest1], %[src] \n\t"
"psrlw %[dest1], %[dest1], %[eight] \n\t"
- "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x13(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -2193,8 +2255,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"paddw %[dest2], %[dest2], %[src] \n\t"
"psrlw %[dest2], %[dest2], %[eight] \n\t"
- "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t"
- "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t"
+ "gsldlc1 %[src], 0x19(%[src_argb]) \n\t"
+ "gsldrc1 %[src], 0x12(%[src_argb]) \n\t"
"punpcklbh %[src_lo], %[src], %[zero] \n\t"
"pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
"pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
@@ -2213,35 +2275,38 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
"gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
"gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
- "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t"
+ "daddiu %[src_argb], %[src_argb], 0x18 \n\t"
"daddiu %[dst_y], %[dst_y], 0x08 \n\t"
"daddi %[width], %[width], -0x08 \n\t"
"bnez %[width], 1b \n\t"
: [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
[dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
[dest3] "=&f"(dest3)
- : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width),
[mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
[zero] "f"(0x00)
: "memory");
}
-void RAWToUVRow_MMI(const uint8_t* src_rgb0,
+void RAWToUVRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
uint64_t src_rgb1;
- uint64_t ftmp[12];
+ uint64_t ftmp[13];
+ uint64_t tmp[1];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x00020070004a0026;
- const uint64_t mask_v = 0x0012005e00700002;
+ const uint64_t mask_u = 0x0002003800250013;
+ const uint64_t mask_v = 0x0009002f00380002;
__asm__ volatile(
+ "dli %[tmp0], 0x0001000100010001 \n\t"
+ "dmtc1 %[tmp0], %[ftmp12] \n\t"
"1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -2253,15 +2318,16 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest0_u], %[src0], %[value] \n\t"
"dsll %[dest0_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t"
"pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
"pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -2273,7 +2339,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -2289,8 +2356,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest0_v], %[src0], %[src1] \n\t"
"psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -2302,15 +2369,16 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest1_u], %[src0], %[value] \n\t"
"dsll %[dest1_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t"
"pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
"pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -2322,7 +2390,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -2338,8 +2407,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest1_v], %[src0], %[src1] \n\t"
"psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -2351,15 +2420,16 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest2_u], %[src0], %[value] \n\t"
"dsll %[dest2_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t"
"pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
"pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -2371,7 +2441,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -2387,8 +2458,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest2_v], %[src0], %[src1] \n\t"
"psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -2400,15 +2471,16 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[dest3_u], %[src0], %[value] \n\t"
"dsll %[dest3_v], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t"
"pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
"pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
@@ -2420,7 +2492,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"dsll %[src1], %[src1], %[eight] \n\t"
"punpckhbh %[src_hi], %[src1], %[zero] \n\t"
"paddh %[src0], %[src0], %[src_hi] \n\t"
- "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[src0], %[src0], %[ftmp12] \n\t"
+ "psrlh %[src0], %[src0], %[one] \n\t"
"pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
"dsll %[src_hi], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
@@ -2448,7 +2521,7 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
"gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
"gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
- "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t"
+ "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t"
"daddiu %[dst_u], %[dst_u], 0x08 \n\t"
"daddiu %[dst_v], %[dst_v], 0x08 \n\t"
"daddi %[width], %[width], -0x10 \n\t"
@@ -2458,23 +2531,24 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0,
[dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
- [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]),
+ [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+ : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01),
[sixteen] "f"(0x10)
: "memory");
}
-void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) {
uint64_t src, src_hi, src_lo;
uint64_t dest, dest0, dest1, dest2, dest3;
uint64_t tmp0, tmp1;
- const uint64_t shift = 0x07;
- const uint64_t value = 0x0040;
+ const uint64_t shift = 0x08;
+ const uint64_t value = 0x80;
const uint64_t mask0 = 0x0;
- const uint64_t mask1 = 0x00010026004B000FULL;
+ const uint64_t mask1 = 0x0001004D0096001DULL;
__asm__ volatile(
"1: \n\t"
@@ -2544,13 +2618,13 @@ void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
[src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
[dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
[tmp1] "=&f"(tmp1)
- : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
[mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
[width] "r"(width)
: "memory");
}
-void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
+void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -2558,22 +2632,22 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
uint64_t src_rgb1;
uint64_t ftmp[12];
const uint64_t value = 0x4040;
- const uint64_t mask_u = 0x002b0054007f0002;
- const uint64_t mask_v = 0x0002007f006b0014;
+ const uint64_t mask_u = 0x0015002a003f0002;
+ const uint64_t mask_v = 0x0002003f0035000a;
__asm__ volatile(
"1: \n\t"
- "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
- "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[dest0_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
@@ -2581,16 +2655,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
"pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
@@ -2607,16 +2681,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest0_v], %[src1], %[src0] \n\t"
"psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[dest1_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
@@ -2624,16 +2698,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
"pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
@@ -2650,16 +2724,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest1_v], %[src1], %[src0] \n\t"
"psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[dest2_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
@@ -2667,16 +2741,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
"pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
@@ -2693,16 +2767,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"psubw %[dest2_v], %[src1], %[src0] \n\t"
"psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
- "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[dest3_u], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
@@ -2710,16 +2784,16 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
"pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
- "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
- "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t"
"gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
"gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
"punpcklbh %[src_lo], %[src0], %[zero] \n\t"
"punpckhbh %[src_hi], %[src0], %[zero] \n\t"
"punpcklbh %[src0], %[src1], %[zero] \n\t"
"punpckhbh %[src1], %[src1], %[zero] \n\t"
- "pavgh %[src0], %[src_lo], %[src0] \n\t"
- "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "paddh %[src0], %[src_lo], %[src0] \n\t"
+ "paddh %[src1], %[src_hi], %[src1] \n\t"
"pavgh %[src0], %[src0], %[src1] \n\t"
"dsll %[src_lo], %[src0], %[sixteen] \n\t"
"pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
@@ -2748,7 +2822,7 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
"gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
"gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
- "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t"
"daddiu %[dst_u], %[dst_u], 0x08 \n\t"
"daddiu %[dst_v], %[dst_v], 0x08 \n\t"
"daddi %[width], %[width], -0x10 \n\t"
@@ -2759,10 +2833,10 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
[dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
[dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
[dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
- : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb),
[dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
[mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
- [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [zero] "f"(0x00), [eight] "f"(0x08),
[sixteen] "f"(0x10)
: "memory");
}
@@ -4052,10 +4126,10 @@ void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
uint64_t tmp0, tmp1;
const uint64_t mask0 = 0x0;
const uint64_t mask1 = 0x01;
- const uint64_t mask2 = 0x00400026004B000FULL;
+ const uint64_t mask2 = 0x0080004D0096001DULL;
const uint64_t mask3 = 0xFF000000FF000000ULL;
const uint64_t mask4 = ~mask3;
- const uint64_t shift = 0x07;
+ const uint64_t shift = 0x08;
__asm__ volatile(
"1: \n\t"
@@ -4312,7 +4386,7 @@ void ARGBShadeRow_MMI(const uint8_t* src_argb,
: "memory");
}
-void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
+void ARGBMultiplyRow_MMI(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -4348,12 +4422,12 @@ void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
[src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
[dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
[src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+ : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
[dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
: "memory");
}
-void ARGBAddRow_MMI(const uint8_t* src_argb0,
+void ARGBAddRow_MMI(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -4375,12 +4449,12 @@ void ARGBAddRow_MMI(const uint8_t* src_argb0,
"daddi %[width], %[width], -0x02 \n\t"
"bnez %[width], 1b \n\t"
: [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+ : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
[dst_ptr] "r"(dst_argb), [width] "r"(width)
: "memory");
}
-void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
+void ARGBSubtractRow_MMI(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -4402,7 +4476,7 @@ void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
"daddi %[width], %[width], -0x02 \n\t"
"bnez %[width], 1b \n\t"
: [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
- : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+ : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
[dst_ptr] "r"(dst_argb), [width] "r"(width)
: "memory");
}
@@ -4778,7 +4852,9 @@ void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
: "memory");
}
-void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
+// TODO - respect YuvConstants
+void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf,
+ const struct YuvConstants*, int width) {
uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
const uint64_t mask0 = 0x0;
const uint64_t mask1 = 0x55;
@@ -4912,10 +4988,10 @@ void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
: "memory");
}
-void MirrorUVRow_MMI(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorSplitUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
uint64_t src0, src1, dest0, dest1;
const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
const uint64_t mask1 = 0x1b;
@@ -5476,10 +5552,10 @@ void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
: "memory");
}
-// Blend src_argb0 over src_argb1 and store to dst_argb.
-// dst_argb may be src_argb0 or src_argb1.
+// Blend src_argb over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb or src_argb1.
// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_MMI(const uint8_t* src_argb0,
+void ARGBBlendRow_MMI(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -5532,7 +5608,7 @@ void ARGBBlendRow_MMI(const uint8_t* src_argb0,
[dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
[src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
[dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
- : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+ : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1),
[dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
[mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
[shift] "f"(shift), [width] "r"(width)
@@ -6034,6 +6110,1730 @@ void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
: "memory");
}
+void I444ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
+ uint64_t mask = 0xff00ff00ff00ff00ULL;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+ __asm__ volatile (
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub
+ "or %[ub], %[ub], %[mask] \n\t"//must sign extension
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"//sign extension
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ "punpcklbh %[u], %[u], %[zero] \n\t"//u
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+
+ "punpcklbh %[v], %[v], %[zero] \n\t"//v
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb
+ "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg
+ "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [y]"=&f"(y),
+ [u]"=&f"(u), [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [alpha]"f"(-1),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask)
+ : "memory"
+ );
+}
+
+// Also used for 420
+void I422ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
+ uint64_t mask = 0xff00ff00ff00ff00ULL;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub
+ "or %[ub], %[ub], %[mask] \n\t"//must sign extension
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"//sign extension
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"//v
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb
+ "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg
+ "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y),
+ [u]"=&f"(u), [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [alpha]"f"(-1),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask)
+ : "memory"
+ );
+}
+
+// 10 bit YUV to ARGB
+void I210ToARGBRow_MMI(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
+ uint64_t mask = 0xff00ff00ff00ff00ULL;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t"
+ "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "psllh %[y], %[y], %[six] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "punpcklhw %[u], %[u], %[u] \n\t"
+ "psrah %[u], %[u], %[two] \n\t"
+ "punpcklhw %[v], %[v], %[v] \n\t"
+ "psrah %[v], %[v], %[two] \n\t"
+ "pminsh %[u], %[u], %[mask1] \n\t"
+ "pminsh %[v], %[v], %[mask1] \n\t"
+
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y),
+ [u]"=&f"(u), [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [alpha]"f"(-1),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask), [two]"f"(0x02),
+ [mask1]"f"(0x00ff00ff00ff00ff)
+ : "memory"
+ );
+}
+
+void I422AlphaToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y,u,v,a;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
+ uint64_t mask = 0xff00ff00ff00ff00ULL;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+ "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t"
+ "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb
+ "packushb %[g_vec0], %[g_vec0], %[a] \n\t"
+ "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v), [a]"=&f"(a),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [a_ptr]"r"(src_a), [zero]"f"(0x00),
+ [six]"f"(0x6), [five]"f"(0x55),
+ [mask]"f"(mask)
+ : "memory"
+ );
+}
+
+void I422ToRGB24Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y,u,v;
+ uint64_t b_vec[2],g_vec[2],r_vec[2];
+ uint64_t mask = 0xff00ff00ff00ff00ULL;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec0], %[y], %[bb] \n\t"
+ "pmullh %[b_vec1], %[u], %[ub] \n\t"
+ "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t"
+ "psrah %[b_vec0], %[b_vec0], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec0], %[y], %[bg] \n\t"
+ "pmullh %[g_vec1], %[u], %[ug] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "pmullh %[g_vec1], %[v], %[vg] \n\t"
+ "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t"
+ "psrah %[g_vec0], %[g_vec0], %[six] \n\t"
+
+ "paddsh %[r_vec0], %[y], %[br] \n\t"
+ "pmullh %[r_vec1], %[v], %[vr] \n\t"
+ "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t"
+ "psrah %[r_vec0], %[r_vec0], %[six] \n\t"
+
+ "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "packushb %[g_vec0], %[g_vec0], %[zero] \n\t"
+ "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"
+ "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"
+ "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"
+
+ "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t"
+ "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t"
+ "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
+ "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t"
+ "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t"
+ "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
+ "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t"
+ "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t"
+ "pextrh %[r_vec1], %[g_vec1], %[one] \n\t"
+ "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t"
+ "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t"
+ "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t"
+ "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t"
+ "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t"
+
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]),
+ [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]),
+ [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask]"f"(mask),
+ [lmove1]"f"(0x18), [rmove1]"f"(0x8),
+ [one]"f"(0x1)
+ : "memory"
+ );
+}
+
+void I422ToARGB4444Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101
+ "pmulhuh %[y], %[y], %[yg] \n\t"//y1
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"//u
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "and %[g_vec], %[g_vec], %[mask1] \n\t"
+ "psrlw %[g_vec], %[g_vec], %[four] \n\t"
+ "psrlw %[r_vec], %[g_vec], %[four] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "punpcklbh %[r_vec], %[alpha], %[zero] \n\t"
+ "and %[g_vec], %[g_vec], %[r_vec] \n\t"
+
+ "and %[b_vec], %[b_vec], %[mask1] \n\t"
+ "psrlw %[b_vec], %[b_vec], %[four] \n\t"
+ "psrlw %[r_vec], %[b_vec], %[four] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "punpcklbh %[r_vec], %[alpha], %[zero] \n\t"
+ "and %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[b_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00),
+ [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0),
+ [alpha]"f"(-1)
+ : "memory"
+ );
+}
+
+void I422ToARGB1555Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "psrlw %[temp], %[g_vec], %[three] \n\t"
+ "and %[g_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "or %[g_vec], %[g_vec], %[mask3] \n\t"
+
+ "psrlw %[temp], %[b_vec], %[three] \n\t"
+ "and %[b_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "psrlw %[temp], %[temp], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "or %[b_vec], %[b_vec], %[mask3] \n\t"
+
+ "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
+ "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
+ "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [three]"f"(0x3), [mask2]"f"(0x1f0000001f),
+ [eight]"f"(0x8), [mask3]"f"(0x800000008000),
+ [lmove5]"f"(0x5)
+ : "memory"
+ );
+}
+
+void I422ToRGB565Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ //u3|u2|u1|u0 --> u1|u1|u0|u0
+ "punpcklbh %[u], %[u], %[u] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ //v3|v2|v1|v0 --> v1|v1|v0|v0
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "psrlh %[temp], %[g_vec], %[three] \n\t"
+ "and %[g_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+
+ "psrlh %[temp], %[b_vec], %[three] \n\t"
+ "and %[b_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psllw %[r_vec], %[r_vec], %[lmove5] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
+ "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
+ "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [three]"f"(0x3), [mask2]"f"(0x1f0000001f),
+ [eight]"f"(0x8), [seven]"f"(0x7),
+ [lmove5]"f"(0x5)
+ : "memory"
+ );
+}
+
+void NV12ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[vshu] \n\t"
+ "pshufh %[u], %[u], %[ushu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1)
+ : "memory"
+ );
+}
+
+void NV21ToARGBRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[ushu] \n\t"
+ "pshufh %[u], %[u], %[vshu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1)
+ : "memory"
+ );
+}
+
+void NV12ToRGB24Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[vshu] \n\t"
+ "pshufh %[u], %[u], %[ushu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t"
+ "psllw %[temp], %[r_vec], %[lmove1] \n\t"
+ "or %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrlw %[temp], %[r_vec], %[rmove1] \n\t"
+ "pextrh %[temp], %[temp], %[zero] \n\t"
+ "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[zero] \n\t"
+ "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[one] \n\t"
+ "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t"
+ "psllw %[b_vec], %[b_vec], %[rmove1] \n\t"
+ "or %[b_vec], %[b_vec], %[temp] \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t"
+ "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1), [lmove1]"f"(0x18),
+ [one]"f"(0x1), [rmove1]"f"(0x8)
+ : "memory"
+ );
+}
+
+void NV21ToRGB24Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[ushu] \n\t"
+ "pshufh %[u], %[u], %[vshu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t"
+ "psllw %[temp], %[r_vec], %[lmove1] \n\t"
+ "or %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrlw %[temp], %[r_vec], %[rmove1] \n\t"
+ "pextrh %[temp], %[temp], %[zero] \n\t"
+ "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[zero] \n\t"
+ "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t"
+ "pextrh %[temp], %[b_vec], %[one] \n\t"
+ "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t"
+ "psllw %[b_vec], %[b_vec], %[rmove1] \n\t"
+ "or %[b_vec], %[b_vec], %[temp] \n\t"
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t"
+ "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu),
+ [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [lmove1]"f"(0x18), [rmove1]"f"(0x8),
+ [one]"f"(0x1)
+ : "memory"
+ );
+}
+
+void NV12ToRGB565Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "pshufh %[v], %[u], %[vshu] \n\t"
+ "pshufh %[u], %[u], %[ushu] \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "psrlh %[temp], %[g_vec], %[three] \n\t"
+ "and %[g_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psubb %[y], %[eight], %[three] \n\t"//5
+ "psllw %[r_vec], %[r_vec], %[y] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[g_vec], %[g_vec], %[r_vec] \n\t"
+
+ "psrlh %[temp], %[b_vec], %[three] \n\t"
+ "and %[b_vec], %[temp], %[mask2] \n\t"
+ "psrlw %[temp], %[temp], %[seven] \n\t"
+ "psrlw %[r_vec], %[mask1], %[eight] \n\t"
+ "and %[r_vec], %[temp], %[r_vec] \n\t"
+ "psubb %[y], %[eight], %[three] \n\t"//5
+ "psllw %[r_vec], %[r_vec], %[y] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+ "paddb %[r_vec], %[three], %[six] \n\t"
+ "psrlw %[temp], %[temp], %[r_vec] \n\t"
+ "and %[r_vec], %[temp], %[mask2] \n\t"
+ "paddb %[temp], %[three], %[eight] \n\t"
+ "psllw %[r_vec], %[r_vec], %[temp] \n\t"
+ "or %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t"
+ "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t"
+ "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t"
+ "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv),
+ [dst_rgb565]"r"(dst_rgb565),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [three]"f"(0x3), [mask2]"f"(0x1f0000001f),
+ [eight]"f"(0x8), [seven]"f"(0x7)
+ : "memory"
+ );
+}
+
+void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t"
+ "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t"
+ "psrlh %[temp], %[y], %[eight] \n\t"
+ "pshufh %[u], %[temp], %[ushu] \n\t"
+ "pshufh %[v], %[temp], %[vshu] \n\t"
+
+ "psrlh %[temp], %[mask1], %[eight] \n\t"
+ "and %[y], %[y], %[temp] \n\t"
+ "psllh %[temp], %[y], %[eight] \n\t"
+ "or %[y], %[y], %[temp] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1), [eight]"f"(0x8)
+ : "memory"
+ );
+}
+
+void UYVYToARGBRow_MMI(const uint8_t* src_uyvy,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t"
+ "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t"
+ "psrlh %[temp], %[mask1], %[eight] \n\t"
+ "and %[temp], %[y], %[temp] \n\t"
+ "pshufh %[u], %[temp], %[ushu] \n\t"
+ "pshufh %[v], %[temp], %[vshu] \n\t"
+
+ "psrlh %[y], %[y], %[eight] \n\t"
+ "psllh %[temp], %[y], %[eight] \n\t"
+ "or %[y], %[y], %[temp] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t"
+ "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t"
+ "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [ushu]"f"(0xA0), [vshu]"f"(0xf5),
+ [alpha]"f"(-1), [eight]"f"(0x8)
+ : "memory"
+ );
+}
+
+void I422ToRGBARow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ uint64_t y, u, v;
+ uint64_t b_vec, g_vec, r_vec, temp;
+ uint64_t ub,ug,vg,vr,bb,bg,br,yg;
+
+ __asm__ volatile(
+ "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"
+ "or %[ub], %[ub], %[mask1] \n\t"
+ "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[ug], %[ug], %[zero] \n\t"
+ "pshufh %[ug], %[ug], %[zero] \n\t"
+ "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vg], %[vg], %[zero] \n\t"
+ "pshufh %[vg], %[vg], %[five] \n\t"
+ "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"
+ "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"
+ "punpcklbh %[vr], %[vr], %[zero] \n\t"
+ "pshufh %[vr], %[vr], %[five] \n\t"
+ "or %[vr], %[vr], %[mask1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t"
+ "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t"
+ "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t"
+ "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t"
+ "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t"
+ "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t"
+
+ "punpcklbh %[y], %[y], %[y] \n\t"
+ "pmulhuh %[y], %[y], %[yg] \n\t"
+
+ "punpcklbh %[u], %[u], %[u] \n\t"
+ "punpcklbh %[u], %[u], %[zero] \n\t"
+ "paddsh %[b_vec], %[y], %[bb] \n\t"
+ "pmullh %[temp], %[u], %[ub] \n\t"
+ "psubsh %[b_vec], %[b_vec], %[temp] \n\t"
+ "psrah %[b_vec], %[b_vec], %[six] \n\t"
+
+ "punpcklbh %[v], %[v], %[v] \n\t"
+ "punpcklbh %[v], %[v], %[zero] \n\t"
+ "paddsh %[g_vec], %[y], %[bg] \n\t"
+ "pmullh %[temp], %[u], %[ug] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "pmullh %[temp], %[v], %[vg] \n\t"
+ "psubsh %[g_vec], %[g_vec], %[temp] \n\t"
+ "psrah %[g_vec], %[g_vec], %[six] \n\t"
+
+ "paddsh %[r_vec], %[y], %[br] \n\t"
+ "pmullh %[temp], %[v], %[vr] \n\t"
+ "psubsh %[r_vec], %[r_vec], %[temp] \n\t"
+ "psrah %[r_vec], %[r_vec], %[six] \n\t"
+
+ "packushb %[r_vec], %[b_vec], %[r_vec] \n\t"
+ "packushb %[g_vec], %[g_vec], %[zero] \n\t"
+ "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t"
+ "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t"
+ "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t"
+ "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t"
+ "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t"
+
+ "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t"
+ "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t"
+ "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t"
+
+ "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t"
+ "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t"
+ "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t"
+ "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [y]"=&f"(y), [u]"=&f"(u),
+ [v]"=&f"(v),
+ [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec),
+ [r_vec]"=&f"(r_vec), [temp]"=&f"(temp),
+ [ub]"=&f"(ub), [ug]"=&f"(ug),
+ [vg]"=&f"(vg), [vr]"=&f"(vr),
+ [bb]"=&f"(bb), [bg]"=&f"(bg),
+ [br]"=&f"(br), [yg]"=&f"(yg)
+ : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u),
+ [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf),
+ [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width),
+ [zero]"f"(0x00), [five]"f"(0x55),
+ [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00),
+ [alpha]"f"(-1)
+ : "memory"
+ );
+}
+
+void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) {
+ __asm__ volatile (
+ "punpcklwd %[v32], %[v32], %[v32] \n\t"
+ "1: \n\t"
+ "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t"
+ "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t"
+
+ "daddi %[width], %[width], -0x04 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [v32]"+&f"(v32)
+ : [dst_ptr]"r"(dst_argb), [width]"r"(width)
+ : "memory"
+ );
+}
+// clang-format on
+
+// 10 bit YUV to ARGB
#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
#ifdef __cplusplus
diff --git a/files/source/row_msa.cc b/files/source/row_msa.cc
index 5c0239a3..b7d5bb5e 100644
--- a/files/source/row_msa.cc
+++ b/files/source/row_msa.cc
@@ -24,16 +24,14 @@ extern "C" {
#define ALPHA_VAL (-1)
// Fill YUV -> RGB conversion constants into vectors
-#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
- { \
- ub = __msa_fill_w(yuvconst->kUVToB[0]); \
- vr = __msa_fill_w(yuvconst->kUVToR[1]); \
- ug = __msa_fill_w(yuvconst->kUVToG[0]); \
- vg = __msa_fill_w(yuvconst->kUVToG[1]); \
- bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \
- bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \
- br = __msa_fill_w(yuvconst->kUVBiasR[0]); \
- yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
+ { \
+ ub = __msa_fill_w(yuvconst->kUVToB[0]); \
+ vr = __msa_fill_w(yuvconst->kUVToR[1]); \
+ ug = __msa_fill_w(yuvconst->kUVToG[0]); \
+ vg = __msa_fill_w(yuvconst->kUVToG[1]); \
+ yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
+ yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]); \
}
// Load YUV 422 pixel data
@@ -70,54 +68,52 @@ extern "C" {
}
// Convert 8 pixels of YUV 420 to RGB.
-#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
- { \
- v8i16 vec0_m, vec1_m; \
- v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
- v4i32 reg5_m, reg6_m, reg7_m; \
- v16i8 zero_m = {0}; \
- \
- vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
- vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
- reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
- reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
- reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \
- reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \
- reg0_m *= yg; \
- reg1_m *= yg; \
- reg2_m *= ubvr; \
- reg3_m *= ubvr; \
- reg0_m = __msa_srai_w(reg0_m, 16); \
- reg1_m = __msa_srai_w(reg1_m, 16); \
- reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
- reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
- reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
- reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
- reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
- reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
- reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
- reg5_m = reg0_m - reg5_m; \
- reg6_m = reg1_m - reg6_m; \
- reg2_m = reg0_m - reg2_m; \
- reg3_m = reg1_m - reg3_m; \
- reg7_m = reg0_m - reg7_m; \
- reg4_m = reg1_m - reg4_m; \
- reg5_m += bb; \
- reg6_m += bb; \
- reg7_m += bg; \
- reg4_m += bg; \
- reg2_m += br; \
- reg3_m += br; \
- reg5_m = __msa_srai_w(reg5_m, 6); \
- reg6_m = __msa_srai_w(reg6_m, 6); \
- reg7_m = __msa_srai_w(reg7_m, 6); \
- reg4_m = __msa_srai_w(reg4_m, 6); \
- reg2_m = __msa_srai_w(reg2_m, 6); \
- reg3_m = __msa_srai_w(reg3_m, 6); \
- CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
- out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
- out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
- out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
+ { \
+ v8i16 vec0_m, vec1_m; \
+ v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
+ v4i32 reg5_m, reg6_m, reg7_m; \
+ v16i8 temp_m, zero_m = {0}; \
+ \
+ vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
+ vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
+ reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
+ reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
+ vec1_m = (v8i16)__msa_subv_h(vec1_m, const_0x80); \
+ temp_m = (v16i8)__msa_clti_s_h(vec1_m, 0); \
+ reg2_m = (v4i32)__msa_ilvr_h((v8i16)temp_m, (v8i16)vec1_m); \
+ reg3_m = (v4i32)__msa_ilvl_h((v8i16)temp_m, (v8i16)vec1_m); \
+ reg0_m *= yg; \
+ reg1_m *= yg; \
+ reg2_m *= ubvr; \
+ reg3_m *= ubvr; \
+ reg0_m = __msa_srai_w(reg0_m, 16); \
+ reg1_m = __msa_srai_w(reg1_m, 16); \
+ reg0_m += yb; \
+ reg1_m += yb; \
+ reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
+ reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
+ reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
+ reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
+ reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
+ reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
+ reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
+ reg5_m = reg0_m + reg5_m; \
+ reg6_m = reg1_m + reg6_m; \
+ reg2_m = reg0_m + reg2_m; \
+ reg3_m = reg1_m + reg3_m; \
+ reg7_m = reg0_m - reg7_m; \
+ reg4_m = reg1_m - reg4_m; \
+ reg5_m = __msa_srai_w(reg5_m, 6); \
+ reg6_m = __msa_srai_w(reg6_m, 6); \
+ reg7_m = __msa_srai_w(reg7_m, 6); \
+ reg4_m = __msa_srai_w(reg4_m, 6); \
+ reg2_m = __msa_srai_w(reg2_m, 6); \
+ reg3_m = __msa_srai_w(reg3_m, 6); \
+ CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
+ out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
+ out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
+ out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
}
// Pack and Store 8 ARGB values.
@@ -155,11 +151,10 @@ extern "C" {
}
// Loads current and next row of ARGB input and averages it to calculate U and V
-#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \
+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \
{ \
v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v16u8 vec8_m, vec9_m; \
v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
v8u16 reg8_m, reg9_m; \
\
@@ -195,81 +190,81 @@ extern "C" {
reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
- reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
- reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
- reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
- reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
- argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
- argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
- src0_m = (v16u8)__msa_ld_b((void*)s, 64); \
- src1_m = (v16u8)__msa_ld_b((void*)s, 80); \
- src2_m = (v16u8)__msa_ld_b((void*)s, 96); \
- src3_m = (v16u8)__msa_ld_b((void*)s, 112); \
- src4_m = (v16u8)__msa_ld_b((void*)t, 64); \
- src5_m = (v16u8)__msa_ld_b((void*)t, 80); \
- src6_m = (v16u8)__msa_ld_b((void*)t, 96); \
- src7_m = (v16u8)__msa_ld_b((void*)t, 112); \
- vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
- vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
- vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
- vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \
- vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \
- vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \
- vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \
- vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \
- reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \
- reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \
- reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \
- reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \
- reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \
- reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \
- reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \
- reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \
- reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \
- reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \
- reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \
- reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \
- reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \
- reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \
- reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \
- reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \
- reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \
- reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \
- reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \
- reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
- argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
- argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
+ reg8_m += const_0x0101; \
+ reg9_m += const_0x0101; \
+ reg0_m += const_0x0101; \
+ reg1_m += const_0x0101; \
+ argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1); \
+ argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1); \
+ argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1); \
+ argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1); \
}
-// Takes ARGB input and calculates U and V.
#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
- shf0, shf1, shf2, shf3, v_out, u_out) \
+ shf0, shf1, shf2, shf3, shift, u_out, v_out) \
{ \
- v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \
+ v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \
\
- vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \
- vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \
- vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \
- vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \
- vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \
- vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \
- vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \
- vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \
- reg0_m = __msa_dotp_u_h(vec0_m, const1); \
- reg1_m = __msa_dotp_u_h(vec1_m, const1); \
- reg2_m = __msa_dotp_u_h(vec4_m, const1); \
- reg3_m = __msa_dotp_u_h(vec5_m, const1); \
- reg0_m += const3; \
- reg1_m += const3; \
- reg2_m += const3; \
- reg3_m += const3; \
- reg0_m -= __msa_dotp_u_h(vec2_m, const0); \
- reg1_m -= __msa_dotp_u_h(vec3_m, const0); \
- reg2_m -= __msa_dotp_u_h(vec6_m, const2); \
- reg3_m -= __msa_dotp_u_h(vec7_m, const2); \
- v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \
- u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \
+ vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \
+ vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \
+ vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \
+ vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \
+ vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \
+ vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \
+ vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \
+ vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \
+ reg0_m = __msa_dotp_u_w(vec0_m, const0); \
+ reg1_m = __msa_dotp_u_w(vec1_m, const0); \
+ reg2_m = __msa_dotp_u_w(vec4_m, const0); \
+ reg3_m = __msa_dotp_u_w(vec5_m, const0); \
+ reg0_m += const1; \
+ reg1_m += const1; \
+ reg2_m += const1; \
+ reg3_m += const1; \
+ reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2); \
+ reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2); \
+ reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3); \
+ reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3); \
+ reg0_m = __msa_srl_w(reg0_m, shift); \
+ reg1_m = __msa_srl_w(reg1_m, shift); \
+ reg2_m = __msa_srl_w(reg2_m, shift); \
+ reg3_m = __msa_srl_w(reg3_m, shift); \
+ u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \
+ v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
+ }
+
+// Takes ARGB input and calculates U and V.
+#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+ shf0, shf1, shf2, shf3, v_out, u_out) \
+ { \
+ v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \
+ \
+ vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \
+ vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \
+ vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \
+ vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \
+ vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \
+ vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \
+ vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \
+ vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \
+ reg0_m = __msa_dotp_u_w(vec0_m, const1); \
+ reg1_m = __msa_dotp_u_w(vec1_m, const1); \
+ reg2_m = __msa_dotp_u_w(vec4_m, const1); \
+ reg3_m = __msa_dotp_u_w(vec5_m, const1); \
+ reg0_m += (v4u32)const3; \
+ reg1_m += (v4u32)const3; \
+ reg2_m += (v4u32)const3; \
+ reg3_m += (v4u32)const3; \
+ reg0_m -= __msa_dotp_u_w(vec2_m, const0); \
+ reg1_m -= __msa_dotp_u_w(vec3_m, const0); \
+ reg2_m -= __msa_dotp_u_w(vec6_m, const2); \
+ reg3_m -= __msa_dotp_u_w(vec7_m, const2); \
+ u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
+ v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \
+ u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out); \
+ v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out); \
}
// Load I444 pixel data
@@ -285,6 +280,34 @@ extern "C" {
out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \
}
+#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
+ { \
+ v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \
+ v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5; \
+ _tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb); \
+ _tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb); \
+ _tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg); \
+ _tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg); \
+ _tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr); \
+ _tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr); \
+ _reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0); \
+ _reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1); \
+ _reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2); \
+ _reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3); \
+ _reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4); \
+ _reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5); \
+ _reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1); \
+ _reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3); \
+ _reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5); \
+ _reg1 = const_8080 + const_112 * _reg0; \
+ _reg3 = const_8080 + const_112 * _reg4; \
+ _reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2); \
+ _reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2); \
+ _reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4); \
+ _reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0); \
+ _dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1); \
+ }
+
void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
@@ -302,6 +325,20 @@ void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
}
}
+void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ int x;
+ v8u16 src, dst;
+ v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0};
+ src_uv += (width - 8) << 1;
+ for (x = 0; x < width; x += 8) {
+ src = LD_UH(src_uv);
+ dst = __msa_vshf_h(shuffler, src, src);
+ ST_UH(dst, dst_uv);
+ src_uv -= 16;
+ dst_uv += 16;
+ }
+}
+
void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
@@ -376,20 +413,19 @@ void I422ToARGBRow_MSA(const uint8_t* src_y,
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
src_y += 8;
src_u += 4;
@@ -407,20 +443,19 @@ void I422ToRGBARow_MSA(const uint8_t* src_y,
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
STOREARGB(alpha, vec0, vec1, vec2, dst_argb);
src_y += 8;
src_u += 4;
@@ -440,12 +475,12 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
int64_t data_a;
v16u8 src0, src1, src2, src3;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v4i32 zero = {0};
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -454,8 +489,7 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
STOREARGB(vec0, vec1, vec2, src3, dst_argb);
src_y += 8;
@@ -476,17 +510,17 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
int64_t data_u, data_v;
v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 reg0, reg1, reg2, reg3;
v2i64 zero = {0};
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10,
11, 29, 12, 13, 30, 14, 15, 31};
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -499,10 +533,8 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
- YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec3, vec4, vec5);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
+ YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec3, vec4, vec5);
reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
@@ -529,24 +561,23 @@ void I422ToRGB565Row_MSA(const uint8_t* src_y,
int x;
v16u8 src0, src1, src2, dst0;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec2, vec1);
- vec0 = __msa_srai_h(vec0, 3);
- vec1 = __msa_srai_h(vec1, 3);
- vec2 = __msa_srai_h(vec2, 2);
- vec1 = __msa_slli_h(vec1, 11);
- vec2 = __msa_slli_h(vec2, 5);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
+ vec0 = __msa_srli_h(vec0, 3);
+ vec1 = __msa_srli_h(vec1, 2);
+ vec2 = __msa_srli_h(vec2, 3);
+ vec2 = __msa_slli_h(vec2, 11);
+ vec1 = __msa_slli_h(vec1, 5);
vec0 |= vec1;
dst0 = (v16u8)(vec2 | vec0);
ST_UB(dst0, dst_rgb565);
@@ -568,25 +599,24 @@ void I422ToARGB4444Row_MSA(const uint8_t* src_y,
v16u8 src0, src1, src2, dst0;
v8i16 vec0, vec1, vec2;
v8u16 reg0, reg1, reg2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
+ v8u16 mask = (v8u16)__msa_fill_h(0x00F0);
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
- reg0 = (v8u16)__msa_srai_h(vec0, 4);
- reg1 = (v8u16)__msa_srai_h(vec1, 4);
- reg2 = (v8u16)__msa_srai_h(vec2, 4);
- reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
- reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
+ reg0 = (v8u16)__msa_srli_h(vec0, 4);
+ reg2 = (v8u16)__msa_srli_h(vec2, 4);
+ reg1 = (v8u16)__msa_and_v(vec1, mask);
+ reg2 = (v8u16)__msa_slli_h(reg2, 8);
reg1 |= const_0xF000;
reg0 |= reg2;
dst0 = (v16u8)(reg1 | reg0);
@@ -608,23 +638,22 @@ void I422ToARGB1555Row_MSA(const uint8_t* src_y,
v16u8 src0, src1, src2, dst0;
v8i16 vec0, vec1, vec2;
v8u16 reg0, reg1, reg2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
- reg0 = (v8u16)__msa_srai_h(vec0, 3);
- reg1 = (v8u16)__msa_srai_h(vec1, 3);
- reg2 = (v8u16)__msa_srai_h(vec2, 3);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
+ reg0 = (v8u16)__msa_srli_h(vec0, 3);
+ reg1 = (v8u16)__msa_srli_h(vec1, 3);
+ reg2 = (v8u16)__msa_srli_h(vec2, 3);
reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
reg1 |= const_0x8000;
@@ -768,7 +797,7 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
}
}
-void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
@@ -779,10 +808,10 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
@@ -809,38 +838,39 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
ST_UB(dst0, dst_y);
- src_argb0 += 64;
+ src_argb += 64;
dst_y += 16;
}
}
-void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+void ARGBToUVRow_MSA(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* src_argb0_next = src_argb0 + src_stride_argb;
+ const uint8_t* src_argb_next = src_argb + src_stride_argb;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
v16u8 dst0, dst1;
- v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+ v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38);
+ v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25);
+ v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13);
+ v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f);
+ v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
for (x = 0; x < width; x += 32) {
- src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
- src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
- src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
- src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
- src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
+ src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64);
+ src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80);
+ src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96);
+ src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
@@ -861,14 +891,14 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
reg3 = __msa_hadd_u_h(vec5, vec5);
reg4 = __msa_hadd_u_h(vec0, vec0);
reg5 = __msa_hadd_u_h(vec1, vec1);
- src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
- src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
- src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
- src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
- src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
- src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
- src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
- src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
+ src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0);
+ src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16);
+ src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32);
+ src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48);
+ src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64);
+ src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80);
+ src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96);
+ src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
@@ -889,12 +919,18 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
reg3 += __msa_hadd_u_h(vec5, vec5);
reg4 += __msa_hadd_u_h(vec0, vec0);
reg5 += __msa_hadd_u_h(vec1, vec1);
- reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
- reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
- reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
- reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
- reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
- reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
+ reg0 += const_0x0001;
+ reg1 += const_0x0001;
+ reg2 += const_0x0001;
+ reg3 += const_0x0001;
+ reg4 += const_0x0001;
+ reg5 += const_0x0001;
+ reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1);
+ reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1);
+ reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1);
+ reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1);
+ reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1);
+ reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1);
reg6 = reg0 * const_0x70;
reg7 = reg1 * const_0x70;
reg8 = reg2 * const_0x4A;
@@ -925,8 +961,8 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
ST_UB(dst0, dst_u);
ST_UB(dst1, dst_v);
- src_argb0 += 128;
- src_argb0_next += 128;
+ src_argb += 128;
+ src_argb_next += 128;
dst_u += 16;
dst_v += 16;
}
@@ -1153,7 +1189,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb,
}
}
-void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -1164,7 +1200,7 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
v8i16 zero = {0};
for (x = 0; x < width; x += 4) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
@@ -1186,13 +1222,13 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_argb);
- src_argb0 += 16;
+ src_argb += 16;
src_argb1 += 16;
dst_argb += 16;
}
}
-void ARGBAddRow_MSA(const uint8_t* src_argb0,
+void ARGBAddRow_MSA(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -1200,20 +1236,20 @@ void ARGBAddRow_MSA(const uint8_t* src_argb0,
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
dst0 = __msa_adds_u_b(src0, src2);
dst1 = __msa_adds_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_argb, 16);
- src_argb0 += 32;
+ src_argb += 32;
src_argb1 += 32;
dst_argb += 32;
}
}
-void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+void ARGBSubtractRow_MSA(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@@ -1221,14 +1257,14 @@ void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
dst0 = __msa_subs_u_b(src0, src2);
dst1 = __msa_subs_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_argb, 16);
- src_argb0 += 32;
+ src_argb += 32;
src_argb1 += 32;
dst_argb += 32;
}
@@ -1412,17 +1448,17 @@ void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
int x;
v16u8 src0, src1, vec0, vec1, dst0, dst1;
v8u16 reg0;
- v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
- v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+ v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D);
+ v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
- reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
- reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
- reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
+ reg0 = __msa_dotp_u_h(vec0, const_0x961D);
+ reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D);
+ reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8);
vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
@@ -1656,56 +1692,51 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
int x;
- v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
- v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
- v16u8 dst0;
- v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
- v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
- v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
- v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
- v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+ v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr;
+ v16u8 reg0, reg1, reg2, dst;
+ v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
+ v8i16 res0, res1;
+ v8i16 const_66 = (v8i16)__msa_ldi_h(66);
+ v8i16 const_129 = (v8i16)__msa_ldi_h(129);
+ v8i16 const_25 = (v8i16)__msa_ldi_h(25);
+ v8u16 const_1080 = (v8u16)__msa_fill_h(0x1080);
+ v16u8 zero = (v16u8)__msa_ldi_b(0);
for (x = 0; x < width; x += 16) {
- src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0);
- src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16);
- vec0 = src0 & const_0x1F;
- vec1 = src1 & const_0x1F;
- src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
- src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
- vec2 = src0 & const_0x1F;
- vec3 = src1 & const_0x1F;
- src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
- src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
- vec4 = src0 & const_0x1F;
- vec5 = src1 & const_0x1F;
- reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
- reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
- reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
- reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
- reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
- reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
- reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
- reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
- reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
- reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
- reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
- reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
- reg0 *= const_0x19;
- reg1 *= const_0x19;
- reg2 *= const_0x81;
- reg3 *= const_0x81;
- reg4 *= const_0x42;
- reg5 *= const_0x42;
- reg0 += reg2;
- reg1 += reg3;
- reg0 += reg4;
- reg1 += reg5;
- reg0 += const_0x1080;
- reg1 += const_0x1080;
- reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
- reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
- dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
- ST_UB(dst0, dst_y);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb1555, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb1555, 16);
+ tmp0 = (v16u8)__msa_pckev_b(src1, src0);
+ tmp1 = (v16u8)__msa_pckod_b(src1, src0);
+ tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
+ tmpg = (v16u8)__msa_srli_b(tmp0, 5);
+ reg0 = (v16u8)__msa_andi_b(tmp1, 0x03);
+ reg0 = (v16u8)__msa_slli_b(reg0, 3);
+ tmpg = (v16u8)__msa_or_v(tmpg, reg0);
+ reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C);
+ tmpr = (v16u8)__msa_srli_b(reg1, 2);
+ reg0 = (v16u8)__msa_slli_b(tmpb, 3);
+ reg1 = (v16u8)__msa_slli_b(tmpg, 3);
+ reg2 = (v16u8)__msa_slli_b(tmpr, 3);
+ tmpb = (v16u8)__msa_srli_b(tmpb, 2);
+ tmpg = (v16u8)__msa_srli_b(tmpg, 2);
+ tmpr = (v16u8)__msa_srli_b(tmpr, 2);
+ tmpb = (v16u8)__msa_or_v(reg0, tmpb);
+ tmpg = (v16u8)__msa_or_v(reg1, tmpg);
+ tmpr = (v16u8)__msa_or_v(reg2, tmpr);
+ tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb);
+ tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb);
+ tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg);
+ tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg);
+ tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr);
+ tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr);
+ res0 = const_1080 + const_25 * tmpb_r;
+ res1 = const_1080 + const_25 * tmpb_l;
+ res0 += const_129 * tmpg_r;
+ res1 += const_129 * tmpg_l;
+ res0 += const_66 * tmpr_r;
+ res1 += const_66 * tmpr_l;
+ dst = (v16u8)__msa_pckod_b(res1, res0);
+ ST_UB(dst, dst_y);
src_argb1555 += 32;
dst_y += 16;
}
@@ -1713,68 +1744,55 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
int x;
- v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
- v4u32 res0, res1, res2, res3;
- v16u8 dst0;
- v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
- v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
- v8i16 const_0x1080 = __msa_fill_h(0x1080);
- v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
- v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
- v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+ v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr;
+ v16u8 reg0, reg1, dst;
+ v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
+ v8i16 res0, res1;
+ v8i16 const_66 = (v8i16)__msa_ldi_h(66);
+ v8i16 const_129 = (v8i16)__msa_ldi_h(129);
+ v8i16 const_25 = (v8i16)__msa_ldi_h(25);
+ v8i16 const_1080 = (v8i16)__msa_fill_h(0x1080);
+ v16u8 zero = __msa_ldi_b(0);
for (x = 0; x < width; x += 16) {
- src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0);
- src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16);
- vec0 = src0 & const_0x1F;
- vec1 = src0 & const_0x7E0;
- vec2 = src0 & const_0xF800;
- vec3 = src1 & const_0x1F;
- vec4 = src1 & const_0x7E0;
- vec5 = src1 & const_0xF800;
- reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
- reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
- reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
- reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
- reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
- reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
- reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
- reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
- reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
- reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
- reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
- reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
- vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
- vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
- vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
- vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
- vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
- vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
- vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
- vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
- res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
- res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
- res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
- res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
- res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
- res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
- res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
- res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
- res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
- res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
- res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
- res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
- vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
- vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
- dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
- ST_UB(dst0, dst_y);
+ src0 = (v16u8)__msa_ld_b((void*)src_rgb565, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_rgb565, 16);
+ tmp0 = (v16u8)__msa_pckev_b(src1, src0);
+ tmp1 = (v16u8)__msa_pckod_b(src1, src0);
+ tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
+ tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8);
+ reg1 = (v16u8)__msa_andi_b(tmp1, 0x07);
+ reg0 = (v16u8)__msa_srli_b(tmp0, 5);
+ reg1 = (v16u8)__msa_slli_b(reg1, 3);
+ tmpg = (v16u8)__msa_or_v(reg1, reg0);
+ reg0 = (v16u8)__msa_slli_b(tmpb, 3);
+ reg1 = (v16u8)__msa_srli_b(tmpb, 2);
+ tmpb = (v16u8)__msa_or_v(reg1, reg0);
+ reg0 = (v16u8)__msa_slli_b(tmpg, 2);
+ reg1 = (v16u8)__msa_srli_b(tmpg, 4);
+ tmpg = (v16u8)__msa_or_v(reg1, reg0);
+ reg0 = (v16u8)__msa_srli_b(tmpr, 5);
+ tmpr = (v16u8)__msa_or_v(tmpr, reg0);
+ tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb);
+ tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb);
+ tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg);
+ tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg);
+ tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr);
+ tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr);
+ res0 = const_1080 + const_25 * tmpb_r;
+ res1 = const_1080 + const_25 * tmpb_l;
+ res0 += const_129 * tmpg_r;
+ res1 += const_129 * tmpg_l;
+ res0 += const_66 * tmpr_r;
+ res1 += const_66 * tmpr_l;
+ dst = (v16u8)__msa_pckod_b(res1, res0);
+ ST_UB(dst, dst_y);
src_rgb565 += 32;
dst_y += 16;
}
}
-void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
v8u16 vec0, vec1, vec2, vec3;
@@ -1789,9 +1807,9 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v16i8 zero = {0};
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
@@ -1810,12 +1828,12 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_y);
- src_argb0 += 48;
+ src_argb += 48;
dst_y += 16;
}
}
-void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
v8u16 vec0, vec1, vec2, vec3;
@@ -1830,9 +1848,9 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v16i8 zero = {0};
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
@@ -1851,7 +1869,7 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_y);
- src_argb0 += 48;
+ src_argb += 48;
dst_y += 16;
}
}
@@ -1865,69 +1883,61 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
const uint16_t* s = (const uint16_t*)src_argb1555;
const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);
int64_t res0, res1;
- v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
- v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
- v16u8 dst0;
- v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
- v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+ v16u8 src0, src1, src2, src3, dst;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v16u8 reg0, reg1, reg2, reg3;
+ v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
+ v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
+ v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
+ v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
+ v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
+ v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
for (x = 0; x < width; x += 16) {
src0 = (v8u16)__msa_ld_b((void*)s, 0);
src1 = (v8u16)__msa_ld_b((void*)s, 16);
src2 = (v8u16)__msa_ld_b((void*)t, 0);
src3 = (v8u16)__msa_ld_b((void*)t, 16);
- vec0 = src0 & const_0x1F;
- vec1 = src1 & const_0x1F;
- vec0 += src2 & const_0x1F;
- vec1 += src3 & const_0x1F;
- vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
- src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
- src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
- src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
- src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
- vec2 = src0 & const_0x1F;
- vec3 = src1 & const_0x1F;
- vec2 += src2 & const_0x1F;
- vec3 += src3 & const_0x1F;
- vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
- src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
- src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
- src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
- src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
- vec4 = src0 & const_0x1F;
- vec5 = src1 & const_0x1F;
- vec4 += src2 & const_0x1F;
- vec5 += src3 & const_0x1F;
- vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
- vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
- vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
- vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
- vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
- vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
- vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
- vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
- vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
- vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
- reg0 = vec6 * const_0x70;
- reg1 = vec0 * const_0x4A;
- reg2 = vec2 * const_0x70;
- reg3 = vec0 * const_0x5E;
- reg0 += const_0x8080;
- reg1 += vec2 * const_0x26;
- reg2 += const_0x8080;
- reg3 += vec6 * const_0x12;
- reg0 -= reg1;
- reg2 -= reg3;
- reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
- reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
- dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
- res0 = __msa_copy_u_d((v2i64)dst0, 0);
- res1 = __msa_copy_u_d((v2i64)dst0, 1);
+ tmp0 = (v16u8)__msa_pckev_b(src1, src0);
+ tmp1 = (v16u8)__msa_pckod_b(src1, src0);
+ tmp2 = (v16u8)__msa_pckev_b(src3, src2);
+ tmp3 = (v16u8)__msa_pckod_b(src3, src2);
+ tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
+ nexb = (v16u8)__msa_andi_b(tmp2, 0x1F);
+ tmpg = (v16u8)__msa_srli_b(tmp0, 5);
+ nexg = (v16u8)__msa_srli_b(tmp2, 5);
+ reg0 = (v16u8)__msa_andi_b(tmp1, 0x03);
+ reg2 = (v16u8)__msa_andi_b(tmp3, 0x03);
+ reg0 = (v16u8)__msa_slli_b(reg0, 3);
+ reg2 = (v16u8)__msa_slli_b(reg2, 3);
+ tmpg = (v16u8)__msa_or_v(tmpg, reg0);
+ nexg = (v16u8)__msa_or_v(nexg, reg2);
+ reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C);
+ reg3 = (v16u8)__msa_andi_b(tmp3, 0x7C);
+ tmpr = (v16u8)__msa_srli_b(reg1, 2);
+ nexr = (v16u8)__msa_srli_b(reg3, 2);
+ reg0 = (v16u8)__msa_slli_b(tmpb, 3);
+ reg1 = (v16u8)__msa_slli_b(tmpg, 3);
+ reg2 = (v16u8)__msa_slli_b(tmpr, 3);
+ tmpb = (v16u8)__msa_srli_b(tmpb, 2);
+ tmpg = (v16u8)__msa_srli_b(tmpg, 2);
+ tmpr = (v16u8)__msa_srli_b(tmpr, 2);
+ tmpb = (v16u8)__msa_or_v(reg0, tmpb);
+ tmpg = (v16u8)__msa_or_v(reg1, tmpg);
+ tmpr = (v16u8)__msa_or_v(reg2, tmpr);
+ reg0 = (v16u8)__msa_slli_b(nexb, 3);
+ reg1 = (v16u8)__msa_slli_b(nexg, 3);
+ reg2 = (v16u8)__msa_slli_b(nexr, 3);
+ nexb = (v16u8)__msa_srli_b(nexb, 2);
+ nexg = (v16u8)__msa_srli_b(nexg, 2);
+ nexr = (v16u8)__msa_srli_b(nexr, 2);
+ nexb = (v16u8)__msa_or_v(reg0, nexb);
+ nexg = (v16u8)__msa_or_v(reg1, nexg);
+ nexr = (v16u8)__msa_or_v(reg2, nexr);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst);
+ res0 = __msa_copy_u_d((v2i64)dst, 0);
+ res1 = __msa_copy_u_d((v2i64)dst, 1);
SD(res0, dst_u);
SD(res1, dst_v);
s += 16;
@@ -1946,68 +1956,57 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
const uint16_t* s = (const uint16_t*)src_rgb565;
const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);
int64_t res0, res1;
- v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
- v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
- v16u8 dst0;
- v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
- v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
- v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
- v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
+ v16u8 src0, src1, src2, src3, dst;
+ v16u8 tmp0, tmp1, tmp2, tmp3;
+ v16u8 reg0, reg1, reg2, reg3;
+ v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
+ v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
+ v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
+ v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
+ v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
+ v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
+ v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
for (x = 0; x < width; x += 16) {
- src0 = (v8u16)__msa_ld_b((void*)s, 0);
- src1 = (v8u16)__msa_ld_b((void*)s, 16);
- src2 = (v8u16)__msa_ld_b((void*)t, 0);
- src3 = (v8u16)__msa_ld_b((void*)t, 16);
- vec0 = src0 & const_0x1F;
- vec1 = src1 & const_0x1F;
- vec0 += src2 & const_0x1F;
- vec1 += src3 & const_0x1F;
- vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
- src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
- src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
- src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
- src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
- vec2 = src0 & const_0x3F;
- vec3 = src1 & const_0x3F;
- vec2 += src2 & const_0x3F;
- vec3 += src3 & const_0x3F;
- vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
- src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
- src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
- src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
- src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
- vec4 = src0 & const_0x1F;
- vec5 = src1 & const_0x1F;
- vec4 += src2 & const_0x1F;
- vec5 += src3 & const_0x1F;
- vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
- vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
- vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
- vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
- vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
- vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
- vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
- vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
- reg0 = vec3 * const_0x70;
- reg1 = vec1 * const_0x4A;
- reg2 = vec4 * const_0x70;
- reg3 = vec1 * const_0x5E;
- reg0 += const_32896;
- reg1 += vec4 * const_0x26;
- reg2 += const_32896;
- reg3 += vec3 * const_0x12;
- reg0 -= reg1;
- reg2 -= reg3;
- reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
- reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
- dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
- res0 = __msa_copy_u_d((v2i64)dst0, 0);
- res1 = __msa_copy_u_d((v2i64)dst0, 1);
+ src0 = (v16u8)__msa_ld_b((void*)s, 0);
+ src1 = (v16u8)__msa_ld_b((void*)s, 16);
+ src2 = (v16u8)__msa_ld_b((void*)t, 0);
+ src3 = (v16u8)__msa_ld_b((void*)t, 16);
+ tmp0 = (v16u8)__msa_pckev_b(src1, src0);
+ tmp1 = (v16u8)__msa_pckod_b(src1, src0);
+ tmp2 = (v16u8)__msa_pckev_b(src3, src2);
+ tmp3 = (v16u8)__msa_pckod_b(src3, src2);
+ tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
+ tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8);
+ nexb = (v16u8)__msa_andi_b(tmp2, 0x1F);
+ nexr = (v16u8)__msa_andi_b(tmp3, 0xF8);
+ reg1 = (v16u8)__msa_andi_b(tmp1, 0x07);
+ reg3 = (v16u8)__msa_andi_b(tmp3, 0x07);
+ reg0 = (v16u8)__msa_srli_b(tmp0, 5);
+ reg1 = (v16u8)__msa_slli_b(reg1, 3);
+ reg2 = (v16u8)__msa_srli_b(tmp2, 5);
+ reg3 = (v16u8)__msa_slli_b(reg3, 3);
+ tmpg = (v16u8)__msa_or_v(reg1, reg0);
+ nexg = (v16u8)__msa_or_v(reg2, reg3);
+ reg0 = (v16u8)__msa_slli_b(tmpb, 3);
+ reg1 = (v16u8)__msa_srli_b(tmpb, 2);
+ reg2 = (v16u8)__msa_slli_b(nexb, 3);
+ reg3 = (v16u8)__msa_srli_b(nexb, 2);
+ tmpb = (v16u8)__msa_or_v(reg1, reg0);
+ nexb = (v16u8)__msa_or_v(reg2, reg3);
+ reg0 = (v16u8)__msa_slli_b(tmpg, 2);
+ reg1 = (v16u8)__msa_srli_b(tmpg, 4);
+ reg2 = (v16u8)__msa_slli_b(nexg, 2);
+ reg3 = (v16u8)__msa_srli_b(nexg, 4);
+ tmpg = (v16u8)__msa_or_v(reg1, reg0);
+ nexg = (v16u8)__msa_or_v(reg2, reg3);
+ reg0 = (v16u8)__msa_srli_b(tmpr, 5);
+ reg2 = (v16u8)__msa_srli_b(nexr, 5);
+ tmpr = (v16u8)__msa_or_v(tmpr, reg0);
+ nexr = (v16u8)__msa_or_v(nexr, reg2);
+ RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst);
+ res0 = __msa_copy_u_d((v2i64)dst, 0);
+ res1 = __msa_copy_u_d((v2i64)dst, 1);
SD(res0, dst_u);
SD(res1, dst_v);
s += 16;
@@ -2017,26 +2016,27 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
}
}
-void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* s = src_rgb0;
- const uint8_t* t = src_rgb0 + src_stride_rgb;
+ const uint8_t* s = src_rgb;
+ const uint8_t* t = src_rgb + src_stride_rgb;
int64_t res0, res1;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8i16 reg0, reg1, reg2, reg3;
v16u8 dst0;
- v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+ v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
+ v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
+ v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
+ v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
+ v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
v16i8 zero = {0};
@@ -2085,10 +2085,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
- reg0 = __msa_srai_h((v8i16)reg0, 2);
- reg1 = __msa_srai_h((v8i16)reg1, 2);
- reg2 = __msa_srai_h((v8i16)reg2, 2);
- reg3 = __msa_srai_h((v8i16)reg3, 2);
+ reg0 += const_0x0001;
+ reg1 += const_0x0001;
+ reg2 += const_0x0001;
+ reg3 += const_0x0001;
+ reg0 = __msa_srai_h((v8i16)reg0, 1);
+ reg1 = __msa_srai_h((v8i16)reg1, 1);
+ reg2 = __msa_srai_h((v8i16)reg2, 1);
+ reg3 = __msa_srai_h((v8i16)reg3, 1);
vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
@@ -2122,26 +2126,27 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
}
}
-void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+void RAWToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* s = src_rgb0;
- const uint8_t* t = src_rgb0 + src_stride_rgb;
+ const uint8_t* s = src_rgb;
+ const uint8_t* t = src_rgb + src_stride_rgb;
int64_t res0, res1;
v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8i16 reg0, reg1, reg2, reg3;
v16u8 dst0;
- v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
- v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
- v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
- v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
- v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+ v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38);
+ v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25);
+ v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13);
+ v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f);
+ v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
v16i8 zero = {0};
@@ -2190,10 +2195,14 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0,
reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
- reg0 = __msa_srai_h(reg0, 2);
- reg1 = __msa_srai_h(reg1, 2);
- reg2 = __msa_srai_h(reg2, 2);
- reg3 = __msa_srai_h(reg3, 2);
+ reg0 += const_0x0001;
+ reg1 += const_0x0001;
+ reg2 += const_0x0001;
+ reg3 += const_0x0001;
+ reg0 = __msa_srai_h(reg0, 1);
+ reg1 = __msa_srai_h(reg1, 1);
+ reg2 = __msa_srai_h(reg2, 1);
+ reg3 = __msa_srai_h(reg3, 1);
vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
@@ -2236,13 +2245,13 @@ void NV12ToARGBRow_MSA(const uint8_t* src_y,
uint64_t val0, val1;
v16u8 src0, src1, res0, res1, dst0, dst1;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 zero = {0};
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -2251,8 +2260,7 @@ void NV12ToARGBRow_MSA(const uint8_t* src_y,
val1 = LD(src_uv);
src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
@@ -2273,12 +2281,12 @@ void NV12ToRGB565Row_MSA(const uint8_t* src_y,
uint64_t val0, val1;
v16u8 src0, src1, dst0;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
v16u8 zero = {0};
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -2287,8 +2295,7 @@ void NV12ToRGB565Row_MSA(const uint8_t* src_y,
val1 = LD(src_uv);
src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
vec0 = vec0 >> 3;
vec1 = (vec1 >> 2) << 5;
vec2 = (vec2 >> 3) << 11;
@@ -2309,14 +2316,14 @@ void NV21ToARGBRow_MSA(const uint8_t* src_y,
uint64_t val0, val1;
v16u8 src0, src1, res0, res1, dst0, dst1;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
v16u8 zero = {0};
v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -2326,8 +2333,7 @@ void NV21ToARGBRow_MSA(const uint8_t* src_y,
src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
- YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
@@ -2416,27 +2422,27 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx,
}
}
-void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
- v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
- v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
- v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
+ v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
+ v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D);
+ v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
- ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
+ ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8,
dst0);
ST_UB(dst0, dst_y);
- src_argb0 += 64;
+ src_argb += 64;
dst_y += 16;
}
}
-void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
@@ -2444,19 +2450,19 @@ void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
dst0);
ST_UB(dst0, dst_y);
- src_argb0 += 64;
+ src_argb += 64;
dst_y += 16;
}
}
-void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
@@ -2464,19 +2470,19 @@ void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
dst0);
ST_UB(dst0, dst_y);
- src_argb0 += 64;
+ src_argb += 64;
dst_y += 16;
}
}
-void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
@@ -2484,81 +2490,143 @@ void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
dst0);
ST_UB(dst0, dst_y);
- src_argb0 += 64;
+ src_argb += 64;
dst_y += 16;
}
}
-void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* s = src_rgb0;
- const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 vec0, vec1, vec2, vec3;
- v16u8 dst0, dst1;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
- v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
- v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
- v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ const uint8_t* s = src_rgb;
+ const uint8_t* t = src_rgb + src_stride_rgb;
+ v8u16 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8u16 vec0, vec1, vec2, vec3;
+ v8u16 dst0, dst1, dst2, dst3;
+ v16u8 zero = {0};
+ v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15};
+ v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
+ v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15};
+ v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
+ v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f);
+ v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080);
+ v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a);
+ v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a);
+ v4i32 shift = __msa_fill_w(0x00000008);
for (x = 0; x < width; x += 32) {
- src0 = (v16u8)__msa_ld_b((void*)s, 0);
- src1 = (v16u8)__msa_ld_b((void*)s, 16);
- src2 = (v16u8)__msa_ld_b((void*)s, 32);
- src3 = (v16u8)__msa_ld_b((void*)s, 48);
- src4 = (v16u8)__msa_ld_b((void*)t, 0);
- src5 = (v16u8)__msa_ld_b((void*)t, 16);
- src6 = (v16u8)__msa_ld_b((void*)t, 32);
- src7 = (v16u8)__msa_ld_b((void*)t, 48);
- src0 = __msa_aver_u_b(src0, src4);
- src1 = __msa_aver_u_b(src1, src5);
- src2 = __msa_aver_u_b(src2, src6);
- src3 = __msa_aver_u_b(src3, src7);
- src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
- src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
- src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
- src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
- vec0 = __msa_aver_u_b(src4, src6);
- vec1 = __msa_aver_u_b(src5, src7);
- src0 = (v16u8)__msa_ld_b((void*)s, 64);
- src1 = (v16u8)__msa_ld_b((void*)s, 80);
- src2 = (v16u8)__msa_ld_b((void*)s, 96);
- src3 = (v16u8)__msa_ld_b((void*)s, 112);
- src4 = (v16u8)__msa_ld_b((void*)t, 64);
- src5 = (v16u8)__msa_ld_b((void*)t, 80);
- src6 = (v16u8)__msa_ld_b((void*)t, 96);
- src7 = (v16u8)__msa_ld_b((void*)t, 112);
- src0 = __msa_aver_u_b(src0, src4);
- src1 = __msa_aver_u_b(src1, src5);
- src2 = __msa_aver_u_b(src2, src6);
- src3 = __msa_aver_u_b(src3, src7);
- src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
- src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
- src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
- src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
- vec2 = __msa_aver_u_b(src4, src6);
- vec3 = __msa_aver_u_b(src5, src7);
- ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
- const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_v);
- ST_UB(dst1, dst_u);
+ src1 = __msa_ld_b((void*)s, 0);
+ src3 = __msa_ld_b((void*)s, 16);
+ src5 = __msa_ld_b((void*)t, 0);
+ src7 = __msa_ld_b((void*)t, 16);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec0 = __msa_aver_u_h(src4, src5);
+ vec1 = __msa_aver_u_h(src6, src7);
+
+ src1 = __msa_ld_b((void*)s, 32);
+ src3 = __msa_ld_b((void*)s, 48);
+ src5 = __msa_ld_b((void*)t, 32);
+ src7 = __msa_ld_b((void*)t, 48);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec2 = __msa_aver_u_h(src4, src5);
+ vec3 = __msa_aver_u_h(src6, src7);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
+ const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
+ shuffler2, shuffler3, shift, dst0, dst1);
+
+ src1 = __msa_ld_b((void*)s, 64);
+ src3 = __msa_ld_b((void*)s, 80);
+ src5 = __msa_ld_b((void*)t, 64);
+ src7 = __msa_ld_b((void*)t, 80);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec0 = __msa_aver_u_h(src4, src5);
+ vec1 = __msa_aver_u_h(src6, src7);
+
+ src1 = __msa_ld_b((void*)s, 96);
+ src3 = __msa_ld_b((void*)s, 112);
+ src5 = __msa_ld_b((void*)t, 96);
+ src7 = __msa_ld_b((void*)t, 112);
+ src0 = __msa_ilvr_b(zero, src1);
+ src1 = __msa_ilvl_b(zero, src1);
+ src2 = __msa_ilvr_b(zero, src3);
+ src3 = __msa_ilvl_b(zero, src3);
+ src4 = __msa_ilvr_b(zero, src5);
+ src5 = __msa_ilvl_b(zero, src5);
+ src6 = __msa_ilvr_b(zero, src7);
+ src7 = __msa_ilvl_b(zero, src7);
+ src0 += src4;
+ src1 += src5;
+ src2 += src6;
+ src3 += src7;
+ src4 = __msa_ilvev_d(src1, src0);
+ src5 = __msa_ilvod_d(src1, src0);
+ src6 = __msa_ilvev_d(src3, src2);
+ src7 = __msa_ilvod_d(src3, src2);
+ vec2 = __msa_aver_u_h(src4, src5);
+ vec3 = __msa_aver_u_h(src6, src7);
+ ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080,
+ const_0x0015002a, const_0x0035000a, shuffler0, shuffler1,
+ shuffler2, shuffler3, shift, dst2, dst3);
+
+ dst0 = (v8u16)__msa_pckev_b(dst2, dst0);
+ dst1 = (v8u16)__msa_pckev_b(dst3, dst1);
+ ST_UB(dst0, dst_u);
+ ST_UB(dst1, dst_v);
s += 128;
t += 128;
dst_v += 16;
@@ -2566,103 +2634,108 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
}
}
-void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+void BGRAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* s = src_rgb0;
- const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
- v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
- v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
- v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ const uint8_t* s = src_rgb;
+ const uint8_t* t = src_rgb + src_stride_rgb;
+ const uint8_t unused = 0xf;
+ v8u16 src0, src1, src2, src3;
+ v16u8 dst0, dst1;
+ v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused};
+ v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15};
+ v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused};
+ v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14};
+ v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+ v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+ v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+ v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
- for (x = 0; x < width; x += 32) {
- READ_ARGB(s, t, vec0, vec1, vec2, vec3);
- ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
- const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_v);
- ST_UB(dst1, dst_u);
- s += 128;
- t += 128;
- dst_v += 16;
- dst_u += 16;
+ for (x = 0; x < width; x += 16) {
+ READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+ ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+ const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+ shuffler3, dst0, dst1);
+ *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+ *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+ s += 64;
+ t += 64;
+ dst_u += 8;
+ dst_v += 8;
}
}
-void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+void ABGRToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* s = src_rgb0;
- const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 src0, src1, src2, src3;
+ const uint8_t* s = src_rgb;
+ const uint8_t* t = src_rgb + src_stride_rgb;
+ const uint8_t unused = 0xf;
+ v8u16 src0, src1, src2, src3;
v16u8 dst0, dst1;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
- v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
- v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
- v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused};
+ v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14};
+ v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused};
+ v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13};
+ v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+ v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+ v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+ v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
- for (x = 0; x < width; x += 32) {
- READ_ARGB(s, t, src0, src1, src2, src3);
- ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
- const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_u);
- ST_UB(dst1, dst_v);
- s += 128;
- t += 128;
- dst_u += 16;
- dst_v += 16;
+ for (x = 0; x < width; x += 16) {
+ READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+ ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+ const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+ shuffler3, dst0, dst1);
+ *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+ *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+ s += 64;
+ t += 64;
+ dst_u += 8;
+ dst_v += 8;
}
}
-void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+void RGBAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
- const uint8_t* s = src_rgb0;
- const uint8_t* t = src_rgb0 + src_stride_rgb;
- v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
- v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
- v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
- 18, 19, 22, 23, 26, 27, 30, 31};
- v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
- v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
- v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
- v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
- v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
- v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+ const uint8_t* s = src_rgb;
+ const uint8_t* t = src_rgb + src_stride_rgb;
+ const uint8_t unused = 0xf;
+ v8u16 src0, src1, src2, src3;
+ v16u8 dst0, dst1;
+ v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused};
+ v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13};
+ v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused};
+ v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14};
+ v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f);
+ v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038);
+ v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013);
+ v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080);
+ v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
- for (x = 0; x < width; x += 32) {
- READ_ARGB(s, t, vec0, vec1, vec2, vec3);
- ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
- const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
- dst1);
- ST_UB(dst0, dst_u);
- ST_UB(dst1, dst_v);
- s += 128;
- t += 128;
- dst_u += 16;
- dst_v += 16;
+ for (x = 0; x < width; x += 16) {
+ READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001);
+ ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038,
+ const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2,
+ shuffler3, dst0, dst1);
+ *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0);
+ *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0);
+ s += 64;
+ t += 64;
+ dst_u += 8;
+ dst_v += 8;
}
}
@@ -2674,54 +2747,57 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
int width) {
int x;
v16u8 src0, src1, src2, dst0, dst1;
- v8u16 vec0, vec1, vec2;
+ v8i16 vec0, vec1, vec2;
v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
v8i16 zero = {0};
+ v4i32 const_0x80 = __msa_fill_w(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
for (x = 0; x < width; x += 8) {
READI444(src_y, src_u, src_v, src0, src1, src2);
- vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+ vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
reg0 *= vec_yg;
reg1 *= vec_yg;
reg0 = __msa_srai_w(reg0, 16);
reg1 = __msa_srai_w(reg1, 16);
- reg4 = reg0 + vec_br;
- reg5 = reg1 + vec_br;
- reg2 = reg0 + vec_bg;
- reg3 = reg1 + vec_bg;
- reg0 += vec_bb;
- reg1 += vec_bb;
+ reg0 += vec_yb;
+ reg1 += vec_yb;
vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
- reg0 -= reg6 * vec_ub;
- reg1 -= reg7 * vec_ub;
- reg2 -= reg6 * vec_ug;
- reg3 -= reg7 * vec_ug;
- reg4 -= reg8 * vec_vr;
- reg5 -= reg9 * vec_vr;
- reg2 -= reg8 * vec_vg;
- reg3 -= reg9 * vec_vg;
- reg0 = __msa_srai_w(reg0, 6);
- reg1 = __msa_srai_w(reg1, 6);
- reg2 = __msa_srai_w(reg2, 6);
- reg3 = __msa_srai_w(reg3, 6);
- reg4 = __msa_srai_w(reg4, 6);
- reg5 = __msa_srai_w(reg5, 6);
+ reg6 -= const_0x80;
+ reg7 -= const_0x80;
+ reg8 -= const_0x80;
+ reg9 -= const_0x80;
+ tmp0 = reg0 + reg6 * vec_ub;
+ tmp1 = reg1 + reg7 * vec_ub;
+ tmp2 = reg0 + reg8 * vec_vr;
+ tmp3 = reg1 + reg9 * vec_vr;
+ tmp4 = reg6 * vec_ug;
+ tmp5 = reg7 * vec_ug;
+ tmp4 += reg8 * vec_vg;
+ tmp5 += reg9 * vec_vg;
+ tmp4 = reg0 - tmp4;
+ tmp5 = reg1 - tmp5;
+ reg0 = __msa_srai_w(tmp0, 6);
+ reg1 = __msa_srai_w(tmp1, 6);
+ reg2 = __msa_srai_w(tmp2, 6);
+ reg3 = __msa_srai_w(tmp3, 6);
+ reg4 = __msa_srai_w(tmp4, 6);
+ reg5 = __msa_srai_w(tmp5, 6);
CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
- vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
- vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+ vec1 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+ vec2 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
@@ -2734,13 +2810,24 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
}
}
-void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+// TODO - respect YuvConstants
+void I400ToARGBRow_MSA(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
int x;
+#if defined(__aarch64__) || defined(__arm__)
+ int ygb = yuvconstants->kUVBiasBGR[3];
+ int yg = yuvconstants->kYToRgb[1];
+#else
+ int ygb = yuvconstants->kYBiasToRgb[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
v8i16 vec0, vec1;
v4i32 reg0, reg1, reg2, reg3;
- v4i32 vec_yg = __msa_fill_w(0x4A35);
- v8i16 vec_ygb = __msa_fill_h(0xFB78);
+ v4i32 vec_yg = __msa_fill_w(yg);
+ v8i16 vec_ygb = __msa_fill_h(ygb);
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
v8i16 max = __msa_ldi_h(0xFF);
v8i16 zero = {0};
@@ -2814,12 +2901,12 @@ void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -2827,8 +2914,7 @@ void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0);
src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
- YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
src_yuy2 += 16;
dst_argb += 32;
@@ -2842,12 +2928,12 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
- v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+ v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
+ v8i16 const_0x80 = __msa_ldi_h(0x80);
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
- YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
- vec_br, vec_yg);
+ YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@@ -2855,8 +2941,7 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0);
src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
- YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
- vec0, vec1, vec2);
+ YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
src_uyvy += 16;
dst_argb += 32;
@@ -3001,12 +3086,12 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
}
}
-void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+void ARGBBlendRow_MSA(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int x;
- v16u8 src0, src1, src2, src3, dst0, dst1;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
v8u16 const_256 = (v8u16)__msa_ldi_h(256);
@@ -3015,8 +3100,8 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
v16i8 zero = {0};
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
@@ -3051,16 +3136,16 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
- vec0 += vec8;
- vec1 += vec9;
- vec2 += vec10;
- vec3 += vec11;
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8);
+ dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
+ dst0 = (v16u8)__msa_adds_u_b(dst0, dst2);
+ dst1 = (v16u8)__msa_adds_u_b(dst1, dst3);
dst0 = __msa_bmnz_v(dst0, const_255, mask);
dst1 = __msa_bmnz_v(dst1, const_255, mask);
ST_UB2(dst0, dst1, dst_argb, 16);
- src_argb0 += 32;
+ src_argb += 32;
src_argb1 += 32;
dst_argb += 32;
}
@@ -3082,7 +3167,7 @@ void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
v16i8 zero = {0};
- for (x = 0; x < width; x += 8) {
+ for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32);
@@ -3315,10 +3400,10 @@ void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
}
}
-void MirrorUVRow_MSA(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorSplitUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
int x;
v16u8 src0, src1, src2, src3;
v16u8 dst0, dst1, dst2, dst3;
diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc
index a12fa790..804ff839 100644
--- a/files/source/row_neon.cc
+++ b/files/source/row_neon.cc
@@ -10,8 +10,6 @@
#include "libyuv/row.h"
-#include <stdio.h>
-
#ifdef __cplusplus
namespace libyuv {
extern "C" {
@@ -21,90 +19,118 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
+// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are
+// reserved.
+
+// q0: Y uint16x8_t
+// d2: U uint8x8_t
+// d3: V uint8x8_t
+
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
- "vld1.8 {d0}, [%0]! \n" \
- "vld1.32 {d2[0]}, [%1]! \n" \
- "vld1.32 {d2[1]}, [%2]! \n"
+ "vld1.8 {d0}, [%[src_y]]! \n" \
+ "vld1.32 {d2[0]}, [%[src_u]]! \n" \
+ "vld1.32 {d2[1]}, [%[src_v]]! \n" \
+ "vmov.u8 d1, d0 \n" \
+ "vmovl.u8 q1, d2 \n" \
+ "vzip.u8 d0, d1 \n" \
+ "vsli.u16 q1, q1, #8 \n"
// Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \
- "vld1.8 {d0}, [%0]! \n" \
- "vld1.8 {d2}, [%1]! \n" \
- "vld1.8 {d3}, [%2]! \n" \
- "vpaddl.u8 q1, q1 \n" \
- "vrshrn.u16 d2, q1, #1 \n"
+ "vld1.8 {d0}, [%[src_y]]! \n" \
+ "vld1.8 {d2}, [%[src_u]]! \n" \
+ "vmovl.u8 q0, d0 \n" \
+ "vld1.8 {d3}, [%[src_v]]! \n" \
+ "vsli.u16 q0, q0, #8 \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
- "vld1.8 {d0}, [%0]! \n" \
- "vmov.u8 d2, #128 \n"
+ "vld1.8 {d0}, [%[src_y]]! \n" \
+ "vmov.u8 q1, #128 \n" \
+ "vmovl.u8 q0, d0 \n" \
+ "vsli.u16 q0, q0, #8 \n"
// Read 8 Y and 4 UV from NV12
-#define READNV12 \
- "vld1.8 {d0}, [%0]! \n" \
- "vld1.8 {d2}, [%1]! \n" \
- "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
+#define READNV12 \
+ "vld1.8 {d0}, [%[src_y]]! \n" \
+ "vld1.8 {d2}, [%[src_uv]]! \n" \
+ "vmov.u8 d1, d0 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vzip.u8 d0, d1 \n" \
+ "vsli.u16 d2, d2, #8 \n" /* Duplicate low byte (U) */ \
+ "vsri.u16 d3, d3, #8 \n" /* Duplicate high byte (V) */
// Read 8 Y and 4 VU from NV21
#define READNV21 \
- "vld1.8 {d0}, [%0]! \n" \
- "vld1.8 {d2}, [%1]! \n" \
- "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
- "vuzp.u8 d3, d2 \n" \
- "vtrn.u32 d2, d3 \n"
+ "vld1.8 {d0}, [%[src_y]]! \n" \
+ "vld1.8 {d2}, [%[src_vu]]! \n" \
+ "vmov.u8 d1, d0 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vzip.u8 d0, d1 \n" \
+ "vsri.u16 d2, d2, #8 \n" /* Duplicate high byte (U) */ \
+ "vsli.u16 d3, d3, #8 \n" /* Duplicate low byte (V) */
// Read 8 YUY2
#define READYUY2 \
- "vld2.8 {d0, d2}, [%0]! \n" \
+ "vld2.8 {d0, d2}, [%[src_yuy2]]! \n" \
+ "vmovl.u8 q0, d0 \n" \
"vmov.u8 d3, d2 \n" \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
+ "vsli.u16 q0, q0, #8 \n" \
+ "vsli.u16 d2, d2, #8 \n" \
+ "vsri.u16 d3, d3, #8 \n"
// Read 8 UYVY
#define READUYVY \
- "vld2.8 {d2, d3}, [%0]! \n" \
- "vmov.u8 d0, d3 \n" \
+ "vld2.8 {d2, d3}, [%[src_uyvy]]! \n" \
+ "vmovl.u8 q0, d3 \n" \
"vmov.u8 d3, d2 \n" \
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
-
-#define YUVTORGB_SETUP \
- "vld1.8 {d24}, [%[kUVToRB]] \n" \
- "vld1.8 {d25}, [%[kUVToG]] \n" \
- "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
- "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
- "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
- "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
-
-#define YUVTORGB \
- "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \
- "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \
- "vmovl.u8 q0, d0 \n" /* Y */ \
- "vmovl.s16 q10, d1 \n" \
- "vmovl.s16 q0, d0 \n" \
- "vmul.s32 q10, q10, q15 \n" \
- "vmul.s32 q0, q0, q15 \n" \
- "vqshrun.s32 d0, q0, #16 \n" \
- "vqshrun.s32 d1, q10, #16 \n" /* Y */ \
- "vadd.s16 d18, d19 \n" \
- "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \
- "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \
- "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \
- "vaddw.u16 q1, q1, d16 \n" \
- "vaddw.u16 q10, q10, d17 \n" \
- "vaddw.u16 q3, q3, d18 \n" \
- "vqadd.s16 q8, q0, q13 \n" /* B */ \
- "vqadd.s16 q9, q0, q14 \n" /* R */ \
- "vqadd.s16 q0, q0, q4 \n" /* G */ \
- "vqadd.s16 q8, q8, q1 \n" /* B */ \
- "vqadd.s16 q9, q9, q10 \n" /* R */ \
- "vqsub.s16 q0, q0, q3 \n" /* G */ \
- "vqshrun.s16 d20, q8, #6 \n" /* B */ \
- "vqshrun.s16 d22, q9, #6 \n" /* R */ \
- "vqshrun.s16 d21, q0, #6 \n" /* G */
+ "vsli.u16 q0, q0, #8 \n" \
+ "vsli.u16 d2, d2, #8 \n" \
+ "vsri.u16 d3, d3, #8 \n"
+
+#define YUVTORGB_SETUP \
+ "vld4.8 {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \
+ "vld1.16 {d31[]}, [%[kRGBCoeffBias]]! \n" \
+ "vld1.16 {d20[], d21[]}, [%[kRGBCoeffBias]]! \n" \
+ "vld1.16 {d22[], d23[]}, [%[kRGBCoeffBias]]! \n" \
+ "vld1.16 {d24[], d25[]}, [%[kRGBCoeffBias]] \n"
+
+// q0: B uint16x8_t
+// q1: G uint16x8_t
+// q2: R uint16x8_t
+
+// Convert from YUV to 2.14 fixed point RGB
+#define YUVTORGB \
+ "vmull.u16 q2, d1, d31 \n" \
+ "vmull.u8 q8, d3, d29 \n" /* DGV */ \
+ "vmull.u16 q0, d0, d31 \n" \
+ "vmlal.u8 q8, d2, d28 \n" /* DG */ \
+ "vqshrn.u32 d0, q0, #16 \n" \
+ "vqshrn.u32 d1, q2, #16 \n" /* Y */ \
+ "vmull.u8 q9, d2, d26 \n" /* DB */ \
+ "vmull.u8 q2, d3, d27 \n" /* DR */ \
+ "vadd.u16 q4, q0, q11 \n" /* G */ \
+ "vadd.u16 q2, q0, q2 \n" /* R */ \
+ "vadd.u16 q0, q0, q9 \n" /* B */ \
+ "vqsub.u16 q1, q4, q8 \n" /* G */ \
+ "vqsub.u16 q0, q0, q10 \n" /* B */ \
+ "vqsub.u16 q2, q2, q12 \n" /* R */
+
+// Convert from 2.14 fixed point RGB To 8 bit RGB
+#define RGBTORGB8 \
+ "vqshrn.u16 d4, q2, #6 \n" /* R */ \
+ "vqshrn.u16 d2, q1, #6 \n" /* G */ \
+ "vqshrn.u16 d0, q0, #6 \n" /* B */
+
+#define YUVTORGB_REGS \
+ "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31"
+
+#define STORERGBA \
+ "vmov.u8 d1, d0 \n" \
+ "vmov.u8 d3, d4 \n" \
+ "vmov.u8 d0, d6 \n" \
+ "vst4.8 {d0, d1, d2, d3}, [%[dst_rgba]]! \n"
void I444ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
@@ -114,22 +140,20 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d6, #255 \n"
"1: \n" READYUV444 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
}
void I422ToARGBRow_NEON(const uint8_t* src_y,
@@ -140,22 +164,46 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
+}
+
+void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV444 YUVTORGB
+ RGBTORGB8
+ "vld1.8 {d6}, [%[src_a]]! \n"
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [src_a] "+r"(src_a), // %[src_a]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
}
void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
@@ -168,22 +216,20 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %5, %5, #8 \n"
- "vld1.8 {d23}, [%3]! \n"
- "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(src_a), // %3
- "+r"(dst_argb), // %4
- "+r"(width) // %5
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
+ RGBTORGB8
+ "vld1.8 {d6}, [%[src_a]]! \n"
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [src_a] "+r"(src_a), // %[src_a]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
}
void I422ToRGBARow_NEON(const uint8_t* src_y,
@@ -194,22 +240,18 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d19, #255 \n" // YUVTORGB modified d19
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgba), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
+ RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgba] "+r"(dst_rgba), // %[dst_rgba]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
}
void I422ToRGB24Row_NEON(const uint8_t* src_y,
@@ -220,29 +262,28 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vst3.8 {d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb24), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
}
#define ARGBTORGB565 \
- "vshll.u8 q0, d22, #8 \n" /* R */ \
- "vshll.u8 q8, d21, #8 \n" /* G */ \
- "vshll.u8 q9, d20, #8 \n" /* B */ \
- "vsri.16 q0, q8, #5 \n" /* RG */ \
- "vsri.16 q0, q9, #11 \n" /* RGB */
+ "vshll.u8 q2, d4, #8 \n" /* R */ \
+ "vshll.u8 q1, d2, #8 \n" /* G */ \
+ "vshll.u8 q0, d0, #8 \n" /* B */ \
+ "vsri.16 q2, q1, #5 \n" /* RG */ \
+ "vsri.16 q2, q0, #11 \n" /* RGB */
void I422ToRGB565Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
@@ -252,31 +293,29 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n" ARGBTORGB565
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb565), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
+ RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565
+ "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
}
#define ARGBTOARGB1555 \
- "vshll.u8 q0, d23, #8 \n" /* A */ \
- "vshll.u8 q8, d22, #8 \n" /* R */ \
- "vshll.u8 q9, d21, #8 \n" /* G */ \
- "vshll.u8 q10, d20, #8 \n" /* B */ \
- "vsri.16 q0, q8, #1 \n" /* AR */ \
- "vsri.16 q0, q9, #6 \n" /* ARG */ \
- "vsri.16 q0, q10, #11 \n" /* ARGB */
+ "vshll.u8 q3, d6, #8 \n" /* A */ \
+ "vshll.u8 q2, d4, #8 \n" /* R */ \
+ "vshll.u8 q1, d2, #8 \n" /* G */ \
+ "vshll.u8 q0, d0, #8 \n" /* B */ \
+ "vsri.16 q3, q2, #1 \n" /* AR */ \
+ "vsri.16 q3, q1, #6 \n" /* ARG */ \
+ "vsri.16 q3, q0, #11 \n" /* ARGB */
void I422ToARGB1555Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
@@ -287,30 +326,28 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n" ARGBTOARGB1555
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb1555), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vmov.u8 d6, #0xff \n" ARGBTOARGB1555
+ "vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555.
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "q3");
}
#define ARGBTOARGB4444 \
- "vshr.u8 d20, d20, #4 \n" /* B */ \
- "vbic.32 d21, d21, d4 \n" /* G */ \
- "vshr.u8 d22, d22, #4 \n" /* R */ \
- "vbic.32 d23, d23, d4 \n" /* A */ \
- "vorr d0, d20, d21 \n" /* BG */ \
- "vorr d1, d22, d23 \n" /* RA */ \
+ "vshr.u8 d0, d0, #4 \n" /* B */ \
+ "vbic.32 d2, d2, d7 \n" /* G */ \
+ "vshr.u8 d4, d4, #4 \n" /* R */ \
+ "vbic.32 d6, d6, d7 \n" /* A */ \
+ "vorr d0, d0, d2 \n" /* BG */ \
+ "vorr d1, d4, d6 \n" /* RA */ \
"vzip.u8 d0, d1 \n" /* BGRA */
void I422ToARGB4444Row_NEON(const uint8_t* src_y,
@@ -321,56 +358,53 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d4, #0x0f \n" // vbic bits to clear
- "1: \n"
-
- READYUV422 YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n" ARGBTOARGB4444
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb4444), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
+ "vmov.u8 d6, #255 \n"
+ "vmov.u8 d7, #0x0f \n" // vbic bits to clear
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n" ARGBTOARGB4444
+ "vst1.8 {q0}, [%[dst_argb4444]]! \n" // store 8 pixels
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "q3");
}
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile(
YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d6, #255 \n"
"1: \n" READYUV400 YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
- [kUVToG] "r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
+ RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
}
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d23, #255 \n"
+ "vmov.u8 d23, #255 \n"
"1: \n"
- "vld1.8 {d20}, [%0]! \n"
- "vmov d21, d20 \n"
- "vmov d22, d20 \n"
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {d20}, [%0]! \n"
+ "vmov d21, d20 \n"
+ "vmov d22, d20 \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -383,22 +417,20 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n" READNV12 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15");
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
}
void NV21ToARGBRow_NEON(const uint8_t* src_y,
@@ -406,22 +438,20 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n" READNV21 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_vu), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15");
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READNV21 YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_vu] "+r"(src_vu), // %[src_vu]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
}
void NV12ToRGB24Row_NEON(const uint8_t* src_y,
@@ -430,25 +460,19 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
-
YUVTORGB_SETUP
-
- "1: \n"
-
- READNV12 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst3.8 {d20, d21, d22}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb24), // %2
- "+r"(width) // %3
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
}
void NV21ToRGB24Row_NEON(const uint8_t* src_y,
@@ -457,25 +481,19 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(
-
YUVTORGB_SETUP
-
- "1: \n"
-
- READNV21 YUVTORGB
- "subs %3, %3, #8 \n"
- "vst3.8 {d20, d21, d22}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_vu), // %1
- "+r"(dst_rgb24), // %2
- "+r"(width) // %3
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READNV21 YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_vu] "+r"(src_vu), // %[src_vu]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
}
void NV12ToRGB565Row_NEON(const uint8_t* src_y,
@@ -485,62 +503,56 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "1: \n" READNV12 YUVTORGB
- "subs %3, %3, #8 \n" ARGBTORGB565
- "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb565), // %2
- "+r"(width) // %3
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
- "q12", "q13", "q14", "q15");
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n" ARGBTORGB565
+ "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
}
void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n" READYUY2 YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15");
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READYUY2 YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
}
void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile(YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n" READUYVY YUVTORGB
- "subs %2, %2, #8 \n"
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15");
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d6, #255 \n"
+ "1: \n" READUYVY YUVTORGB RGBTORGB8
+ "subs %[width], %[width], #8 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
+ "bgt 1b \n"
+ : [src_uyvy] "+r"(src_uyvy), // %[src_uyvy]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "d6");
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
@@ -550,11 +562,11 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
int width) {
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vst1.8 {q0}, [%1]! \n" // store U
- "vst1.8 {q1}, [%2]! \n" // store V
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store U
+ "vst1.8 {q1}, [%2]! \n" // store V
+ "bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -564,6 +576,52 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
);
}
+// Reads 16 byte Y's from tile and writes out 16 Y's.
+// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
+// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
+// width measured in bytes so 8 UV = 16.
+void DetileRow_NEON(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.16 {q0}, [%0], %3 \n" // load 16 bytes
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "pld [%0, 1792] \n"
+ "vst1.16 {q0}, [%1]! \n" // store 16 bytes
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride) // %3
+ : "cc", "memory", "q0" // Clobber List
+ );
+}
+
+// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
+void DetileSplitUVRow_NEON(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], %4 \n"
+ "subs %3, %3, #16 \n"
+ "pld [%0, 1792] \n"
+ "vst1.8 {d0}, [%1]! \n"
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(src_tile_stride) // %4
+ : "cc", "memory", "d0", "d1" // Clobber List
+ );
+}
+
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v,
@@ -571,11 +629,11 @@ void MergeUVRow_NEON(const uint8_t* src_u,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load U
- "vld1.8 {q1}, [%1]! \n" // load V
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load U
+ "vld1.8 {q1}, [%1]! \n" // load V
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "bgt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -593,13 +651,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
int width) {
asm volatile(
"1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
- "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
- "subs %4, %4, #16 \n" // 16 processed per loop
- "vst1.8 {q0}, [%1]! \n" // store R
- "vst1.8 {q1}, [%2]! \n" // store G
- "vst1.8 {q2}, [%3]! \n" // store B
- "bgt 1b \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store R
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%3]! \n" // store B
+ "bgt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
@@ -618,13 +676,13 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load R
- "vld1.8 {q1}, [%1]! \n" // load G
- "vld1.8 {q2}, [%2]! \n" // load B
- "subs %4, %4, #16 \n" // 16 processed per loop
- "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
- "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q2}, [%2]! \n" // load B
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
+ "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
+ "bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@@ -635,14 +693,341 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
);
}
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
+void SplitARGBRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
+ "subs %5, %5, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%3]! \n" // store B
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%1]! \n" // store R
+ "vst1.8 {q3}, [%4]! \n" // store A
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(dst_a), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
+void MergeARGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q2}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q0}, [%2]! \n" // load B
+ "vld1.8 {q3}, [%3]! \n" // load A
+ "subs %5, %5, #16 \n" // 16 processed per loop
+ "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB
+ "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
+void SplitXRGBRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%3]! \n" // store B
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%1]! \n" // store R
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
+void MergeXRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 q3, #255 \n" // load A(255)
+ "1: \n"
+ "vld1.8 {q2}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q0}, [%2]! \n" // load B
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB
+ "vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void MergeXR30Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int depth,
+ int width) {
+ int shift = 10 - depth;
+ asm volatile(
+ "vmov.u32 q14, #1023 \n"
+ "vdup.32 q15, %5 \n"
+ "1: \n"
+ "vld1.16 {d4}, [%2]! \n" // B
+ "vld1.16 {d2}, [%1]! \n" // G
+ "vld1.16 {d0}, [%0]! \n" // R
+ "vmovl.u16 q2, d4 \n" // B
+ "vmovl.u16 q1, d2 \n" // G
+ "vmovl.u16 q0, d0 \n" // R
+ "vshl.u32 q2, q2, q15 \n" // 000B
+ "vshl.u32 q1, q1, q15 \n"
+ "vshl.u32 q0, q0, q15 \n"
+ "vmin.u32 q2, q2, q14 \n"
+ "vmin.u32 q1, q1, q14 \n"
+ "vmin.u32 q0, q0, q14 \n"
+ "vsli.u32 q2, q1, #10 \n" // 00GB
+ "vsli.u32 q2, q0, #20 \n" // 0RGB
+ "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
+ "subs %4, %4, #4 \n"
+ "vst1.8 {q2}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar30), // %3
+ "+r"(width) // %4
+ : "r"(shift) // %5
+ : "memory", "cc", "q0", "q1", "q2", "q14", "q15");
+}
+
+void MergeXR30Row_10_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int /* depth */,
+ int width) {
+ asm volatile(
+ "vmov.u32 q14, #1023 \n"
+ "1: \n"
+ "vld1.16 {d4}, [%2]! \n" // B
+ "vld1.16 {d2}, [%1]! \n" // G
+ "vld1.16 {d0}, [%0]! \n" // R
+ "vmovl.u16 q2, d4 \n" // 000B
+ "vmovl.u16 q1, d2 \n" // G
+ "vmovl.u16 q0, d0 \n" // R
+ "vmin.u32 q2, q2, q14 \n"
+ "vmin.u32 q1, q1, q14 \n"
+ "vmin.u32 q0, q0, q14 \n"
+ "vsli.u32 q2, q1, #10 \n" // 00GB
+ "vsli.u32 q2, q0, #20 \n" // 0RGB
+ "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
+ "subs %4, %4, #4 \n"
+ "vst1.8 {q2}, [%3]! \n"
+ "bgt 1b \n"
+ "3: \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar30), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q14");
+}
+
+void MergeAR64Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int mask = (1 << depth) - 1;
+ asm volatile(
+
+ "vdup.u16 q15, %6 \n"
+ "vdup.u16 q14, %7 \n"
+ "1: \n"
+ "vld1.16 {q2}, [%0]! \n" // R
+ "vld1.16 {q1}, [%1]! \n" // G
+ "vld1.16 {q0}, [%2]! \n" // B
+ "vld1.16 {q3}, [%3]! \n" // A
+ "vmin.u16 q2, q2, q14 \n"
+ "vmin.u16 q1, q1, q14 \n"
+ "vmin.u16 q0, q0, q14 \n"
+ "vmin.u16 q3, q3, q14 \n"
+ "vshl.u16 q2, q2, q15 \n"
+ "vshl.u16 q1, q1, q15 \n"
+ "vshl.u16 q0, q0, q15 \n"
+ "vshl.u16 q3, q3, q15 \n"
+ "subs %5, %5, #8 \n"
+ "vst4.16 {d0, d2, d4, d6}, [%4]! \n"
+ "vst4.16 {d1, d3, d5, d7}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_ar64), // %4
+ "+r"(width) // %5
+ : "r"(shift), // %6
+ "r"(mask) // %7
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
+}
+
+void MergeXR64Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int mask = (1 << depth) - 1;
+ asm volatile(
+
+ "vmov.u8 q3, #0xff \n" // A (0xffff)
+ "vdup.u16 q15, %5 \n"
+ "vdup.u16 q14, %6 \n"
+ "1: \n"
+ "vld1.16 {q2}, [%0]! \n" // R
+ "vld1.16 {q1}, [%1]! \n" // G
+ "vld1.16 {q0}, [%2]! \n" // B
+ "vmin.u16 q2, q2, q14 \n"
+ "vmin.u16 q1, q1, q14 \n"
+ "vmin.u16 q0, q0, q14 \n"
+ "vshl.u16 q2, q2, q15 \n"
+ "vshl.u16 q1, q1, q15 \n"
+ "vshl.u16 q0, q0, q15 \n"
+ "subs %4, %4, #8 \n"
+ "vst4.16 {d0, d2, d4, d6}, [%3]! \n"
+ "vst4.16 {d1, d3, d5, d7}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar64), // %3
+ "+r"(width) // %4
+ : "r"(shift), // %5
+ "r"(mask) // %6
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
+}
+
+void MergeARGB16To8Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ int shift = 8 - depth;
+ asm volatile(
+
+ "vdup.16 q15, %6 \n"
+ "1: \n"
+ "vld1.16 {q2}, [%0]! \n" // R
+ "vld1.16 {q1}, [%1]! \n" // G
+ "vld1.16 {q0}, [%2]! \n" // B
+ "vld1.16 {q3}, [%3]! \n" // A
+ "vshl.u16 q2, q2, q15 \n"
+ "vshl.u16 q1, q1, q15 \n"
+ "vshl.u16 q0, q0, q15 \n"
+ "vshl.u16 q3, q3, q15 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d1, q1 \n"
+ "vqmovn.u16 d2, q2 \n"
+ "vqmovn.u16 d3, q3 \n"
+ "subs %5, %5, #8 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : "r"(shift) // %6
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
+}
+
+void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ int shift = 8 - depth;
+ asm volatile(
+
+ "vdup.16 q15, %5 \n"
+ "vmov.u8 d6, #0xff \n" // A (0xff)
+ "1: \n"
+ "vld1.16 {q2}, [%0]! \n" // R
+ "vld1.16 {q1}, [%1]! \n" // G
+ "vld1.16 {q0}, [%2]! \n" // B
+ "vshl.u16 q2, q2, q15 \n"
+ "vshl.u16 q1, q1, q15 \n"
+ "vshl.u16 q0, q0, q15 \n"
+ "vqmovn.u16 d5, q2 \n"
+ "vqmovn.u16 d4, q1 \n"
+ "vqmovn.u16 d3, q0 \n"
+ "subs %4, %4, #8 \n"
+ "vst4.u8 {d3, d4, d5, d6}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(shift) // %5
+ : "memory", "cc", "q0", "q1", "q2", "d6", "q15");
+}
+
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"1: \n"
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
- "subs %2, %2, #32 \n" // 32 processed per loop
- "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
- "bgt 1b \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "subs %2, %2, #32 \n" // 32 processed per loop
+ "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2 // Output registers
@@ -654,11 +1039,11 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile(
- "vdup.8 q0, %2 \n" // duplicate 16 bytes
+ "vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n"
- "subs %1, %1, #16 \n" // 16 bytes per loop
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
+ "subs %1, %1, #16 \n" // 16 bytes per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v8) // %2
@@ -668,11 +1053,11 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile(
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
- "subs %1, %1, #4 \n" // 4 pixels per loop
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
+ "subs %1, %1, #4 \n" // 4 pixels per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v32) // %2
@@ -682,41 +1067,62 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
// Start at end of source row.
- "mov r3, #-16 \n"
- "add %0, %0, %2 \n"
- "sub %0, #16 \n"
+ "add %0, %0, %2 \n"
+ "sub %0, %0, #32 \n" // 32 bytes per loop
"1: \n"
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #16 \n" // 16 pixels per loop.
- "vrev64.8 q0, q0 \n"
- "vst1.8 {d1}, [%1]! \n" // dst += 16
- "vst1.8 {d0}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32
+ "subs %2, #32 \n" // 32 pixels per loop.
+ "vrev64.8 q0, q2 \n"
+ "vrev64.8 q1, q1 \n"
+ "vswp d0, d1 \n"
+ "vswp d2, d3 \n"
+ "vst1.8 {q0, q1}, [%1]! \n" // dst += 32
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
+ : "r"(-32) // %3
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %2, lsl #1 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst2.8 {d0, d1}, [%1]! \n" // dst += 16
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
:
- : "cc", "memory", "r3", "q0");
+ : "cc", "memory", "r12", "q0");
}
-void MirrorUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
asm volatile(
// Start at end of source row.
- "mov r12, #-16 \n"
- "add %0, %0, %3, lsl #1 \n"
- "sub %0, #16 \n"
+ "mov r12, #-16 \n"
+ "add %0, %0, %3, lsl #1 \n"
+ "sub %0, #16 \n"
"1: \n"
- "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
- "subs %3, #8 \n" // 8 pixels per loop.
- "vrev64.8 q0, q0 \n"
- "vst1.8 {d0}, [%1]! \n" // dst += 8
- "vst1.8 {d1}, [%2]! \n"
- "bgt 1b \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %3, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -725,37 +1131,57 @@ void MirrorUVRow_NEON(const uint8_t* src_uv,
: "cc", "memory", "r12", "q0");
}
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- // Start at end of source row.
- "mov r3, #-16 \n"
- "add %0, %0, %2, lsl #2 \n"
- "sub %0, #16 \n"
+ "add %0, %0, %2, lsl #2 \n"
+ "sub %0, #32 \n"
"1: \n"
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #4 \n" // 4 pixels per loop.
- "vrev64.32 q0, q0 \n"
- "vst1.8 {d1}, [%1]! \n" // dst += 16
- "vst1.8 {d0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0");
+ "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 d0, d0 \n"
+ "vrev64.8 d1, d1 \n"
+ "vrev64.8 d2, d2 \n"
+ "vrev64.8 d3, d3 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(-32) // %3
+ : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ src_rgb24 += width * 3 - 24;
+ asm volatile(
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
+ "subs %2, #8 \n" // 8 pixels per loop.
+ "vrev64.8 d0, d0 \n"
+ "vrev64.8 d1, d1 \n"
+ "vrev64.8 d2, d2 \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "r"(-24) // %3
+ : "cc", "memory", "d0", "d1", "d2");
}
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d4, #255 \n" // Alpha
+ "vmov.u8 d4, #255 \n" // Alpha
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -766,13 +1192,13 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d4, #255 \n" // Alpha
+ "vmov.u8 d4, #255 \n" // Alpha
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -781,15 +1207,31 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
);
}
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ asm volatile(
+ "vmov.u8 d0, #255 \n" // Alpha
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile(
"1: \n"
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
// RGB24.
- "bgt 1b \n"
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -814,13 +1256,13 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -860,13 +1302,13 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -889,13 +1331,13 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // Alpha
+ "vmov.u8 d3, #255 \n" // Alpha
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -909,27 +1351,28 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
- // RGB24.
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst3.8 {d0, d2, d4}, [%1]! \n" // store 16 RGB24 pixels.
+ "vst3.8 {d1, d3, d5}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
);
}
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile(
"1: \n"
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
- "bgt 1b \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(width) // %2
@@ -941,10 +1384,10 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -956,10 +1399,10 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -974,11 +1417,11 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- "vst1.8 {d1}, [%1]! \n" // store 8 U.
- "vst1.8 {d3}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -994,11 +1437,11 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- "vst1.8 {d0}, [%1]! \n" // store 8 U.
- "vst1.8 {d2}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1014,16 +1457,16 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // stride + src_yuy2
+ "add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
- "vrhadd.u8 d1, d1, d5 \n" // average rows of U
- "vrhadd.u8 d3, d3, d7 \n" // average rows of V
- "vst1.8 {d1}, [%2]! \n" // store 8 U.
- "vst1.8 {d3}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 d1, d1, d5 \n" // average rows of U
+ "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1
"+r"(dst_u), // %2
@@ -1041,16 +1484,16 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // stride + src_uyvy
+ "add %1, %0, %1 \n" // stride + src_uyvy
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
- "vrhadd.u8 d0, d0, d4 \n" // average rows of U
- "vrhadd.u8 d2, d2, d6 \n" // average rows of V
- "vst1.8 {d0}, [%2]! \n" // store 8 U.
- "vst1.8 {d2}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
+ "vrhadd.u8 d0, d0, d4 \n" // average rows of U
+ "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1
"+r"(dst_u), // %2
@@ -1068,14 +1511,14 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
const uint8_t* shuffler,
int width) {
asm volatile(
- "vld1.8 {q2}, [%3] \n" // shuffler
+ "vld1.8 {q2}, [%3] \n" // shuffler
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
- "subs %2, %2, #4 \n" // 4 processed per loop
- "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
- "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
- "vst1.8 {q1}, [%1]! \n" // store 4.
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
+ "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "vst1.8 {q1}, [%1]! \n" // store 4.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -1091,12 +1534,12 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
- "vld1.8 {d1}, [%1]! \n" // load 8 Us
- "vld1.8 {d3}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
- "bgt 1b \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1113,12 +1556,12 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
- "vld1.8 {d0}, [%1]! \n" // load 8 Us
- "vld1.8 {d2}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
- "bgt 1b \n"
+ "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1133,16 +1576,16 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
+ "vst1.8 {q2}, [%1]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
"+r"(width) // %2
:
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+ : "cc", "memory", "q0", "q1", "q2", "d6");
}
void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
@@ -1150,21 +1593,21 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
const uint32_t dither4,
int width) {
asm volatile(
- "vdup.32 d2, %2 \n" // dither4
+ "vdup.32 d7, %2 \n" // dither4
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d20, d20, d2 \n"
- "vqadd.u8 d21, d21, d2 \n"
- "vqadd.u8 d22, d22, d2 \n" // add for dither
+ "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d7 \n"
+ "vqadd.u8 d2, d2, d7 \n"
+ "vqadd.u8 d4, d4, d7 \n" // add for dither
ARGBTORGB565
- "vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
- "bgt 1b \n"
+ "vst1.8 {q2}, [%0]! \n" // store 8 RGB565.
+ "bgt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
"r"(dither4), // %2
"r"(width) // %3
- : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
+ : "cc", "memory", "q0", "q1", "q2", "q3");
}
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
@@ -1172,58 +1615,35 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
- "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
- "bgt 1b \n"
+ "vst1.8 {q3}, [%1]! \n" // store 8 ARGB1555.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
"+r"(width) // %2
:
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+ : "cc", "memory", "q0", "q1", "q2", "q3");
}
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444,
int width) {
asm volatile(
- "vmov.u8 d4, #0x0f \n" // bits to clear with
+ "vmov.u8 d7, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
- "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
- "bgt 1b \n"
+ "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
"+r"(width) // %2
:
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
-}
-
-void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
- asm volatile(
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+ : "cc", "memory", "q0", "q1", "q2", "q3");
}
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
@@ -1231,11 +1651,11 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q3}, [%1]! \n" // store 16 A's.
- "bgt 1b \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q3}, [%1]! \n" // store 16 A's.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
"+r"(width) // %2
@@ -1244,59 +1664,36 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
);
}
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
- asm volatile(
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
-}
-
// 8x1 pixels.
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile(
- "vmov.u8 d24, #112 \n" // UB / VR 0.875
+ "vmov.u8 d24, #112 \n" // UB / VR 0.875
// coefficient
- "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
- "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
- "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
- "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlsl.u8 q2, d1, d25 \n" // G
- "vmlsl.u8 q2, d2, d26 \n" // R
- "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
-
- "vmull.u8 q3, d2, d24 \n" // R
- "vmlsl.u8 q3, d1, d28 \n" // G
- "vmlsl.u8 q3, d0, d27 \n" // B
- "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
-
- "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
-
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
+ "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
+ "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
+ "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlsl.u8 q2, d1, d25 \n" // G
+ "vmlsl.u8 q2, d2, d26 \n" // R
+
+ "vmull.u8 q3, d2, d24 \n" // R
+ "vmlsl.u8 q3, d1, d28 \n" // G
+ "vmlsl.u8 q3, d0, d27 \n" // B
+
+ "vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned
+ "vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned
+
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1312,13 +1709,11 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
"vmul.s16 q8, " #QB ", q10 \n" /* B */ \
"vmls.s16 q8, " #QG ", q11 \n" /* G */ \
"vmls.s16 q8, " #QR ", q12 \n" /* R */ \
- "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
"vmul.s16 q9, " #QR ", q10 \n" /* R */ \
"vmls.s16 q9, " #QG ", q14 \n" /* G */ \
"vmls.s16 q9, " #QB ", q13 \n" /* B */ \
- "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
- "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
- "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
+ "vaddhn.u16 d0, q8, q15 \n" /* +128 -> unsigned */ \
+ "vaddhn.u16 d1, q9, q15 \n" /* +128 -> unsigned */
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
@@ -1328,34 +1723,34 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
@@ -1374,34 +1769,34 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
- "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
- "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
- "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
- "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
@@ -1413,40 +1808,132 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
);
}
+// TODO(fbarchard): Subsample match C code.
+void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_stride_rgb24), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void RAWToUVJRow_NEON(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_stride_raw), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_bgra
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
- "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
- "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q1, q1, #1 \n" // 2x average
- "vrshr.u16 q2, q2, #1 \n"
- "vrshr.u16 q3, q3, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_bgra
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
+ "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
+ "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q1, q1, #1 \n" // 2x average
+ "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q3, q3, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q3, q2, q1)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_bgra), // %0
"+r"(src_stride_bgra), // %1
"+r"(dst_u), // %2
@@ -1464,34 +1951,34 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_abgr
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_abgr
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_stride_abgr), // %1
"+r"(dst_u), // %2
@@ -1509,34 +1996,34 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgba
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
- "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
- "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_rgba
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
+ "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
+ "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(src_stride_rgba), // %1
"+r"(dst_u), // %2
@@ -1554,34 +2041,34 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_rgb24
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(src_stride_rgb24), // %1
"+r"(dst_u), // %2
@@ -1599,34 +2086,34 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
uint8_t* dst_v,
int width) {
asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_raw
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
- "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
- "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
- "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
- "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
-
- "vrshr.u16 q0, q0, #1 \n" // 2x average
- "vrshr.u16 q1, q1, #1 \n"
- "vrshr.u16 q2, q2, #1 \n"
-
- "subs %4, %4, #16 \n" // 32 processed per loop.
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(src_stride_raw), // %1
"+r"(dst_u), // %2
@@ -1645,55 +2132,55 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(src_stride_rgb565), // %1
"+r"(dst_u), // %2
@@ -1711,55 +2198,55 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(src_stride_argb1555), // %1
"+r"(dst_u), // %2
@@ -1777,55 +2264,46 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_v,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
-
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
-
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q0, q4, #1 \n" // 2x average
+ "vrshr.u16 q1, q5, #1 \n"
+ "vrshr.u16 q2, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(src_stride_argb4444), // %1
"+r"(dst_u), // %2
@@ -1838,21 +2316,21 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile(
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1864,21 +2342,21 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
asm volatile(
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1890,21 +2368,21 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y,
int width) {
asm volatile(
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1912,119 +2390,276 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
: "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
}
-void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+void ARGBToAR64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ar64,
+ int width) {
asm volatile(
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // R
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // B
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q2}, [%0]! \n"
+ "vmov.u8 q1, q0 \n"
+ "vmov.u8 q3, q2 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels
+ "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ar64), // %1
"+r"(width) // %2
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+ : "cc", "memory", "q0", "q1", "q2", "q3");
}
-void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
+ 10, 9, 8, 11, 14, 13, 12, 15};
+
+void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ab64,
+ int width) {
asm volatile(
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // R
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // B
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
+ "vld1.8 {q4}, [%3] \n" // shuffler
+
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q2}, [%0]! \n"
+ "vtbl.8 d2, {d0, d1}, d8 \n"
+ "vtbl.8 d3, {d0, d1}, d9 \n"
+ "vtbl.8 d6, {d4, d5}, d8 \n"
+ "vtbl.8 d7, {d4, d5}, d9 \n"
+ "vmov.u8 q0, q1 \n"
+ "vmov.u8 q2, q3 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels
+ "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleARGBToABGR) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
+void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n"
+ "vld1.16 {q1}, [%0]! \n"
+ "vld1.16 {q2}, [%0]! \n"
+ "vld1.16 {q3}, [%0]! \n"
+ "vshrn.u16 d0, q0, #8 \n"
+ "vshrn.u16 d1, q1, #8 \n"
+ "vshrn.u16 d4, q2, #8 \n"
+ "vshrn.u16 d5, q3, #8 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 4 pixels
+ "vst1.8 {q2}, [%1]! \n" // store 4 pixels
+ "bgt 1b \n"
+ : "+r"(src_ar64), // %0
+ "+r"(dst_argb), // %1
"+r"(width) // %2
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
+
+void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vld1.8 {d8}, [%3] \n" // shuffler
+
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n"
+ "vld1.16 {q1}, [%0]! \n"
+ "vld1.16 {q2}, [%0]! \n"
+ "vld1.16 {q3}, [%0]! \n"
+ "vtbl.8 d0, {d0, d1}, d8 \n"
+ "vtbl.8 d1, {d2, d3}, d8 \n"
+ "vtbl.8 d4, {d4, d5}, d8 \n"
+ "vtbl.8 d5, {d6, d7}, d8 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 4 pixels
+ "vst1.8 {q2}, [%1]! \n" // store 4 pixels
+ "bgt 1b \n"
+ : "+r"(src_ab64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleAB64ToARGB) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
+struct RgbConstants {
+ uint8_t kRGBToY[4];
+ uint16_t kAddY;
+ uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+ 128,
+ 0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+ 0x1080,
+ 0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+ 0x1080,
+ 0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ asm volatile(
+ "vld1.8 {d0}, [%3] \n" // load rgbconstants
+ "vdup.u8 d20, d0[0] \n"
+ "vdup.u8 d21, d0[1] \n"
+ "vdup.u8 d22, d0[2] \n"
+ "vdup.u16 q12, d0[2] \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vmull.u8 q8, d0, d20 \n" // B
+ "vmull.u8 q9, d1, d20 \n"
+ "vmlal.u8 q8, d2, d21 \n" // G
+ "vmlal.u8 q9, d3, d21 \n"
+ "vmlal.u8 q8, d4, d22 \n" // R
+ "vmlal.u8 q9, d5, d22 \n"
+ "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
+ "vaddhn.u16 d1, q9, q12 \n"
+ "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(rgbconstants) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+ "q12");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+ ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
+ asm volatile(
+ "vld1.8 {d0}, [%3] \n" // load rgbconstants
+ "vdup.u8 d20, d0[0] \n"
+ "vdup.u8 d21, d0[1] \n"
+ "vdup.u8 d22, d0[2] \n"
+ "vdup.u16 q12, d0[2] \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vmull.u8 q8, d2, d20 \n" // B
+ "vmull.u8 q9, d3, d20 \n"
+ "vmlal.u8 q8, d4, d21 \n" // G
+ "vmlal.u8 q9, d5, d21 \n"
+ "vmlal.u8 q8, d6, d22 \n" // R
+ "vmlal.u8 q9, d7, d22 \n"
+ "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
+ "vaddhn.u16 d1, q9, q12 \n"
+ "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(rgbconstants) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+ "q12");
}
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+ RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
asm volatile(
- "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // B
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+ "vld1.8 {d0}, [%3] \n" // load rgbconstants
+ "vdup.u8 d20, d0[0] \n"
+ "vdup.u8 d21, d0[1] \n"
+ "vdup.u8 d22, d0[2] \n"
+ "vdup.u16 q12, d0[2] \n"
+ "1: \n"
+ "vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of
+ // RGB24.
+ "vld3.8 {d3, d5, d7}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vmull.u8 q8, d2, d20 \n" // B
+ "vmull.u8 q9, d3, d20 \n"
+ "vmlal.u8 q8, d4, d21 \n" // G
+ "vmlal.u8 q9, d5, d21 \n"
+ "vmlal.u8 q8, d6, d22 \n" // R
+ "vmlal.u8 q9, d7, d22 \n"
+ "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
+ "vaddhn.u16 d1, q9, q12 \n"
+ "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(rgbconstants) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
+ "q12");
+}
+
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
}
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
- asm volatile(
- "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+ RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
}
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
- asm volatile(
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+ RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
}
// Bilinear filter 16x2 -> 16x1
@@ -2035,46 +2670,46 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
asm volatile(
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
@@ -2086,59 +2721,119 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
: "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
}
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_16_NEON(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+ asm volatile(
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+
+ "vdup.16 d17, %4 \n"
+ "vdup.16 d16, %5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.16 {q0}, [%1]! \n"
+ "vld1.16 {q1}, [%2]! \n"
+ "subs %3, %3, #8 \n"
+ "vmull.u16 q2, d0, d16 \n"
+ "vmull.u16 q3, d1, d16 \n"
+ "vmlal.u16 q2, d2, d17 \n"
+ "vmlal.u16 q3, d3, d17 \n"
+ "vrshrn.u32 d0, q2, #8 \n"
+ "vrshrn.u32 d1, q3, #8 \n"
+ "vst1.16 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.16 {q0}, [%1]! \n"
+ "vld1.16 {q1}, [%2]! \n"
+ "subs %3, %3, #8 \n"
+ "vrhadd.u16 q0, q1 \n"
+ "vst1.16 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.16 {q0}, [%1]! \n"
+ "subs %3, %3, #8 \n"
+ "vst1.16 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(dst_width) // %3
+ : "r"(y1_fraction), // %4
+ "r"(y0_fraction) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8");
+}
+
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+void ARGBBlendRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile(
- "subs %3, #8 \n"
- "blt 89f \n"
+ "subs %3, #8 \n"
+ "blt 89f \n"
// Blend 8 pixels.
"8: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
"vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
"vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
"vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
- "bge 8b \n"
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
+ "bge 8b \n"
"89: \n"
- "adds %3, #8-1 \n"
- "blt 99f \n"
+ "adds %3, #8-1 \n"
+ "blt 99f \n"
// Blend 1 pixels.
"1: \n"
- "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
- "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
- "subs %3, %3, #1 \n" // 1 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
- "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
- "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
- "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
- "bge 1b \n"
-
- "99: \n"
-
- : "+r"(src_argb0), // %0
+ "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "subs %3, %3, #1 \n" // 1 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
+ "bge 1b \n"
+
+ "99: \n"
+
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -2153,16 +2848,16 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
asm volatile(
// Attenuate 8 pixels.
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d0, d3 \n" // b * a
- "vmull.u8 q11, d1, d3 \n" // g * a
- "vmull.u8 q12, d2, d3 \n" // r * a
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d0, d3 \n" // b * a
+ "vmull.u8 q11, d1, d3 \n" // g * a
+ "vmull.u8 q12, d2, d3 \n" // r * a
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2178,32 +2873,32 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int interval_offset,
int width) {
asm volatile(
- "vdup.u16 q8, %2 \n"
- "vshr.u16 q8, q8, #1 \n" // scale >>= 1
- "vdup.u16 q9, %3 \n" // interval multiply.
- "vdup.u16 q10, %4 \n" // interval add
+ "vdup.u16 q8, %2 \n"
+ "vshr.u16 q8, q8, #1 \n" // scale >>= 1
+ "vdup.u16 q9, %3 \n" // interval multiply.
+ "vdup.u16 q10, %4 \n" // interval add
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmovl.u8 q0, d0 \n" // b (0 .. 255)
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q2, d4 \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q0, d0 \n" // b (0 .. 255)
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q2, d4 \n"
"vqdmulh.s16 q0, q0, q8 \n" // b * scale
"vqdmulh.s16 q1, q1, q8 \n" // g
"vqdmulh.s16 q2, q2, q8 \n" // r
- "vmul.u16 q0, q0, q9 \n" // b * interval_size
- "vmul.u16 q1, q1, q9 \n" // g
- "vmul.u16 q2, q2, q9 \n" // r
- "vadd.u16 q0, q0, q10 \n" // b + interval_offset
- "vadd.u16 q1, q1, q10 \n" // g
- "vadd.u16 q2, q2, q10 \n" // r
- "vqmovn.u16 d0, q0 \n"
- "vqmovn.u16 d2, q1 \n"
- "vqmovn.u16 d4, q2 \n"
- "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vmul.u16 q0, q0, q9 \n" // b * interval_size
+ "vmul.u16 q1, q1, q9 \n" // g
+ "vmul.u16 q2, q2, q9 \n" // r
+ "vadd.u16 q0, q0, q10 \n" // b + interval_offset
+ "vadd.u16 q1, q1, q10 \n" // g
+ "vadd.u16 q2, q2, q10 \n" // r
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d2, q1 \n"
+ "vqmovn.u16 d4, q2 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
@@ -2220,28 +2915,28 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
int width,
uint32_t value) {
asm volatile(
- "vdup.u32 q0, %3 \n" // duplicate scale value.
- "vzip.u8 d0, d1 \n" // d0 aarrggbb.
- "vshr.u16 q0, q0, #1 \n" // scale / 2.
+ "vdup.u32 q0, %3 \n" // duplicate scale value.
+ "vzip.u8 d0, d1 \n" // d0 aarrggbb.
+ "vshr.u16 q0, q0, #1 \n" // scale / 2.
// 8 pixel loop.
"1: \n"
- "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q10, d20 \n" // b (0 .. 255)
- "vmovl.u8 q11, d22 \n"
- "vmovl.u8 q12, d24 \n"
- "vmovl.u8 q13, d26 \n"
+ "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q10, d20 \n" // b (0 .. 255)
+ "vmovl.u8 q11, d22 \n"
+ "vmovl.u8 q12, d24 \n"
+ "vmovl.u8 q13, d26 \n"
"vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
"vqrdmulh.s16 q11, q11, d0[1] \n" // g
"vqrdmulh.s16 q12, q12, d0[2] \n" // r
"vqrdmulh.s16 q13, q13, d0[3] \n" // a
- "vqmovn.u16 d20, q10 \n"
- "vqmovn.u16 d22, q11 \n"
- "vqmovn.u16 d24, q12 \n"
- "vqmovn.u16 d26, q13 \n"
- "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
+ "vqmovn.u16 d20, q10 \n"
+ "vqmovn.u16 d22, q11 \n"
+ "vqmovn.u16 d24, q12 \n"
+ "vqmovn.u16 d26, q13 \n"
+ "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2251,23 +2946,23 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
// Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
- "vmov d1, d0 \n" // G
- "vmov d2, d0 \n" // R
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
+ "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
+ "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B
+ "vmov d1, d0 \n" // G
+ "vmov d2, d0 \n" // R
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2281,32 +2976,32 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
// r = (r * 50 + g * 98 + b * 24) >> 7
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
asm volatile(
- "vmov.u8 d20, #17 \n" // BB coefficient
- "vmov.u8 d21, #68 \n" // BG coefficient
- "vmov.u8 d22, #35 \n" // BR coefficient
- "vmov.u8 d24, #22 \n" // GB coefficient
- "vmov.u8 d25, #88 \n" // GG coefficient
- "vmov.u8 d26, #45 \n" // GR coefficient
- "vmov.u8 d28, #24 \n" // BB coefficient
- "vmov.u8 d29, #98 \n" // BG coefficient
- "vmov.u8 d30, #50 \n" // BR coefficient
- "1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d20 \n" // B to Sepia B
- "vmlal.u8 q2, d1, d21 \n" // G
- "vmlal.u8 q2, d2, d22 \n" // R
- "vmull.u8 q3, d0, d24 \n" // B to Sepia G
- "vmlal.u8 q3, d1, d25 \n" // G
- "vmlal.u8 q3, d2, d26 \n" // R
- "vmull.u8 q8, d0, d28 \n" // B to Sepia R
- "vmlal.u8 q8, d1, d29 \n" // G
- "vmlal.u8 q8, d2, d30 \n" // R
- "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
- "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
- "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
- "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vmov.u8 d20, #17 \n" // BB coefficient
+ "vmov.u8 d21, #68 \n" // BG coefficient
+ "vmov.u8 d22, #35 \n" // BR coefficient
+ "vmov.u8 d24, #22 \n" // GB coefficient
+ "vmov.u8 d25, #88 \n" // GG coefficient
+ "vmov.u8 d26, #45 \n" // GR coefficient
+ "vmov.u8 d28, #24 \n" // BB coefficient
+ "vmov.u8 d29, #98 \n" // BG coefficient
+ "vmov.u8 d30, #50 \n" // BR coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d20 \n" // B to Sepia B
+ "vmlal.u8 q2, d1, d21 \n" // G
+ "vmlal.u8 q2, d2, d22 \n" // R
+ "vmull.u8 q3, d0, d24 \n" // B to Sepia G
+ "vmlal.u8 q3, d1, d25 \n" // G
+ "vmlal.u8 q3, d2, d26 \n" // R
+ "vmull.u8 q8, d0, d28 \n" // B to Sepia R
+ "vmlal.u8 q8, d1, d29 \n" // G
+ "vmlal.u8 q8, d2, d30 \n" // R
+ "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
+ "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
+ "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
:
@@ -2322,51 +3017,51 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
const int8_t* matrix_argb,
int width) {
asm volatile(
- "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
- "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
- "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
-
- "1: \n"
- "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
- "vmovl.u8 q9, d18 \n" // g
- "vmovl.u8 q10, d20 \n" // r
- "vmovl.u8 q11, d22 \n" // a
- "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
- "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
- "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
- "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
- "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
- "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
- "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
- "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
- "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
- "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
- "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
- "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
- "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
- "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
+ "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+
+ "1: \n"
+ "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
+ "vmovl.u8 q9, d18 \n" // g
+ "vmovl.u8 q10, d20 \n" // r
+ "vmovl.u8 q11, d22 \n" // a
+ "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
+ "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
+ "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
+ "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
+ "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
+ "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
+ "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
+ "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
"vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
"vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
"vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
"vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
- "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2376,27 +3071,27 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
}
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q0, d0, d1 \n" // multiply B
- "vmull.u8 q1, d2, d3 \n" // multiply G
- "vmull.u8 q2, d4, d5 \n" // multiply R
- "vmull.u8 q3, d6, d7 \n" // multiply A
- "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
- "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
- "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
- "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb0), // %0
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q0, d0, d1 \n" // multiply B
+ "vmull.u8 q1, d2, d3 \n" // multiply G
+ "vmull.u8 q2, d4, d5 \n" // multiply R
+ "vmull.u8 q3, d6, d7 \n" // multiply A
+ "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
+ "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
+ "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
+ "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -2405,21 +3100,21 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
}
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8_t* src_argb0,
+void ARGBAddRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 q0, q0, q2 \n" // add B, G
- "vqadd.u8 q1, q1, q3 \n" // add R, A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb0), // %0
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 q0, q0, q2 \n" // add B, G
+ "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -2428,21 +3123,21 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
}
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+void ARGBSubtractRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqsub.u8 q0, q0, q2 \n" // subtract B, G
- "vqsub.u8 q1, q1, q3 \n" // subtract R, A
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb0), // %0
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqsub.u8 q0, q0, q2 \n" // subtract B, G
+ "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -2460,17 +3155,17 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // alpha
+ "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
- "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d0, d0, d1 \n" // add
- "vmov.u8 d1, d0 \n"
- "vmov.u8 d2, d0 \n"
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d1 \n" // add
+ "vmov.u8 d1, d0 \n"
+ "vmov.u8 d2, d0 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2487,12 +3182,12 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
asm volatile(
// 16 pixel loop.
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
- "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vqadd.u8 q0, q0, q1 \n" // add
- "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vqadd.u8 q0, q0, q1 \n" // add
+ "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -2511,15 +3206,15 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "vmov.u8 d3, #255 \n" // alpha
+ "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
- "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d1, d0, d2 \n" // add
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d1, d0, d2 \n" // add
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2539,23 +3234,23 @@ void SobelXRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {d0}, [%0],%5 \n" // top
- "vld1.8 {d1}, [%0],%6 \n"
- "vsubl.u8 q0, d0, d1 \n"
- "vld1.8 {d2}, [%1],%5 \n" // center * 2
- "vld1.8 {d3}, [%1],%6 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vld1.8 {d2}, [%2],%5 \n" // bottom
- "vld1.8 {d3}, [%2],%6 \n"
- "subs %4, %4, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0],%5 \n" // top
+ "vld1.8 {d1}, [%0],%6 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "vld1.8 {d3}, [%1],%6 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "vld1.8 {d3}, [%2],%6 \n"
+ "subs %4, %4, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -2577,23 +3272,23 @@ void SobelYRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "vld1.8 {d0}, [%0],%4 \n" // left
- "vld1.8 {d1}, [%1],%4 \n"
- "vsubl.u8 q0, d0, d1 \n"
- "vld1.8 {d2}, [%0],%4 \n" // center * 2
- "vld1.8 {d3}, [%1],%4 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vld1.8 {d2}, [%0],%5 \n" // right
- "vld1.8 {d3}, [%1],%5 \n"
- "subs %3, %3, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- "vst1.8 {d0}, [%2]! \n" // store 8 sobely
- "bgt 1b \n"
+ "vld1.8 {d0}, [%0],%4 \n" // left
+ "vld1.8 {d1}, [%1],%4 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "vld1.8 {d3}, [%1],%4 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%0],%5 \n" // right
+ "vld1.8 {d3}, [%1],%5 \n"
+ "subs %3, %3, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
@@ -2615,18 +3310,18 @@ void HalfFloat1Row_NEON(const uint16_t* src,
asm volatile(
"1: \n"
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // adjust exponent
- "vmul.f32 q3, q3, %y3 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2641,18 +3336,18 @@ void HalfFloatRow_NEON(const uint16_t* src,
asm volatile(
"1: \n"
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // adjust exponent
- "vmul.f32 q3, q3, %y3 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2667,17 +3362,17 @@ void ByteToFloatRow_NEON(const uint8_t* src,
asm volatile(
"1: \n"
- "vld1.8 {d2}, [%0]! \n" // load 8 bytes
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u8 q1, d2 \n" // 8 shorts
- "vmovl.u16 q2, d2 \n" // 8 ints
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, %y3 \n" // scale
- "vmul.f32 q3, q3, %y3 \n"
- "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
- "bgt 1b \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 bytes
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u8 q1, d2 \n" // 8 shorts
+ "vmovl.u16 q2, d2 \n" // 8 ints
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // scale
+ "vmul.f32 q3, q3, %y3 \n"
+ "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
+ "bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2694,26 +3389,26 @@ void GaussCol_NEON(const uint16_t* src0,
uint32_t* dst,
int width) {
asm volatile(
- "vmov.u16 d6, #4 \n" // constant 4
- "vmov.u16 d7, #6 \n" // constant 6
-
- "1: \n"
- "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
- "vld1.16 {q2}, [%4]! \n"
- "vaddl.u16 q0, d2, d4 \n" // * 1
- "vaddl.u16 q1, d3, d5 \n" // * 1
- "vld1.16 {q2}, [%1]! \n"
- "vmlal.u16 q0, d4, d6 \n" // * 4
- "vmlal.u16 q1, d5, d6 \n" // * 4
- "vld1.16 {q2}, [%2]! \n"
- "vmlal.u16 q0, d4, d7 \n" // * 6
- "vmlal.u16 q1, d5, d7 \n" // * 6
- "vld1.16 {q2}, [%3]! \n"
- "vmlal.u16 q0, d4, d6 \n" // * 4
- "vmlal.u16 q1, d5, d6 \n" // * 4
- "subs %6, %6, #8 \n" // 8 processed per loop
- "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
- "bgt 1b \n"
+ "vmov.u16 d6, #4 \n" // constant 4
+ "vmov.u16 d7, #6 \n" // constant 6
+
+ "1: \n"
+ "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
+ "vld1.16 {q2}, [%4]! \n"
+ "vaddl.u16 q0, d2, d4 \n" // * 1
+ "vaddl.u16 q1, d3, d5 \n" // * 1
+ "vld1.16 {q2}, [%1]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "vld1.16 {q2}, [%2]! \n"
+ "vmlal.u16 q0, d4, d7 \n" // * 6
+ "vmlal.u16 q1, d5, d7 \n" // * 6
+ "vld1.16 {q2}, [%3]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "subs %6, %6, #8 \n" // 8 processed per loop
+ "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
+ "bgt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
@@ -2731,8 +3426,8 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
const uint32_t* src2 = src + 2;
const uint32_t* src3 = src + 3;
asm volatile(
- "vmov.u32 q10, #4 \n" // constant 4
- "vmov.u32 q11, #6 \n" // constant 6
+ "vmov.u32 q10, #4 \n" // constant 4
+ "vmov.u32 q11, #6 \n" // constant 6
"1: \n"
"vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
@@ -2769,17 +3464,17 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
uint8_t* dst_yuv24,
int width) {
asm volatile(
- "1: \n"
- "vld1.8 {q2}, [%0]! \n" // load 16 Y values
- "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
- "vmov d1, d0 \n"
- "vzip.u8 d0, d1 \n" // VV
- "vmov d3, d2 \n"
- "vzip.u8 d2, d3 \n" // UU
- "subs %3, %3, #16 \n" // 16 pixels per loop
- "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
- "vst3.8 {d1, d3, d5}, [%2]! \n"
- "bgt 1b \n"
+ "1: \n"
+ "vld1.8 {q2}, [%0]! \n" // load 16 Y values
+ "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
+ "vmov d1, d0 \n"
+ "vzip.u8 d0, d1 \n" // VV
+ "vmov d3, d2 \n"
+ "vzip.u8 d2, d3 \n" // UU
+ "subs %3, %3, #16 \n" // 16 pixels per loop
+ "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
+ "vst3.8 {d1, d3, d5}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
@@ -2793,24 +3488,24 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
uint8_t* dst_uv,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
// pixels.
- "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
// pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
// pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
"vqrshrun.s16 d0, q1, #2 \n"
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
- "bgt 1b \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
+ "bgt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_stride_ayuv), // %1
"+r"(dst_uv), // %2
@@ -2824,24 +3519,24 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
uint8_t* dst_vu,
int width) {
asm volatile(
- "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
// pixels.
- "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
- "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
// pixels.
- "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
// pixels.
- "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
"vqrshrun.s16 d1, q1, #2 \n"
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
- "bgt 1b \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
+ "bgt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_stride_ayuv), // %1
"+r"(dst_vu), // %2
@@ -2854,12 +3549,12 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
// Similar to ARGBExtractAlphaRow_NEON
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
asm volatile(
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
- "bgt 1b \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
+ "bgt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2867,16 +3562,16 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
: "cc", "memory", "q0", "q1", "q2", "q3");
}
-// Convert biplanar UV channel of NV12 to NV21
-void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
- "1: \n"
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
- "vld2.8 {d1, d3}, [%0]! \n"
- "vorr.u8 q2, q0, q0 \n" // move U after V
- "subs %2, %2, #16 \n" // 16 pixels per loop
- "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
- "bgt 1b \n"
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
+ "vld2.8 {d1, d3}, [%0]! \n"
+ "vorr.u8 q2, q0, q0 \n" // move U after V
+ "subs %2, %2, #16 \n" // 16 pixels per loop
+ "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
+ "bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
@@ -2884,6 +3579,170 @@ void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
: "cc", "memory", "q0", "q1", "q2");
}
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_u_1 = src_u + src_stride_u;
+ const uint8_t* src_v_1 = src_v + src_stride_v;
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 U values
+ "vld1.8 {q1}, [%2]! \n" // load 16 V values
+ "vld1.8 {q2}, [%1]! \n"
+ "vld1.8 {q3}, [%3]! \n"
+ "vpaddl.u8 q0, q0 \n" // half size
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q1, q3 \n"
+ "vqrshrn.u16 d0, q0, #2 \n"
+ "vqrshrn.u16 d1, q1, #2 \n"
+ "subs %5, %5, #16 \n" // 16 src pixels per loop
+ "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels
+ "bgt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_u_1), // %1
+ "+r"(src_v), // %2
+ "+r"(src_v_1), // %3
+ "+r"(dst_uv), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+void SplitUVRow_16_NEON(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width) {
+ int shift = depth - 16; // Negative for right shift.
+ asm volatile(
+ "vdup.16 q2, %4 \n"
+ "1: \n"
+ "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
+ "vshl.u16 q0, q0, q2 \n"
+ "vshl.u16 q1, q1, q2 \n"
+ "subs %3, %3, #8 \n" // 8 src pixels per loop
+ "vst1.16 {q0}, [%1]! \n" // store 8 U pixels
+ "vst1.16 {q1}, [%2]! \n" // store 8 V pixels
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ asm volatile(
+ "vdup.16 q2, %4 \n"
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n" // load 8 U
+ "vld1.16 {q1}, [%1]! \n" // load 8 V
+ "vshl.u16 q0, q0, q2 \n"
+ "vshl.u16 q1, q1, q2 \n"
+ "subs %3, %3, #8 \n" // 8 src pixels per loop
+ "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels
+ "bgt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void MultiplyRow_16_NEON(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ asm volatile(
+ "vdup.16 q2, %3 \n"
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n"
+ "vld1.16 {q1}, [%0]! \n"
+ "vmul.u16 q0, q0, q2 \n"
+ "vmul.u16 q1, q1, q2 \n"
+ "vst1.16 {q0}, [%1]! \n"
+ "vst1.16 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n" // 16 src pixels per loop
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void DivideRow_16_NEON(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ asm volatile(
+ "vdup.16 q0, %3 \n"
+ "1: \n"
+ "vld1.16 {q1}, [%0]! \n"
+ "vld1.16 {q2}, [%0]! \n"
+ "vmovl.u16 q3, d2 \n"
+ "vmovl.u16 q1, d3 \n"
+ "vmovl.u16 q4, d4 \n"
+ "vmovl.u16 q2, d5 \n"
+ "vshl.u32 q3, q3, q0 \n"
+ "vshl.u32 q4, q4, q0 \n"
+ "vshl.u32 q1, q1, q0 \n"
+ "vshl.u32 q2, q2, q0 \n"
+ "vmovn.u32 d2, q3 \n"
+ "vmovn.u32 d3, q1 \n"
+ "vmovn.u32 d4, q4 \n"
+ "vmovn.u32 d5, q2 \n"
+ "vst1.16 {q1}, [%1]! \n"
+ "vst1.16 {q2}, [%1]! \n"
+ "subs %2, %2, #16 \n" // 16 src pixels per loop
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits = shr 1
+// 16384 = 10 bits = shr 2
+// 4096 = 12 bits = shr 4
+// 256 = 16 bits = shr 8
+void Convert16To8Row_NEON(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width) {
+ int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
+ asm volatile(
+ "vdup.16 q2, %3 \n"
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n"
+ "vld1.16 {q1}, [%0]! \n"
+ "vshl.u16 q0, q0, q2 \n" // shr = q2 is negative
+ "vshl.u16 q1, q1, q2 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d1, q1 \n"
+ "subs %2, %2, #16 \n" // 16 src pixels per loop
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(shift) // %3
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
#ifdef __cplusplus
diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc
index f5cbb470..0f120373 100644
--- a/files/source/row_neon64.cc
+++ b/files/source/row_neon64.cc
@@ -15,102 +15,108 @@ namespace libyuv {
extern "C" {
#endif
+// Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer
+// STn over ZIP1+ST1
+// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions.
+
// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+// v0.8h: Y
+// v1.16b: 8U, 8V
+
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "ld1 {v1.s}[0], [%1], #4 \n" \
- "ld1 {v1.s}[1], [%2], #4 \n"
+ "ldr d0, [%[src_y]], #8 \n" \
+ "ld1 {v1.s}[0], [%[src_u]], #4 \n" \
+ "ld1 {v1.s}[1], [%[src_v]], #4 \n" \
+ "zip1 v0.16b, v0.16b, v0.16b \n" \
+ "prfm pldl1keep, [%[src_y], 448] \n" \
+ "zip1 v1.16b, v1.16b, v1.16b \n" \
+ "prfm pldl1keep, [%[src_u], 128] \n" \
+ "prfm pldl1keep, [%[src_v], 128] \n"
// Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "ld1 {v1.d}[0], [%1], #8 \n" \
- "ld1 {v1.d}[1], [%2], #8 \n" \
- "uaddlp v1.8h, v1.16b \n" \
- "rshrn v1.8b, v1.8h, #1 \n"
+ "ldr d0, [%[src_y]], #8 \n" \
+ "ld1 {v1.d}[0], [%[src_u]], #8 \n" \
+ "prfm pldl1keep, [%[src_y], 448] \n" \
+ "ld1 {v1.d}[1], [%[src_v]], #8 \n" \
+ "prfm pldl1keep, [%[src_u], 448] \n" \
+ "zip1 v0.16b, v0.16b, v0.16b \n" \
+ "prfm pldl1keep, [%[src_v], 448] \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "movi v1.8b , #128 \n"
+ "ldr d0, [%[src_y]], #8 \n" \
+ "movi v1.16b, #128 \n" \
+ "prfm pldl1keep, [%[src_y], 448] \n" \
+ "zip1 v0.16b, v0.16b, v0.16b \n"
-// Read 8 Y and 4 UV from NV12
+static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6,
+ 1, 1, 3, 3, 5, 5, 7, 7};
+static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
+ 0, 0, 2, 2, 4, 4, 6, 6};
+
+// Read 8 Y and 4 UV from NV12 or NV21
#define READNV12 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "ld1 {v2.8b}, [%1], #8 \n" \
- "uzp1 v1.8b, v2.8b, v2.8b \n" \
- "uzp2 v3.8b, v2.8b, v2.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
-
-// Read 8 Y and 4 VU from NV21
-#define READNV21 \
- "ld1 {v0.8b}, [%0], #8 \n" \
- "ld1 {v2.8b}, [%1], #8 \n" \
- "uzp1 v3.8b, v2.8b, v2.8b \n" \
- "uzp2 v1.8b, v2.8b, v2.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
+ "ldr d0, [%[src_y]], #8 \n" \
+ "ldr d1, [%[src_uv]], #8 \n" \
+ "zip1 v0.16b, v0.16b, v0.16b \n" \
+ "prfm pldl1keep, [%[src_y], 448] \n" \
+ "tbl v1.16b, {v1.16b}, v2.16b \n" \
+ "prfm pldl1keep, [%[src_uv], 448] \n"
// Read 8 YUY2
-#define READYUY2 \
- "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
- "uzp2 v3.8b, v1.8b, v1.8b \n" \
- "uzp1 v1.8b, v1.8b, v1.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
+#define READYUY2 \
+ "ld2 {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \
+ "zip1 v0.16b, v0.16b, v0.16b \n" \
+ "prfm pldl1keep, [%[src_yuy2], 448] \n" \
+ "tbl v1.16b, {v1.16b}, v2.16b \n"
// Read 8 UYVY
-#define READUYVY \
- "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
- "orr v0.8b, v3.8b, v3.8b \n" \
- "uzp1 v1.8b, v2.8b, v2.8b \n" \
- "uzp2 v3.8b, v2.8b, v2.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
-
-#define YUVTORGB_SETUP \
- "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
- "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
- "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
- "ld1r {v31.4s}, [%[kYToRgb]] \n" \
- "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
- "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
-
-#define YUVTORGB(vR, vG, vB) \
- "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
- "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
- "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
- "ushll v0.4s, v0.4h, #0 \n" \
- "mul v3.4s, v3.4s, v31.4s \n" \
- "mul v0.4s, v0.4s, v31.4s \n" \
- "sqshrun v0.4h, v0.4s, #16 \n" \
- "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
- "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
- "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
- "uxtl v2.8h, v2.8b \n" \
- "uxtl v1.8h, v1.8b \n" /* Extract U */ \
- "mul v3.8h, v1.8h, v27.8h \n" \
- "mul v5.8h, v1.8h, v29.8h \n" \
- "mul v6.8h, v2.8h, v30.8h \n" \
- "mul v7.8h, v2.8h, v28.8h \n" \
- "sqadd v6.8h, v6.8h, v5.8h \n" \
- "sqadd " #vB \
- ".8h, v24.8h, v0.8h \n" /* B */ \
- "sqadd " #vG \
- ".8h, v25.8h, v0.8h \n" /* G */ \
- "sqadd " #vR \
- ".8h, v26.8h, v0.8h \n" /* R */ \
- "sqadd " #vB ".8h, " #vB \
- ".8h, v3.8h \n" /* B */ \
- "sqsub " #vG ".8h, " #vG \
- ".8h, v6.8h \n" /* G */ \
- "sqadd " #vR ".8h, " #vR \
- ".8h, v7.8h \n" /* R */ \
- "sqshrun " #vB ".8b, " #vB \
- ".8h, #6 \n" /* B */ \
- "sqshrun " #vG ".8b, " #vG \
- ".8h, #6 \n" /* G */ \
- "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
+#define READUYVY \
+ "ld2 {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \
+ "zip1 v0.16b, v4.16b, v4.16b \n" \
+ "prfm pldl1keep, [%[src_uyvy], 448] \n" \
+ "tbl v1.16b, {v3.16b}, v2.16b \n"
+
+// UB VR UG VG
+// YG BB BG BR
+#define YUVTORGB_SETUP \
+ "ld4r {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \
+ "ld4r {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n"
+
+// v16.8h: B
+// v17.8h: G
+// v18.8h: R
+
+// Convert from YUV to 2.14 fixed point RGB
+#define YUVTORGB \
+ "umull2 v3.4s, v0.8h, v24.8h \n" \
+ "umull v6.8h, v1.8b, v30.8b \n" \
+ "umull v0.4s, v0.4h, v24.4h \n" \
+ "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \
+ "uqshrn v0.4h, v0.4s, #16 \n" \
+ "uqshrn2 v0.8h, v3.4s, #16 \n" /* Y */ \
+ "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \
+ "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \
+ "add v17.8h, v0.8h, v26.8h \n" /* G */ \
+ "add v16.8h, v0.8h, v4.8h \n" /* B */ \
+ "add v18.8h, v0.8h, v5.8h \n" /* R */ \
+ "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \
+ "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \
+ "uqsub v18.8h, v18.8h, v27.8h \n" /* R */
+
+// Convert from 2.14 fixed point RGB To 8 bit RGB
+#define RGBTORGB8 \
+ "uqshrn v17.8b, v17.8h, #6 \n" \
+ "uqshrn v16.8b, v16.8h, #6 \n" \
+ "uqshrn v18.8b, v18.8h, #6 \n"
+
+#define YUVTORGB_REGS \
+ "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \
+ "v26", "v27", "v28", "v29", "v30", "v31"
void I444ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
@@ -118,27 +124,22 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
- "1: \n"
- READYUV444
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n" /* A */
+ "1: \n" READYUV444 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19");
}
void I422ToARGBRow_NEON(const uint8_t* src_y,
@@ -147,27 +148,48 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n" /* A */
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n"
+ "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444
+ "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [src_a] "+r"(src_a), // %[src_a]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19");
}
void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
@@ -177,28 +199,23 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "ld1 {v23.8b}, [%3], #8 \n"
- "subs %w5, %w5, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(src_a), // %3
- "+r"(dst_argb), // %4
- "+r"(width) // %5
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n"
+ "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422
+ "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [src_a] "+r"(src_a), // %[src_a]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19");
}
void I422ToRGBARow_NEON(const uint8_t* src_y,
@@ -207,27 +224,22 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v20.8b, #255 \n" /* A */
- "1: \n"
- READYUV422
- YUVTORGB(v23, v22, v21)
- "subs %w4, %w4, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgba), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v15.8b, #255 \n" /* A */
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgba] "+r"(dst_rgba), // %[dst_rgba]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v15");
}
void I422ToRGB24Row_NEON(const uint8_t* src_y,
@@ -236,34 +248,29 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb24), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
}
#define ARGBTORGB565 \
- "shll v0.8h, v22.8b, #8 \n" /* R */ \
- "shll v21.8h, v21.8b, #8 \n" /* G */ \
- "shll v20.8h, v20.8b, #8 \n" /* B */ \
- "sri v0.8h, v21.8h, #5 \n" /* RG */ \
- "sri v0.8h, v20.8h, #11 \n" /* RGB */
+ "shll v18.8h, v18.8b, #8 \n" /* R */ \
+ "shll v17.8h, v17.8b, #8 \n" /* G */ \
+ "shll v16.8h, v16.8b, #8 \n" /* B */ \
+ "sri v18.8h, v17.8h, #5 \n" /* RG */ \
+ "sri v18.8h, v16.8h, #11 \n" /* RGB */
void I422ToRGB565Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
@@ -273,33 +280,28 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "1: \n" READYUV422 YUVTORGB(
- v22, v21,
- v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
- // RGB565.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb565), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8 "subs %w[width], %w[width], #8 \n" ARGBTORGB565
+ "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS);
}
#define ARGBTOARGB1555 \
- "shll v0.8h, v23.8b, #8 \n" /* A */ \
- "shll v22.8h, v22.8b, #8 \n" /* R */ \
- "shll v21.8h, v21.8b, #8 \n" /* G */ \
- "shll v20.8h, v20.8b, #8 \n" /* B */ \
- "sri v0.8h, v22.8h, #1 \n" /* AR */ \
- "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
- "sri v0.8h, v20.8h, #11 \n" /* ARGB */
+ "shll v0.8h, v19.8b, #8 \n" /* A */ \
+ "shll v18.8h, v18.8b, #8 \n" /* R */ \
+ "shll v17.8h, v17.8b, #8 \n" /* G */ \
+ "shll v16.8h, v16.8b, #8 \n" /* B */ \
+ "sri v0.8h, v18.8h, #1 \n" /* AR */ \
+ "sri v0.8h, v17.8h, #6 \n" /* ARG */ \
+ "sri v0.8h, v16.8h, #11 \n" /* ARGB */
void I422ToARGB1555Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
@@ -309,34 +311,31 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n" READYUV422 YUVTORGB(
- v22, v21,
- v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
- // RGB565.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb1555), // %3
- "+r"(width) // %4
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+ "movi v19.8b, #255 \n"
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n" ARGBTOARGB1555
+ "st1 {v0.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels
+ // RGB565.
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19");
}
#define ARGBTOARGB4444 \
- /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
- "ushr v20.8b, v20.8b, #4 \n" /* B */ \
- "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
- "ushr v22.8b, v22.8b, #4 \n" /* R */ \
- "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
- "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
- "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
+ /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f */ \
+ "ushr v16.8b, v16.8b, #4 \n" /* B */ \
+ "bic v17.8b, v17.8b, v23.8b \n" /* G */ \
+ "ushr v18.8b, v18.8b, #4 \n" /* R */ \
+ "bic v19.8b, v19.8b, v23.8b \n" /* A */ \
+ "orr v0.8b, v16.8b, v17.8b \n" /* BG */ \
+ "orr v1.8b, v18.8b, v19.8b \n" /* RA */ \
"zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
void I422ToARGB4444Row_NEON(const uint8_t* src_y,
@@ -345,95 +344,109 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v4.16b, #0x0f \n" // bits to clear with vbic.
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- "movi v23.8b, #255 \n"
- ARGBTOARGB4444
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb4444), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v23.16b, #0x0f \n" // bits to clear with
+ // vbic.
+ "1: \n" READYUV422 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "movi v19.8b, #255 \n" ARGBTOARGB4444
+ "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8
+ // pixels
+ // ARGB4444.
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_u] "+r"(src_u), // %[src_u]
+ [src_v] "+r"(src_v), // %[src_v]
+ [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19", "v23");
}
-void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READYUV400
- YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
- [kUVToG]"r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+void I400ToARGBRow_NEON(const uint8_t* src_y,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n"
+ "1: \n" READYUV400 YUVTORGB
+ RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
+ : "cc", "memory", YUVTORGB_REGS, "v19");
}
+#if LIBYUV_USE_ST4
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
- "movi v23.8b, #255 \n"
+ "movi v23.8b, #255 \n"
"1: \n"
- "ld1 {v20.8b}, [%0], #8 \n"
- "orr v21.8b, v20.8b, v20.8b \n"
- "orr v22.8b, v20.8b, v20.8b \n"
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
+ "ld1 {v20.8b}, [%0], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v21.8b, v20.8b, v20.8b \n"
+ "orr v22.8b, v20.8b, v20.8b \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v20", "v21", "v22", "v23");
}
+#else
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v20.8b, #255 \n"
+ "1: \n"
+ "ldr d16, [%0], #8 \n"
+ "subs %w2, %w2, #8 \n"
+ "zip1 v18.16b, v16.16b, v16.16b \n" // YY
+ "zip1 v19.16b, v16.16b, v20.16b \n" // YA
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v16.16b, v18.16b, v19.16b \n" // YYYA
+ "zip2 v17.16b, v18.16b, v19.16b \n"
+ "stp q16, q17, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20");
+}
+#endif // LIBYUV_USE_ST4
void NV12ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READNV12
- YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n"
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV12Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
}
void NV21ToARGBRow_NEON(const uint8_t* src_y,
@@ -441,26 +454,22 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READNV21
- YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_vu), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n"
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_vu), // %[src_uv]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV21Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
}
void NV12ToRGB24Row_NEON(const uint8_t* src_y,
@@ -468,25 +477,21 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READNV12
- YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb24), // %2
- "+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV12Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2");
}
void NV21ToRGB24Row_NEON(const uint8_t* src_y,
@@ -494,25 +499,21 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READNV21
- YUVTORGB(v22, v21, v20)
- "subs %w3, %w3, #8 \n"
- "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_vu), // %1
- "+r"(dst_rgb24), // %2
- "+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_vu), // %[src_uv]
+ [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV21Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2");
}
void NV12ToRGB565Row_NEON(const uint8_t* src_y,
@@ -522,72 +523,63 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
- "1: \n" READNV12 YUVTORGB(
- v22, v21,
- v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
- "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
- // RGB565.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb565), // %2
- "+r"(width) // %3
- : [kUVToRB] "r"(&yuvconstants->kUVToRB),
- [kUVToG] "r"(&yuvconstants->kUVToG),
- [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb] "r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READNV12 YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n" ARGBTORGB565
+ "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8
+ // pixels
+ // RGB565.
+ "b.gt 1b \n"
+ : [src_y] "+r"(src_y), // %[src_y]
+ [src_uv] "+r"(src_uv), // %[src_uv]
+ [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV12Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2");
}
void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READYUY2
- YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n"
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READYUY2 YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV12Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
}
void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READUYVY
- YUVTORGB(v22, v21, v20)
- "subs %w2, %w2, #8 \n"
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
- "b.gt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v19.8b, #255 \n"
+ "ldr q2, [%[kNV12Table]] \n"
+ "1: \n" READUYVY YUVTORGB RGBTORGB8
+ "subs %w[width], %w[width], #8 \n"
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
+ "b.gt 1b \n"
+ : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2]
+ [dst_argb] "+r"(dst_argb), // %[dst_argb]
+ [width] "+r"(width) // %[width]
+ : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
+ [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
+ [kNV12Table] "r"(&kNV12Table)
+ : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
@@ -597,11 +589,12 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
int width) {
asm volatile(
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "st1 {v0.16b}, [%1], #16 \n" // store U
- "st1 {v1.16b}, [%2], #16 \n" // store V
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store U
+ "st1 {v1.16b}, [%2], #16 \n" // store V
+ "b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -611,6 +604,53 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
);
}
+// Reads 16 byte Y's from tile and writes out 16 Y's.
+// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
+// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
+// width measured in bytes so 8 UV = 16.
+void DetileRow_NEON(const uint8_t* src,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], %3 \n" // load 16 bytes
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 1792] \n" // 7 tiles of 256b ahead
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 bytes
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(src_tile_stride) // %3
+ : "cc", "memory", "v0" // Clobber List
+ );
+}
+
+// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
+void DetileSplitUVRow_NEON(const uint8_t* src_uv,
+ ptrdiff_t src_tile_stride,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.8b,v1.8b}, [%0], %4 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%0, 1792] \n"
+ "st1 {v0.8b}, [%1], #8 \n"
+ "st1 {v1.8b}, [%2], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(src_tile_stride) // %4
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+#if LIBYUV_USE_ST2
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v,
@@ -618,11 +658,13 @@ void MergeUVRow_NEON(const uint8_t* src_u,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load U
- "ld1 {v1.16b}, [%1], #16 \n" // load V
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
@@ -632,6 +674,86 @@ void MergeUVRow_NEON(const uint8_t* src_u,
);
}
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ asm volatile(
+ "dup v2.8h, %w4 \n"
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // load 8 U
+ "subs %w3, %w3, #8 \n" // 8 src pixels per loop
+ "ld1 {v1.8h}, [%1], #16 \n" // load 8 V
+ "ushl v0.8h, v0.8h, v2.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ushl v1.8h, v1.8h, v2.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "v0", "v1", "v2");
+}
+#else
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "zip1 v2.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip2 v3.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void MergeUVRow_16_NEON(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ asm volatile(
+ "dup v4.8h, %w4 \n"
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // load 8 U
+ "subs %w3, %w3, #8 \n" // 8 src pixels per loop
+ "ld1 {v1.8h}, [%1], #16 \n" // load 8 V
+ "ushl v0.8h, v0.8h, v4.8h \n"
+ "ushl v1.8h, v1.8h, v4.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v2.8h, v0.8h, v1.8h \n"
+ "zip2 v3.8h, v0.8h, v1.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4");
+}
+#endif // LIBYUV_USE_ST2
+
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
void SplitRGBRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_r,
@@ -640,12 +762,13 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
int width) {
asm volatile(
"1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
- "subs %w4, %w4, #16 \n" // 16 processed per loop
- "st1 {v0.16b}, [%1], #16 \n" // store R
- "st1 {v1.16b}, [%2], #16 \n" // store G
- "st1 {v2.16b}, [%3], #16 \n" // store B
- "b.gt 1b \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store R
+ "st1 {v1.16b}, [%2], #16 \n" // store G
+ "st1 {v2.16b}, [%3], #16 \n" // store B
+ "b.gt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
@@ -664,12 +787,15 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load R
- "ld1 {v1.16b}, [%1], #16 \n" // load G
- "ld1 {v2.16b}, [%2], #16 \n" // load B
- "subs %w4, %w4, #16 \n" // 16 processed per loop
- "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load R
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%2], #16 \n" // load B
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
+ "b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@@ -680,14 +806,403 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
);
}
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
+void SplitARGBRow_NEON(const uint8_t* src_rgba,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
+ "subs %w5, %w5, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v0.16b}, [%3], #16 \n" // store B
+ "st1 {v1.16b}, [%2], #16 \n" // store G
+ "st1 {v2.16b}, [%1], #16 \n" // store R
+ "st1 {v3.16b}, [%4], #16 \n" // store A
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(dst_a), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+#if LIBYUV_USE_ST4
+// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
+void MergeARGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%2], #16 \n" // load B
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%0], #16 \n" // load R
+ "ld1 {v3.16b}, [%3], #16 \n" // load A
+ "subs %w5, %w5, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+#else
+// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
+void MergeARGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%2], #16 \n" // load B
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%0], #16 \n" // load R
+ "ld1 {v3.16b}, [%3], #16 \n" // load A
+ "subs %w5, %w5, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%2, 448] \n"
+ "zip1 v4.16b, v0.16b, v1.16b \n" // BG
+ "zip1 v5.16b, v2.16b, v3.16b \n" // RA
+ "prfm pldl1keep, [%1, 448] \n"
+ "zip2 v6.16b, v0.16b, v1.16b \n" // BG
+ "zip2 v7.16b, v2.16b, v3.16b \n" // RA
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v0.8h, v4.8h, v5.8h \n" // BGRA
+ "zip2 v1.8h, v4.8h, v5.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "zip1 v2.8h, v6.8h, v7.8h \n"
+ "zip2 v3.8h, v6.8h, v7.8h \n"
+ "st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v7" // Clobber List
+ );
+}
+#endif // LIBYUV_USE_ST4
+
+// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
+void SplitXRGBRow_NEON(const uint8_t* src_rgba,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v0.16b}, [%3], #16 \n" // store B
+ "st1 {v1.16b}, [%2], #16 \n" // store G
+ "st1 {v2.16b}, [%1], #16 \n" // store R
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
+void MergeXRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.16b, #255 \n" // load A(255)
+ "1: \n"
+ "ld1 {v2.16b}, [%0], #16 \n" // load R
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v0.16b}, [%2], #16 \n" // load B
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void MergeXR30Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int depth,
+ int width) {
+ int shift = 10 - depth;
+ asm volatile(
+ "movi v30.16b, #255 \n"
+ "ushr v30.4s, v30.4s, #22 \n" // 1023
+ "dup v31.4s, %w5 \n"
+ "1: \n"
+ "ldr d2, [%2], #8 \n" // B
+ "ldr d1, [%1], #8 \n" // G
+ "ldr d0, [%0], #8 \n" // R
+ "ushll v2.4s, v2.4h, #0 \n" // B
+ "ushll v1.4s, v1.4h, #0 \n" // G
+ "ushll v0.4s, v0.4h, #0 \n" // R
+ "ushl v2.4s, v2.4s, v31.4s \n" // 000B
+ "ushl v1.4s, v1.4s, v31.4s \n" // G
+ "ushl v0.4s, v0.4s, v31.4s \n" // R
+ "umin v2.4s, v2.4s, v30.4s \n"
+ "umin v1.4s, v1.4s, v30.4s \n"
+ "umin v0.4s, v0.4s, v30.4s \n"
+ "sli v2.4s, v1.4s, #10 \n" // 00GB
+ "sli v2.4s, v0.4s, #20 \n" // 0RGB
+ "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
+ "subs %w4, %w4, #4 \n"
+ "str q2, [%3], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar30), // %3
+ "+r"(width) // %4
+ : "r"(shift) // %5
+ : "memory", "cc", "v0", "v1", "v2", "v30", "v31");
+}
+
+void MergeXR30Row_10_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_ar30,
+ int /* depth */,
+ int width) {
+ asm volatile(
+ "movi v30.16b, #255 \n"
+ "ushr v30.4s, v30.4s, #22 \n" // 1023
+ "1: \n"
+ "ldr d2, [%2], #8 \n" // B
+ "ldr d1, [%1], #8 \n" // G
+ "ldr d0, [%0], #8 \n" // R
+ "ushll v2.4s, v2.4h, #0 \n" // 000B
+ "ushll v1.4s, v1.4h, #0 \n" // G
+ "ushll v0.4s, v0.4h, #0 \n" // R
+ "umin v2.4s, v2.4s, v30.4s \n"
+ "umin v1.4s, v1.4s, v30.4s \n"
+ "umin v0.4s, v0.4s, v30.4s \n"
+ "sli v2.4s, v1.4s, #10 \n" // 00GB
+ "sli v2.4s, v0.4s, #20 \n" // 0RGB
+ "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
+ "subs %w4, %w4, #4 \n"
+ "str q2, [%3], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar30), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v30");
+}
+
+void MergeAR64Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int mask = (1 << depth) - 1;
+ asm volatile(
+
+ "dup v30.8h, %w7 \n"
+ "dup v31.8h, %w6 \n"
+ "1: \n"
+ "ldr q2, [%0], #16 \n" // R
+ "ldr q1, [%1], #16 \n" // G
+ "ldr q0, [%2], #16 \n" // B
+ "ldr q3, [%3], #16 \n" // A
+ "umin v2.8h, v2.8h, v30.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "umin v1.8h, v1.8h, v30.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "umin v0.8h, v0.8h, v30.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "umin v3.8h, v3.8h, v30.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "ushl v2.8h, v2.8h, v31.8h \n"
+ "ushl v1.8h, v1.8h, v31.8h \n"
+ "ushl v0.8h, v0.8h, v31.8h \n"
+ "ushl v3.8h, v3.8h, v31.8h \n"
+ "subs %w5, %w5, #8 \n"
+ "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_ar64), // %4
+ "+r"(width) // %5
+ : "r"(shift), // %6
+ "r"(mask) // %7
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+void MergeXR64Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint16_t* dst_ar64,
+ int depth,
+ int width) {
+ int shift = 16 - depth;
+ int mask = (1 << depth) - 1;
+ asm volatile(
+
+ "movi v3.16b, #0xff \n" // A (0xffff)
+ "dup v30.8h, %w6 \n"
+ "dup v31.8h, %w5 \n"
+
+ "1: \n"
+ "ldr q2, [%0], #16 \n" // R
+ "ldr q1, [%1], #16 \n" // G
+ "ldr q0, [%2], #16 \n" // B
+ "umin v2.8h, v2.8h, v30.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "umin v1.8h, v1.8h, v30.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "umin v0.8h, v0.8h, v30.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "ushl v2.8h, v2.8h, v31.8h \n"
+ "ushl v1.8h, v1.8h, v31.8h \n"
+ "ushl v0.8h, v0.8h, v31.8h \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_ar64), // %3
+ "+r"(width) // %4
+ : "r"(shift), // %5
+ "r"(mask) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+void MergeARGB16To8Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ const uint16_t* src_a,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ int shift = 8 - depth;
+ asm volatile(
+
+ "dup v31.8h, %w6 \n"
+ "1: \n"
+ "ldr q2, [%0], #16 \n" // R
+ "ldr q1, [%1], #16 \n" // G
+ "ldr q0, [%2], #16 \n" // B
+ "ldr q3, [%3], #16 \n" // A
+ "ushl v2.8h, v2.8h, v31.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ushl v1.8h, v1.8h, v31.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "ushl v0.8h, v0.8h, v31.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "ushl v3.8h, v3.8h, v31.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uqxtn v2.8b, v2.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn v3.8b, v3.8h \n"
+ "subs %w5, %w5, #8 \n"
+ "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : "r"(shift) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
+void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
+ const uint16_t* src_g,
+ const uint16_t* src_b,
+ uint8_t* dst_argb,
+ int depth,
+ int width) {
+ int shift = 8 - depth;
+ asm volatile(
+
+ "dup v31.8h, %w5 \n"
+ "movi v3.8b, #0xff \n" // A (0xff)
+ "1: \n"
+ "ldr q2, [%0], #16 \n" // R
+ "ldr q1, [%1], #16 \n" // G
+ "ldr q0, [%2], #16 \n" // B
+ "ushl v2.8h, v2.8h, v31.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ushl v1.8h, v1.8h, v31.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "ushl v0.8h, v0.8h, v31.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uqxtn v2.8b, v2.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "subs %w4, %w4, #8 \n"
+ "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(shift) // %5
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
+}
+
// Copy multiple of 32.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"1: \n"
- "ldp q0, q1, [%0], #32 \n"
- "subs %w2, %w2, #32 \n" // 32 processed per loop
- "stp q0, q1, [%1], #32 \n"
- "b.gt 1b \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #32 \n" // 32 processed per loop
+ "stp q0, q1, [%1], #32 \n"
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2 // Output registers
@@ -699,11 +1214,11 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile(
- "dup v0.16b, %w2 \n" // duplicate 16 bytes
+ "dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n"
- "subs %w1, %w1, #16 \n" // 16 bytes per loop
- "st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
+ "subs %w1, %w1, #16 \n" // 16 bytes per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v8) // %2
@@ -712,89 +1227,157 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile(
- "dup v0.4s, %w2 \n" // duplicate 4 ints
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
- "subs %w1, %w1, #4 \n" // 4 ints per loop
- "st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
+ "subs %w1, %w1, #4 \n" // 4 ints per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
: "r"(v32) // %2
: "cc", "memory", "v0");
}
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
// Start at end of source row.
- "add %0, %0, %w2, sxtw \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #16 \n" // 16 pixels per loop.
- "rev64 v0.16b, v0.16b \n"
- "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
- "st1 {v0.D}[0], [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-16) // %3
- : "cc", "memory", "v0");
+ "ld1 {v3.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q2, [%0, 16] \n"
+ "ldr q1, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #32 \n" // 32 pixels per loop.
+ "tbl v0.16b, {v2.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
+ "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirror) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3");
}
-void MirrorUVRow_NEON(const uint8_t* src_uv,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+ 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile(
// Start at end of source row.
- "add %0, %0, %w3, sxtw #1 \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
- "subs %w3, %w3, #8 \n" // 8 pixels per loop.
- "rev64 v0.8b, v0.8b \n"
- "rev64 v1.8b, v1.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // dst += 8
- "st1 {v1.8b}, [%2], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((ptrdiff_t)-16) // %4
- : "cc", "memory", "v0", "v1");
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirrorUV) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
-void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
asm volatile(
// Start at end of source row.
- "add %0, %0, %w2, sxtw #2 \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
- "rev64 v0.4s, v0.4s \n"
- "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
- "st1 {v0.D}[0], [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-16) // %3
- : "cc", "memory", "v0");
+ "ld1 {v4.16b}, [%4] \n" // shuffler
+ "add %0, %0, %w3, sxtw #1 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "uzp1 v0.16b, v2.16b, v3.16b \n" // U
+ "uzp2 v1.16b, v2.16b, v3.16b \n" // V
+ "st1 {v0.16b}, [%1], #16 \n" // dst += 16
+ "st1 {v1.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(&kShuffleMirrorUV) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+// Shuffle table for reversing the ARGB.
+static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
+ 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
+
+void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "ld1 {v4.16b}, [%3] \n" // shuffler
+ "add %0, %0, %w2, sxtw #2 \n"
+ "sub %0, %0, #32 \n"
+ "1: \n"
+ "ldr q1, [%0, 16] \n"
+ "ldr q0, [%0], -32 \n" // src -= 32
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop.
+ "tbl v2.16b, {v1.16b}, v4.16b \n"
+ "tbl v3.16b, {v0.16b}, v4.16b \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleMirrorARGB) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_rgb24,
+ int width) {
+ asm volatile(
+ "ld1 {v3.16b}, [%4] \n" // shuffler
+ "add %0, %0, %w2, sxtw #1 \n" // Start at end of row.
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #48 \n"
+
+ "1: \n"
+ "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "tbl v0.16b, {v0.16b}, v3.16b \n"
+ "tbl v1.16b, {v1.16b}, v3.16b \n"
+ "tbl v2.16b, {v2.16b}, v3.16b \n"
+ "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "r"((ptrdiff_t)-48), // %3
+ "r"(&kShuffleMirror) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3");
}
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v4.8b, #255 \n" // Alpha
+ "movi v4.8b, #255 \n" // Alpha
"1: \n"
- "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
+ // RGB24.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -805,14 +1388,15 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile(
- "movi v5.8b, #255 \n" // Alpha
+ "movi v5.8b, #255 \n" // Alpha
"1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
- "b.gt 1b \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -821,15 +1405,35 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
);
}
+void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
+ asm volatile(
+ "movi v0.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v2.8b, v4.8b, v4.8b \n" // move g
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v1.8b, v5.8b, v5.8b \n" // move r
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgba), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile(
"1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
- "b.gt 1b \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
@@ -855,13 +1459,13 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // Alpha
+ "movi v3.8b, #255 \n" // Alpha
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -911,14 +1515,14 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // Alpha
+ "movi v3.8b, #255 \n" // Alpha
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
- "b.gt 1b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -927,6 +1531,8 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
);
}
+// Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
+// clobbers v3
#define ARGB4444TOARGB \
"shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
"xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
@@ -944,12 +1550,11 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -963,28 +1568,29 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
- // RGB24.
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "prfm pldl1keep, [%0, 448] \n"
+ "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n" // store 8 RGB24
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(width) // %2
:
- : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile(
"1: \n"
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v4.8b, v2.8b, v2.8b \n" // mov g
- "orr v5.8b, v1.8b, v1.8b \n" // mov b
- "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
- "b.gt 1b \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v4.8b, v2.8b, v2.8b \n" // mov g
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v5.8b, v1.8b, v1.8b \n" // mov b
+ "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(width) // %2
@@ -996,10 +1602,11 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
- "subs %w2, %w2, #16 \n" // 16 processed per loop.
- "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1011,10 +1618,11 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
- "subs %w2, %w2, #16 \n" // 16 processed per loop.
- "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1029,11 +1637,12 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
int width) {
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
- "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
- "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
- "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1049,11 +1658,12 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
int width) {
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
- "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
- "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
- "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1071,14 +1681,15 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
- "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
- "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
- "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
- "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "prfm pldl1keep, [%0, 448] \n"
+ "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(src_yuy2b), // %1
"+r"(dst_u), // %2
@@ -1098,14 +1709,15 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
- "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
- "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
- "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "prfm pldl1keep, [%0, 448] \n"
+ "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(src_uyvyb), // %1
"+r"(dst_u), // %2
@@ -1123,13 +1735,14 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
const uint8_t* shuffler,
int width) {
asm volatile(
- "ld1 {v2.16b}, [%3] \n" // shuffler
+ "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
- "subs %w2, %w2, #4 \n" // 4 processed per loop
- "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
- "st1 {v1.16b}, [%1], #16 \n" // store 4.
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
+ "st1 {v1.16b}, [%1], #16 \n" // store 4.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -1145,13 +1758,14 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
- "orr v2.8b, v1.8b, v1.8b \n"
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
- "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
- "b.gt 1b \n"
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "orr v2.8b, v1.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1168,13 +1782,14 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
- "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
- "orr v3.8b, v2.8b, v2.8b \n"
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
- "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
- "b.gt 1b \n"
+ "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
+ "orr v3.8b, v2.8b, v2.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -1189,16 +1804,17 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGBTORGB565
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565
+ "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
"+r"(width) // %2
:
- : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+ : "cc", "memory", "v16", "v17", "v18", "v19");
}
void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
@@ -1206,20 +1822,22 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
const uint32_t dither4,
int width) {
asm volatile(
- "dup v1.4s, %w2 \n" // dither4
+ "dup v1.4s, %w2 \n" // dither4
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v20.8b, v20.8b, v1.8b \n"
- "uqadd v21.8b, v21.8b, v1.8b \n"
- "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
- "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // load 8
+ // pixels
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v16.8b, v16.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "uqadd v17.8b, v17.8b, v1.8b \n"
+ "uqadd v18.8b, v18.8b, v1.8b \n" ARGBTORGB565
+ "st1 {v18.16b}, [%0], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
"r"(dither4), // %2
"r"(width) // %3
- : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
+ : "cc", "memory", "v1", "v16", "v17", "v18", "v19");
}
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
@@ -1227,99 +1845,198 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
int width) {
asm volatile(
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGBTOARGB1555
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
- // ARGB1555.
- "b.gt 1b \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
"+r"(width) // %2
:
- : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+ : "cc", "memory", "v0", "v16", "v17", "v18", "v19");
}
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444,
int width) {
asm volatile(
- "movi v4.16b, #0x0f \n" // bits to clear with
+ "movi v23.16b, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGBTOARGB4444
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
- // ARGB4444.
- "b.gt 1b \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
+ // pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
"+r"(width) // %2
:
- : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
+ : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23");
}
-void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+#if LIBYUV_USE_ST2
+void ARGBToAR64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ar64,
+ int width) {
asm volatile(
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "1: \n"
+ "ldp q0, q2, [%0], #32 \n" // load 8 pixels
+ "mov v1.16b, v0.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mov v3.16b, v2.16b \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
+ "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
- "+r"(dst_y), // %1
+ "+r"(dst_ar64), // %1
"+r"(width) // %2
:
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+ : "cc", "memory", "v0", "v1", "v2", "v3");
}
-void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
- uint8_t* dst_a,
- int width) {
+static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
+ 10, 9, 8, 11, 14, 13, 12, 15};
+
+void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ab64,
+ int width) {
asm volatile(
+ "ldr q4, [%3] \n" // shuffler
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16
- // pixels
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
- "b.gt 1b \n"
+ "ldp q0, q2, [%0], #32 \n" // load 8 pixels
+ "tbl v0.16b, {v0.16b}, v4.16b \n"
+ "tbl v2.16b, {v2.16b}, v4.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mov v1.16b, v0.16b \n"
+ "mov v3.16b, v2.16b \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
+ "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleARGBToABGR) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+#else
+void ARGBToAR64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ar64,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "zip1 v2.16b, v0.16b, v0.16b \n"
+ "zip2 v3.16b, v0.16b, v0.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v4.16b, v1.16b, v1.16b \n"
+ "zip2 v5.16b, v1.16b, v1.16b \n"
+ "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
- "+r"(dst_a), // %1
+ "+r"(dst_ar64), // %1
"+r"(width) // %2
:
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
+}
+
+static const uvec8 kShuffleARGBToAB64[2] = {
+ {2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7},
+ {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}};
+
+void ARGBToAB64Row_NEON(const uint8_t* src_argb,
+ uint16_t* dst_ab64,
+ int width) {
+ asm volatile(
+ "ldp q6, q7, [%3] \n" // 2 shufflers
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n" // load 8 pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64
+ "tbl v3.16b, {v0.16b}, v7.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v4.16b, {v1.16b}, v6.16b \n"
+ "tbl v5.16b, {v1.16b}, v7.16b \n"
+ "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_ab64), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleARGBToAB64[0]) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+#endif // LIBYUV_USE_ST2
+
+static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15,
+ 17, 19, 21, 23, 25, 27, 29, 31};
+
+void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "ldr q4, [%3] \n" // shuffler
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n" // load 4 pixels
+ "ldp q2, q3, [%0], #32 \n" // load 4 pixels
+ "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "stp q0, q2, [%1], #32 \n" // store 8 pixels
+ "b.gt 1b \n"
+ : "+r"(src_ar64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleAR64ToARGB) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
+static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15,
+ 21, 19, 17, 23, 29, 27, 25, 31};
+
+void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "ldr q4, [%3] \n" // shuffler
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n" // load 4 pixels
+ "ldp q2, q3, [%0], #32 \n" // load 4 pixels
+ "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "stp q0, q2, [%1], #32 \n" // store 8 pixels
+ "b.gt 1b \n"
+ : "+r"(src_ab64), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleAB64ToARGB) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
-void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
asm volatile(
- "movi v4.8b, #15 \n" // B * 0.11400 coefficient
- "movi v5.8b, #75 \n" // G * 0.58700 coefficient
- "movi v6.8b, #38 \n" // R * 0.29900 coefficient
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
- "+r"(dst_y), // %1
+ "+r"(dst_a), // %1
"+r"(width) // %2
:
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
}
// 8x1 pixels.
@@ -1328,33 +2045,31 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
asm volatile(
- "movi v24.8b, #112 \n" // UB / VR 0.875
+ "movi v24.8b, #112 \n" // UB / VR 0.875
// coefficient
- "movi v25.8b, #74 \n" // UG -0.5781 coefficient
- "movi v26.8b, #38 \n" // UR -0.2969 coefficient
- "movi v27.8b, #18 \n" // VB -0.1406 coefficient
- "movi v28.8b, #94 \n" // VG -0.7344 coefficient
- "movi v29.16b,#0x80 \n" // 128.5
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- // pixels.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlsl v4.8h, v1.8b, v25.8b \n" // G
- "umlsl v4.8h, v2.8b, v26.8b \n" // R
- "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
-
- "umull v3.8h, v2.8b, v24.8b \n" // R
- "umlsl v3.8h, v1.8b, v28.8b \n" // G
- "umlsl v3.8h, v0.8b, v27.8b \n" // B
- "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
-
- "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
-
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "movi v25.8b, #74 \n" // UG -0.5781 coefficient
+ "movi v26.8b, #38 \n" // UR -0.2969 coefficient
+ "movi v27.8b, #18 \n" // VB -0.1406 coefficient
+ "movi v28.8b, #94 \n" // VG -0.7344 coefficient
+ "movi v29.16b,#0x80 \n" // 128.5
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlsl v4.8h, v1.8b, v25.8b \n" // G
+ "umlsl v4.8h, v2.8b, v26.8b \n" // R
+ "prfm pldl1keep, [%0, 448] \n"
+
+ "umull v3.8h, v2.8b, v24.8b \n" // R
+ "umlsl v3.8h, v1.8b, v28.8b \n" // G
+ "umlsl v3.8h, v0.8b, v27.8b \n" // B
+
+ "addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned
+ "addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned
+
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -1381,10 +2096,8 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
"mls v4.8h, " #QG ",v24.8h \n" /* G */ \
"mls v3.8h, " #QR ",v22.8h \n" /* R */ \
"mls v4.8h, " #QB ",v23.8h \n" /* B */ \
- "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
- "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
- "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
- "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
+ "addhn v0.8b, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
+ "addhn v1.8b, v4.8h, v25.8h \n" /* +128 -> unsigned */
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
@@ -1398,26 +2111,28 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
-
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
@@ -1429,7 +2144,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
);
}
-// TODO(fbarchard): Subsample match C code.
void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
@@ -1437,31 +2151,33 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int width) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
- "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
- "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
- "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
- "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
- "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
- "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
@@ -1473,6 +2189,96 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
);
}
+void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+ asm volatile (
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_rgb24_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
+void RAWToUVJRow_NEON(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ const uint8_t* src_raw_1 = src_raw + src_stride_raw;
+ asm volatile (
+ "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
+ "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
+ "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
+ "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
+ "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
+ "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v2.8h, v1.8h, v0.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_raw_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
@@ -1481,25 +2287,27 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
- "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v3.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
+ "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v3.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(src_bgra_1), // %1
"+r"(dst_u), // %2
@@ -1519,25 +2327,27 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
- "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v3.8h, #1 \n" // 2x average
- "urshr v2.8h, v2.8h, #1 \n"
- "urshr v1.8h, v1.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v3.8h, #1 \n" // 2x average
+ "urshr v2.8h, v2.8h, #1 \n"
+ "urshr v1.8h, v1.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v2.8h, v1.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_abgr_1), // %1
"+r"(dst_u), // %2
@@ -1557,25 +2367,27 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
- "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
- "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
+ "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
+ "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(src_rgba_1), // %1
"+r"(dst_u), // %2
@@ -1595,25 +2407,27 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
- "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v0.8h, v0.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v2.8h, v2.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
+ "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v0.8h, v0.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v2.8h, v2.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(src_rgb24_1), // %1
"+r"(dst_u), // %2
@@ -1633,25 +2447,27 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
const uint8_t* src_raw_1 = src_raw + src_stride_raw;
asm volatile (
RGBTOUV_SETUP_REG
- "1: \n"
- "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
- "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
- "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
- "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
-
- "urshr v2.8h, v2.8h, #1 \n" // 2x average
- "urshr v1.8h, v1.8h, #1 \n"
- "urshr v0.8h, v0.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 32 processed per loop.
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RAW pixels.
+ "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
+ "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
+ "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
+
+ "urshr v2.8h, v2.8h, #1 \n" // 2x average
+ "urshr v1.8h, v1.8h, #1 \n"
+ "urshr v0.8h, v0.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(src_raw_1), // %1
"+r"(dst_u), // %2
@@ -1663,7 +2479,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
);
}
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16.
void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
int src_stride_rgb565,
uint8_t* dst_u,
@@ -1671,67 +2487,54 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
int width) {
const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
asm volatile(
- "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) /
- // 2
- "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
- "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
- "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
- "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
- "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
+ RGBTOUV_SETUP_REG
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
RGB565TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
RGB565TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
RGB565TOARGB
- "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
-
- "ins v16.D[1], v17.D[0] \n"
- "ins v18.D[1], v19.D[0] \n"
- "ins v20.D[1], v21.D[0] \n"
-
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v18.8h, #1 \n"
- "urshr v6.8h, v20.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v16.8h, v4.8h, v22.8h \n" // B
- "mls v16.8h, v5.8h, v23.8h \n" // G
- "mls v16.8h, v6.8h, v24.8h \n" // R
- "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
- "mul v17.8h, v6.8h, v22.8h \n" // R
- "mls v17.8h, v5.8h, v26.8h \n" // G
- "mls v17.8h, v4.8h, v25.8h \n" // B
- "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(src_rgb565_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
:
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
- "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
- "v27");
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28");
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
@@ -1744,50 +2547,43 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
asm volatile(
RGBTOUV_SETUP_REG
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
-
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
-
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v17.8h, #1 \n"
- "urshr v6.8h, v18.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v2.8h, v4.8h, v20.8h \n" // B
- "mls v2.8h, v5.8h, v21.8h \n" // G
- "mls v2.8h, v6.8h, v22.8h \n" // R
- "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
- "mul v3.8h, v6.8h, v20.8h \n" // R
- "mls v3.8h, v5.8h, v24.8h \n" // G
- "mls v3.8h, v4.8h, v23.8h \n" // B
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(src_argb1555_1), // %1
"+r"(dst_u), // %2
@@ -1807,52 +2603,45 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
int width) {
const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
asm volatile(
- RGBTOUV_SETUP_REG
+ RGBTOUV_SETUP_REG // sets v20-v25
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
-
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
-
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v17.8h, #1 \n"
- "urshr v6.8h, v18.8h, #1 \n"
-
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v2.8h, v4.8h, v20.8h \n" // B
- "mls v2.8h, v5.8h, v21.8h \n" // G
- "mls v2.8h, v6.8h, v22.8h \n" // R
- "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
- "mul v3.8h, v6.8h, v20.8h \n" // R
- "mls v3.8h, v5.8h, v24.8h \n" // G
- "mls v3.8h, v4.8h, v23.8h \n" // B
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
+
+ "urshr v0.8h, v16.8h, #1 \n" // 2x average
+ "urshr v1.8h, v17.8h, #1 \n"
+ "urshr v2.8h, v18.8h, #1 \n"
+
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ RGBTOUV(v0.8h, v1.8h, v2.8h)
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(src_argb4444_1), // %1
"+r"(dst_u), // %2
@@ -1868,21 +2657,22 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile(
- "movi v24.8b, #13 \n" // B * 0.1016 coefficient
- "movi v25.8b, #65 \n" // G * 0.5078 coefficient
- "movi v26.8b, #33 \n" // R * 0.2578 coefficient
- "movi v27.8b, #16 \n" // Add 16 constant
+ "movi v24.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
- "umull v3.8h, v0.8b, v24.8b \n" // B
- "umlal v3.8h, v1.8b, v25.8b \n" // G
- "umlal v3.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v27.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1895,21 +2685,22 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
asm volatile(
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
+ "movi v4.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1921,21 +2712,22 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y,
int width) {
asm volatile(
- "movi v24.8b, #13 \n" // B * 0.1016 coefficient
- "movi v25.8b, #65 \n" // G * 0.5078 coefficient
- "movi v26.8b, #33 \n" // R * 0.2578 coefficient
- "movi v27.8b, #16 \n" // Add 16 constant
+ "movi v24.8b, #25 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #129 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #66 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
- "umull v3.8h, v0.8b, v24.8b \n" // B
- "umlal v3.8h, v1.8b, v25.8b \n" // G
- "umlal v3.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v27.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -1943,119 +2735,175 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
: "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
}
-void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+struct RgbConstants {
+ uint8_t kRGBToY[4];
+ uint16_t kAddY;
+ uint16_t pad;
+};
+
+// RGB to JPeg coefficients
+// B * 0.1140 coefficient = 29
+// G * 0.5870 coefficient = 150
+// R * 0.2990 coefficient = 77
+// Add 0.5 = 0x80
+static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
+ 128,
+ 0};
+
+static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
+
+// RGB to BT.601 coefficients
+// B * 0.1016 coefficient = 25
+// G * 0.5078 coefficient = 129
+// R * 0.2578 coefficient = 66
+// Add 16.5 = 0x1080
+
+static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
+ 0x1080,
+ 0};
+
+static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
+ 0x1080,
+ 0};
+
+// ARGB expects first 3 values to contain RGB and 4th value is ignored.
+void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
asm volatile(
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // R
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // B
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+ "ldr d0, [%3] \n" // load rgbconstants
+ "dup v6.16b, v0.b[0] \n"
+ "dup v7.16b, v0.b[1] \n"
+ "dup v16.16b, v0.b[2] \n"
+ "dup v17.8h, v0.h[2] \n"
+ "1: \n"
+ "ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16
+ // pixels.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "umull v0.8h, v2.8b, v6.8b \n" // B
+ "umull2 v1.8h, v2.16b, v6.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v0.8h, v3.8b, v7.8b \n" // G
+ "umlal2 v1.8h, v3.16b, v7.16b \n"
+ "umlal v0.8h, v4.8b, v16.8b \n" // R
+ "umlal2 v1.8h, v4.16b, v16.16b \n"
+ "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
+ "addhn v1.8b, v1.8h, v17.8h \n"
+ "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(rgbconstants) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
+ ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
}
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
+}
+
+// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
+// Same code as ARGB, except the LD4
+void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
asm volatile(
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // R
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // B
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+ "ldr d0, [%3] \n" // load rgbconstants
+ "dup v6.16b, v0.b[0] \n"
+ "dup v7.16b, v0.b[1] \n"
+ "dup v16.16b, v0.b[2] \n"
+ "dup v17.8h, v0.h[2] \n"
+ "1: \n"
+ "ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16
+ // pixels.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "umull v0.8h, v2.8b, v6.8b \n" // B
+ "umull2 v1.8h, v2.16b, v6.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v0.8h, v3.8b, v7.8b \n" // G
+ "umlal2 v1.8h, v3.16b, v7.16b \n"
+ "umlal v0.8h, v4.8b, v16.8b \n" // R
+ "umlal2 v1.8h, v4.16b, v16.16b \n"
+ "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
+ "addhn v1.8b, v1.8h, v17.8h \n"
+ "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(rgbconstants) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17");
}
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
+}
+
+void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
+ RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
+}
+
+void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
+ uint8_t* dst_y,
+ int width,
+ const struct RgbConstants* rgbconstants) {
asm volatile(
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // B
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
+ "ldr d0, [%3] \n" // load rgbconstants
+ "dup v5.16b, v0.b[0] \n"
+ "dup v6.16b, v0.b[1] \n"
+ "dup v7.16b, v0.b[2] \n"
+ "dup v16.8h, v0.h[2] \n"
+ "1: \n"
+ "ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "umull v0.8h, v2.8b, v5.8b \n" // B
+ "umull2 v1.8h, v2.16b, v5.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v0.8h, v3.8b, v6.8b \n" // G
+ "umlal2 v1.8h, v3.16b, v6.16b \n"
+ "umlal v0.8h, v4.8b, v7.8b \n" // R
+ "umlal2 v1.8h, v4.16b, v7.16b \n"
+ "addhn v0.8b, v0.8h, v16.8h \n" // 16 bit to 8 bit Y
+ "addhn v1.8b, v1.8h, v16.8h \n"
+ "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(rgbconstants) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
}
+void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
+}
+
+void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
+ RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
+}
+
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+ RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
}
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
- asm volatile(
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+ RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
}
// Bilinear filter 16x2 -> 16x1
@@ -2068,44 +2916,49 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile(
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
- "dup v5.16b, %w4 \n"
- "dup v4.16b, %w5 \n"
+ "dup v5.16b, %w4 \n"
+ "dup v4.16b, %w5 \n"
// General purpose row blend.
"1: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "umull v2.8h, v0.8b, v4.8b \n"
- "umull2 v3.8h, v0.16b, v4.16b \n"
- "umlal v2.8h, v1.8b, v5.8b \n"
- "umlal2 v3.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v2.8h, #8 \n"
- "rshrn2 v0.16b, v3.8h, #8 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v2.8h, v0.8b, v4.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "umull2 v3.8h, v0.16b, v4.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "umlal v2.8h, v1.8b, v5.8b \n"
+ "umlal2 v3.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v2.8h, #8 \n"
+ "rshrn2 v0.16b, v3.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
@@ -2118,66 +2971,215 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
: "cc", "memory", "v0", "v1", "v3", "v4", "v5");
}
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_16_NEON(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+
+ asm volatile(
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+
+ "dup v5.8h, %w4 \n"
+ "dup v4.8h, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "ld1 {v0.8h}, [%1], #16 \n"
+ "ld1 {v1.8h}, [%2], #16 \n"
+ "subs %w3, %w3, #8 \n"
+ "umull v2.4s, v0.4h, v4.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "umull2 v3.4s, v0.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "umlal v2.4s, v1.4h, v5.4h \n"
+ "umlal2 v3.4s, v1.8h, v5.8h \n"
+ "rshrn v0.4h, v2.4s, #8 \n"
+ "rshrn2 v0.8h, v3.4s, #8 \n"
+ "st1 {v0.8h}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "ld1 {v0.8h}, [%1], #16 \n"
+ "ld1 {v1.8h}, [%2], #16 \n"
+ "subs %w3, %w3, #8 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "urhadd v0.8h, v0.8h, v1.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.8h}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "ld1 {v0.8h}, [%1], #16 \n"
+ "subs %w3, %w3, #8 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "st1 {v0.8h}, [%0], #16 \n"
+ "b.gt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(dst_width) // %3
+ : "r"(y1_fraction), // %4
+ "r"(y0_fraction) // %5
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
+}
+
+// Bilinear filter 8x2 -> 8x1
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ int scale,
+ int dst_width,
+ int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
+
+ asm volatile(
+ "dup v6.8h, %w6 \n"
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+
+ "dup v5.8h, %w4 \n"
+ "dup v4.8h, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "ld1 {v0.8h}, [%1], #16 \n"
+ "ld1 {v1.8h}, [%2], #16 \n"
+ "subs %w3, %w3, #8 \n"
+ "umull v2.4s, v0.4h, v4.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "umull2 v3.4s, v0.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "umlal v2.4s, v1.4h, v5.4h \n"
+ "umlal2 v3.4s, v1.8h, v5.8h \n"
+ "rshrn v0.4h, v2.4s, #8 \n"
+ "rshrn2 v0.8h, v3.4s, #8 \n"
+ "ushl v0.8h, v0.8h, v6.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%0], #8 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "ld1 {v0.8h}, [%1], #16 \n"
+ "ld1 {v1.8h}, [%2], #16 \n"
+ "subs %w3, %w3, #8 \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "urhadd v0.8h, v0.8h, v1.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "ushl v0.8h, v0.8h, v6.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%0], #8 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "ldr q0, [%1], #16 \n"
+ "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative
+ "prfm pldl1keep, [%1, 448] \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "subs %w3, %w3, #8 \n" // 8 src pixels per loop
+ "str d0, [%0], #8 \n" // store 8 pixels
+ "b.gt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(dst_width) // %3
+ : "r"(y1_fraction), // %4
+ "r"(y0_fraction), // %5
+ "r"(shift) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+void ARGBBlendRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile(
- "subs %w3, %w3, #8 \n"
- "b.lt 89f \n"
+ "subs %w3, %w3, #8 \n"
+ "b.lt 89f \n"
// Blend 8 pixels.
"8: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
- // pixels
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
- // pixels
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- // pixels
- "b.ge 8b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "prfm pldl1keep, [%0, 448] \n"
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "prfm pldl1keep, [%1, 448] \n"
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.ge 8b \n"
"89: \n"
- "adds %w3, %w3, #8-1 \n"
- "b.lt 99f \n"
+ "adds %w3, %w3, #8-1 \n"
+ "b.lt 99f \n"
// Blend 1 pixels.
"1: \n"
- "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
- "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
- "subs %w3, %w3, #1 \n" // 1 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
- "b.ge 1b \n"
+ "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel
+ // ARGB0.
+ "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel
+ // ARGB1.
+ "subs %w3, %w3, #1 \n" // 1 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "prfm pldl1keep, [%0, 448] \n"
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "prfm pldl1keep, [%1, 448] \n"
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
+ "b.ge 1b \n"
"99: \n"
- : "+r"(src_argb0), // %0
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -2193,17 +3195,17 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
asm volatile(
// Attenuate 8 pixels.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v3.8b \n" // b * a
- "umull v5.8h, v1.8b, v3.8b \n" // g * a
- "umull v6.8h, v2.8b, v3.8b \n" // r * a
- "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
- "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
- "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
- // pixels
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v3.8b \n" // b * a
+ "prfm pldl1keep, [%0, 448] \n"
+ "umull v5.8h, v1.8b, v3.8b \n" // g * a
+ "umull v6.8h, v2.8b, v3.8b \n" // r * a
+ "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
+ "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
+ "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2219,32 +3221,33 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int interval_offset,
int width) {
asm volatile(
- "dup v4.8h, %w2 \n"
- "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
- "dup v5.8h, %w3 \n" // interval multiply.
- "dup v6.8h, %w4 \n" // interval add
+ "dup v4.8h, %w2 \n"
+ "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
+ "dup v5.8h, %w3 \n" // interval multiply.
+ "dup v6.8h, %w4 \n" // interval add
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
- "subs %w1, %w1, #8 \n" // 8 processed per loop.
- "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
- "uxtl v1.8h, v1.8b \n"
- "uxtl v2.8h, v2.8b \n"
- "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
- "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
- "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
- "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
- "mul v1.8h, v1.8h, v5.8h \n" // g
- "mul v2.8h, v2.8h, v5.8h \n" // r
- "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
- "add v1.8h, v1.8h, v6.8h \n" // g
- "add v2.8h, v2.8h, v6.8h \n" // r
- "uqxtn v0.8b, v0.8h \n"
- "uqxtn v1.8b, v1.8h \n"
- "uqxtn v2.8b, v2.8h \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
+ "prfm pldl1keep, [%0, 448] \n"
+ "uxtl v1.8h, v1.8b \n"
+ "uxtl v2.8h, v2.8b \n"
+ "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
+ "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
+ "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
+ "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
+ "mul v1.8h, v1.8h, v5.8h \n" // g
+ "mul v2.8h, v2.8h, v5.8h \n" // r
+ "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
+ "add v1.8h, v1.8h, v6.8h \n" // g
+ "add v2.8h, v2.8h, v6.8h \n" // r
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v2.8b, v2.8h \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
: "r"(scale), // %2
@@ -2261,28 +3264,29 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
int width,
uint32_t value) {
asm volatile(
- "dup v0.4s, %w3 \n" // duplicate scale value.
- "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
- "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
+ "dup v0.4s, %w3 \n" // duplicate scale value.
+ "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
+ "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop.
"1: \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
- "uxtl v5.8h, v5.8b \n"
- "uxtl v6.8h, v6.8b \n"
- "uxtl v7.8h, v7.8b \n"
- "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
- "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
- "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
- "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
- "uqxtn v4.8b, v4.8h \n"
- "uqxtn v5.8b, v5.8h \n"
- "uqxtn v6.8b, v6.8h \n"
- "uqxtn v7.8b, v7.8h \n"
- "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
+ "prfm pldl1keep, [%0, 448] \n"
+ "uxtl v5.8h, v5.8b \n"
+ "uxtl v6.8h, v6.8b \n"
+ "uxtl v7.8h, v7.8b \n"
+ "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
+ "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
+ "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
+ "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
+ "uqxtn v4.8b, v4.8h \n"
+ "uqxtn v5.8b, v5.8h \n"
+ "uqxtn v6.8b, v6.8h \n"
+ "uqxtn v7.8b, v7.8h \n"
+ "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2292,23 +3296,24 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
// Similar to ARGBToYJ but stores ARGB.
-// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
- "movi v24.8b, #15 \n" // B * 0.11400 coefficient
- "movi v25.8b, #75 \n" // G * 0.58700 coefficient
- "movi v26.8b, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlal v4.8h, v1.8b, v25.8b \n" // G
- "umlal v4.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
- "orr v1.8b, v0.8b, v0.8b \n" // G
- "orr v2.8b, v0.8b, v0.8b \n" // R
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
+ "movi v24.8b, #29 \n" // B * 0.1140 coefficient
+ "movi v25.8b, #150 \n" // G * 0.5870 coefficient
+ "movi v26.8b, #77 \n" // R * 0.2990 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v4.8h, v1.8b, v25.8b \n" // G
+ "umlal v4.8h, v2.8b, v26.8b \n" // R
+ "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
+ "orr v1.8b, v0.8b, v0.8b \n" // G
+ "orr v2.8b, v0.8b, v0.8b \n" // R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2323,32 +3328,33 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
asm volatile(
- "movi v20.8b, #17 \n" // BB coefficient
- "movi v21.8b, #68 \n" // BG coefficient
- "movi v22.8b, #35 \n" // BR coefficient
- "movi v24.8b, #22 \n" // GB coefficient
- "movi v25.8b, #88 \n" // GG coefficient
- "movi v26.8b, #45 \n" // GR coefficient
- "movi v28.8b, #24 \n" // BB coefficient
- "movi v29.8b, #98 \n" // BG coefficient
- "movi v30.8b, #50 \n" // BR coefficient
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
- "subs %w1, %w1, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
- "umlal v4.8h, v1.8b, v21.8b \n" // G
- "umlal v4.8h, v2.8b, v22.8b \n" // R
- "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
- "umlal v5.8h, v1.8b, v25.8b \n" // G
- "umlal v5.8h, v2.8b, v26.8b \n" // R
- "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
- "umlal v6.8h, v1.8b, v29.8b \n" // G
- "umlal v6.8h, v2.8b, v30.8b \n" // R
- "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
- "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
- "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
+ "movi v20.8b, #17 \n" // BB coefficient
+ "movi v21.8b, #68 \n" // BG coefficient
+ "movi v22.8b, #35 \n" // BR coefficient
+ "movi v24.8b, #22 \n" // GB coefficient
+ "movi v25.8b, #88 \n" // GG coefficient
+ "movi v26.8b, #45 \n" // GR coefficient
+ "movi v28.8b, #24 \n" // BB coefficient
+ "movi v29.8b, #98 \n" // BG coefficient
+ "movi v30.8b, #50 \n" // BR coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
+ "prfm pldl1keep, [%0, 448] \n"
+ "umlal v4.8h, v1.8b, v21.8b \n" // G
+ "umlal v4.8h, v2.8b, v22.8b \n" // R
+ "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
+ "umlal v5.8h, v1.8b, v25.8b \n" // G
+ "umlal v5.8h, v2.8b, v26.8b \n" // R
+ "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
+ "umlal v6.8h, v1.8b, v29.8b \n" // G
+ "umlal v6.8h, v2.8b, v30.8b \n" // R
+ "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
+ "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
+ "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
:
@@ -2364,51 +3370,52 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
const int8_t* matrix_argb,
int width) {
asm volatile(
- "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
- "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
- "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
-
- "1: \n"
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
- "uxtl v17.8h, v17.8b \n" // g
- "uxtl v18.8h, v18.8b \n" // r
- "uxtl v19.8h, v19.8b \n" // a
- "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
- "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
- "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
- "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
- "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
- "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
- "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
- "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
- "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
- "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
- "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
- "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
- "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
- "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
- "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
- "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
- "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
- "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
+ "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
+ "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
+
+ "1: \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
+ "prfm pldl1keep, [%0, 448] \n"
+ "uxtl v17.8h, v17.8b \n" // g
+ "uxtl v18.8h, v18.8b \n" // r
+ "uxtl v19.8h, v19.8b \n" // a
+ "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
+ "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
+ "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
+ "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
+ "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
+ "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
+ "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
+ "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
+ "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
+ "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
+ "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
+ "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
+ "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
+ "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
+ "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
+ "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
+ "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@@ -2419,27 +3426,29 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v0.8h, v0.8b, v4.8b \n" // multiply B
- "umull v1.8h, v1.8b, v5.8b \n" // multiply G
- "umull v2.8h, v2.8b, v6.8b \n" // multiply R
- "umull v3.8h, v3.8b, v7.8b \n" // multiply A
- "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
- "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
- "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
- "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(src_argb0), // %0
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "prfm pldl1keep, [%0, 448] \n"
+ "umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "prfm pldl1keep, [%1, 448] \n"
+ "umull v2.8h, v2.8b, v6.8b \n" // multiply R
+ "umull v3.8h, v3.8b, v7.8b \n" // multiply A
+ "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
+ "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
+ "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
+ "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -2448,23 +3457,25 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
}
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8_t* src_argb0,
+void ARGBAddRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v4.8b \n"
- "uqadd v1.8b, v1.8b, v5.8b \n"
- "uqadd v2.8b, v2.8b, v6.8b \n"
- "uqadd v3.8b, v3.8b, v7.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(src_argb0), // %0
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v4.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "uqadd v1.8b, v1.8b, v5.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uqadd v2.8b, v2.8b, v6.8b \n"
+ "uqadd v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -2473,23 +3484,25 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
}
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+void ARGBSubtractRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile(
// 8 pixel loop.
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqsub v0.8b, v0.8b, v4.8b \n"
- "uqsub v1.8b, v1.8b, v5.8b \n"
- "uqsub v2.8b, v2.8b, v6.8b \n"
- "uqsub v3.8b, v3.8b, v7.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
- : "+r"(src_argb0), // %0
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqsub v0.8b, v0.8b, v4.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "uqsub v1.8b, v1.8b, v5.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uqsub v2.8b, v2.8b, v6.8b \n"
+ "uqsub v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@@ -2507,17 +3520,19 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v1.8b \n" // add
- "orr v1.8b, v0.8b, v0.8b \n"
- "orr v2.8b, v0.8b, v0.8b \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "prfm pldl1keep, [%0, 448] \n"
+ "orr v1.8b, v0.8b, v0.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "orr v2.8b, v0.8b, v0.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2534,12 +3549,14 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
asm volatile(
// 16 pixel loop.
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
- "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "uqadd v0.16b, v0.16b, v1.16b \n" // add
- "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
+ "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uqadd v0.16b, v0.16b, v1.16b \n" // add
+ "prfm pldl1keep, [%1, 448] \n"
+ "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
@@ -2558,15 +3575,17 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
uint8_t* dst_argb,
int width) {
asm volatile(
- "movi v3.8b, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
- "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v1.8b, v0.8b, v2.8b \n" // add
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
- "b.gt 1b \n"
+ "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uqadd v1.8b, v0.8b, v2.8b \n" // add
+ "prfm pldl1keep, [%1, 448] \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
@@ -2586,23 +3605,26 @@ void SobelXRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.8b}, [%0],%5 \n" // top
- "ld1 {v1.8b}, [%0],%6 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- "ld1 {v2.8b}, [%1],%5 \n" // center * 2
- "ld1 {v3.8b}, [%1],%6 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "ld1 {v2.8b}, [%2],%5 \n" // bottom
- "ld1 {v3.8b}, [%2],%6 \n"
- "subs %w4, %w4, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0],%5 \n" // top
+ "ld1 {v1.8b}, [%0],%6 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ld1 {v2.8b}, [%1],%5 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%6 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%2],%5 \n" // bottom
+ "ld1 {v3.8b}, [%2],%6 \n"
+ "subs %w4, %w4, #8 \n" // 8 pixels
+ "prfm pldl1keep, [%2, 448] \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
+ "b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(src_y2), // %2
@@ -2624,23 +3646,25 @@ void SobelYRow_NEON(const uint8_t* src_y0,
int width) {
asm volatile(
"1: \n"
- "ld1 {v0.8b}, [%0],%4 \n" // left
- "ld1 {v1.8b}, [%1],%4 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- "ld1 {v2.8b}, [%0],%4 \n" // center * 2
- "ld1 {v3.8b}, [%1],%4 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "ld1 {v2.8b}, [%0],%5 \n" // right
- "ld1 {v3.8b}, [%1],%5 \n"
- "subs %w3, %w3, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0],%4 \n" // left
+ "ld1 {v1.8b}, [%1],%4 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%0],%4 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%4 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%0],%5 \n" // right
+ "ld1 {v3.8b}, [%1],%5 \n"
+ "subs %w3, %w3, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
+ "b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
@@ -2658,16 +3682,17 @@ void HalfFloat1Row_NEON(const uint16_t* src,
int width) {
asm volatile(
"1: \n"
- "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v2.4s, v1.4h \n" // 8 int's
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fcvtn v1.4h, v2.4s \n" // 8 half floats
- "fcvtn2 v1.8h, v3.4s \n"
- "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
- "b.gt 1b \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "prfm pldl1keep, [%0, 448] \n"
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fcvtn v1.4h, v2.4s \n" // 8 half floats
+ "fcvtn2 v1.8h, v3.4s \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2681,18 +3706,19 @@ void HalfFloatRow_NEON(const uint16_t* src,
int width) {
asm volatile(
"1: \n"
- "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v2.4s, v1.4h \n" // 8 int's
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
- "fmul v3.4s, v3.4s, %3.s[0] \n"
- "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
- "uqshrn2 v1.8h, v3.4s, #13 \n"
- "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
- "b.gt 1b \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "prfm pldl1keep, [%0, 448] \n"
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
+ "uqshrn2 v1.8h, v3.4s, #13 \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2706,17 +3732,18 @@ void ByteToFloatRow_NEON(const uint8_t* src,
int width) {
asm volatile(
"1: \n"
- "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v1.8h, v1.8b \n" // 8 shorts
- "uxtl v2.4s, v1.4h \n" // 8 ints
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
- "fmul v3.4s, v3.4s, %3.s[0] \n"
- "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
- "b.gt 1b \n"
+ "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v1.8h, v1.8b \n" // 8 shorts
+ "prfm pldl1keep, [%0, 448] \n"
+ "uxtl v2.4s, v1.4h \n" // 8 ints
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2730,20 +3757,21 @@ float ScaleMaxSamples_NEON(const float* src,
int width) {
float fmax;
asm volatile(
- "movi v5.4s, #0 \n" // max
- "movi v6.4s, #0 \n"
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n"
"1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
- "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
- "fmax v5.4s, v5.4s, v1.4s \n" // max
- "fmax v6.4s, v6.4s, v2.4s \n"
- "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
- "fmax v5.4s, v5.4s, v6.4s \n" // max
- "fmaxv %s3, v5.4s \n" // signed max acculator
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
+ "fmax v5.4s, v5.4s, v1.4s \n" // max
+ "fmax v6.4s, v6.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "fmax v5.4s, v5.4s, v6.4s \n" // max
+ "fmaxv %s3, v5.4s \n" // signed max acculator
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
@@ -2759,21 +3787,22 @@ float ScaleSumSamples_NEON(const float* src,
int width) {
float fsum;
asm volatile(
- "movi v5.4s, #0 \n" // max
- "movi v6.4s, #0 \n" // max
-
- "1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
- "fmul v4.4s, v2.4s, %4.s[0] \n"
- "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
- "fmla v6.4s, v2.4s, v2.4s \n"
- "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
- "faddp v5.4s, v5.4s, v6.4s \n"
- "faddp v5.4s, v5.4s, v5.4s \n"
- "faddp %3.4s, v5.4s, v5.4s \n" // sum
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n" // max
+
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmul v4.4s, v2.4s, %4.s[0] \n"
+ "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
+ "fmla v6.4s, v2.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "faddp v5.4s, v5.4s, v6.4s \n"
+ "faddp v5.4s, v5.4s, v5.4s \n"
+ "faddp %3.4s, v5.4s, v5.4s \n" // sum
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
@@ -2786,12 +3815,13 @@ float ScaleSumSamples_NEON(const float* src,
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
asm volatile(
"1: \n"
- "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
- "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
- "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
- "b.gt 1b \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "prfm pldl1keep, [%0, 448] \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@@ -2808,26 +3838,31 @@ void GaussCol_NEON(const uint16_t* src0,
uint32_t* dst,
int width) {
asm volatile(
- "movi v6.8h, #4 \n" // constant 4
- "movi v7.8h, #6 \n" // constant 6
-
- "1: \n"
- "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
- "ld1 {v2.8h}, [%4], #16 \n"
- "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
- "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
- "ld1 {v2.8h}, [%1], #16 \n"
- "umlal v0.4s, v2.4h, v6.4h \n" // * 4
- "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
- "ld1 {v2.8h}, [%2], #16 \n"
- "umlal v0.4s, v2.4h, v7.4h \n" // * 6
- "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
- "ld1 {v2.8h}, [%3], #16 \n"
- "umlal v0.4s, v2.4h, v6.4h \n" // * 4
- "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
- "subs %w6, %w6, #8 \n" // 8 processed per loop
- "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
- "b.gt 1b \n"
+ "movi v6.8h, #4 \n" // constant 4
+ "movi v7.8h, #6 \n" // constant 6
+
+ "1: \n"
+ "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
+ "ld1 {v2.8h}, [%4], #16 \n"
+ "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
+ "ld1 {v2.8h}, [%1], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%1, 448] \n"
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "ld1 {v2.8h}, [%2], #16 \n"
+ "umlal v0.4s, v2.4h, v7.4h \n" // * 6
+ "prfm pldl1keep, [%2, 448] \n"
+ "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
+ "ld1 {v2.8h}, [%3], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "prfm pldl1keep, [%3, 448] \n"
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
+ "prfm pldl1keep, [%4, 448] \n"
+ "b.gt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
"+r"(src2), // %2
@@ -2845,27 +3880,28 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
const uint32_t* src2 = src + 2;
const uint32_t* src3 = src + 3;
asm volatile(
- "movi v6.4s, #4 \n" // constant 4
- "movi v7.4s, #6 \n" // constant 6
-
- "1: \n"
- "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
- "add v0.4s, v0.4s, v1.4s \n" // * 1
- "add v1.4s, v1.4s, v2.4s \n" // * 1
- "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
- "mla v0.4s, v2.4s, v7.4s \n" // * 6
- "mla v1.4s, v3.4s, v7.4s \n" // * 6
- "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
- "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
- "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
- "add v3.4s, v3.4s, v5.4s \n"
- "mla v0.4s, v2.4s, v6.4s \n" // * 4
- "mla v1.4s, v3.4s, v6.4s \n" // * 4
- "subs %w5, %w5, #8 \n" // 8 processed per loop
- "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
- "uqrshrn2 v0.8h, v1.4s, #8 \n"
- "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
- "b.gt 1b \n"
+ "movi v6.4s, #4 \n" // constant 4
+ "movi v7.4s, #6 \n" // constant 6
+
+ "1: \n"
+ "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
+ "add v0.4s, v0.4s, v1.4s \n" // * 1
+ "add v1.4s, v1.4s, v2.4s \n" // * 1
+ "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
+ "mla v0.4s, v2.4s, v7.4s \n" // * 6
+ "mla v1.4s, v3.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
+ "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
+ "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
+ "add v3.4s, v3.4s, v5.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mla v0.4s, v2.4s, v6.4s \n" // * 4
+ "mla v1.4s, v3.4s, v6.4s \n" // * 4
+ "subs %w5, %w5, #8 \n" // 8 processed per loop
+ "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
+ "uqrshrn2 v0.8h, v1.4s, #8 \n"
+ "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
+ "b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
@@ -2876,20 +3912,104 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
+static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_F32_NEON(const float* src0,
+ const float* src1,
+ const float* src2,
+ const float* src3,
+ const float* src4,
+ float* dst,
+ int width) {
+ asm volatile(
+ "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6
+
+ "1: \n"
+ "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
+ "ld1 {v2.4s, v3.4s}, [%1], #32 \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "ld1 {v4.4s, v5.4s}, [%2], #32 \n"
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmla v0.4s, v4.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s, v3.4s}, [%3], #32 \n"
+ "fmla v1.4s, v5.4s, v7.4s \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "ld1 {v4.4s, v5.4s}, [%4], #32 \n"
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "fadd v0.4s, v0.4s, v4.4s \n" // * 1
+ "prfm pldl1keep, [%3, 448] \n"
+ "fadd v1.4s, v1.4s, v5.4s \n"
+ "prfm pldl1keep, [%4, 448] \n"
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(src4), // %4
+ "+r"(dst), // %5
+ "+r"(width) // %6
+ : "r"(&kGaussCoefficients) // %7
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_F32_NEON(const float* src, float* dst, int width) {
+ asm volatile(
+ "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256
+
+ "1: \n"
+ "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5
+ // rows
+ "fadd v0.4s, v0.4s, v1.4s \n" // * 1
+ "ld1 {v4.4s, v5.4s}, [%0], %5 \n"
+ "fadd v1.4s, v1.4s, v2.4s \n"
+ "fmla v0.4s, v4.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s, v3.4s}, [%0], %4 \n"
+ "fmla v1.4s, v5.4s, v7.4s \n"
+ "ld1 {v4.4s, v5.4s}, [%0], %6 \n"
+ "fadd v2.4s, v2.4s, v4.4s \n"
+ "fadd v3.4s, v3.4s, v5.4s \n"
+ "fmla v0.4s, v2.4s, v6.4s \n" // * 4
+ "fmla v1.4s, v3.4s, v6.4s \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "fmul v0.4s, v0.4s, v8.4s \n" // / 256
+ "fmul v1.4s, v1.4s, v8.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(&kGaussCoefficients), // %3
+ "r"(8LL), // %4
+ "r"(-4LL), // %5
+ "r"(20LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+}
+
+#if LIBYUV_USE_ST3
// Convert biplanar NV21 to packed YUV24
void NV21ToYUV24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_yuv24,
int width) {
asm volatile(
- "1: \n"
- "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
- "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
- "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
- "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
- "subs %w3, %w3, #16 \n" // 16 pixels per loop
- "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
- "b.gt 1b \n"
+ "1: \n"
+ "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
+ "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
+ "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
+ "prfm pldl1keep, [%0, 448] \n"
+ "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
+ "prfm pldl1keep, [%1, 448] \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
+ "b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
@@ -2897,7 +4017,44 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
:
: "cc", "memory", "v0", "v1", "v2");
}
+#else
+static const uvec8 kYUV24Shuffle[3] = {
+ {16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20},
+ {21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27},
+ {10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15}};
+
+// Convert biplanar NV21 to packed YUV24
+// NV21 has VU in memory for chroma.
+// YUV24 is VUY in memory
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values
+ "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values
+ "tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n"
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop
+ "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ : "r"(&kYUV24Shuffle[0]) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+#endif // LIBYUV_USE_ST3
+
+// Note ST2 8b version is faster than zip+ST1
+// AYUV is VUYA in memory. UV for NV12 is UV order in memory.
void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_uv,
@@ -2905,19 +4062,20 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
asm volatile(
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- // pixels.
- "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
- "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
- "uqrshrn v2.8b, v1.8h, #2 \n"
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
- "b.gt 1b \n"
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v2.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
+ "b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
"+r"(dst_uv), // %2
@@ -2933,19 +4091,20 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
asm volatile(
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- // pixels.
- "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
- "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
- "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
- "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
- "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
- "uqrshrn v1.8b, v1.8h, #2 \n"
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
- "b.gt 1b \n"
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
+ "b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
"+r"(dst_vu), // %2
@@ -2957,12 +4116,12 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
// Copy row of AYUV Y's into Y
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
asm volatile(
- "1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
- // pixels
- "subs %w2, %w2, #16 \n" // 16 pixels per loop
- "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
- "b.gt 1b \n"
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "prfm pldl1keep, [%0, 448] \n"
+ "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
+ "b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@@ -2970,61 +4129,170 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3");
}
-void FloatDivToByteRow_NEON(const float* src_weights,
- const float* src_values,
- uint8_t* dst_out,
- uint8_t* dst_mask,
- int width) {
- asm volatile(
- "movi v0.4s, #0 \n"
+// Shuffle table for swapping UV bytes.
+static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
+ 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
- "ld1 {v1.4s,v2.4s}, [%0], #32 \n" // load 8 float weights
- "ld1 {v3.4s,v4.4s}, [%1], #32 \n" // load 8 float values
- "subs %w4, %w4, #8 \n" // 8 pixels per loop
-
- "fdiv v1.4s, v3.4s, v1.4s \n" // values / weights
- "fdiv v2.4s, v4.4s, v2.4s \n"
-
- "fcvtas v1.4s, v1.4s \n" // float to int
- "fcvtas v2.4s, v2.4s \n" // float to int
- "uqxtn v1.4h, v1.4s \n" // 8 shorts
- "uqxtn2 v1.8h, v2.4s \n"
- "uqxtn v1.8b, v1.8h \n" // 8 bytes
+ "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
+ "ld1 {v1.16b}, [%0], 16 \n"
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "tbl v0.16b, {v0.16b}, v2.16b \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "tbl v1.16b, {v1.16b}, v2.16b \n"
+ "stp q0, q1, [%1], 32 \n" // store 16 VU pixels
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ : "r"(&kShuffleSwapUV) // %3
+ : "cc", "memory", "v0", "v1", "v2");
+}
- "st1 {v1.8b}, [%2], #8 \n" // store 8 byte out
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_u_1 = src_u + src_stride_u;
+ const uint8_t* src_v_1 = src_v + src_stride_v;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values
+ "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
+ "ld1 {v2.16b}, [%1], #16 \n"
+ "ld1 {v3.16b}, [%3], #16 \n"
+ "uaddlp v0.8h, v0.16b \n" // half size
+ "prfm pldl1keep, [%0, 448] \n"
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uadalp v0.8h, v2.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v1.8h, v3.16b \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uqrshrn v0.8b, v0.8h, #2 \n"
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w5, %w5, #16 \n" // 16 src pixels per loop
+ "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_u_1), // %1
+ "+r"(src_v), // %2
+ "+r"(src_v_1), // %3
+ "+r"(dst_uv), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
- "fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero
- "fcmgt v6.4s, v2.4s, v0.4s \n"
- "uqxtn v5.4h, v5.4s \n" // 8 shorts
- "uqxtn2 v5.8h, v6.4s \n"
- "uqxtn v5.8b, v1.8h \n" // 8 bytes
+void SplitUVRow_16_NEON(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width) {
+ int shift = depth - 16; // Negative for right shift.
+ asm volatile(
+ "dup v2.8h, %w4 \n"
+ "1: \n"
+ "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV
+ "subs %w3, %w3, #8 \n" // 8 src pixels per loop
+ "ushl v0.8h, v0.8h, v2.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ushl v1.8h, v1.8h, v2.8h \n"
+ "st1 {v0.8h}, [%1], #16 \n" // store 8 U pixels
+ "st1 {v1.8h}, [%2], #16 \n" // store 8 V pixels
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"(shift) // %4
+ : "cc", "memory", "v0", "v1", "v2");
+}
- "st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask
+void MultiplyRow_16_NEON(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ asm volatile(
+ "dup v2.8h, %w3 \n"
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "mul v0.8h, v0.8h, v2.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "mul v1.8h, v1.8h, v2.8h \n"
+ "stp q0, q1, [%1], #32 \n" // store 16 pixels
+ "subs %w2, %w2, #16 \n" // 16 src pixels per loop
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "cc", "memory", "v0", "v1", "v2");
+}
- "b.gt 1b \n"
- : "+r"(src_weights), // %0
- "+r"(src_values), // %1
- "+r"(dst_out), // %2
- "+r"(dst_mask), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+void DivideRow_16_NEON(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ asm volatile(
+ "dup v0.8h, %w3 \n"
+ "1: \n"
+ "ldp q1, q2, [%0], #32 \n"
+ "ushll v3.4s, v1.4h, #0 \n"
+ "ushll v4.4s, v2.4h, #0 \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "ushll2 v1.4s, v1.8h, #0 \n"
+ "ushll2 v2.4s, v2.8h, #0 \n"
+ "mul v3.4s, v0.4s, v3.4s \n"
+ "mul v4.4s, v0.4s, v4.4s \n"
+ "mul v1.4s, v0.4s, v1.4s \n"
+ "mul v2.4s, v0.4s, v2.4s \n"
+ "shrn v3.4h, v3.4s, #16 \n"
+ "shrn v4.4h, v4.4s, #16 \n"
+ "shrn2 v3.8h, v1.4s, #16 \n"
+ "shrn2 v4.8h, v2.4s, #16 \n"
+ "stp q3, q3, [%1], #32 \n" // store 16 pixels
+ "subs %w2, %w2, #16 \n" // 16 src pixels per loop
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
-// Convert biplanar UV channel of NV12 to NV21
-void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits = shr 1
+// 16384 = 10 bits = shr 2
+// 4096 = 12 bits = shr 4
+// 256 = 16 bits = shr 8
+void Convert16To8Row_NEON(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width) {
+ int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
asm volatile(
- "1: \n"
- "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values
- "orr v2.16b, v0.16b, v0.16b \n" // move U after V
- "subs %w2, %w2, #16 \n" // 16 pixels per loop
- "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
- "b.gt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_vu), // %1
- "+r"(width) // %2
- :
+ "dup v2.8h, %w3 \n"
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative
+ "ushl v1.8h, v1.8h, v2.8h \n"
+ "prfm pldl1keep, [%0, 448] \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn2 v0.16b, v1.8h \n"
+ "subs %w2, %w2, #16 \n" // 16 src pixels per loop
+ "str q0, [%1], #16 \n" // store 16 pixels
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(shift) // %3
: "cc", "memory", "v0", "v1", "v2");
}
diff --git a/files/source/row_win.cc b/files/source/row_win.cc
index 27e3da7b..c7c1ff60 100644
--- a/files/source/row_win.cc
+++ b/files/source/row_win.cc
@@ -10,9 +10,9 @@
#include "libyuv/row.h"
-// This module is for Visual C 32/64 bit and clangcl 32 bit
+// This module is for Visual C 32/64 bit
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
- (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
+ !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
#if defined(_M_X64)
#include <emmintrin.h>
@@ -27,12 +27,34 @@ extern "C" {
// 64 bit
#if defined(_M_X64)
+// Read 8 UV from 444
+#define READYUV444 \
+ xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \
+ xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
+ u_buf += 8; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8;
+
+// Read 8 UV from 444, With 8 Alpha.
+#define READYUVA444 \
+ xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \
+ xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
+ u_buf += 8; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8; \
+ xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
+ a_buf += 8;
+
// Read 4 UV from 422, upsample to 8 UV.
#define READYUV422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
+ xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \
u_buf += 4; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
@@ -40,10 +62,10 @@ extern "C" {
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \
+ xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \
u_buf += 4; \
xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
@@ -52,24 +74,21 @@ extern "C" {
a_buf += 8;
// Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants) \
- xmm1 = _mm_loadu_si128(&xmm0); \
- xmm2 = _mm_loadu_si128(&xmm0); \
- xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
- xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
- xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
- xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
- xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
- xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
- xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
- xmm0 = _mm_adds_epi16(xmm0, xmm4); \
- xmm1 = _mm_adds_epi16(xmm1, xmm4); \
- xmm2 = _mm_adds_epi16(xmm2, xmm4); \
- xmm0 = _mm_srai_epi16(xmm0, 6); \
- xmm1 = _mm_srai_epi16(xmm1, 6); \
- xmm2 = _mm_srai_epi16(xmm2, 6); \
- xmm0 = _mm_packus_epi16(xmm0, xmm0); \
- xmm1 = _mm_packus_epi16(xmm1, xmm1); \
+#define YUVTORGB(yuvconstants) \
+ xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8((char)0x80)); \
+ xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
+ xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \
+ xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \
+ xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \
+ xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \
+ xmm0 = _mm_adds_epi16(xmm4, xmm0); \
+ xmm1 = _mm_subs_epi16(xmm4, xmm1); \
+ xmm2 = _mm_adds_epi16(xmm4, xmm2); \
+ xmm0 = _mm_srai_epi16(xmm0, 6); \
+ xmm1 = _mm_srai_epi16(xmm1, 6); \
+ xmm2 = _mm_srai_epi16(xmm2, 6); \
+ xmm0 = _mm_packus_epi16(xmm0, xmm0); \
+ xmm1 = _mm_packus_epi16(xmm1, xmm1); \
xmm2 = _mm_packus_epi16(xmm2, xmm2);
// Store 8 ARGB values.
@@ -90,7 +109,7 @@ void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
const __m128i xmm5 = _mm_set1_epi8(-1);
const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
@@ -110,7 +129,7 @@ void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
READYUVA422
@@ -121,6 +140,44 @@ void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
}
#endif
+#if defined(HAS_I444TOARGBROW_SSSE3)
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+ const __m128i xmm5 = _mm_set1_epi8(-1);
+ const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
+ while (width > 0) {
+ READYUV444
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ width -= 8;
+ }
+}
+#endif
+
+#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
+void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
+ const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
+ while (width > 0) {
+ READYUVA444
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ width -= 8;
+ }
+}
+#endif
+
// 32 bit
#else // defined(_M_X64)
#ifdef HAS_ARGBTOYROW_SSSE3
@@ -187,11 +244,11 @@ static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
// 7 bit fixed point 0.5.
static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
-static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
- 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+// 8 bit fixed point 0.5, for bias of UV.
+static const ulvec8 kBiasUV128 = {
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// Shuffle table for converting RGB24 to ARGB.
static const uvec8 kShuffleMaskRGB24ToARGB = {
@@ -1367,7 +1424,7 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
}
}
-__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1380,7 +1437,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
@@ -1439,7 +1496,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
}
}
-__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1452,7 +1509,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUVJ128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToVJ
movdqa xmm7, xmmword ptr kARGBToUJ
sub edi, edx // stride from u to v
@@ -1513,7 +1570,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
}
#ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1526,7 +1583,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUV128
+ vbroadcastf128 ymm5, xmmword ptr kBiasUV128
vbroadcastf128 ymm6, xmmword ptr kARGBToV
vbroadcastf128 ymm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
@@ -1581,7 +1638,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1594,9 +1651,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- vbroadcastf128 ymm5, xmmword ptr kAddUV128
- vbroadcastf128 ymm6, xmmword ptr kARGBToV
- vbroadcastf128 ymm7, xmmword ptr kARGBToU
+ vbroadcastf128 ymm5, xmmword ptr kBiasUV128
+ vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
+ vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
sub edi, edx // stride from u to v
convertloop:
@@ -1649,7 +1706,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
}
#endif // HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
@@ -1659,7 +1716,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 4 + 8] // dst_u
mov edi, [esp + 4 + 12] // dst_v
mov ecx, [esp + 4 + 16] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kARGBToV
movdqa xmm7, xmmword ptr kARGBToU
sub edi, edx // stride from u to v
@@ -1707,7 +1764,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
}
}
-__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1720,7 +1777,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kBGRAToV
movdqa xmm7, xmmword ptr kBGRAToU
sub edi, edx // stride from u to v
@@ -1779,7 +1836,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
}
}
-__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1792,7 +1849,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kABGRToV
movdqa xmm7, xmmword ptr kABGRToU
sub edi, edx // stride from u to v
@@ -1851,7 +1908,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
}
}
-__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@@ -1864,7 +1921,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // width
- movdqa xmm5, xmmword ptr kAddUV128
+ movdqa xmm5, xmmword ptr kBiasUV128
movdqa xmm6, xmmword ptr kRGBAToV
movdqa xmm7, xmmword ptr kRGBAToU
sub edi, edx // stride from u to v
@@ -1926,137 +1983,153 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
// Read 16 UV from 444
#define READYUV444_AVX2 \
- __asm { \
- __asm vmovdqu xmm0, [esi] /* U */ \
- __asm vmovdqu xmm1, [esi + edi] /* V */ \
+ __asm { \
+ __asm vmovdqu xmm3, [esi] /* U */ \
+ __asm vmovdqu xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
+ __asm vpermq ymm3, ymm3, 0xd8 \
__asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16]}
+// Read 16 UV from 444. With 16 Alpha.
+#define READYUVA444_AVX2 \
+ __asm { \
+ __asm vmovdqu xmm3, [esi] /* U */ \
+ __asm vmovdqu xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 16] \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpermq ymm1, ymm1, 0xd8 \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm4, ymm4, 0xd8 \
+ __asm vpunpcklbw ymm4, ymm4, ymm4 \
+ __asm lea eax, [eax + 16] \
+ __asm vmovdqu xmm5, [ebp] /* A */ \
+ __asm vpermq ymm5, ymm5, 0xd8 \
+ __asm lea ebp, [ebp + 16]}
+
// Read 8 UV from 422, upsample to 16 UV.
#define READYUV422_AVX2 \
- __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm { \
+ __asm vmovq xmm3, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16]}
// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
#define READYUVA422_AVX2 \
- __asm { \
- __asm vmovq xmm0, qword ptr [esi] /* U */ \
- __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm { \
+ __asm vmovq xmm3, qword ptr [esi] /* U */ \
+ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16] \
- __asm vmovdqu xmm5, [ebp] /* A */ \
+ __asm vmovdqu xmm5, [ebp] /* A */ \
__asm vpermq ymm5, ymm5, 0xd8 \
__asm lea ebp, [ebp + 16]}
// Read 8 UV from NV12, upsample to 16 UV.
#define READNV12_AVX2 \
- __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm { \
+ __asm vmovdqu xmm3, [esi] /* UV */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16]}
// Read 8 UV from NV21, upsample to 16 UV.
#define READNV21_AVX2 \
- __asm { \
- __asm vmovdqu xmm0, [esi] /* UV */ \
+ __asm { \
+ __asm vmovdqu xmm3, [esi] /* UV */ \
__asm lea esi, [esi + 16] \
- __asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
- __asm vmovdqu xmm4, [eax] /* Y */ \
+ __asm vpermq ymm3, ymm3, 0xd8 \
+ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleNV21 \
+ __asm vmovdqu xmm4, [eax] /* Y */ \
__asm vpermq ymm4, ymm4, 0xd8 \
__asm vpunpcklbw ymm4, ymm4, ymm4 \
__asm lea eax, [eax + 16]}
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
#define READYUY2_AVX2 \
- __asm { \
- __asm vmovdqu ymm4, [eax] /* YUY2 */ \
+ __asm { \
+ __asm vmovdqu ymm4, [eax] /* YUY2 */ \
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
+ __asm vmovdqu ymm3, [eax] /* UV */ \
+ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleYUY2UV \
__asm lea eax, [eax + 32]}
// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
#define READUYVY_AVX2 \
- __asm { \
- __asm vmovdqu ymm4, [eax] /* UYVY */ \
+ __asm { \
+ __asm vmovdqu ymm4, [eax] /* UYVY */ \
__asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
- __asm vmovdqu ymm0, [eax] /* UV */ \
- __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
+ __asm vmovdqu ymm3, [eax] /* UV */ \
+ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleUYVYUV \
__asm lea eax, [eax + 32]}
// Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_AVX2(YuvConstants) \
- __asm { \
- __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
- __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
- __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
- __asm vpsubw ymm2, ymm3, ymm2 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
- __asm vpsubw ymm1, ymm3, ymm1 \
- __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
- __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
+ __asm { \
+ __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \
__asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
- __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
- __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
- __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
+ __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \
+ __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \
+ __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \
+ __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \
+ __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \
+ __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \
+ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \
+ __asm vpaddw ymm4, ymm3, ymm4 \
+ __asm vpaddsw ymm0, ymm0, ymm4 \
+ __asm vpsubsw ymm1, ymm4, ymm1 \
+ __asm vpaddsw ymm2, ymm2, ymm4 \
__asm vpsraw ymm0, ymm0, 6 \
__asm vpsraw ymm1, ymm1, 6 \
__asm vpsraw ymm2, ymm2, 6 \
- __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
- __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
- __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
- }
+ __asm vpackuswb ymm0, ymm0, ymm0 \
+ __asm vpackuswb ymm1, ymm1, ymm1 \
+ __asm vpackuswb ymm2, ymm2, ymm2}
// Store 16 ARGB values.
#define STOREARGB_AVX2 \
- __asm { \
- __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
+ __asm { \
+ __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
__asm vpermq ymm0, ymm0, 0xd8 \
- __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
+ __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
__asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
- __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
+ __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
+ __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
__asm vmovdqu 0[edx], ymm1 \
__asm vmovdqu 32[edx], ymm0 \
__asm lea edx, [edx + 64]}
// Store 16 RGBA values.
#define STORERGBA_AVX2 \
- __asm { \
- __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
+ __asm { \
+ __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
__asm vpermq ymm1, ymm1, 0xd8 \
- __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
+ __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
__asm vpermq ymm2, ymm2, 0xd8 \
- __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
- __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
+ __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
+ __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
__asm vmovdqu [edx], ymm0 \
__asm vmovdqu [edx + 32], ymm1 \
__asm lea edx, [edx + 64]}
@@ -2183,6 +2256,48 @@ __declspec(naked) void I444ToARGBRow_AVX2(
}
#endif // HAS_I444TOARGBROW_AVX2
+#ifdef HAS_I444ALPHATOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) void I444AlphaToARGBRow_AVX2(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
+ mov edi, [esp + 16 + 12] // V
+ mov ebp, [esp + 16 + 16] // A
+ mov edx, [esp + 16 + 20] // argb
+ mov ebx, [esp + 16 + 24] // yuvconstants
+ mov ecx, [esp + 16 + 28] // width
+ sub edi, esi
+ convertloop:
+ READYUVA444_AVX2
+ YUVTORGB_AVX2(ebx)
+ STOREARGB_AVX2
+
+ sub ecx, 16
+ jg convertloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_I444AlphaTOARGBROW_AVX2
+
#ifdef HAS_NV12TOARGBROW_AVX2
// 16 pixels.
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
@@ -2361,191 +2476,202 @@ __declspec(naked) void I422ToRGBARow_AVX2(
// Read 8 UV from 444.
#define READYUV444 \
- __asm { \
- __asm movq xmm0, qword ptr [esi] /* U */ \
+ __asm { \
+ __asm movq xmm3, qword ptr [esi] /* U */ \
__asm movq xmm1, qword ptr [esi + edi] /* V */ \
__asm lea esi, [esi + 8] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
+// Read 4 UV from 444. With 8 Alpha.
+#define READYUVA444 \
+ __asm { \
+ __asm movq xmm3, qword ptr [esi] /* U */ \
+ __asm movq xmm1, qword ptr [esi + edi] /* V */ \
+ __asm lea esi, [esi + 8] \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
+ __asm movq xmm4, qword ptr [eax] \
+ __asm punpcklbw xmm4, xmm4 \
+ __asm lea eax, [eax + 8] \
+ __asm movq xmm5, qword ptr [ebp] /* A */ \
+ __asm lea ebp, [ebp + 8]}
+
// Read 4 UV from 422, upsample to 8 UV.
#define READYUV422 \
- __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
+ __asm { \
+ __asm movd xmm3, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
+ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \
- __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
+ __asm { \
+ __asm movd xmm3, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
__asm lea esi, [esi + 4] \
- __asm punpcklbw xmm0, xmm1 /* UV */ \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
- __asm movq xmm4, qword ptr [eax] /* Y */ \
+ __asm punpcklbw xmm3, xmm1 /* UV */ \
+ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
+ __asm movq xmm4, qword ptr [eax] /* Y */ \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8] \
- __asm movq xmm5, qword ptr [ebp] /* A */ \
+ __asm movq xmm5, qword ptr [ebp] /* A */ \
__asm lea ebp, [ebp + 8]}
// Read 4 UV from NV12, upsample to 8 UV.
#define READNV12 \
- __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm { \
+ __asm movq xmm3, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
- __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
// Read 4 VU from NV21, upsample to 8 UV.
#define READNV21 \
- __asm { \
- __asm movq xmm0, qword ptr [esi] /* UV */ \
+ __asm { \
+ __asm movq xmm3, qword ptr [esi] /* UV */ \
__asm lea esi, [esi + 8] \
- __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
+ __asm pshufb xmm3, xmmword ptr kShuffleNV21 \
__asm movq xmm4, qword ptr [eax] \
__asm punpcklbw xmm4, xmm4 \
__asm lea eax, [eax + 8]}
// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
#define READYUY2 \
- __asm { \
- __asm movdqu xmm4, [eax] /* YUY2 */ \
+ __asm { \
+ __asm movdqu xmm4, [eax] /* YUY2 */ \
__asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
+ __asm movdqu xmm3, [eax] /* UV */ \
+ __asm pshufb xmm3, xmmword ptr kShuffleYUY2UV \
__asm lea eax, [eax + 16]}
// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
#define READUYVY \
- __asm { \
- __asm movdqu xmm4, [eax] /* UYVY */ \
+ __asm { \
+ __asm movdqu xmm4, [eax] /* UYVY */ \
__asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
- __asm movdqu xmm0, [eax] /* UV */ \
- __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
+ __asm movdqu xmm3, [eax] /* UV */ \
+ __asm pshufb xmm3, xmmword ptr kShuffleUYVYUV \
__asm lea eax, [eax + 16]}
// Convert 8 pixels: 8 UV and 8 Y.
#define YUVTORGB(YuvConstants) \
- __asm { \
- __asm movdqa xmm1, xmm0 \
- __asm movdqa xmm2, xmm0 \
- __asm movdqa xmm3, xmm0 \
- __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
- __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
- __asm psubw xmm0, xmm1 \
- __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
- __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
- __asm psubw xmm1, xmm2 \
- __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
- __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
- __asm psubw xmm2, xmm3 \
+ __asm { \
+ __asm psubb xmm3, xmmword ptr kBiasUV128 \
__asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
- __asm paddsw xmm0, xmm4 /* B += Y */ \
- __asm paddsw xmm1, xmm4 /* G += Y */ \
- __asm paddsw xmm2, xmm4 /* R += Y */ \
+ __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \
+ __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \
+ __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \
+ __asm pmaddubsw xmm0, xmm3 \
+ __asm pmaddubsw xmm1, xmm3 \
+ __asm pmaddubsw xmm2, xmm3 \
+ __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \
+ __asm paddw xmm4, xmm3 \
+ __asm paddsw xmm0, xmm4 \
+ __asm paddsw xmm2, xmm4 \
+ __asm psubsw xmm4, xmm1 \
+ __asm movdqa xmm1, xmm4 \
__asm psraw xmm0, 6 \
__asm psraw xmm1, 6 \
__asm psraw xmm2, 6 \
- __asm packuswb xmm0, xmm0 /* B */ \
- __asm packuswb xmm1, xmm1 /* G */ \
+ __asm packuswb xmm0, xmm0 /* B */ \
+ __asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \
}
// Store 8 ARGB values.
#define STOREARGB \
- __asm { \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm5 /* RA */ \
+ __asm { \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm5 /* RA */ \
__asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
+ __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
__asm movdqu 0[edx], xmm0 \
__asm movdqu 16[edx], xmm1 \
__asm lea edx, [edx + 32]}
// Store 8 BGRA values.
#define STOREBGRA \
- __asm { \
- __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
- __asm punpcklbw xmm1, xmm0 /* GB */ \
- __asm punpcklbw xmm5, xmm2 /* AR */ \
+ __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm0 /* GB */ \
+ __asm punpcklbw xmm5, xmm2 /* AR */ \
__asm movdqa xmm0, xmm5 \
- __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
- __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
+ __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
__asm movdqu 0[edx], xmm5 \
__asm movdqu 16[edx], xmm0 \
__asm lea edx, [edx + 32]}
// Store 8 RGBA values.
#define STORERGBA \
- __asm { \
- __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
- __asm punpcklbw xmm1, xmm2 /* GR */ \
- __asm punpcklbw xmm5, xmm0 /* AB */ \
+ __asm { \
+ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
+ __asm punpcklbw xmm1, xmm2 /* GR */ \
+ __asm punpcklbw xmm5, xmm0 /* AB */ \
__asm movdqa xmm0, xmm5 \
- __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
- __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
+ __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
+ __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
__asm movdqu 0[edx], xmm5 \
__asm movdqu 16[edx], xmm0 \
__asm lea edx, [edx + 32]}
// Store 8 RGB24 values.
#define STORERGB24 \
- __asm {/* Weave into RRGB */ \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm {/* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
- __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
- __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
- __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
- __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
- __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
+ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
+ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
+ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
__asm lea edx, [edx + 24]}
// Store 8 RGB565 values.
#define STORERGB565 \
- __asm {/* Weave into RRGB */ \
- __asm punpcklbw xmm0, xmm1 /* BG */ \
- __asm punpcklbw xmm2, xmm2 /* RR */ \
+ __asm {/* Weave into RRGB */ \
+ __asm punpcklbw xmm0, xmm1 /* BG */ \
+ __asm punpcklbw xmm2, xmm2 /* RR */ \
__asm movdqa xmm1, xmm0 \
- __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
- __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
- __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
- __asm movdqa xmm2, xmm0 /* G */ \
- __asm pslld xmm0, 8 /* R */ \
- __asm psrld xmm3, 3 /* B */ \
- __asm psrld xmm2, 5 /* G */ \
- __asm psrad xmm0, 16 /* R */ \
- __asm pand xmm3, xmm5 /* B */ \
- __asm pand xmm2, xmm6 /* G */ \
- __asm pand xmm0, xmm7 /* R */ \
- __asm por xmm3, xmm2 /* BG */ \
- __asm por xmm0, xmm3 /* BGR */ \
- __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
- __asm movdqa xmm2, xmm1 /* G */ \
- __asm pslld xmm1, 8 /* R */ \
- __asm psrld xmm3, 3 /* B */ \
- __asm psrld xmm2, 5 /* G */ \
- __asm psrad xmm1, 16 /* R */ \
- __asm pand xmm3, xmm5 /* B */ \
- __asm pand xmm2, xmm6 /* G */ \
- __asm pand xmm1, xmm7 /* R */ \
- __asm por xmm3, xmm2 /* BG */ \
- __asm por xmm1, xmm3 /* BGR */ \
+ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
+ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
+ __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm0 /* G */ \
+ __asm pslld xmm0, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm0, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm0, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm0, xmm3 /* BGR */ \
+ __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
+ __asm movdqa xmm2, xmm1 /* G */ \
+ __asm pslld xmm1, 8 /* R */ \
+ __asm psrld xmm3, 3 /* B */ \
+ __asm psrld xmm2, 5 /* G */ \
+ __asm psrad xmm1, 16 /* R */ \
+ __asm pand xmm3, xmm5 /* B */ \
+ __asm pand xmm2, xmm6 /* G */ \
+ __asm pand xmm1, xmm7 /* R */ \
+ __asm por xmm3, xmm2 /* BG */ \
+ __asm por xmm1, xmm3 /* BGR */ \
__asm packssdw xmm0, xmm1 \
- __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
+ __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
__asm lea edx, [edx + 16]}
// 8 pixels.
@@ -2586,6 +2712,46 @@ __declspec(naked) void I444ToARGBRow_SSSE3(
}
// 8 pixels.
+// 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes).
+__declspec(naked) void I444AlphaToARGBRow_SSSE3(
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov eax, [esp + 16 + 4] // Y
+ mov esi, [esp + 16 + 8] // U
+ mov edi, [esp + 16 + 12] // V
+ mov ebp, [esp + 16 + 16] // A
+ mov edx, [esp + 16 + 20] // argb
+ mov ebx, [esp + 16 + 24] // yuvconstants
+ mov ecx, [esp + 16 + 28] // width
+ sub edi, esi
+
+ convertloop:
+ READYUVA444
+ YUVTORGB(ebx)
+ STOREARGB
+
+ sub ecx, 8
+ jg convertloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
__declspec(naked) void I422ToRGB24Row_SSSE3(
const uint8_t* y_buf,
@@ -2898,10 +3064,12 @@ __declspec(naked) void I422ToRGBARow_SSSE3(
}
#endif // HAS_I422TOARGBROW_SSSE3
+// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter
#ifdef HAS_I400TOARGBROW_SSE2
// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
uint8_t* rgb_buf,
+ const struct YuvConstants*,
int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -2949,6 +3117,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
// note: vpunpcklbw mutates and vpackuswb unmutates.
__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
uint8_t* rgb_buf,
+ const struct YuvConstants*,
int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -3045,15 +3214,15 @@ __declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
}
#endif // HAS_MIRRORROW_AVX2
-#ifdef HAS_MIRRORUVROW_SSSE3
+#ifdef HAS_MIRRORSPLITUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels.
static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
- uint8_t* dst_u,
- uint8_t* dst_v,
- int width) {
+__declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
__asm {
push edi
mov eax, [esp + 4 + 4] // src
@@ -3078,7 +3247,7 @@ __declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
ret
}
}
-#endif // HAS_MIRRORUVROW_SSSE3
+#endif // HAS_MIRRORSPLITUVROW_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
@@ -4172,13 +4341,13 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
// Blend 8 pixels at a time.
-__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@@ -4267,7 +4436,7 @@ __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
__asm {
- mov eax, [esp + 4] // src_argb0
+ mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
pcmpeqb xmm3, xmm3 // generate mask 0xff000000
@@ -4312,7 +4481,7 @@ __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
__asm {
- mov eax, [esp + 4] // src_argb0
+ mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
@@ -4406,7 +4575,7 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
__asm {
- mov eax, [esp + 4] // src_argb0
+ mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
@@ -4762,20 +4931,20 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
pxor xmm5, xmm5 // constant 0
convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm0, [eax] // read 4 pixels from src_argb
movdqu xmm2, [esi] // read 4 pixels from src_argb1
movdqu xmm1, xmm0
movdqu xmm3, xmm2
@@ -4783,8 +4952,8 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
punpckhbw xmm1, xmm1 // next 2
punpcklbw xmm2, xmm5 // first 2
punpckhbw xmm3, xmm5 // next 2
- pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
- pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
+ pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2
+ pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2
lea eax, [eax + 16]
lea esi, [esi + 16]
packuswb xmm0, xmm1
@@ -4802,13 +4971,13 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
// TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@@ -4817,11 +4986,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
jl convertloop49
convertloop4:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm0, [eax] // read 4 pixels from src_argb
lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ paddusb xmm0, xmm1 // src_argb + src_argb1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@@ -4832,11 +5001,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
jl convertloop19
convertloop1:
- movd xmm0, [eax] // read 1 pixels from src_argb0
+ movd xmm0, [eax] // read 1 pixels from src_argb
lea eax, [eax + 4]
movd xmm1, [esi] // read 1 pixels from src_argb1
lea esi, [esi + 4]
- paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ paddusb xmm0, xmm1 // src_argb + src_argb1
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
@@ -4851,23 +5020,23 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
- movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm0, [eax] // read 4 pixels from src_argb
lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
- psubusb xmm0, xmm1 // src_argb0 - src_argb1
+ psubusb xmm0, xmm1 // src_argb - src_argb1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@@ -4881,20 +5050,20 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
vpxor ymm5, ymm5, ymm5 // constant 0
convertloop:
- vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm1, [eax] // read 8 pixels from src_argb
lea eax, [eax + 32]
vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
lea esi, [esi + 32]
@@ -4902,8 +5071,8 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
vpunpckhbw ymm1, ymm1, ymm1 // high 4
vpunpcklbw ymm2, ymm3, ymm5 // low 4
vpunpckhbw ymm3, ymm3, ymm5 // high 4
- vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
- vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
+ vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4
+ vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4
vpackuswb ymm0, ymm0, ymm1
vmovdqu [edx], ymm0
lea edx, [edx + 32]
@@ -4919,19 +5088,19 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb
lea eax, [eax + 32]
vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
lea esi, [esi + 32]
@@ -4949,21 +5118,21 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
- mov eax, [esp + 4 + 4] // src_argb0
+ mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
- vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb
lea eax, [eax + 32]
- vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
+ vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1
lea esi, [esi + 32]
vmovdqu [edx], ymm0
lea edx, [edx + 32]
@@ -5450,7 +5619,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
// 1 pixel loop
l1:
- movd xmm2, dword ptr [eax] // 1 argb pixel, 4 bytes.
+ movd xmm2, dword ptr [eax] // 1 argb pixel
lea eax, [eax + 4]
punpcklbw xmm2, xmm1
punpcklwd xmm2, xmm1
diff --git a/files/source/scale.cc b/files/source/scale.cc
index ab085496..e1335f1e 100644
--- a/files/source/scale.cc
+++ b/files/source/scale.cc
@@ -17,6 +17,7 @@
#include "libyuv/planar_functions.h" // For CopyPlane
#include "libyuv/row.h"
#include "libyuv/scale_row.h"
+#include "libyuv/scale_uv.h" // For UVScale
#ifdef __cplusplus
namespace libyuv {
@@ -28,6 +29,7 @@ static __inline int Abs(int v) {
}
#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
// Scale plane, 1/2
// This is an optimized version for scaling down a plane to 1/2 of
@@ -49,7 +51,7 @@ static void ScalePlaneDown2(int src_width,
? ScaleRowDown2_C
: (filtering == kFilterLinear ? ScaleRowDown2Linear_C
: ScaleRowDown2Box_C);
- int row_stride = src_stride << 1;
+ int row_stride = src_stride * 2;
(void)src_width;
(void)src_height;
if (!filtering) {
@@ -118,18 +120,18 @@ static void ScalePlaneDown2(int src_width,
}
}
#endif
-#if defined(HAS_SCALEROWDOWN2_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
+#if defined(HAS_SCALEROWDOWN2_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
ScaleRowDown2 =
filtering == kFilterNone
- ? ScaleRowDown2_Any_MMI
- : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI
- : ScaleRowDown2Box_Any_MMI);
- if (IS_ALIGNED(dst_width, 8)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI
+ ? ScaleRowDown2_Any_LSX
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_LSX
+ : ScaleRowDown2Box_Any_LSX);
+ if (IS_ALIGNED(dst_width, 32)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_LSX
: (filtering == kFilterLinear
- ? ScaleRowDown2Linear_MMI
- : ScaleRowDown2Box_MMI);
+ ? ScaleRowDown2Linear_LSX
+ : ScaleRowDown2Box_LSX);
}
}
#endif
@@ -161,7 +163,7 @@ static void ScalePlaneDown2_16(int src_width,
? ScaleRowDown2_16_C
: (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
: ScaleRowDown2Box_16_C);
- int row_stride = src_stride << 1;
+ int row_stride = src_stride * 2;
(void)src_width;
(void)src_height;
if (!filtering) {
@@ -184,14 +186,6 @@ static void ScalePlaneDown2_16(int src_width,
: ScaleRowDown2Box_16_SSE2);
}
#endif
-#if defined(HAS_SCALEROWDOWN2_16_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
- ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI
- : (filtering == kFilterLinear
- ? ScaleRowDown2Linear_16_MMI
- : ScaleRowDown2Box_16_MMI);
- }
-#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -221,7 +215,7 @@ static void ScalePlaneDown4(int src_width,
void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
uint8_t* dst_ptr, int dst_width) =
filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
- int row_stride = src_stride << 2;
+ int row_stride = src_stride * 4;
(void)src_width;
(void)src_height;
if (!filtering) {
@@ -264,12 +258,12 @@ static void ScalePlaneDown4(int src_width,
}
}
#endif
-#if defined(HAS_SCALEROWDOWN4_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
+#if defined(HAS_SCALEROWDOWN4_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
ScaleRowDown4 =
- filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI;
- if (IS_ALIGNED(dst_width, 8)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI;
+ filtering ? ScaleRowDown4Box_Any_LSX : ScaleRowDown4_Any_LSX;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_LSX : ScaleRowDown4_LSX;
}
}
#endif
@@ -297,7 +291,7 @@ static void ScalePlaneDown4_16(int src_width,
void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
uint16_t* dst_ptr, int dst_width) =
filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
- int row_stride = src_stride << 2;
+ int row_stride = src_stride * 4;
(void)src_width;
(void)src_height;
if (!filtering) {
@@ -316,11 +310,6 @@ static void ScalePlaneDown4_16(int src_width,
filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
}
#endif
-#if defined(HAS_SCALEROWDOWN4_16_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI;
- }
-#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -398,6 +387,26 @@ static void ScalePlaneDown34(int src_width,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN34_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_Any_LSX;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_LSX;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_LSX;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_LSX;
+ }
+ if (dst_width % 48 == 0) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_LSX;
+ ScaleRowDown34_1 = ScaleRowDown34_LSX;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_LSX;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_LSX;
+ }
+ }
+ }
+#endif
#if defined(HAS_SCALEROWDOWN34_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (!filtering) {
@@ -613,6 +622,26 @@ static void ScalePlaneDown38(int src_width,
}
}
#endif
+#if defined(HAS_SCALEROWDOWN38_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_Any_LSX;
+ ScaleRowDown38_2 = ScaleRowDown38_Any_LSX;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_LSX;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_LSX;
+ }
+ if (dst_width % 12 == 0) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_LSX;
+ ScaleRowDown38_2 = ScaleRowDown38_LSX;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_LSX;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_LSX;
+ }
+ }
+ }
+#endif
for (y = 0; y < dst_height - 2; y += 3) {
ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -886,11 +915,11 @@ static void ScalePlaneBox(int src_width,
}
}
#endif
-#if defined(HAS_SCALEADDROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ScaleAddRow = ScaleAddRow_Any_MMI;
- if (IS_ALIGNED(src_width, 8)) {
- ScaleAddRow = ScaleAddRow_MMI;
+#if defined(HAS_SCALEADDROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ScaleAddRow = ScaleAddRow_Any_LSX;
+ if (IS_ALIGNED(src_width, 16)) {
+ ScaleAddRow = ScaleAddRow_LSX;
}
}
#endif
@@ -898,7 +927,7 @@ static void ScalePlaneBox(int src_width,
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
- const uint8_t* src = src_ptr + iy * src_stride;
+ const uint8_t* src = src_ptr + iy * (int64_t)src_stride;
y += dy;
if (y > max_y) {
y = max_y;
@@ -949,15 +978,10 @@ static void ScalePlaneBox_16(int src_width,
}
#endif
-#if defined(HAS_SCALEADDROW_16_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) {
- ScaleAddRow = ScaleAddRow_16_MMI;
- }
-#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
- const uint16_t* src = src_ptr + iy * src_stride;
+ const uint16_t* src = src_ptr + iy * (int64_t)src_stride;
y += dy;
if (y > max_y) {
y = max_y;
@@ -1038,11 +1062,11 @@ void ScalePlaneBilinearDown(int src_width,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(src_width, 16)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
}
}
#endif
@@ -1068,13 +1092,21 @@ void ScalePlaneBilinearDown(int src_width,
}
}
#endif
+#if defined(HAS_SCALEFILTERCOLS_LSX)
+ if (TestCpuFlag(kCpuHasLSX) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleFilterCols = ScaleFilterCols_LSX;
+ }
+ }
+#endif
if (y > max_y) {
y = max_y;
}
for (j = 0; j < dst_height; ++j) {
int yi = y >> 16;
- const uint8_t* src = src_ptr + yi * src_stride;
+ const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
if (filtering == kFilterLinear) {
ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
} else {
@@ -1123,7 +1155,7 @@ void ScalePlaneBilinearDown_16(int src_width,
#if defined(HAS_INTERPOLATEROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- InterpolateRow = InterpolateRow_Any_16_SSE2;
+ InterpolateRow = InterpolateRow_16_Any_SSE2;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_16_SSE2;
}
@@ -1131,7 +1163,7 @@ void ScalePlaneBilinearDown_16(int src_width,
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ InterpolateRow = InterpolateRow_16_Any_SSSE3;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_16_SSSE3;
}
@@ -1139,7 +1171,7 @@ void ScalePlaneBilinearDown_16(int src_width,
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- InterpolateRow = InterpolateRow_Any_16_AVX2;
+ InterpolateRow = InterpolateRow_16_Any_AVX2;
if (IS_ALIGNED(src_width, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
}
@@ -1147,7 +1179,7 @@ void ScalePlaneBilinearDown_16(int src_width,
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- InterpolateRow = InterpolateRow_Any_16_NEON;
+ InterpolateRow = InterpolateRow_16_Any_NEON;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_16_NEON;
}
@@ -1165,7 +1197,7 @@ void ScalePlaneBilinearDown_16(int src_width,
for (j = 0; j < dst_height; ++j) {
int yi = y >> 16;
- const uint16_t* src = src_ptr + yi * src_stride;
+ const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
if (filtering == kFilterLinear) {
ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
} else {
@@ -1258,6 +1290,14 @@ void ScalePlaneBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_SCALEFILTERCOLS_LSX)
+ if (filtering && TestCpuFlag(kCpuHasLSX) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleFilterCols = ScaleFilterCols_LSX;
+ }
+ }
+#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_C;
#if defined(HAS_SCALECOLS_SSE2)
@@ -1265,11 +1305,6 @@ void ScalePlaneBilinearUp(int src_width,
ScaleFilterCols = ScaleColsUp2_SSE2;
}
#endif
-#if defined(HAS_SCALECOLS_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
- ScaleFilterCols = ScaleColsUp2_MMI;
- }
-#endif
}
if (y > max_y) {
@@ -1277,7 +1312,7 @@ void ScalePlaneBilinearUp(int src_width,
}
{
int yi = y >> 16;
- const uint8_t* src = src_ptr + yi * src_stride;
+ const uint8_t* src = src_ptr + yi * (int64_t)src_stride;
// Allocate 2 row buffers.
const int kRowSize = (dst_width + 31) & ~31;
@@ -1292,7 +1327,9 @@ void ScalePlaneBilinearUp(int src_width,
src += src_stride;
}
ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
- src += src_stride;
+ if (src_height > 2) {
+ src += src_stride;
+ }
for (j = 0; j < dst_height; ++j) {
yi = y >> 16;
@@ -1300,14 +1337,16 @@ void ScalePlaneBilinearUp(int src_width,
if (y > max_y) {
y = max_y;
yi = y >> 16;
- src = src_ptr + yi * src_stride;
+ src = src_ptr + yi * (int64_t)src_stride;
}
if (yi != lasty) {
ScaleFilterCols(rowptr, src, dst_width, x, dx);
rowptr += rowstride;
rowstride = -rowstride;
lasty = yi;
- src += src_stride;
+ if ((y + 65536) < max_y) {
+ src += src_stride;
+ }
}
}
if (filtering == kFilterLinear) {
@@ -1323,6 +1362,327 @@ void ScalePlaneBilinearUp(int src_width,
}
}
+// Scale plane, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of I422 to I444.
+void ScalePlaneUp2_Linear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
+ void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
+ ScaleRowUp2_Linear_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+// Scale plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of I420 to I444.
+void ScalePlaneUp2_Bilinear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_Any_C;
+ int x;
+
+ // This function can only scale up by 2 times.
+ assert(src_width == ((dst_width + 1) / 2));
+ assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
+ }
+#endif
+
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ // TODO(fbarchard): Test performance of writing one row of destination at a
+ // time.
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+}
+
+// Scale at most 14 bit plane, horizontally up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// stride is in count of uint16_t.
+// This is used to scale U and V planes of I210 to I410 and I212 to I412.
+void ScalePlaneUp2_12_Linear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+// Scale at most 12 bit plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// stride is in count of uint16_t.
+// This is used to scale U and V planes of I010 to I410 and I012 to I412.
+void ScalePlaneUp2_12_Bilinear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_16_Any_C;
+ int x;
+
+ // This function can only scale up by 2 times.
+ assert(src_width == ((dst_width + 1) / 2));
+ assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON;
+ }
+#endif
+
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+}
+
+void ScalePlaneUp2_16_Linear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
+ int dst_width) = ScaleRowUp2_Linear_16_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+void ScalePlaneUp2_16_Bilinear(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleRowUp2_Bilinear_16_Any_C;
+ int x;
+
+ // This function can only scale up by 2 times.
+ assert(src_width == ((dst_width + 1) / 2));
+ assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON;
+ }
+#endif
+
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+}
+
void ScalePlaneBilinearUp_16(int src_width,
int src_height,
int dst_width,
@@ -1351,7 +1711,7 @@ void ScalePlaneBilinearUp_16(int src_width,
#if defined(HAS_INTERPOLATEROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- InterpolateRow = InterpolateRow_Any_16_SSE2;
+ InterpolateRow = InterpolateRow_16_Any_SSE2;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_16_SSE2;
}
@@ -1359,7 +1719,7 @@ void ScalePlaneBilinearUp_16(int src_width,
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ InterpolateRow = InterpolateRow_16_Any_SSSE3;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_16_SSSE3;
}
@@ -1367,7 +1727,7 @@ void ScalePlaneBilinearUp_16(int src_width,
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- InterpolateRow = InterpolateRow_Any_16_AVX2;
+ InterpolateRow = InterpolateRow_16_Any_AVX2;
if (IS_ALIGNED(dst_width, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
}
@@ -1375,7 +1735,7 @@ void ScalePlaneBilinearUp_16(int src_width,
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- InterpolateRow = InterpolateRow_Any_16_NEON;
+ InterpolateRow = InterpolateRow_16_Any_NEON;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_16_NEON;
}
@@ -1397,19 +1757,13 @@ void ScalePlaneBilinearUp_16(int src_width,
ScaleFilterCols = ScaleColsUp2_16_SSE2;
}
#endif
-#if defined(HAS_SCALECOLS_16_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
- ScaleFilterCols = ScaleColsUp2_16_MMI;
- }
-#endif
}
-
if (y > max_y) {
y = max_y;
}
{
int yi = y >> 16;
- const uint16_t* src = src_ptr + yi * src_stride;
+ const uint16_t* src = src_ptr + yi * (int64_t)src_stride;
// Allocate 2 row buffers.
const int kRowSize = (dst_width + 31) & ~31;
@@ -1424,7 +1778,9 @@ void ScalePlaneBilinearUp_16(int src_width,
src += src_stride;
}
ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
- src += src_stride;
+ if (src_height > 2) {
+ src += src_stride;
+ }
for (j = 0; j < dst_height; ++j) {
yi = y >> 16;
@@ -1432,14 +1788,16 @@ void ScalePlaneBilinearUp_16(int src_width,
if (y > max_y) {
y = max_y;
yi = y >> 16;
- src = src_ptr + yi * src_stride;
+ src = src_ptr + yi * (int64_t)src_stride;
}
if (yi != lasty) {
ScaleFilterCols(rowptr, src, dst_width, x, dx);
rowptr += rowstride;
rowstride = -rowstride;
lasty = yi;
- src += src_stride;
+ if ((y + 65536) < max_y) {
+ src += src_stride;
+ }
}
}
if (filtering == kFilterLinear) {
@@ -1487,15 +1845,11 @@ static void ScalePlaneSimple(int src_width,
ScaleCols = ScaleColsUp2_SSE2;
}
#endif
-#if defined(HAS_SCALECOLS_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
- ScaleCols = ScaleColsUp2_MMI;
- }
-#endif
}
for (i = 0; i < dst_height; ++i) {
- ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x,
+ dx);
dst_ptr += dst_stride;
y += dy;
}
@@ -1528,15 +1882,11 @@ static void ScalePlaneSimple_16(int src_width,
ScaleCols = ScaleColsUp2_16_SSE2;
}
#endif
-#if defined(HAS_SCALECOLS_16_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
- ScaleCols = ScaleColsUp2_16_MMI;
- }
-#endif
}
for (i = 0; i < dst_height; ++i) {
- ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x,
+ dx);
dst_ptr += dst_stride;
y += dy;
}
@@ -1544,7 +1894,6 @@ static void ScalePlaneSimple_16(int src_width,
// Scale a plane.
// This function dispatches to a specialized scaler based on scale factor.
-
LIBYUV_API
void ScalePlane(const uint8_t* src,
int src_stride,
@@ -1562,10 +1911,9 @@ void ScalePlane(const uint8_t* src,
// Negative height means invert the image.
if (src_height < 0) {
src_height = -src_height;
- src = src + (src_height - 1) * src_stride;
+ src = src + (src_height - 1) * (int64_t)src_stride;
src_stride = -src_stride;
}
-
// Use specialized scales to improve performance for common resolutions.
// For example, all the 1/2 scalings will use ScalePlaneDown2()
if (dst_width == src_width && dst_height == src_height) {
@@ -1574,10 +1922,19 @@ void ScalePlane(const uint8_t* src,
return;
}
if (dst_width == src_width && filtering != kFilterBox) {
- int dy = FixedDiv(src_height, dst_height);
+ int dy = 0;
+ int y = 0;
+ // When scaling down, use the center 2 rows to filter.
+ // When scaling up, last row of destination uses the last 2 source rows.
+ if (dst_height <= src_height) {
+ dy = FixedDiv(src_height, dst_height);
+ y = CENTERSTART(dy, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (src_height > 1 && dst_height > 1) {
+ dy = FixedDiv1(src_height, dst_height);
+ }
// Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
- dst_stride, src, dst, 0, 0, dy, 1, filtering);
+ dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
return;
}
if (dst_width <= Abs(src_width) && dst_height <= src_height) {
@@ -1614,6 +1971,17 @@ void ScalePlane(const uint8_t* src,
dst_stride, src, dst);
return;
}
+ if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+ ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+ if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
if (filtering && dst_height > src_height) {
ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
@@ -1645,10 +2013,9 @@ void ScalePlane_16(const uint16_t* src,
// Negative height means invert the image.
if (src_height < 0) {
src_height = -src_height;
- src = src + (src_height - 1) * src_stride;
+ src = src + (src_height - 1) * (int64_t)src_stride;
src_stride = -src_stride;
}
-
// Use specialized scales to improve performance for common resolutions.
// For example, all the 1/2 scalings will use ScalePlaneDown2()
if (dst_width == src_width && dst_height == src_height) {
@@ -1657,10 +2024,22 @@ void ScalePlane_16(const uint16_t* src,
return;
}
if (dst_width == src_width && filtering != kFilterBox) {
- int dy = FixedDiv(src_height, dst_height);
- // Arbitrary scale vertically, but unscaled vertically.
+ int dy = 0;
+ int y = 0;
+ // When scaling down, use the center 2 rows to filter.
+ // When scaling up, last row of destination uses the last 2 source rows.
+ if (dst_height <= src_height) {
+ dy = FixedDiv(src_height, dst_height);
+ y = CENTERSTART(dy, -32768); // Subtract 0.5 (32768) to center filter.
+ // When scaling up, ensure the last row of destination uses the last
+ // source. Avoid divide by zero for dst_height but will do no scaling
+ // later.
+ } else if (src_height > 1 && dst_height > 1) {
+ dy = FixedDiv1(src_height, dst_height);
+ }
+ // Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
- dst_stride, src, dst, 0, 0, dy, 1, filtering);
+ dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
return;
}
if (dst_width <= Abs(src_width) && dst_height <= src_height) {
@@ -1697,6 +2076,17 @@ void ScalePlane_16(const uint16_t* src,
dst_stride, src, dst);
return;
}
+ if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+ ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+ if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
if (filtering && dst_height > src_height) {
ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
@@ -1711,6 +2101,43 @@ void ScalePlane_16(const uint16_t* src,
dst_stride, src, dst);
}
+LIBYUV_API
+void ScalePlane_12(const uint16_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint16_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * (int64_t)src_stride;
+ src_stride = -src_stride;
+ }
+
+ if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
+ ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+ if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+
+ ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride,
+ dst_width, dst_height, filtering);
+}
+
// Scale an I420 image.
// This function in turn calls a scaling function for each plane.
@@ -1736,7 +2163,8 @@ int I420Scale(const uint8_t* src_y,
int src_halfheight = SUBSAMPLE(src_height, 1, 1);
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
- if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
@@ -1773,7 +2201,8 @@ int I420Scale_16(const uint16_t* src_y,
int src_halfheight = SUBSAMPLE(src_height, 1, 1);
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
- if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
@@ -1788,6 +2217,44 @@ int I420Scale_16(const uint16_t* src_y,
return 0;
}
+LIBYUV_API
+int I420Scale_12(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+ dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+ ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+ dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
+ return 0;
+}
+
// Scale an I444 image.
// This function in turn calls a scaling function for each plane.
@@ -1809,7 +2276,7 @@ int I444Scale(const uint8_t* src_y,
int dst_width,
int dst_height,
enum FilterMode filtering) {
- if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
@@ -1842,7 +2309,7 @@ int I444Scale_16(const uint16_t* src_y,
int dst_width,
int dst_height,
enum FilterMode filtering) {
- if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
@@ -1857,6 +2324,185 @@ int I444Scale_16(const uint16_t* src_y,
return 0;
}
+LIBYUV_API
+int I444Scale_12(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane_12(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+ dst_width, dst_height, filtering);
+ ScalePlane_12(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+ dst_width, dst_height, filtering);
+ return 0;
+}
+
+// Scale an I422 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I422Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
+ dst_stride_u, dst_halfwidth, dst_height, filtering);
+ ScalePlane(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
+ dst_stride_v, dst_halfwidth, dst_height, filtering);
+ return 0;
+}
+
+LIBYUV_API
+int I422Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
+ dst_stride_u, dst_halfwidth, dst_height, filtering);
+ ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
+ dst_stride_v, dst_halfwidth, dst_height, filtering);
+ return 0;
+}
+
+LIBYUV_API
+int I422Scale_12(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_height, dst_u,
+ dst_stride_u, dst_halfwidth, dst_height, filtering);
+ ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_height, dst_v,
+ dst_stride_v, dst_halfwidth, dst_height, filtering);
+ return 0;
+}
+
+// Scale an NV12 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int NV12Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+
+ if (!src_y || !src_uv || src_width <= 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv,
+ dst_stride_uv, dst_halfwidth, dst_halfheight, filtering);
+ return 0;
+}
+
// Deprecated api
LIBYUV_API
int Scale(const uint8_t* src_y,
diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc
index 17831372..317041f8 100644
--- a/files/source/scale_any.cc
+++ b/files/source/scale_any.cc
@@ -20,49 +20,6 @@ namespace libyuv {
extern "C" {
#endif
-// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
- void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
- int dx) { \
- int r = dst_width & MASK; \
- int n = dst_width & ~MASK; \
- if (n > 0) { \
- TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
- } \
- TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
- }
-
-#ifdef HAS_SCALEFILTERCOLS_NEON
-CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
-#endif
-#ifdef HAS_SCALEFILTERCOLS_MSA
-CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
-#endif
-#ifdef HAS_SCALEARGBCOLS_NEON
-CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MSA
-CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MMI
-CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON,
- ScaleARGBFilterCols_NEON,
- ScaleARGBFilterCols_C,
- 4,
- 3)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_MSA
-CANY(ScaleARGBFilterCols_Any_MSA,
- ScaleARGBFilterCols_MSA,
- ScaleARGBFilterCols_C,
- 4,
- 7)
-#endif
-#undef CANY
-
// Fixed scale down.
// Mask may be non-power of 2, so use MOD
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
@@ -113,6 +70,22 @@ SDODD(ScaleRowDown2Box_Odd_SSSE3,
1,
15)
#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+SDANY(ScaleUVRowDown2Box_Any_SSSE3,
+ ScaleUVRowDown2Box_SSSE3,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 3)
+#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+SDANY(ScaleUVRowDown2Box_Any_AVX2,
+ ScaleUVRowDown2Box_AVX2,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 7)
+#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2,
@@ -155,6 +128,15 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
1,
15)
#endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
+SDANY(ScaleUVRowDown2Box_Any_NEON,
+ ScaleUVRowDown2Box_NEON,
+ ScaleUVRowDown2Box_C,
+ 2,
+ 2,
+ 7)
+#endif
+
#ifdef HAS_SCALEROWDOWN2_MSA
SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_MSA,
@@ -170,26 +152,20 @@ SDANY(ScaleRowDown2Box_Any_MSA,
1,
31)
#endif
-#ifdef HAS_SCALEROWDOWN2_MMI
-SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7)
-SDANY(ScaleRowDown2Linear_Any_MMI,
- ScaleRowDown2Linear_MMI,
+#ifdef HAS_SCALEROWDOWN2_LSX
+SDANY(ScaleRowDown2_Any_LSX, ScaleRowDown2_LSX, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_LSX,
+ ScaleRowDown2Linear_LSX,
ScaleRowDown2Linear_C,
2,
1,
- 7)
-SDANY(ScaleRowDown2Box_Any_MMI,
- ScaleRowDown2Box_MMI,
+ 31)
+SDANY(ScaleRowDown2Box_Any_LSX,
+ ScaleRowDown2Box_LSX,
ScaleRowDown2Box_C,
2,
1,
- 7)
-SDODD(ScaleRowDown2Box_Odd_MMI,
- ScaleRowDown2Box_MMI,
- ScaleRowDown2Box_Odd_C,
- 2,
- 1,
- 7)
+ 31)
#endif
#ifdef HAS_SCALEROWDOWN4_SSSE3
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
@@ -227,14 +203,14 @@ SDANY(ScaleRowDown4Box_Any_MSA,
1,
15)
#endif
-#ifdef HAS_SCALEROWDOWN4_MMI
-SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_MMI,
- ScaleRowDown4Box_MMI,
+#ifdef HAS_SCALEROWDOWN4_LSX
+SDANY(ScaleRowDown4_Any_LSX, ScaleRowDown4_LSX, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_LSX,
+ ScaleRowDown4Box_LSX,
ScaleRowDown4Box_C,
4,
1,
- 7)
+ 15)
#endif
#ifdef HAS_SCALEROWDOWN34_SSSE3
SDANY(ScaleRowDown34_Any_SSSE3,
@@ -296,6 +272,26 @@ SDANY(ScaleRowDown34_1_Box_Any_MSA,
1,
47)
#endif
+#ifdef HAS_SCALEROWDOWN34_LSX
+SDANY(ScaleRowDown34_Any_LSX,
+ ScaleRowDown34_LSX,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 47)
+SDANY(ScaleRowDown34_0_Box_Any_LSX,
+ ScaleRowDown34_0_Box_LSX,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 47)
+SDANY(ScaleRowDown34_1_Box_Any_LSX,
+ ScaleRowDown34_1_Box_LSX,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 47)
+#endif
#ifdef HAS_SCALEROWDOWN38_SSSE3
SDANY(ScaleRowDown38_Any_SSSE3,
ScaleRowDown38_SSSE3,
@@ -356,6 +352,26 @@ SDANY(ScaleRowDown38_2_Box_Any_MSA,
1,
11)
#endif
+#ifdef HAS_SCALEROWDOWN38_LSX
+SDANY(ScaleRowDown38_Any_LSX,
+ ScaleRowDown38_LSX,
+ ScaleRowDown38_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_3_Box_Any_LSX,
+ ScaleRowDown38_3_Box_LSX,
+ ScaleRowDown38_3_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+SDANY(ScaleRowDown38_2_Box_Any_LSX,
+ ScaleRowDown38_2_Box_LSX,
+ ScaleRowDown38_2_Box_C,
+ 8 / 3,
+ 1,
+ 11)
+#endif
#ifdef HAS_SCALEARGBROWDOWN2_SSE2
SDANY(ScaleARGBRowDown2_Any_SSE2,
@@ -417,25 +433,25 @@ SDANY(ScaleARGBRowDown2Box_Any_MSA,
4,
3)
#endif
-#ifdef HAS_SCALEARGBROWDOWN2_MMI
-SDANY(ScaleARGBRowDown2_Any_MMI,
- ScaleARGBRowDown2_MMI,
+#ifdef HAS_SCALEARGBROWDOWN2_LSX
+SDANY(ScaleARGBRowDown2_Any_LSX,
+ ScaleARGBRowDown2_LSX,
ScaleARGBRowDown2_C,
2,
4,
- 1)
-SDANY(ScaleARGBRowDown2Linear_Any_MMI,
- ScaleARGBRowDown2Linear_MMI,
+ 3)
+SDANY(ScaleARGBRowDown2Linear_Any_LSX,
+ ScaleARGBRowDown2Linear_LSX,
ScaleARGBRowDown2Linear_C,
2,
4,
- 1)
-SDANY(ScaleARGBRowDown2Box_Any_MMI,
- ScaleARGBRowDown2Box_MMI,
+ 3)
+SDANY(ScaleARGBRowDown2Box_Any_LSX,
+ ScaleARGBRowDown2Box_LSX,
ScaleARGBRowDown2Box_C,
2,
4,
- 1)
+ 3)
#endif
#undef SDANY
@@ -488,17 +504,24 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
4,
3)
#endif
-#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI
-SDAANY(ScaleARGBRowDownEven_Any_MMI,
- ScaleARGBRowDownEven_MMI,
+#ifdef HAS_SCALEARGBROWDOWNEVEN_LSX
+SDAANY(ScaleARGBRowDownEven_Any_LSX,
+ ScaleARGBRowDownEven_LSX,
ScaleARGBRowDownEven_C,
4,
- 1)
-SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
- ScaleARGBRowDownEvenBox_MMI,
+ 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_LSX,
+ ScaleARGBRowDownEvenBox_LSX,
ScaleARGBRowDownEvenBox_C,
4,
- 1)
+ 3)
+#endif
+#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
+SDAANY(ScaleUVRowDownEven_Any_NEON,
+ ScaleUVRowDownEven_NEON,
+ ScaleUVRowDownEven_C,
+ 2,
+ 3)
#endif
#ifdef SASIMDONLY
@@ -533,8 +556,8 @@ SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
#ifdef HAS_SCALEADDROW_MSA
SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
#endif
-#ifdef HAS_SCALEADDROW_MMI
-SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7)
+#ifdef HAS_SCALEADDROW_LSX
+SAROW(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, 1, 2, 15)
#endif
#undef SAANY
@@ -562,13 +585,477 @@ SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
#ifdef HAS_SCALEADDROW_MSA
SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
#endif
-#ifdef HAS_SCALEADDROW_MMI
-SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
+#ifdef HAS_SCALEADDROW_LSX
+SAANY(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, ScaleAddRow_C, 15)
#endif
#undef SAANY
#endif // SASIMDONLY
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
+ void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+ int dx) { \
+ int r = dst_width & MASK; \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
+ } \
+ TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
+ }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_LSX
+CANY(ScaleFilterCols_Any_LSX, ScaleFilterCols_LSX, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBCOLS_LSX
+CANY(ScaleARGBCols_Any_LSX, ScaleARGBCols_LSX, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON,
+ ScaleARGBFilterCols_NEON,
+ ScaleARGBFilterCols_C,
+ 4,
+ 3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+ ScaleARGBFilterCols_MSA,
+ ScaleARGBFilterCols_C,
+ 4,
+ 7)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_LSX
+CANY(ScaleARGBFilterCols_Any_LSX,
+ ScaleARGBFilterCols_LSX,
+ ScaleARGBFilterCols_C,
+ 4,
+ 7)
+#endif
+#undef CANY
+
+// Scale up horizontally 2 times using linear filter.
+#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \
+ void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
+ int work_width = (dst_width - 1) & ~1; \
+ int r = work_width & MASK; \
+ int n = work_width & ~MASK; \
+ dst_ptr[0] = src_ptr[0]; \
+ if (work_width > 0) { \
+ if (n != 0) { \
+ SIMD(src_ptr, dst_ptr + 1, n); \
+ } \
+ C(src_ptr + (n / 2), dst_ptr + n + 1, r); \
+ } \
+ dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2]; \
+ }
+
+// Even the C versions need to be wrapped, because boundary pixels have to
+// be handled differently
+
+SUH2LANY(ScaleRowUp2_Linear_Any_C,
+ ScaleRowUp2_Linear_C,
+ ScaleRowUp2_Linear_C,
+ 0,
+ uint8_t)
+
+SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
+ ScaleRowUp2_Linear_16_C,
+ ScaleRowUp2_Linear_16_C,
+ 0,
+ uint16_t)
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
+SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
+ ScaleRowUp2_Linear_SSE2,
+ ScaleRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
+SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
+ ScaleRowUp2_Linear_SSSE3,
+ ScaleRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
+SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3,
+ ScaleRowUp2_Linear_12_SSSE3,
+ ScaleRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
+SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
+ ScaleRowUp2_Linear_16_SSE2,
+ ScaleRowUp2_Linear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
+SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
+ ScaleRowUp2_Linear_AVX2,
+ ScaleRowUp2_Linear_C,
+ 31,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
+SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2,
+ ScaleRowUp2_Linear_12_AVX2,
+ ScaleRowUp2_Linear_16_C,
+ 31,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
+SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
+ ScaleRowUp2_Linear_16_AVX2,
+ ScaleRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_NEON
+SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
+ ScaleRowUp2_Linear_NEON,
+ ScaleRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_NEON
+SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON,
+ ScaleRowUp2_Linear_12_NEON,
+ ScaleRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_NEON
+SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
+ ScaleRowUp2_Linear_16_NEON,
+ ScaleRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#undef SUH2LANY
+
+// Scale up 2 times using bilinear filter.
+// This function produces 2 rows at a time.
+#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
+ void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
+ ptrdiff_t dst_stride, int dst_width) { \
+ int work_width = (dst_width - 1) & ~1; \
+ int r = work_width & MASK; \
+ int n = work_width & ~MASK; \
+ const PTYPE* sa = src_ptr; \
+ const PTYPE* sb = src_ptr + src_stride; \
+ PTYPE* da = dst_ptr; \
+ PTYPE* db = dst_ptr + dst_stride; \
+ da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \
+ db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \
+ if (work_width > 0) { \
+ if (n != 0) { \
+ SIMD(sa, sb - sa, da + 1, db - da, n); \
+ } \
+ C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \
+ } \
+ da[dst_width - 1] = \
+ (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \
+ db[dst_width - 1] = \
+ (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \
+ }
+
+SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
+ ScaleRowUp2_Bilinear_C,
+ ScaleRowUp2_Bilinear_C,
+ 0,
+ uint8_t)
+
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
+ ScaleRowUp2_Bilinear_16_C,
+ ScaleRowUp2_Bilinear_16_C,
+ 0,
+ uint16_t)
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
+SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
+ ScaleRowUp2_Bilinear_SSE2,
+ ScaleRowUp2_Bilinear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3,
+ ScaleRowUp2_Bilinear_12_SSSE3,
+ ScaleRowUp2_Bilinear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
+ ScaleRowUp2_Bilinear_16_SSE2,
+ ScaleRowUp2_Bilinear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
+SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
+ ScaleRowUp2_Bilinear_SSSE3,
+ ScaleRowUp2_Bilinear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
+ ScaleRowUp2_Bilinear_AVX2,
+ ScaleRowUp2_Bilinear_C,
+ 31,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2,
+ ScaleRowUp2_Bilinear_12_AVX2,
+ ScaleRowUp2_Bilinear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
+ ScaleRowUp2_Bilinear_16_AVX2,
+ ScaleRowUp2_Bilinear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
+ ScaleRowUp2_Bilinear_NEON,
+ ScaleRowUp2_Bilinear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_12_Any_NEON,
+ ScaleRowUp2_Bilinear_12_NEON,
+ ScaleRowUp2_Bilinear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_NEON
+SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
+ ScaleRowUp2_Bilinear_16_NEON,
+ ScaleRowUp2_Bilinear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#undef SU2BLANY
+
+// Scale bi-planar plane up horizontally 2 times using linear filter.
+#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE) \
+ void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
+ int work_width = (dst_width - 1) & ~1; \
+ int r = work_width & MASK; \
+ int n = work_width & ~MASK; \
+ dst_ptr[0] = src_ptr[0]; \
+ dst_ptr[1] = src_ptr[1]; \
+ if (work_width > 0) { \
+ if (n != 0) { \
+ SIMD(src_ptr, dst_ptr + 2, n); \
+ } \
+ C(src_ptr + n, dst_ptr + 2 * n + 2, r); \
+ } \
+ dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \
+ dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \
+ }
+
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
+ ScaleUVRowUp2_Linear_C,
+ ScaleUVRowUp2_Linear_C,
+ 0,
+ uint8_t)
+
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C,
+ ScaleUVRowUp2_Linear_16_C,
+ ScaleUVRowUp2_Linear_16_C,
+ 0,
+ uint16_t)
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
+ ScaleUVRowUp2_Linear_SSSE3,
+ ScaleUVRowUp2_Linear_C,
+ 7,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
+ ScaleUVRowUp2_Linear_AVX2,
+ ScaleUVRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41,
+ ScaleUVRowUp2_Linear_16_SSE41,
+ ScaleUVRowUp2_Linear_16_C,
+ 3,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2,
+ ScaleUVRowUp2_Linear_16_AVX2,
+ ScaleUVRowUp2_Linear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_NEON
+SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
+ ScaleUVRowUp2_Linear_NEON,
+ ScaleUVRowUp2_Linear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON
+SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON,
+ ScaleUVRowUp2_Linear_16_NEON,
+ ScaleUVRowUp2_Linear_16_C,
+ 15,
+ uint16_t)
+#endif
+
+#undef SBUH2LANY
+
+// Scale bi-planar plane up 2 times using bilinear filter.
+// This function produces 2 rows at a time.
+#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
+ void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
+ ptrdiff_t dst_stride, int dst_width) { \
+ int work_width = (dst_width - 1) & ~1; \
+ int r = work_width & MASK; \
+ int n = work_width & ~MASK; \
+ const PTYPE* sa = src_ptr; \
+ const PTYPE* sb = src_ptr + src_stride; \
+ PTYPE* da = dst_ptr; \
+ PTYPE* db = dst_ptr + dst_stride; \
+ da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \
+ db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \
+ da[1] = (3 * sa[1] + sb[1] + 2) >> 2; \
+ db[1] = (sa[1] + 3 * sb[1] + 2) >> 2; \
+ if (work_width > 0) { \
+ if (n != 0) { \
+ SIMD(sa, sb - sa, da + 2, db - da, n); \
+ } \
+ C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \
+ } \
+ da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \
+ sb[((dst_width + 1) & ~1) - 2] + 2) >> \
+ 2; \
+ db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \
+ 3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \
+ 2; \
+ da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \
+ sb[((dst_width + 1) & ~1) - 1] + 2) >> \
+ 2; \
+ db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \
+ 3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \
+ 2; \
+ }
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
+ ScaleUVRowUp2_Bilinear_C,
+ ScaleUVRowUp2_Bilinear_C,
+ 0,
+ uint8_t)
+
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C,
+ ScaleUVRowUp2_Bilinear_16_C,
+ ScaleUVRowUp2_Bilinear_16_C,
+ 0,
+ uint16_t)
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
+ ScaleUVRowUp2_Bilinear_SSSE3,
+ ScaleUVRowUp2_Bilinear_C,
+ 7,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
+ ScaleUVRowUp2_Bilinear_AVX2,
+ ScaleUVRowUp2_Bilinear_C,
+ 15,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41,
+ ScaleUVRowUp2_Bilinear_16_SSE41,
+ ScaleUVRowUp2_Bilinear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2,
+ ScaleUVRowUp2_Bilinear_16_AVX2,
+ ScaleUVRowUp2_Bilinear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_NEON
+SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
+ ScaleUVRowUp2_Bilinear_NEON,
+ ScaleUVRowUp2_Bilinear_C,
+ 7,
+ uint8_t)
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON
+SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON,
+ ScaleUVRowUp2_Bilinear_16_NEON,
+ ScaleUVRowUp2_Bilinear_16_C,
+ 7,
+ uint16_t)
+#endif
+
+#undef SBU2BLANY
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc
index beef380a..9c3acf7f 100644
--- a/files/source/scale_argb.cc
+++ b/files/source/scale_argb.cc
@@ -58,9 +58,9 @@ static void ScaleARGBDown2(int src_width,
assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
// Advance to odd row, even column.
if (filtering == kFilterBilinear) {
- src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4;
} else {
- src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+ src_argb += (y >> 16) * (int64_t)src_stride + ((x >> 16) - 1) * 4;
}
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
@@ -111,19 +111,19 @@ static void ScaleARGBDown2(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBROWDOWN2_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
+#if defined(HAS_SCALEARGBROWDOWN2_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
ScaleARGBRowDown2 =
filtering == kFilterNone
- ? ScaleARGBRowDown2_Any_MMI
- : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI
- : ScaleARGBRowDown2Box_Any_MMI);
- if (IS_ALIGNED(dst_width, 2)) {
+ ? ScaleARGBRowDown2_Any_LSX
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_LSX
+ : ScaleARGBRowDown2Box_Any_LSX);
+ if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 =
filtering == kFilterNone
- ? ScaleARGBRowDown2_MMI
- : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI
- : ScaleARGBRowDown2Box_MMI);
+ ? ScaleARGBRowDown2_LSX
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_LSX
+ : ScaleARGBRowDown2Box_LSX);
}
}
#endif
@@ -162,7 +162,7 @@ static void ScaleARGBDown4Box(int src_width,
uint8_t* dst_argb, int dst_width) =
ScaleARGBRowDown2Box_C;
// Advance to odd row, even column.
- src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4;
(void)src_width;
(void)src_height;
(void)dx;
@@ -214,7 +214,7 @@ static void ScaleARGBDownEven(int src_width,
enum FilterMode filtering) {
int j;
int col_step = dx >> 16;
- int row_stride = (dy >> 16) * src_stride;
+ int row_stride = (dy >> 16) * (int64_t)src_stride;
void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
int src_step, uint8_t* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
@@ -222,7 +222,7 @@ static void ScaleARGBDownEven(int src_width,
(void)src_height;
assert(IS_ALIGNED(src_width, 2));
assert(IS_ALIGNED(src_height, 2));
- src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
@@ -253,13 +253,13 @@ static void ScaleARGBDownEven(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI
- : ScaleARGBRowDownEven_Any_MMI;
- if (IS_ALIGNED(dst_width, 2)) {
+#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_LSX
+ : ScaleARGBRowDownEven_Any_LSX;
+ if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven =
- filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI;
+ filtering ? ScaleARGBRowDownEvenBox_LSX : ScaleARGBRowDownEven_LSX;
}
}
#endif
@@ -340,6 +340,14 @@ static void ScaleARGBBilinearDown(int src_width,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
@@ -361,6 +369,14 @@ static void ScaleARGBBilinearDown(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_LSX;
+ }
+ }
+#endif
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
// Allocate a row of ARGB.
{
@@ -372,7 +388,7 @@ static void ScaleARGBBilinearDown(int src_width,
}
for (j = 0; j < dst_height; ++j) {
int yi = y >> 16;
- const uint8_t* src = src_argb + yi * src_stride;
+ const uint8_t* src = src_argb + yi * (int64_t)src_stride;
if (filtering == kFilterLinear) {
ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
} else {
@@ -444,11 +460,11 @@ static void ScaleARGBBilinearUp(int src_width,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(dst_width, 2)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_LSX;
}
}
#endif
@@ -477,6 +493,14 @@ static void ScaleARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_LSX)
+ if (filtering && TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_LSX;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -498,11 +522,11 @@ static void ScaleARGBBilinearUp(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBCOLS_MMI)
- if (!filtering && TestCpuFlag(kCpuHasMMI)) {
- ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
- if (IS_ALIGNED(dst_width, 1)) {
- ScaleARGBFilterCols = ScaleARGBCols_MMI;
+#if defined(HAS_SCALEARGBCOLS_LSX)
+ if (!filtering && TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBCols_LSX;
}
}
#endif
@@ -513,11 +537,6 @@ static void ScaleARGBBilinearUp(int src_width,
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
-#if defined(HAS_SCALEARGBCOLSUP2_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
- ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
- }
-#endif
}
if (y > max_y) {
@@ -526,7 +545,7 @@ static void ScaleARGBBilinearUp(int src_width,
{
int yi = y >> 16;
- const uint8_t* src = src_argb + yi * src_stride;
+ const uint8_t* src = src_argb + yi * (int64_t)src_stride;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 31) & ~31;
@@ -541,7 +560,9 @@ static void ScaleARGBBilinearUp(int src_width,
src += src_stride;
}
ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
- src += src_stride;
+ if (src_height > 2) {
+ src += src_stride;
+ }
for (j = 0; j < dst_height; ++j) {
yi = y >> 16;
@@ -549,14 +570,16 @@ static void ScaleARGBBilinearUp(int src_width,
if (y > max_y) {
y = max_y;
yi = y >> 16;
- src = src_argb + yi * src_stride;
+ src = src_argb + yi * (int64_t)src_stride;
}
if (yi != lasty) {
ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
rowptr += rowstride;
rowstride = -rowstride;
lasty = yi;
- src += src_stride;
+ if ((y + 65536) < max_y) {
+ src += src_stride;
+ }
}
}
if (filtering == kFilterLinear) {
@@ -611,6 +634,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_AVX512BW)
+ if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) ==
+ (kCpuHasAVX512BW | kCpuHasAVX512VL)) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX512BW;
+ if (IS_ALIGNED(src_width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_AVX512BW;
+ }
+ }
+#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
@@ -627,6 +659,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_I422TOARGBROW_LASX)
+ if (TestCpuFlag(kCpuHasLASX)) {
+ I422ToARGBRow = I422ToARGBRow_Any_LASX;
+ if (IS_ALIGNED(src_width, 32)) {
+ I422ToARGBRow = I422ToARGBRow_LASX;
+ }
+ }
+#endif
void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
ptrdiff_t src_stride, int dst_width,
@@ -663,6 +703,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_LSX;
+ }
+ }
+#endif
void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
int dst_width, int x, int dx) =
@@ -692,6 +740,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_LSX)
+ if (filtering && TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_LSX;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -713,11 +769,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBCOLS_MMI)
- if (!filtering && TestCpuFlag(kCpuHasMMI)) {
- ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
- if (IS_ALIGNED(dst_width, 1)) {
- ScaleARGBFilterCols = ScaleARGBCols_MMI;
+#if defined(HAS_SCALEARGBCOLS_LSX)
+ if (!filtering && TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBCols_LSX;
}
}
#endif
@@ -728,11 +784,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
-#if defined(HAS_SCALEARGBCOLSUP2_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
- ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
- }
-#endif
}
const int max_y = (src_height - 1) << 16;
@@ -742,9 +793,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
int yi = y >> 16;
int uv_yi = yi >> kYShift;
- const uint8_t* src_row_y = src_y + yi * src_stride_y;
- const uint8_t* src_row_u = src_u + uv_yi * src_stride_u;
- const uint8_t* src_row_v = src_v + uv_yi * src_stride_v;
+ const uint8_t* src_row_y = src_y + yi * (int64_t)src_stride_y;
+ const uint8_t* src_row_u = src_u + uv_yi * (int64_t)src_stride_u;
+ const uint8_t* src_row_v = src_v + uv_yi * (int64_t)src_stride_v;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 31) & ~31;
@@ -782,9 +833,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
y = max_y;
yi = y >> 16;
uv_yi = yi >> kYShift;
- src_row_y = src_y + yi * src_stride_y;
- src_row_u = src_u + uv_yi * src_stride_u;
- src_row_v = src_v + uv_yi * src_stride_v;
+ src_row_y = src_y + yi * (int64_t)src_stride_y;
+ src_row_u = src_u + uv_yi * (int64_t)src_stride_u;
+ src_row_v = src_v + uv_yi * (int64_t)src_stride_v;
}
if (yi != lasty) {
// TODO(fbarchard): Convert the clipped region of row.
@@ -857,11 +908,11 @@ static void ScaleARGBSimple(int src_width,
}
}
#endif
-#if defined(HAS_SCALEARGBCOLS_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- ScaleARGBCols = ScaleARGBCols_Any_MMI;
- if (IS_ALIGNED(dst_width, 1)) {
- ScaleARGBCols = ScaleARGBCols_MMI;
+#if defined(HAS_SCALEARGBCOLS_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ ScaleARGBCols = ScaleARGBCols_Any_LSX;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBCols = ScaleARGBCols_LSX;
}
}
#endif
@@ -872,16 +923,11 @@ static void ScaleARGBSimple(int src_width,
ScaleARGBCols = ScaleARGBColsUp2_SSE2;
}
#endif
-#if defined(HAS_SCALEARGBCOLSUP2_MMI)
- if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
- ScaleARGBCols = ScaleARGBColsUp2_MMI;
- }
-#endif
}
for (j = 0; j < dst_height; ++j) {
- ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
- dx);
+ ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (int64_t)src_stride,
+ dst_width, x, dx);
dst_argb += dst_stride;
y += dy;
}
@@ -916,7 +962,7 @@ static void ScaleARGB(const uint8_t* src,
// Negative src_height means invert the image.
if (src_height < 0) {
src_height = -src_height;
- src = src + (src_height - 1) * src_stride;
+ src = src + (src_height - 1) * (int64_t)src_stride;
src_stride = -src_stride;
}
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -931,7 +977,7 @@ static void ScaleARGB(const uint8_t* src,
if (clip_y) {
int64_t clipf = (int64_t)(clip_y)*dy;
y += (clipf & 0xffff);
- src += (clipf >> 16) * src_stride;
+ src += (clipf >> 16) * (int64_t)src_stride;
dst += clip_y * dst_stride;
}
@@ -965,17 +1011,17 @@ static void ScaleARGB(const uint8_t* src,
filtering = kFilterNone;
if (dx == 0x10000 && dy == 0x10000) {
// Straight copy.
- ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
- dst, dst_stride, clip_width, clip_height);
+ ARGBCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 4,
+ src_stride, dst, dst_stride, clip_width, clip_height);
return;
}
}
}
}
if (dx == 0x10000 && (x & 0xffff) == 0) {
- // Arbitrary scale vertically, but unscaled vertically.
+ // Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
- dst_stride, src, dst, x, y, dy, 4, filtering);
+ dst_stride, src, dst, x, y, dy, /*bpp=*/4, filtering);
return;
}
if (filtering && dy < 65536) {
diff --git a/files/source/scale_common.cc b/files/source/scale_common.cc
index 63690271..b02bdafd 100644
--- a/files/source/scale_common.cc
+++ b/files/source/scale_common.cc
@@ -400,6 +400,95 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
}
}
+// Sample position: (O is src sample position, X is dst sample position)
+//
+// v dst_ptr at here v stop at here
+// X O X X O X X O X X O X X O X
+// ^ src_ptr at here
+void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
+ dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
+ }
+}
+
+// Sample position: (O is src sample position, X is dst sample position)
+//
+// src_ptr at here
+// X v X X X X X X X X X
+// O O O O O
+// X X X X X X X X X X
+// ^ dst_ptr at here ^ stop at here
+// X X X X X X X X X X
+// O O O O O
+// X X X X X X X X X X
+void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ uint8_t* d = dst_ptr;
+ uint8_t* e = dst_ptr + dst_stride;
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ d[2 * x + 0] =
+ (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
+ d[2 * x + 1] =
+ (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
+ e[2 * x + 0] =
+ (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
+ e[2 * x + 1] =
+ (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
+ }
+}
+
+// Only suitable for at most 14 bit range.
+void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
+ dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
+ }
+}
+
+// Only suitable for at most 12bit range.
+void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
+ uint16_t* d = dst_ptr;
+ uint16_t* e = dst_ptr + dst_stride;
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ d[2 * x + 0] =
+ (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
+ d[2 * x + 1] =
+ (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
+ e[2 * x + 0] =
+ (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
+ e[2 * x + 1] =
+ (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
+ }
+}
+
// Scales a single row of pixels using point sampling.
void ScaleCols_C(uint8_t* dst_ptr,
const uint8_t* src_ptr,
@@ -677,18 +766,18 @@ void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
(src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
- (65536 / 9) >>
+ (65536u / 9u) >>
16;
dst_ptr[1] =
(src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
- (65536 / 9) >>
+ (65536u / 9u) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
- (65536 / 6) >>
+ (65536u / 6u) >>
16;
src_ptr += 8;
dst_ptr += 3;
@@ -731,15 +820,15 @@ void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2]) *
- (65536 / 6) >>
+ (65536u / 6u) >>
16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5]) *
- (65536 / 6) >>
+ (65536u / 6u) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
- (65536 / 4) >>
+ (65536u / 4u) >>
16;
src_ptr += 8;
dst_ptr += 3;
@@ -776,6 +865,8 @@ void ScaleAddRow_16_C(const uint16_t* src_ptr,
}
}
+// ARGB scale row functions
+
void ScaleARGBRowDown2_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
@@ -1018,6 +1109,351 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
#undef BLENDERC
#undef BLENDER
+// UV scale row functions
+// same as ARGB but 2 channels
+
+void ScaleUVRowDown2_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[1];
+ dst[1] = src[3];
+ src += 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[1];
+ }
+}
+
+void ScaleUVRowDown2Linear_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1;
+ src_uv += 4;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+ src_uv[src_stride + 2] + 2) >>
+ 2;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+ src_uv[src_stride + 3] + 2) >>
+ 2;
+ src_uv += 4;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowDownEven_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ (void)src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[src_stepx];
+ src += src_stepx * 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_uv,
+ int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
+ src_uv[src_stride + 2] + 2) >>
+ 2;
+ dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
+ src_uv[src_stride + 3] + 2) >>
+ 2;
+ src_uv += src_stepx * 2;
+ dst_uv += 2;
+ }
+}
+
+void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ dst_ptr[4 * x + 0] =
+ (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
+ dst_ptr[4 * x + 1] =
+ (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
+ dst_ptr[4 * x + 2] =
+ (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
+ dst_ptr[4 * x + 3] =
+ (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
+ }
+}
+
+void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ uint8_t* d = dst_ptr;
+ uint8_t* e = dst_ptr + dst_stride;
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+ t[2 * x + 2] * 1 + 8) >>
+ 4;
+ d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+ t[2 * x + 3] * 1 + 8) >>
+ 4;
+ d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
+ t[2 * x + 2] * 3 + 8) >>
+ 4;
+ d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
+ t[2 * x + 3] * 3 + 8) >>
+ 4;
+ e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
+ t[2 * x + 2] * 3 + 8) >>
+ 4;
+ e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
+ t[2 * x + 3] * 3 + 8) >>
+ 4;
+ e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+ t[2 * x + 2] * 9 + 8) >>
+ 4;
+ e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+ t[2 * x + 3] * 9 + 8) >>
+ 4;
+ }
+}
+
+void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ dst_ptr[4 * x + 0] =
+ (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
+ dst_ptr[4 * x + 1] =
+ (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
+ dst_ptr[4 * x + 2] =
+ (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
+ dst_ptr[4 * x + 3] =
+ (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
+ }
+}
+
+void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
+ uint16_t* d = dst_ptr;
+ uint16_t* e = dst_ptr + dst_stride;
+ int src_width = dst_width >> 1;
+ int x;
+ assert((dst_width % 2 == 0) && (dst_width >= 0));
+ for (x = 0; x < src_width; ++x) {
+ d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+ t[2 * x + 2] * 1 + 8) >>
+ 4;
+ d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+ t[2 * x + 3] * 1 + 8) >>
+ 4;
+ d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
+ t[2 * x + 2] * 3 + 8) >>
+ 4;
+ d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
+ t[2 * x + 3] * 3 + 8) >>
+ 4;
+ e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
+ t[2 * x + 2] * 3 + 8) >>
+ 4;
+ e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
+ t[2 * x + 3] * 3 + 8) >>
+ 4;
+ e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
+ t[2 * x + 2] * 9 + 8) >>
+ 4;
+ e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
+ t[2 * x + 3] * 9 + 8) >>
+ 4;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleUVCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+void ScaleUVCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleUVColsUp2_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ (void)x;
+ (void)dx;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[1] = dst[0] = src[0];
+ src += 1;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+ (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleUVFilterCols_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+
+void ScaleUVFilterCols64_C(uint8_t* dst_uv,
+ const uint8_t* src_uv,
+ int dst_width,
+ int x32,
+ int dx) {
+ int64_t x = (int64_t)(x32);
+ const uint16_t* src = (const uint16_t*)(src_uv);
+ uint16_t* dst = (uint16_t*)(dst_uv);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64_t xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int64_t xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint16_t a = src[xi];
+ uint16_t b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
// Scale plane vertically with bilinear interpolation.
void ScalePlaneVertical(int src_height,
int dst_width,
@@ -1029,7 +1465,7 @@ void ScalePlaneVertical(int src_height,
int x,
int y,
int dy,
- int bpp,
+ int bpp, // bytes per pixel. 4 for ARGB.
enum FilterMode filtering) {
// TODO(fbarchard): Allow higher bpp.
int dst_width_bytes = dst_width * bpp;
@@ -1075,11 +1511,11 @@ void ScalePlaneVertical(int src_height,
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_MMI)
- if (TestCpuFlag(kCpuHasMMI)) {
- InterpolateRow = InterpolateRow_Any_MMI;
- if (IS_ALIGNED(dst_width_bytes, 8)) {
- InterpolateRow = InterpolateRow_MMI;
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
}
}
#endif
@@ -1097,6 +1533,7 @@ void ScalePlaneVertical(int src_height,
y += dy;
}
}
+
void ScalePlaneVertical_16(int src_height,
int dst_width,
int dst_height,
@@ -1107,7 +1544,7 @@ void ScalePlaneVertical_16(int src_height,
int x,
int y,
int dy,
- int wpp,
+ int wpp, /* words per pixel. normally 1 */
enum FilterMode filtering) {
// TODO(fbarchard): Allow higher wpp.
int dst_width_words = dst_width * wpp;
@@ -1123,32 +1560,32 @@ void ScalePlaneVertical_16(int src_height,
src_argb += (x >> 16) * wpp;
#if defined(HAS_INTERPOLATEROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
- InterpolateRow = InterpolateRow_Any_16_SSE2;
- if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_Any_SSE2;
+ if (IS_ALIGNED(dst_width_words, 16)) {
InterpolateRow = InterpolateRow_16_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
- InterpolateRow = InterpolateRow_Any_16_SSSE3;
- if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_Any_SSSE3;
+ if (IS_ALIGNED(dst_width_words, 16)) {
InterpolateRow = InterpolateRow_16_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
- InterpolateRow = InterpolateRow_Any_16_AVX2;
- if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_16_Any_AVX2;
+ if (IS_ALIGNED(dst_width_words, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
- InterpolateRow = InterpolateRow_Any_16_NEON;
- if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_Any_NEON;
+ if (IS_ALIGNED(dst_width_words, 8)) {
InterpolateRow = InterpolateRow_16_NEON;
}
}
@@ -1168,6 +1605,70 @@ void ScalePlaneVertical_16(int src_height,
}
}
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+void ScalePlaneVertical_16To8(int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_argb,
+ uint8_t* dst_argb,
+ int x,
+ int y,
+ int dy,
+ int wpp, /* words per pixel. normally 1 */
+ int scale,
+ enum FilterMode filtering) {
+ // TODO(fbarchard): Allow higher wpp.
+ int dst_width_words = dst_width * wpp;
+ // TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions.
+ void (*InterpolateRow_16To8)(uint8_t * dst_argb, const uint16_t* src_argb,
+ ptrdiff_t src_stride, int scale, int dst_width,
+ int source_y_fraction) = InterpolateRow_16To8_C;
+ const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ int j;
+ assert(wpp >= 1 && wpp <= 2);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ src_argb += (x >> 16) * wpp;
+
+#if defined(HAS_INTERPOLATEROW_16TO8_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow_16To8 = InterpolateRow_16To8_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow_16To8 = InterpolateRow_16To8_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16TO8_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow_16To8 = InterpolateRow_16To8_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 32)) {
+ InterpolateRow_16To8 = InterpolateRow_16To8_AVX2;
+ }
+ }
+#endif
+ for (j = 0; j < dst_height; ++j) {
+ int yi;
+ int yf;
+ if (y > max_y) {
+ y = max_y;
+ }
+ yi = y >> 16;
+ yf = filtering ? ((y >> 8) & 255) : 0;
+ InterpolateRow_16To8(dst_argb, src_argb + yi * src_stride, src_stride,
+ scale, dst_width_words, yf);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
// Simplify the filtering based on scale factors.
enum FilterMode ScaleFilterReduce(int src_width,
int src_height,
@@ -1181,8 +1682,8 @@ enum FilterMode ScaleFilterReduce(int src_width,
src_height = -src_height;
}
if (filtering == kFilterBox) {
- // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
- if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+ // If scaling either axis to 0.5 or larger, switch from Box to Bilinear.
+ if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) {
filtering = kFilterBilinear;
}
}
@@ -1217,7 +1718,7 @@ int FixedDiv_C(int num, int div) {
return (int)(((int64_t)(num) << 16) / div);
}
-// Divide num by div and return as 16.16 fixed point result.
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_C(int num, int div) {
return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1));
}
@@ -1260,14 +1761,14 @@ void ScaleSlope(int src_width,
if (dst_width <= Abs(src_width)) {
*dx = FixedDiv(Abs(src_width), dst_width);
*x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
- } else if (dst_width > 1) {
+ } else if (src_width > 1 && dst_width > 1) {
*dx = FixedDiv1(Abs(src_width), dst_width);
*x = 0;
}
if (dst_height <= src_height) {
*dy = FixedDiv(src_height, dst_height);
*y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter.
- } else if (dst_height > 1) {
+ } else if (src_height > 1 && dst_height > 1) {
*dy = FixedDiv1(src_height, dst_height);
*y = 0;
}
@@ -1276,7 +1777,7 @@ void ScaleSlope(int src_width,
if (dst_width <= Abs(src_width)) {
*dx = FixedDiv(Abs(src_width), dst_width);
*x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
- } else if (dst_width > 1) {
+ } else if (src_width > 1 && dst_width > 1) {
*dx = FixedDiv1(Abs(src_width), dst_width);
*x = 0;
}
diff --git a/files/source/scale_dspr2.cc b/files/source/scale_dspr2.cc
deleted file mode 100644
index ddedcbf4..00000000
--- a/files/source/scale_dspr2.cc
+++ /dev/null
@@ -1,668 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC MIPS DSPR2
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
- (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void ScaleRowDown2_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 4 \n" // iterations -> by 16
- "beqz $t9, 2f \n"
- " nop \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
- // TODO(fbarchard): Use odd pixels instead of even.
- "precrq.qb.ph $t8, $t1, $t0 \n" // |7|5|3|1|
- "precrq.qb.ph $t0, $t3, $t2 \n" // |15|13|11|9|
- "precrq.qb.ph $t1, $t5, $t4 \n" // |23|21|19|17|
- "precrq.qb.ph $t2, $t7, $t6 \n" // |31|29|27|25|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "addiu $t9, $t9, -1 \n"
- "sw $t8, 0(%[dst]) \n"
- "sw $t0, 4(%[dst]) \n"
- "sw $t1, 8(%[dst]) \n"
- "sw $t2, 12(%[dst]) \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 16 \n"
-
- "2: \n"
- "andi $t9, %[dst_width], 0xf \n" // residue
- "beqz $t9, 3f \n"
- " nop \n"
-
- "21: \n"
- "lbu $t0, 1(%[src_ptr]) \n"
- "addiu %[src_ptr], %[src_ptr], 2 \n"
- "addiu $t9, $t9, -1 \n"
- "sb $t0, 0(%[dst]) \n"
- "bgtz $t9, 21b \n"
- " addiu %[dst], %[dst], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
- : [dst_width] "r"(dst_width)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width) {
- const uint8* t = src_ptr + src_stride;
-
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 3 \n" // iterations -> step 8
- "bltz $t9, 2f \n"
- " nop \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t4, 0(%[t]) \n" // |19|18|17|16|
- "lw $t5, 4(%[t]) \n" // |23|22|21|20|
- "lw $t6, 8(%[t]) \n" // |27|26|25|24|
- "lw $t7, 12(%[t]) \n" // |31|30|29|28|
- "addiu $t9, $t9, -1 \n"
- "srl $t8, $t0, 16 \n" // |X|X|3|2|
- "ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
- "ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
- "raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
- "raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
- "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
- "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
- "srl $t8, $t1, 16 \n" // |X|X|7|6|
- "ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
- "ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
- "raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
- "raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
- "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
- "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
- "srl $t8, $t2, 16 \n" // |X|X|11|10|
- "ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
- "ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
- "raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
- "raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
- "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
- "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
- "srl $t8, $t3, 16 \n" // |X|X|15|14|
- "ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
- "ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
- "raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
- "raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
- "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
- "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
- "addiu %[src_ptr], %[src_ptr], 16 \n"
- "addiu %[t], %[t], 16 \n"
- "sb $t0, 0(%[dst]) \n"
- "sb $t4, 1(%[dst]) \n"
- "sb $t1, 2(%[dst]) \n"
- "sb $t5, 3(%[dst]) \n"
- "sb $t2, 4(%[dst]) \n"
- "sb $t6, 5(%[dst]) \n"
- "sb $t3, 6(%[dst]) \n"
- "sb $t7, 7(%[dst]) \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 8 \n"
-
- "2: \n"
- "andi $t9, %[dst_width], 0x7 \n" // x = residue
- "beqz $t9, 3f \n"
- " nop \n"
-
- "21: \n"
- "lwr $t1, 0(%[src_ptr]) \n"
- "lwl $t1, 3(%[src_ptr]) \n"
- "lwr $t2, 0(%[t]) \n"
- "lwl $t2, 3(%[t]) \n"
- "srl $t8, $t1, 16 \n"
- "ins $t1, $t2, 16, 16 \n"
- "ins $t2, $t8, 0, 16 \n"
- "raddu.w.qb $t1, $t1 \n"
- "raddu.w.qb $t2, $t2 \n"
- "shra_r.w $t1, $t1, 2 \n"
- "shra_r.w $t2, $t2, 2 \n"
- "sb $t1, 0(%[dst]) \n"
- "sb $t2, 1(%[dst]) \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "addiu $t9, $t9, -2 \n"
- "addiu %[t], %[t], 4 \n"
- "bgtz $t9, 21b \n"
- " addiu %[dst], %[dst], 2 \n"
-
- "3: \n"
- ".set pop \n"
-
- : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t)
- : [dst_width] "r"(dst_width)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown4_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 3 \n"
- "beqz $t9, 2f \n"
- " nop \n"
-
- "1: \n"
- "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
- "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
- "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
- "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
- "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
- "precrq.qb.ph $t1, $t2, $t1 \n" // |14|10|6|2|
- "precrq.qb.ph $t5, $t6, $t5 \n" // |30|26|22|18|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "addiu $t9, $t9, -1 \n"
- "sw $t1, 0(%[dst]) \n"
- "sw $t5, 4(%[dst]) \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 8 \n"
-
- "2: \n"
- "andi $t9, %[dst_width], 7 \n" // residue
- "beqz $t9, 3f \n"
- " nop \n"
-
- "21: \n"
- "lbu $t1, 2(%[src_ptr]) \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "addiu $t9, $t9, -1 \n"
- "sb $t1, 0(%[dst]) \n"
- "bgtz $t9, 21b \n"
- " addiu %[dst], %[dst], 1 \n"
-
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
- : [dst_width] "r"(dst_width)
- : "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width) {
- intptr_t stride = src_stride;
- const uint8* s1 = src_ptr + stride;
- const uint8* s2 = s1 + stride;
- const uint8* s3 = s2 + stride;
-
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "srl $t9, %[dst_width], 1 \n"
- "andi $t8, %[dst_width], 1 \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
- "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
- "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
- "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t5, 4(%[s1]) \n" // |23|22|21|20|
- "lw $t6, 4(%[s2]) \n" // |27|26|25|24|
- "lw $t7, 4(%[s3]) \n" // |31|30|29|28|
- "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
- "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
- "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
- "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
- "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
- "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
- "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
- "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
- "add $t0, $t0, $t1 \n"
- "add $t1, $t2, $t3 \n"
- "add $t0, $t0, $t1 \n"
- "add $t4, $t4, $t5 \n"
- "add $t6, $t6, $t7 \n"
- "add $t4, $t4, $t6 \n"
- "shra_r.w $t0, $t0, 4 \n"
- "shra_r.w $t4, $t4, 4 \n"
- "sb $t0, 0(%[dst]) \n"
- "sb $t4, 1(%[dst]) \n"
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[s1], %[s1], 8 \n"
- "addiu %[s2], %[s2], 8 \n"
- "addiu %[s3], %[s3], 8 \n"
- "addiu $t9, $t9, -1 \n"
- "bgtz $t9, 1b \n"
- " addiu %[dst], %[dst], 2 \n"
- "beqz $t8, 2f \n"
- " nop \n"
-
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
- "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
- "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
- "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
- "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
- "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
- "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
- "add $t0, $t0, $t1 \n"
- "add $t1, $t2, $t3 \n"
- "add $t0, $t0, $t1 \n"
- "shra_r.w $t0, $t0, 4 \n"
- "sb $t0, 0(%[dst]) \n"
-
- "2: \n"
- ".set pop \n"
-
- : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2),
- [s3] "+r"(s3)
- : [dst_width] "r"(dst_width)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown34_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "1: \n"
- "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
- "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
- "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
- "addiu %[dst_width], %[dst_width], -24 \n"
- "ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
- "ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
- "ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
- "ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
- "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
- "prepend $t1, $t2, 8 \n" // |4|3|1|0|
- "prepend $t3, $t4, 24 \n" // |15|13|12|11|
- "prepend $t5, $t6, 8 \n" // |20|19|17|16|
- "prepend $t7, $t8, 24 \n" // |31|29|28|27|
- "sw $t1, 0(%[dst]) \n"
- "sw $t0, 4(%[dst]) \n"
- "sw $t3, 8(%[dst]) \n"
- "sw $t5, 12(%[dst]) \n"
- "sw $t9, 16(%[dst]) \n"
- "sw $t7, 20(%[dst]) \n"
- "bnez %[dst_width], 1b \n"
- " addiu %[dst], %[dst], 24 \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
-}
-
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* d,
- int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "repl.ph $t3, 3 \n" // 0x00030003
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
- "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
- "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
- "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
- "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
- "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
- "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
- "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
- "raddu.w.qb $t0, $t0 \n"
- "raddu.w.qb $t1, $t1 \n"
- "shra_r.w $t0, $t0, 1 \n"
- "shra_r.w $t1, $t1, 1 \n"
- "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
- "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
- "rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
- "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
- "addu.ph $t2, $t2, $t4 \n"
- "addu.ph $t6, $t6, $t5 \n"
- "sll $t5, $t0, 1 \n"
- "add $t0, $t5, $t0 \n"
- "shra_r.ph $t2, $t2, 2 \n"
- "shra_r.ph $t6, $t6, 2 \n"
- "shll.ph $t4, $t2, 1 \n"
- "addq.ph $t4, $t4, $t2 \n"
- "addu $t0, $t0, $t1 \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "shra_r.w $t0, $t0, 2 \n"
- "addu.ph $t6, $t6, $t4 \n"
- "shra_r.ph $t6, $t6, 2 \n"
- "srl $t1, $t6, 16 \n"
- "addiu %[dst_width], %[dst_width], -3 \n"
- "sb $t1, 0(%[d]) \n"
- "sb $t0, 1(%[d]) \n"
- "sb $t6, 2(%[d]) \n"
- "bgtz %[dst_width], 1b \n"
- " addiu %[d], %[d], 3 \n"
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
- [dst_width] "+r"(dst_width)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
-}
-
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* d,
- int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "repl.ph $t2, 3 \n" // 0x00030003
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
- "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
- "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
- "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
- "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
- "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
- "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
- "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
- "raddu.w.qb $t0, $t0 \n"
- "raddu.w.qb $t1, $t1 \n"
- "shra_r.w $t0, $t0, 1 \n"
- "shra_r.w $t1, $t1, 1 \n"
- "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
- "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
- "rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
- "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
- "addu.ph $t4, $t4, $t3 \n"
- "addu.ph $t6, $t6, $t5 \n"
- "shra_r.ph $t6, $t6, 2 \n"
- "shra_r.ph $t4, $t4, 2 \n"
- "addu.ph $t6, $t6, $t4 \n"
- "addiu %[src_ptr], %[src_ptr], 4 \n"
- "shra_r.ph $t6, $t6, 1 \n"
- "addu $t0, $t0, $t1 \n"
- "addiu %[dst_width], %[dst_width], -3 \n"
- "shra_r.w $t0, $t0, 1 \n"
- "srl $t1, $t6, 16 \n"
- "sb $t1, 0(%[d]) \n"
- "sb $t0, 1(%[d]) \n"
- "sb $t6, 2(%[d]) \n"
- "bgtz %[dst_width], 1b \n"
- " addiu %[d], %[d], 3 \n"
- "3: \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
- [dst_width] "+r"(dst_width)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
-}
-
-void ScaleRowDown38_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width) {
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
- "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
- "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
- "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
- "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
- "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
- "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
- "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
- "wsbh $t0, $t0 \n" // |2|3|0|1|
- "wsbh $t6, $t6 \n" // |26|27|24|25|
- "srl $t0, $t0, 8 \n" // |X|2|3|0|
- "srl $t3, $t3, 16 \n" // |X|X|15|14|
- "srl $t5, $t5, 16 \n" // |X|X|23|22|
- "srl $t7, $t7, 16 \n" // |X|X|31|30|
- "ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
- "ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
- "ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
- "ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
- "prepend $t2, $t3, 24 \n" // |X|15|14|11|
- "ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
- "ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
- "addiu %[src_ptr], %[src_ptr], 32 \n"
- "addiu %[dst_width], %[dst_width], -12 \n"
- "addiu $t8,%[dst_width], -12 \n"
- "sw $t1, 0(%[dst]) \n"
- "sw $t4, 4(%[dst]) \n"
- "sw $t6, 8(%[dst]) \n"
- "bgez $t8, 1b \n"
- " addiu %[dst], %[dst], 12 \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
- :
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
-}
-
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr,
- int dst_width) {
- intptr_t stride = src_stride;
- const uint8* t = src_ptr + stride;
- const int c = 0x2AAA;
-
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
- "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
- "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
- "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
- "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
- "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
- "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
- "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
- "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
- "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
- "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
- "srl $t4, $t4, 2 \n" // t4 / 4
- "srl $t6, $t6, 16 \n" // |0|0|S3|T3|
- "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
- "addu $t6, $t5, $t6 \n"
- "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
- "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
- "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
- "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
- "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
- "addu $t0, $t0, $t2 \n"
- "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[t], %[t], 8 \n"
- "addiu %[dst_width], %[dst_width], -3 \n"
- "addiu %[dst_ptr], %[dst_ptr], 3 \n"
- "srl $t6, $t6, 16 \n"
- "srl $t0, $t0, 16 \n"
- "sb $t4, -1(%[dst_ptr]) \n"
- "sb $t6, -2(%[dst_ptr]) \n"
- "bgtz %[dst_width], 1b \n"
- " sb $t0, -3(%[dst_ptr]) \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t),
- [dst_width] "+r"(dst_width)
- : [c] "r"(c)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
-}
-
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr,
- int dst_width) {
- intptr_t stride = src_stride;
- const uint8* s1 = src_ptr + stride;
- stride += stride;
- const uint8* s2 = src_ptr + stride;
- const int c1 = 0x1C71;
- const int c2 = 0x2AAA;
-
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
-
- "1: \n"
- "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
- "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
- "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
- "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
- "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
- "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
- "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
- "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
- "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
- "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
- "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
- "sll $t8, $t5, 16 \n" // |R5|R4|0|0|
- "raddu.w.qb $t8, $t8 \n" // R5+R4
- "addu $t7, $t7, $t8 \n"
- "srl $t8, $t5, 16 \n" // |0|0|R7|R6|
- "raddu.w.qb $t8, $t8 \n" // R7 + R6
- "addu $t6, $t6, $t8 \n"
- "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
- "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
- "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
- "srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
- "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
- "addu $t7, $t7, $t8 \n"
- "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
- "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
- "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
- "sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
- "raddu.w.qb $t0, $t0 \n"
- "raddu.w.qb $t2, $t2 \n"
- "raddu.w.qb $t4, $t4 \n"
- "addu $t0, $t0, $t2 \n"
- "addu $t0, $t0, $t4 \n"
- "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
- "addiu %[src_ptr], %[src_ptr], 8 \n"
- "addiu %[s1], %[s1], 8 \n"
- "addiu %[s2], %[s2], 8 \n"
- "addiu %[dst_width], %[dst_width], -3 \n"
- "addiu %[dst_ptr], %[dst_ptr], 3 \n"
- "srl $t6, $t6, 16 \n"
- "srl $t7, $t7, 16 \n"
- "srl $t0, $t0, 16 \n"
- "sb $t6, -1(%[dst_ptr]) \n"
- "sb $t7, -2(%[dst_ptr]) \n"
- "bgtz %[dst_width], 1b \n"
- " sb $t0, -3(%[dst_ptr]) \n"
- ".set pop \n"
- : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1),
- [s2] "+r"(s2), [dst_width] "+r"(dst_width)
- : [c1] "r"(c1), [c2] "r"(c2)
- : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
-}
-
-void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
- int x;
- for (x = 0; x < ((src_width - 1)); x += 8) {
- uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4;
- uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8;
- __asm__ __volatile__(
- ".set push \n"
- ".set noreorder \n"
- "lw %[tmp_t5], 0(%[src_ptr]) \n"
- "lw %[tmp_t6], 4(%[src_ptr]) \n"
- "lw %[tmp_t1], 0(%[dst_ptr]) \n"
- "lw %[tmp_t2], 4(%[dst_ptr]) \n"
- "lw %[tmp_t3], 8(%[dst_ptr]) \n"
- "lw %[tmp_t4], 12(%[dst_ptr]) \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t5] \n"
- "preceu.ph.qbl %[tmp_t8], %[tmp_t5] \n"
- "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t7] \n"
- "addu.ph %[tmp_t2], %[tmp_t2], %[tmp_t8] \n"
- "preceu.ph.qbr %[tmp_t7], %[tmp_t6] \n"
- "preceu.ph.qbl %[tmp_t8], %[tmp_t6] \n"
- "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t7] \n"
- "addu.ph %[tmp_t4], %[tmp_t4], %[tmp_t8] \n"
- "sw %[tmp_t1], 0(%[dst_ptr]) \n"
- "sw %[tmp_t2], 4(%[dst_ptr]) \n"
- "sw %[tmp_t3], 8(%[dst_ptr]) \n"
- "sw %[tmp_t4], 12(%[dst_ptr]) \n"
- ".set pop \n"
- :
- [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3),
- [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
- [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr)
- : [dst_ptr] "r"(dst_ptr));
- src_ptr += 8;
- dst_ptr += 8;
- }
-
- if ((src_width)&7) {
- for (x = 0; x < ((src_width - 1) & 7); x += 1) {
- dst_ptr[0] += src_ptr[0];
- src_ptr += 1;
- dst_ptr += 1;
- }
- }
-}
-
-#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc
index 90a49f30..edaf2e29 100644
--- a/files/source/scale_gcc.cc
+++ b/files/source/scale_gcc.cc
@@ -17,8 +17,7 @@ extern "C" {
#endif
// This module is for GCC x86 and x64.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
// Offsets for source bytes 0 to 9
static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
@@ -102,16 +101,16 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
// 16 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -125,25 +124,25 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -156,33 +155,33 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "psrlw $0x1,%%xmm0 \n"
- "psrlw $0x1,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -196,27 +195,25 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile(
-
- LABELALIGN
+ asm volatile(LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1");
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
}
void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
@@ -225,26 +222,26 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -258,34 +255,34 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -301,24 +298,24 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x18,%%xmm5 \n"
- "pslld $0x10,%%xmm5 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ "pslld $0x10,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -332,46 +329,46 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
int dst_width) {
intptr_t stridex3;
asm volatile(
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "psllw $0x3,%%xmm5 \n"
- "lea 0x00(%4,%4,2),%3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "psllw $0x3,%%xmm5 \n"
+ "lea 0x00(%4,%4,2),%3 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "movdqu 0x00(%0,%4,2),%%xmm2 \n"
- "movdqu 0x10(%0,%4,2),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "psrlw $0x4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%4,2),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,2),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -387,26 +384,26 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrld $0x18,%%ymm5,%%ymm5 \n"
- "vpslld $0x10,%%ymm5,%%ymm5 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrld $0x18,%%ymm5,%%ymm5 \n"
+ "vpslld $0x10,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "lea 0x40(%0),%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -420,46 +417,46 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpsllw $0x3,%%ymm4,%%ymm5 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpsllw $0x3,%%ymm4,%%ymm5 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm0 \n"
- "vmovdqu 0x20(%0),%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
- "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
- "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
- "lea 0x40(%0),%0 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@@ -476,37 +473,35 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "movdqa %0,%%xmm3 \n"
- "movdqa %1,%%xmm4 \n"
- "movdqa %2,%%xmm5 \n"
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
:
: "m"(kShuf0), // %0
"m"(kShuf1), // %1
"m"(kShuf2) // %2
);
- asm volatile(
-
- LABELALIGN
+ asm volatile(LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm2 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "palignr $0x8,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm2 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
@@ -514,65 +509,63 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile(
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
);
- asm volatile(
-
- LABELALIGN
+ asm volatile(LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x00(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "movdqu 0x10(%0,%3,1),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "m"(kMadd21) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6", "xmm7");
}
void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
@@ -580,69 +573,67 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
:
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile(
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
:
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
);
- asm volatile(
-
- LABELALIGN
+ asm volatile(LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm6 \n"
- "movdqu 0x00(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqu 0x10(%0),%%xmm6 \n"
- "movdqu 0x10(%0,%3,1),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "m"(kMadd21) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
- "xmm7");
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6", "xmm7");
}
void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
@@ -651,23 +642,23 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movd %%xmm1,0x8(%1) \n"
- "lea 0xc(%1),%1 \n"
- "sub $0xc,%2 \n"
- "jg 1b \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1,0x8(%1) \n"
+ "lea 0xc(%1),%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -681,44 +672,43 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "movdqa %3,%%xmm5 \n"
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
:
: "m"(kShufAb0), // %0
"m"(kShufAb1), // %1
"m"(kShufAb2), // %2
"m"(kScaleAb2) // %3
);
- asm volatile(
-
- LABELALIGN
+ asm volatile(LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%3,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "paddusw %%xmm6,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1,(%1) \n"
- "psrlq $0x10,%%xmm1 \n"
- "movd %%xmm1,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "sub $0x6,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,(%1) \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6");
}
void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
@@ -726,126 +716,1105 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
:
: "m"(kShufAc), // %0
"m"(kShufAc3), // %1
"m"(kScaleAc33) // %2
);
+ asm volatile(LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm6 \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,2),%%xmm6 \n"
+ "lea 0x10(%0),%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movd %%xmm6,(%1) \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6", "xmm7");
+}
+
+static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5,
+ 10, 11, 8, 9, 14, 15, 12, 13};
+
+static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
+ 3, 1, 1, 3, 3, 1, 1, 3};
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSE2
+void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
asm volatile(
+ "pxor %%xmm0,%%xmm0 \n" // 0
+ "pcmpeqw %%xmm6,%%xmm6 \n"
+ "psrlw $15,%%xmm6 \n"
+ "psllw $1,%%xmm6 \n" // all 2
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x00(%0,%3,1),%%xmm6 \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqu 0x00(%0,%3,2),%%xmm6 \n"
- "lea 0x10(%0),%0 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "pshufb %%xmm3,%%xmm7 \n"
- "paddusw %%xmm7,%%xmm6 \n"
- "pmulhuw %%xmm4,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movd %%xmm6,(%1) \n"
- "psrlq $0x10,%%xmm6 \n"
- "movd %%xmm6,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "sub $0x6,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
+ "movq (%0),%%xmm1 \n" // 01234567
+ "movq 1(%0),%%xmm2 \n" // 12345678
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
+ "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
+ "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
+ "movdqa %%xmm2,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
+ "paddw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "paddw %%xmm6,%%xmm4 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
+ "paddw %%xmm5,%%xmm5 \n"
+ "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo)
+ "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo)
+
+ "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
+ "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
+ "paddw %%xmm2,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
+ "paddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm3,%%xmm3 \n"
+ "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
+ "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
+
+ "packuswb %%xmm1,%%xmm5 \n"
+ "movdqu %%xmm5,(%1) \n"
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
+void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ LABELALIGN
+ "1: \n"
+ "pxor %%xmm0,%%xmm0 \n" // 0
+ // above line
+ "movq (%0),%%xmm1 \n" // 01234567
+ "movq 1(%0),%%xmm2 \n" // 12345678
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
+ "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
+ "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
+
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
+ "movdqa %%xmm2,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
+ "paddw %%xmm5,%%xmm4 \n" // near+far
+ "movdqa %%xmm3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
+ "paddw %%xmm5,%%xmm5 \n" // 2*near
+ "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo)
+
+ "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
+ "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
+ "paddw %%xmm2,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
+ "paddw %%xmm3,%%xmm3 \n" // 2*near
+ "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
+
+ // below line
+ "movq (%0,%3),%%xmm6 \n" // 01234567
+ "movq 1(%0,%3),%%xmm2 \n" // 12345678
+ "movdqa %%xmm6,%%xmm3 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
+ "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677
+ "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
+
+ "movdqa %%xmm6,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16)
+ "movdqa %%xmm2,%%xmm7 \n"
+ "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16)
+ "paddw %%xmm7,%%xmm5 \n" // near+far
+ "movdqa %%xmm3,%%xmm7 \n"
+ "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16)
+ "paddw %%xmm7,%%xmm7 \n" // 2*near
+ "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo)
+
+ "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16)
+ "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
+ "paddw %%xmm6,%%xmm2 \n" // near+far
+ "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
+ "paddw %%xmm3,%%xmm3 \n" // 2*near
+ "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi)
+
+ // xmm4 xmm1
+ // xmm5 xmm2
+ "pcmpeqw %%xmm0,%%xmm0 \n"
+ "psrlw $15,%%xmm0 \n"
+ "psllw $3,%%xmm0 \n" // all 8
+
+ "movdqa %%xmm4,%%xmm3 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm3 \n" // ^ div by 16
+
+ "movdqa %%xmm1,%%xmm7 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm7 \n" // ^ div by 16
+
+ "packuswb %%xmm7,%%xmm3 \n"
+ "movdqu %%xmm3,(%1) \n" // save above line
+
+ "movdqa %%xmm5,%%xmm3 \n"
+ "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo)
+ "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo)
+ "psrlw $4,%%xmm5 \n" // ^ div by 16
+
+ "movdqa %%xmm2,%%xmm3 \n"
+ "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi)
+ "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi)
+ "psrlw $4,%%xmm2 \n" // ^ div by 16
+
+ "packuswb %%xmm2,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4) \n" // save below line
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
+#endif
-// Reads 16xN bytes and produces 16 shorts at a time.
-void ScaleAddRow_SSE2(const uint8_t* src_ptr,
- uint16_t* dst_ptr,
- int src_width) {
+#ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
+void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "movdqa %3,%%xmm5 \n"
+ "pcmpeqw %%xmm4,%%xmm4 \n"
+ "psrlw $15,%%xmm4 \n"
+ "psllw $1,%%xmm4 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // 01234567 (16)
+ "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
+
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16)
+ "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16)
+
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far)
+ "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far)
+
+ "paddw %%xmm4,%%xmm1 \n" // far+2
+ "paddw %%xmm4,%%xmm3 \n" // far+2
+ "paddw %%xmm0,%%xmm1 \n" // near+far+2
+ "paddw %%xmm2,%%xmm3 \n" // near+far+2
+ "paddw %%xmm0,%%xmm0 \n" // 2*near
+ "paddw %%xmm2,%%xmm2 \n" // 2*near
+ "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi)
+
+ "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far
+ "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm2,16(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kLinearShuffleFar) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
+void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
asm volatile(
+ "pcmpeqw %%xmm7,%%xmm7 \n"
+ "psrlw $15,%%xmm7 \n"
+ "psllw $3,%%xmm7 \n" // all 8
+ "movdqa %5,%%xmm6 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ // above line
+ "movdqu (%0),%%xmm0 \n" // 01234567 (16)
+ "movdqu 2(%0),%%xmm1 \n" // 12345678 (16)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16)
+ "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16)
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far)
+ "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far)
+ "paddw %%xmm0,%%xmm1 \n" // near+far
+ "paddw %%xmm2,%%xmm3 \n" // near+far
+ "paddw %%xmm0,%%xmm0 \n" // 2*near
+ "paddw %%xmm2,%%xmm2 \n" // 2*near
+ "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo)
+ "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi)
+
+ // below line
+ "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16)
+ "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16)
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16)
+ "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16)
+ "movdqa %%xmm3,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far)
+ "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far)
+ "paddw %%xmm1,%%xmm4 \n" // near+far
+ "paddw %%xmm3,%%xmm5 \n" // near+far
+ "paddw %%xmm1,%%xmm1 \n" // 2*near
+ "paddw %%xmm3,%%xmm3 \n" // 2*near
+ "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo)
+ "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
+
+ // xmm0 xmm2
+ // xmm1 xmm3
+
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm4 \n" // ^ div by 16
+ "movdqu %%xmm4,(%1) \n"
+
+ "movdqa %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm4 \n" // ^ div by 16
+ "movdqu %%xmm4,0x10(%1) \n"
+
+ "movdqa %%xmm1,%%xmm4 \n"
+ "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrlw $4,%%xmm1 \n" // ^ div by 16
+ "movdqu %%xmm1,(%1,%4,2) \n"
+
+ "movdqa %%xmm3,%%xmm4 \n"
+ "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi)
+ "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrlw $4,%%xmm3 \n" // ^ div by 16
+ "movdqu %%xmm3,0x10(%1,%4,2) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kLinearShuffleFar) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
+void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqd %%xmm4,%%xmm4 \n"
+ "psrld $31,%%xmm4 \n"
+ "pslld $1,%%xmm4 \n" // all 2
- // 16 pixel loop.
LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n" // src_ptr += 16
- "movdqu (%1),%%xmm0 \n"
- "movdqu 0x10(%1),%%xmm1 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "paddusw %%xmm3,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n" // 0123 (16b)
+ "movq 2(%0),%%xmm1 \n" // 1234 (16b)
+
+ "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b)
+ "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b)
+
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+
+ "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far)
+ "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far)
+
+ "paddd %%xmm4,%%xmm2 \n" // far+2 (lo)
+ "paddd %%xmm4,%%xmm3 \n" // far+2 (hi)
+ "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo)
+ "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi)
+ "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
+ "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
+ "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
+
+ "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
+ "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
+ "packssdw %%xmm1,%%xmm0 \n"
+ "pshufd $0b11011000,%%xmm0,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
- "+r"(src_width) // %2
+ "+r"(dst_width) // %2
:
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
+#endif
-#ifdef HAS_SCALEADDROW_AVX2
-// Reads 32 bytes and accumulates to 32 shorts at a time.
-void ScaleAddRow_AVX2(const uint8_t* src_ptr,
- uint16_t* dst_ptr,
- int src_width) {
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
+void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "pxor %%xmm7,%%xmm7 \n"
+ "pcmpeqd %%xmm6,%%xmm6 \n"
+ "psrld $31,%%xmm6 \n"
+ "pslld $3,%%xmm6 \n" // all 8
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
+ "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
+ "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
+ "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo)
+ "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi)
+ "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo)
+ "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi)
+ "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo)
+ "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi)
+ "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
+ "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
+
+ "movq (%0),%%xmm0 \n" // 0123 (16b)
+ "movq 2(%0),%%xmm1 \n" // 1234 (16b)
+ "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b)
+ "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far)
+ "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far)
+ "paddd %%xmm0,%%xmm2 \n" // near+far (lo)
+ "paddd %%xmm1,%%xmm3 \n" // near+far (hi)
+ "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
+ "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
+ "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
+ "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
+
+ "movq (%0,%3,2),%%xmm2 \n"
+ "movq 2(%0,%3,2),%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b)
+ "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b)
+ "movdqa %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far)
+ "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far)
+ "paddd %%xmm2,%%xmm4 \n" // near+far (lo)
+ "paddd %%xmm3,%%xmm5 \n" // near+far (hi)
+ "paddd %%xmm2,%%xmm2 \n" // 2*near (lo)
+ "paddd %%xmm3,%%xmm3 \n" // 2*near (hi)
+ "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo)
+ "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
+
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo)
+
+ "movdqa %%xmm2,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo)
+ "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo)
+
+ "movdqa %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi)
+ "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi)
+ "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi)
+ "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi)
+
+ "movdqa %%xmm3,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi)
+ "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi)
+ "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
+ "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi)
+
+ "packssdw %%xmm0,%%xmm4 \n"
+ "pshufd $0b11011000,%%xmm4,%%xmm4 \n"
+ "movdqu %%xmm4,(%1) \n" // store above
+ "packssdw %%xmm2,%%xmm5 \n"
+ "pshufd $0b11011000,%%xmm5,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4,2) \n" // store below
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
+void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqw %%xmm4,%%xmm4 \n"
+ "psrlw $15,%%xmm4 \n"
+ "psllw $1,%%xmm4 \n" // all 2
+ "movdqa %3,%%xmm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 01234567
+ "movq 1(%0),%%xmm1 \n" // 12345678
+ "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
+ "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
+ "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
+ "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
+ "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
+ "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
+ "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
+ "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kLinearMadd31) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
+void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqw %%xmm6,%%xmm6 \n"
+ "psrlw $15,%%xmm6 \n"
+ "psllw $3,%%xmm6 \n" // all 8
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 01234567
+ "movq 1(%0),%%xmm1 \n" // 12345678
+ "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
+ "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
+ "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
+ "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
+ "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
+
+ "movq (%0,%3),%%xmm1 \n"
+ "movq 1(%0,%3),%%xmm4 \n"
+ "punpcklwd %%xmm1,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpckhdq %%xmm4,%%xmm3 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
+ "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
+
+ // xmm0 xmm2
+ // xmm1 xmm3
+
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
+
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
+
+ "movdqa %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
+
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
+ "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
+
+ "packuswb %%xmm0,%%xmm4 \n"
+ "movdqu %%xmm4,(%1) \n" // store above
+ "packuswb %%xmm1,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4) \n" // store below
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kLinearMadd31) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
+void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
+ "vbroadcastf128 %3,%%ymm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
+ "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
+ "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
+ "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kLinearMadd31) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
+void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrlw $15,%%ymm6,%%ymm6 \n"
+ "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
+ "vbroadcastf128 %5,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
+ "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
+
+ "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
+ "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
+ "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
+ "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
+ "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
+
+ // ymm0 ymm1
+ // ymm2 ymm3
+
+ "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 16 sample to 32 sample
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kLinearMadd31) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
+void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm5 \n"
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b)
+ "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b)
+
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0
+
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near)
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
+ "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far)
+ "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
+
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2
+ "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2
+ "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2
+ "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2
+ "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
+ "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2
+ "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2
+
+ "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
+ "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm2,32(%1) \n"
+
+ "lea 0x20(%0),%0 \n"
+ "lea 0x40(%1),%1 \n" // 16 sample to 32 sample
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kLinearShuffleFar) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
+void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
+ "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
+ "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
+ "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
+ "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1)
+
+ "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
+ "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
+ "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
+ "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
+ "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
+ "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2)
+
+ "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
+ "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
+ "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
+ "vmovdqu %%ymm0,(%1) \n" // store above
+
+ "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
+ "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
+ "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
+ "vmovdqu %%ymm0,(%1,%4,2) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 sample to 16 sample
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kLinearShuffleFar) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
+void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $31,%%ymm4,%%ymm4 \n"
+ "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
- "vmovdqu (%0),%%ymm3 \n"
- "lea 0x20(%0),%0 \n" // src_ptr += 32
- "vpermq $0xd8,%%ymm3,%%ymm3 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpaddusw (%1),%%ymm2,%%ymm0 \n"
- "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
- "vmovdqu %%ymm0,(%1) \n"
- "vmovdqu %%ymm1,0x20(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
+
+ "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
+ "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
+
+ "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
+ "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
+
+ "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
+ "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
+ "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
+ "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
+
+ "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpshufd $0b11011000,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
- "+r"(src_width) // %2
+ "+r"(dst_width) // %2
:
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
+void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrld $31,%%ymm6,%%ymm6 \n"
+ "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
+ "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
+ "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
+ "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
+ "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
+ "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
+ "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
+ "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
+ "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi)
+
+ "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v)
+ "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v)
+ "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
+ "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
+ "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far)
+ "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far)
+ "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
+ "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
+ "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
+ "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
+ "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo)
+ "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi)
+
+ "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
+ "vpshufd $0b11011000,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
+ "vpshufd $0b11011000,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile("pxor %%xmm5,%%xmm5 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n" // src_ptr += 16
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x10(%1),%%xmm1 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n" // src_ptr += 32
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpaddusw (%1),%%ymm2,%%ymm0 \n"
+ "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SCALEADDROW_AVX2
@@ -866,69 +1835,69 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
int dx) {
intptr_t x0, x1, temp_pixel;
asm volatile(
- "movd %6,%%xmm2 \n"
- "movd %7,%%xmm3 \n"
- "movl $0x04040000,%k2 \n"
- "movd %k2,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n" // 0x007f007f
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $15,%%xmm7 \n" // 0x00010001
-
- "pextrw $0x1,%%xmm2,%k3 \n"
- "subl $0x2,%5 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
+ "movd %6,%%xmm2 \n"
+ "movd %7,%%xmm3 \n"
+ "movl $0x04040000,%k2 \n"
+ "movd %k2,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n" // 0x007f007f
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $15,%%xmm7 \n" // 0x00010001
+
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "subl $0x2,%5 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
LABELALIGN
"2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "movzwl 0x00(%1,%3,1),%k2 \n"
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- "movzwl 0x00(%1,%4,1),%k2 \n"
- "movd %k2,%%xmm4 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "punpcklwd %%xmm4,%%xmm0 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movzwl 0x00(%1,%4,1),%k2 \n"
+ "movd %k2,%%xmm4 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm0 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
// 1
- "paddusb %%xmm7,%%xmm1 \n"
- "pmaddubsw %%xmm0,%%xmm1 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "paddw %9,%%xmm1 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1,%k2 \n"
- "mov %w2,(%0) \n"
- "lea 0x2(%0),%0 \n"
- "subl $0x2,%5 \n"
- "jge 2b \n"
+ "paddusb %%xmm7,%%xmm1 \n"
+ "pmaddubsw %%xmm0,%%xmm1 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "paddw %9,%%xmm1 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,%k2 \n"
+ "mov %w2,(%0) \n"
+ "lea 0x2(%0),%0 \n"
+ "subl $0x2,%5 \n"
+ "jge 2b \n"
LABELALIGN
"29: \n"
- "addl $0x1,%5 \n"
- "jl 99f \n"
- "movzwl 0x00(%1,%3,1),%k2 \n"
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm2 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm2 \n"
- "paddusb %%xmm7,%%xmm2 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "paddw %9,%%xmm2 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movd %%xmm2,%k2 \n"
- "mov %b2,(%0) \n"
+ "addl $0x1,%5 \n"
+ "jl 99f \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm2 \n"
+ "paddusb %%xmm7,%%xmm2 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "paddw %9,%%xmm2 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movd %%xmm2,%k2 \n"
+ "mov %b2,(%0) \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
@@ -962,26 +1931,24 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
int dx) {
(void)x;
(void)dx;
- asm volatile(
-
- LABELALIGN
+ asm volatile(LABELALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
-
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1");
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
}
void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
@@ -989,23 +1956,21 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
- asm volatile(
-
- LABELALIGN
+ asm volatile(LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "shufps $0xdd,%%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1");
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
}
void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
@@ -1013,56 +1978,52 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
- asm volatile(
-
- LABELALIGN
+ asm volatile(LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1");
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
}
void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width) {
- asm volatile(
-
- LABELALIGN
+ asm volatile(LABELALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x00(%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
// Reads 4 pixels at a time.
@@ -1076,23 +2037,23 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x12;
(void)src_stride;
asm volatile(
- "lea 0x00(,%1,4),%1 \n"
- "lea 0x00(%1,%1,2),%4 \n"
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
LABELALIGN
"1: \n"
- "movd (%0),%%xmm0 \n"
- "movd 0x00(%0,%1,1),%%xmm1 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movd 0x00(%0,%1,2),%%xmm2 \n"
- "movd 0x00(%0,%4,1),%%xmm3 \n"
- "lea 0x00(%0,%1,4),%0 \n"
- "punpckldq %%xmm3,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movd (%0),%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%0,%1,2),%%xmm2 \n"
+ "movd 0x00(%0,%4,1),%%xmm3 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2
@@ -1113,32 +2074,32 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride);
asm volatile(
- "lea 0x00(,%1,4),%1 \n"
- "lea 0x00(%1,%1,2),%4 \n"
- "lea 0x00(%0,%5,1),%5 \n"
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
+ "lea 0x00(%0,%5,1),%5 \n"
LABELALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movhps 0x00(%0,%1,1),%%xmm0 \n"
- "movq 0x00(%0,%1,2),%%xmm1 \n"
- "movhps 0x00(%0,%4,1),%%xmm1 \n"
- "lea 0x00(%0,%1,4),%0 \n"
- "movq (%5),%%xmm2 \n"
- "movhps 0x00(%5,%1,1),%%xmm2 \n"
- "movq 0x00(%5,%1,2),%%xmm3 \n"
- "movhps 0x00(%5,%4,1),%%xmm3 \n"
- "lea 0x00(%5,%1,4),%5 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
+ "movq (%0),%%xmm0 \n"
+ "movhps 0x00(%0,%1,1),%%xmm0 \n"
+ "movq 0x00(%0,%1,2),%%xmm1 \n"
+ "movhps 0x00(%0,%4,1),%%xmm1 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "movq (%5),%%xmm2 \n"
+ "movhps 0x00(%5,%1,1),%%xmm2 \n"
+ "movq 0x00(%5,%1,2),%%xmm3 \n"
+ "movhps 0x00(%5,%4,1),%%xmm3 \n"
+ "lea 0x00(%5,%1,4),%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2
@@ -1156,56 +2117,56 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
int dx) {
intptr_t x0, x1;
asm volatile(
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "pshufd $0x11,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x5,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "cmp $0x0,%4 \n"
- "jl 99f \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x11,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x5,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "cmp $0x0,%4 \n"
+ "jl 99f \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
LABELALIGN
"40: \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd 0x00(%3,%1,4),%%xmm1 \n"
- "pextrw $0x5,%%xmm2,%k0 \n"
- "pextrw $0x7,%%xmm2,%k1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movd 0x00(%3,%0,4),%%xmm1 \n"
- "movd 0x00(%3,%1,4),%%xmm4 \n"
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "punpckldq %%xmm4,%%xmm1 \n"
- "punpcklqdq %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "sub $0x4,%4 \n"
- "jge 40b \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "pextrw $0x7,%%xmm2,%k1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%3,%0,4),%%xmm1 \n"
+ "movd 0x00(%3,%1,4),%%xmm4 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
"49: \n"
- "test $0x2,%4 \n"
- "je 29f \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd 0x00(%3,%1,4),%%xmm1 \n"
- "pextrw $0x5,%%xmm2,%k0 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%2) \n"
- "lea 0x8(%2),%2 \n"
+ "test $0x2,%4 \n"
+ "je 29f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%2) \n"
+ "lea 0x8(%2),%2 \n"
"29: \n"
- "test $0x1,%4 \n"
- "je 99f \n"
- "movd 0x00(%3,%0,4),%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
+ "test $0x1,%4 \n"
+ "je 99f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
"99: \n"
: "=&a"(x0), // %0
"=&d"(x1), // %1
@@ -1226,26 +2187,24 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
int dx) {
(void)x;
(void)dx;
- asm volatile(
-
- LABELALIGN
+ asm volatile(LABELALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpckldq %%xmm0,%%xmm0 \n"
- "punpckhdq %%xmm1,%%xmm1 \n"
- "movdqu %%xmm0,(%0) \n"
- "movdqu %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
-
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+r"(dst_width) // %2
- ::"memory",
- "cc", "xmm0", "xmm1");
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpckldq %%xmm0,%%xmm0 \n"
+ "punpckhdq %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
}
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
@@ -1267,63 +2226,64 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
int dx) {
intptr_t x0, x1;
asm volatile(
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm5 \n"
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm5 \n"
:
: "m"(kShuffleColARGB), // %0
"m"(kShuffleFractions) // %1
);
asm volatile(
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "sub $0x2,%2 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
LABELALIGN
"2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "movq 0x00(%1,%3,4),%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- "movhps 0x00(%1,%4,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%0) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x2,%2 \n"
- "jge 2b \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movhps 0x00(%1,%4,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%0) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
LABELALIGN
"29: \n"
- "add $0x1,%2 \n"
- "jl 99f \n"
- "psrlw $0x9,%%xmm2 \n"
- "movq 0x00(%1,%3,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm2 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%0) \n"
-
- LABELALIGN "99: \n" // clang-format error.
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%0) \n"
+
+ LABELALIGN
+ "99: \n" // clang-format error.
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
@@ -1339,10 +2299,10 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
int FixedDiv_X86(int num, int div) {
asm volatile(
"cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
@@ -1353,19 +2313,637 @@ int FixedDiv_X86(int num, int div) {
int FixedDiv1_X86(int num, int div) {
asm volatile(
"cdq \n"
- "shld $0x10,%%eax,%%edx \n"
- "shl $0x10,%%eax \n"
- "sub $0x10001,%%eax \n"
- "sbb $0x0,%%edx \n"
- "sub $0x1,%1 \n"
- "idiv %1 \n"
- "mov %0, %%eax \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "sub $0x10001,%%eax \n"
+ "sbb $0x0,%%edx \n"
+ "sub $0x1,%1 \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
return num;
}
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \
+ defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+
+// Shuffle table for splitting UV into upper and lower part of register.
+static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
+ 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
+static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
+ 6u, 14u, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80};
+#endif
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5, %%xmm5 \n" // zero
+ "movdqa %4,%%xmm1 \n" // split shuffler
+ "movdqa %5,%%xmm3 \n" // merge shuffler
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // 8 UV row 0
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
+ "lea 0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
+ "pshufb %%xmm1,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n" // vertical add
+ "psrlw $0x1,%%xmm0 \n" // round
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm3,%%xmm0 \n" // merge uv
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n" // 4 UV
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kShuffleSplitUV), // %4
+ "m"(kShuffleMergeUV) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
+
+#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
+void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
+ "vbroadcastf128 %4,%%ymm1 \n" // split shuffler
+ "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
+ "lea 0x20(%0),%0 \n"
+ "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
+ "vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n" // 8 UV
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kShuffleSplitUV), // %4
+ "m"(kShuffleMergeUV) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEUVROWDOWN2BOX_AVX2
+
+static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
+ 3, 1, 3, 1, 1, 3, 1, 3};
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
+void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqw %%xmm4,%%xmm4 \n"
+ "psrlw $15,%%xmm4 \n"
+ "psllw $1,%%xmm4 \n" // all 2
+ "movdqa %3,%%xmm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
+ "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
+ "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
+ "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
+ "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi)
+ "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo)
+ "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
+ "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
+ "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kUVLinearMadd31) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
+void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "pcmpeqw %%xmm6,%%xmm6 \n"
+ "psrlw $15,%%xmm6 \n"
+ "psllw $3,%%xmm6 \n" // all 8
+ "movdqa %5,%%xmm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 00112233 (1u1v)
+ "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
+ "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
+ "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
+ "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi)
+ "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo)
+
+ "movq (%0,%3),%%xmm1 \n"
+ "movq 2(%0,%3),%%xmm4 \n"
+ "punpcklbw %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "punpckhdq %%xmm1,%%xmm3 \n"
+ "punpckldq %%xmm1,%%xmm1 \n"
+ "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
+ "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
+
+ // xmm0 xmm2
+ // xmm1 xmm3
+
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
+
+ "movdqa %%xmm1,%%xmm5 \n"
+ "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
+ "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
+
+ "movdqa %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
+ "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
+ "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
+ "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
+
+ "movdqa %%xmm3,%%xmm1 \n"
+ "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
+ "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
+ "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
+ "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
+
+ "packuswb %%xmm0,%%xmm4 \n"
+ "movdqu %%xmm4,(%1) \n" // store above
+ "packuswb %%xmm1,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4) \n" // store below
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 4 uv to 8 uv
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kUVLinearMadd31) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
+
+void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $15,%%ymm4,%%ymm4 \n"
+ "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
+ "vbroadcastf128 %3,%%ymm3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n"
+ "vmovdqu 2(%0),%%xmm1 \n"
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
+ "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
+ "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kUVLinearMadd31) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
+void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrlw $15,%%ymm6,%%ymm6 \n"
+ "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
+ "vbroadcastf128 %5,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n"
+ "vmovdqu 2(%0),%%xmm1 \n"
+ "vpermq $0b11011000,%%ymm0,%%ymm0 \n"
+ "vpermq $0b11011000,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
+ "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
+
+ "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
+ "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
+ "vpermq $0b11011000,%%ymm2,%%ymm2 \n"
+ "vpermq $0b11011000,%%ymm3,%%ymm3 \n"
+ "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n"
+ "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
+
+ // ymm0 ymm1
+ // ymm2 ymm3
+
+ "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 8 uv to 16 uv
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)), // %4
+ "m"(kUVLinearMadd31) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
+void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqd %%xmm4,%%xmm4 \n"
+ "psrld $31,%%xmm4 \n"
+ "pslld $1,%%xmm4 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
+ "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
+
+ "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v)
+ "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v)
+
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+
+ "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far)
+ "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far)
+
+ "paddd %%xmm4,%%xmm2 \n" // far+2 (lo)
+ "paddd %%xmm4,%%xmm3 \n" // far+2 (hi)
+ "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo)
+ "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi)
+ "paddd %%xmm0,%%xmm0 \n" // 2*near (lo)
+ "paddd %%xmm1,%%xmm1 \n" // 2*near (hi)
+ "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo)
+ "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
+
+ "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
+ "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
+ "packusdw %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 2 uv to 4 uv
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
+void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "pxor %%xmm7,%%xmm7 \n"
+ "pcmpeqd %%xmm6,%%xmm6 \n"
+ "psrld $31,%%xmm6 \n"
+ "pslld $3,%%xmm6 \n" // all 8
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v)
+ "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v)
+ "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v)
+ "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v)
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo)
+ "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi)
+ "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo)
+ "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi)
+ "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo)
+ "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi)
+ "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo)
+ "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
+
+ "movq (%0,%3,2),%%xmm2 \n"
+ "movq 4(%0,%3,2),%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo)
+ "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi)
+ "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo)
+ "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi)
+ "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo)
+ "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi)
+ "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo)
+ "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi)
+
+ "movdqa %%xmm0,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
+ "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
+ "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
+ "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo)
+
+ "movdqa %%xmm2,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo)
+ "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
+ "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo)
+ "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo)
+
+ "movdqa %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi)
+ "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi)
+ "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi)
+ "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi)
+
+ "movdqa %%xmm3,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi)
+ "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi)
+ "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
+ "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi)
+
+ "packusdw %%xmm0,%%xmm4 \n"
+ "movdqu %%xmm4,(%1) \n" // store above
+ "packusdw %%xmm2,%%xmm5 \n"
+ "movdqu %%xmm5,(%1,%4,2) \n" // store below
+
+ "lea 0x8(%0),%0 \n"
+ "lea 0x10(%1),%1 \n" // 2 uv to 4 uv
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
+void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrld $31,%%ymm4,%%ymm4 \n"
+ "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
+ "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
+
+ "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
+ "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
+
+ "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
+ "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
+
+ "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
+ "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
+ "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
+ "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
+
+ "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
+ "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
+ "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif
+
+#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
+void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ asm volatile(
+ "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
+ "vpsrld $31,%%ymm6,%%ymm6 \n"
+ "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
+
+ LABELALIGN
+ "1: \n"
+
+ "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
+ "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
+ "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
+ "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
+ "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
+ "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
+ "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
+ "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
+ "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo)
+ "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi)
+
+ "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v)
+ "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v)
+ "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
+ "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
+ "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far)
+ "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far)
+ "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
+ "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
+ "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
+ "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
+ "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo)
+ "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi)
+
+ "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
+ "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
+ "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
+ "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
+ "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
+
+ "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
+ "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
+ "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
+ "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
+ "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
+
+ "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
+ "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
+ "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
+ "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
+ "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
+
+ "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
+ "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
+ "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
+ "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
+ "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
+
+ "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
+ "vmovdqu %%ymm4,(%1) \n" // store above
+ "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
+ "vmovdqu %%ymm5,(%1,%4,2) \n" // store below
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x20(%1),%1 \n" // 4 uv to 8 uv
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
diff --git a/files/source/scale_lsx.cc b/files/source/scale_lsx.cc
new file mode 100644
index 00000000..bfe5e9fb
--- /dev/null
+++ b/files/source/scale_lsx.cc
@@ -0,0 +1,739 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "libyuv/scale_row.h"
+
+#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
+#include "libyuv/loongson_intrinsics.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define LOAD_DATA(_src, _in, _out) \
+ { \
+ int _tmp1, _tmp2, _tmp3, _tmp4; \
+ DUP4_ARG2(__lsx_vpickve2gr_w, _in, 0, _in, 1, _in, 2, _in, 3, _tmp1, \
+ _tmp2, _tmp3, _tmp4); \
+ _out = __lsx_vinsgr2vr_w(_out, _src[_tmp1], 0); \
+ _out = __lsx_vinsgr2vr_w(_out, _src[_tmp2], 1); \
+ _out = __lsx_vinsgr2vr_w(_out, _src[_tmp3], 2); \
+ _out = __lsx_vinsgr2vr_w(_out, _src[_tmp4], 3); \
+ }
+
+void ScaleARGBRowDown2_LSX(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ int len = dst_width / 4;
+ (void)src_stride;
+ __m128i src0, src1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ dst0 = __lsx_vpickod_w(src1, src0);
+ __lsx_vst(dst0, dst_argb, 0);
+ src_argb += 32;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ int len = dst_width / 4;
+ (void)src_stride;
+ __m128i src0, src1, tmp0, tmp1, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1);
+ tmp0 = __lsx_vpickev_w(src1, src0);
+ tmp1 = __lsx_vpickod_w(src1, src0);
+ dst0 = __lsx_vavgr_bu(tmp1, tmp0);
+ __lsx_vst(dst0, dst_argb, 0);
+ src_argb += 32;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ int len = dst_width / 4;
+ const uint8_t* s = src_argb;
+ const uint8_t* t = src_argb + src_stride;
+ __m128i src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3, dst0;
+ __m128i reg0, reg1, reg2, reg3;
+ __m128i shuff = {0x0703060205010400, 0x0F0B0E0A0D090C08};
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, s, 0, s, 16, src0, src1);
+ DUP2_ARG2(__lsx_vld, t, 0, t, 16, src2, src3);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff, src1, src1, shuff, src2, src2,
+ shuff, src3, src3, shuff, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+ tmp3, reg0, reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vsadd_hu, reg0, reg2, reg1, reg3, reg0, reg1);
+ dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2);
+ __lsx_vst(dst0, dst_argb, 0);
+ s += 32;
+ t += 32;
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ int len = dst_width / 4;
+ int32_t stepx = src_stepx << 2;
+ (void)src_stride;
+ __m128i dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < len; x++) {
+ dst0 = __lsx_vldrepl_w(src_argb, 0);
+ src_argb += stepx;
+ dst1 = __lsx_vldrepl_w(src_argb, 0);
+ src_argb += stepx;
+ dst2 = __lsx_vldrepl_w(src_argb, 0);
+ src_argb += stepx;
+ dst3 = __lsx_vldrepl_w(src_argb, 0);
+ src_argb += stepx;
+ __lsx_vstelm_w(dst0, dst_argb, 0, 0);
+ __lsx_vstelm_w(dst1, dst_argb, 4, 0);
+ __lsx_vstelm_w(dst2, dst_argb, 8, 0);
+ __lsx_vstelm_w(dst3, dst_argb, 12, 0);
+ dst_argb += 16;
+ }
+}
+
+void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ int x;
+ int len = dst_width / 4;
+ int32_t stepx = src_stepx * 4;
+ const uint8_t* next_argb = src_argb + src_stride;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, dst0;
+
+ for (x = 0; x < len; x++) {
+ tmp0 = __lsx_vldrepl_d(src_argb, 0);
+ src_argb += stepx;
+ tmp1 = __lsx_vldrepl_d(src_argb, 0);
+ src_argb += stepx;
+ tmp2 = __lsx_vldrepl_d(src_argb, 0);
+ src_argb += stepx;
+ tmp3 = __lsx_vldrepl_d(src_argb, 0);
+ src_argb += stepx;
+ tmp4 = __lsx_vldrepl_d(next_argb, 0);
+ next_argb += stepx;
+ tmp5 = __lsx_vldrepl_d(next_argb, 0);
+ next_argb += stepx;
+ tmp6 = __lsx_vldrepl_d(next_argb, 0);
+ next_argb += stepx;
+ tmp7 = __lsx_vldrepl_d(next_argb, 0);
+ next_argb += stepx;
+ DUP4_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
+ DUP2_ARG2(__lsx_vpackev_w, tmp1, tmp0, tmp3, tmp2, reg0, reg1);
+ DUP2_ARG2(__lsx_vpackod_w, tmp1, tmp0, tmp3, tmp2, tmp4, tmp5);
+ DUP2_ARG2(__lsx_vadd_h, reg0, tmp4, reg1, tmp5, reg0, reg1);
+ dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2);
+ dst0 = __lsx_vshuf4i_b(dst0, 0xD8);
+ __lsx_vst(dst0, dst_argb, 0);
+ dst_argb += 16;
+ }
+}
+
+void ScaleRowDown2_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ int len = dst_width / 32;
+ __m128i src0, src1, src2, src3, dst0, dst1;
+ (void)src_stride;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst0, dst1);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ src_ptr += 64;
+ dst += 32;
+ }
+}
+
+void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ int len = dst_width / 32;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3, dst0, dst1;
+ (void)src_stride;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3);
+ DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp1, tmp2, tmp3, dst0, dst1);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ src_ptr += 64;
+ dst += 32;
+ }
+}
+
+void ScaleRowDown2Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ int len = dst_width / 32;
+ const uint8_t* src_nex = src_ptr + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i dst0, dst1;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
+ src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
+ src7, tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
+ src7, tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vsrarni_b_h, tmp1, tmp0, 2, tmp3, tmp2, 2, dst0, dst1);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ src_ptr += 64;
+ src_nex += 64;
+ dst += 32;
+ }
+}
+
+void ScaleRowDown4_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ int len = dst_width / 16;
+ __m128i src0, src1, src2, src3, tmp0, tmp1, dst0;
+ (void)src_stride;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp1);
+ dst0 = __lsx_vpickod_b(tmp1, tmp0);
+ __lsx_vst(dst0, dst, 0);
+ src_ptr += 64;
+ dst += 16;
+ }
+}
+
+void ScaleRowDown4Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ int len = dst_width / 16;
+ const uint8_t* ptr1 = src_ptr + src_stride;
+ const uint8_t* ptr2 = ptr1 + src_stride;
+ const uint8_t* ptr3 = ptr2 + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, dst0;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, ptr1, 0, ptr1, 16, ptr1, 32, ptr1, 48, src4, src5,
+ src6, src7);
+ DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
+ src7, tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
+ src7, tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, ptr2, 32, ptr2, 48, src0, src1,
+ src2, src3);
+ DUP4_ARG2(__lsx_vld, ptr3, 0, ptr3, 16, ptr3, 32, ptr3, 48, src4, src5,
+ src6, src7);
+ DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3,
+ src7, tmp0, tmp2, tmp4, tmp6);
+ DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3,
+ src7, tmp1, tmp3, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+ reg0, reg1, reg2, reg3);
+ DUP4_ARG2(__lsx_vhaddw_wu_hu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+ reg3, reg0, reg1, reg2, reg3);
+ DUP2_ARG3(__lsx_vsrarni_h_w, reg1, reg0, 4, reg3, reg2, 4, tmp0, tmp1);
+ dst0 = __lsx_vpickev_b(tmp1, tmp0);
+ __lsx_vst(dst0, dst, 0);
+ src_ptr += 64;
+ ptr1 += 64;
+ ptr2 += 64;
+ ptr3 += 64;
+ dst += 16;
+ }
+}
+
+void ScaleRowDown38_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x, len;
+ __m128i src0, src1, tmp0;
+ __m128i shuff = {0x13100E0B08060300, 0x000000001E1B1816};
+
+ assert(dst_width % 3 == 0);
+ len = dst_width / 12;
+ (void)src_stride;
+
+ for (x = 0; x < len; x++) {
+ DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+ tmp0 = __lsx_vshuf_b(src1, src0, shuff);
+ __lsx_vstelm_d(tmp0, dst, 0, 0);
+ __lsx_vstelm_w(tmp0, dst, 8, 2);
+ src_ptr += 32;
+ dst += 12;
+ }
+}
+
+void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int x, len;
+ const uint8_t* src_nex = src_ptr + src_stride;
+ __m128i src0, src1, src2, src3, dst0;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, reg2, reg3;
+ __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A};
+ __m128i const_0x2AAA = __lsx_vreplgr2vr_h(0x2AAA);
+ __m128i const_0x4000 = __lsx_vreplgr2vr_w(0x4000);
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ len = dst_width / 12;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_nex, 0, src_nex, 16, src0,
+ src1, src2, src3);
+ DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
+ DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1);
+ DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3);
+ tmp4 = __lsx_vpickev_w(reg3, reg2);
+ tmp5 = __lsx_vadd_h(reg0, reg1);
+ tmp6 = __lsx_vadd_h(tmp5, tmp4);
+ tmp7 = __lsx_vmuh_h(tmp6, const_0x2AAA);
+ tmp0 = __lsx_vpickod_w(reg3, reg2);
+ tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0);
+ tmp2 = __lsx_vmul_w(tmp1, const_0x4000);
+ dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff);
+ __lsx_vstelm_d(dst0, dst_ptr, 0, 0);
+ __lsx_vstelm_w(dst0, dst_ptr, 8, 2);
+ src_ptr += 32;
+ src_nex += 32;
+ dst_ptr += 12;
+ }
+}
+
+void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ int x, len;
+ const uint8_t* ptr1 = src_ptr + src_stride;
+ const uint8_t* ptr2 = ptr1 + src_stride;
+ __m128i src0, src1, src2, src3, src4, src5;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, reg2, reg3, dst0;
+ __m128i zero = __lsx_vldi(0);
+ __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A};
+ __m128i const_0x1C71 = __lsx_vreplgr2vr_h(0x1C71);
+ __m128i const_0x2AAA = __lsx_vreplgr2vr_w(0x2AAA);
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ len = dst_width / 12;
+
+ for (x = 0; x < len; x++) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, ptr1, 0, ptr1, 16, src0, src1,
+ src2, src3);
+ DUP2_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, src4, src5);
+ DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2);
+ DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3);
+ DUP2_ARG2(__lsx_vpackev_b, zero, src4, zero, src5, tmp4, tmp6);
+ DUP2_ARG2(__lsx_vpackod_b, zero, src4, zero, src5, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1);
+ DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3);
+ tmp4 = __lsx_vpickev_w(reg3, reg2);
+ tmp5 = __lsx_vadd_h(reg0, reg1);
+ tmp6 = __lsx_vadd_h(tmp5, tmp4);
+ tmp7 = __lsx_vmuh_h(tmp6, const_0x1C71);
+ tmp0 = __lsx_vpickod_w(reg3, reg2);
+ tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0);
+ tmp2 = __lsx_vmul_w(tmp1, const_0x2AAA);
+ dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff);
+ __lsx_vstelm_d(dst0, dst_ptr, 0, 0);
+ __lsx_vstelm_w(dst0, dst_ptr, 8, 2);
+ src_ptr += 32;
+ ptr1 += 32;
+ ptr2 += 32;
+ dst_ptr += 12;
+ }
+}
+
+void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+ int x;
+ int len = src_width / 16;
+ __m128i src0, tmp0, tmp1, dst0, dst1;
+ __m128i zero = __lsx_vldi(0);
+
+ assert(src_width > 0);
+
+ for (x = 0; x < len; x++) {
+ src0 = __lsx_vld(src_ptr, 0);
+ DUP2_ARG2(__lsx_vld, dst_ptr, 0, dst_ptr, 16, dst0, dst1);
+ tmp0 = __lsx_vilvl_b(zero, src0);
+ tmp1 = __lsx_vilvh_b(zero, src0);
+ DUP2_ARG2(__lsx_vadd_h, dst0, tmp0, dst1, tmp1, dst0, dst1);
+ __lsx_vst(dst0, dst_ptr, 0);
+ __lsx_vst(dst1, dst_ptr, 16);
+ src_ptr += 16;
+ dst_ptr += 16;
+ }
+}
+
+void ScaleFilterCols_LSX(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ int len = dst_width / 16;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i vec0, vec1, dst0;
+ __m128i vec_x = __lsx_vreplgr2vr_w(x);
+ __m128i vec_dx = __lsx_vreplgr2vr_w(dx);
+ __m128i const1 = __lsx_vreplgr2vr_w(0xFFFF);
+ __m128i const2 = __lsx_vreplgr2vr_w(0x40);
+ __m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
+
+ vec0 = __lsx_vmul_w(vec_dx, const_tmp);
+ vec1 = __lsx_vslli_w(vec_dx, 2);
+ vec_x = __lsx_vadd_w(vec_x, vec0);
+
+ for (j = 0; j < len; j++) {
+ tmp0 = __lsx_vsrai_w(vec_x, 16);
+ tmp4 = __lsx_vand_v(vec_x, const1);
+ vec_x = __lsx_vadd_w(vec_x, vec1);
+ tmp1 = __lsx_vsrai_w(vec_x, 16);
+ tmp5 = __lsx_vand_v(vec_x, const1);
+ vec_x = __lsx_vadd_w(vec_x, vec1);
+ tmp2 = __lsx_vsrai_w(vec_x, 16);
+ tmp6 = __lsx_vand_v(vec_x, const1);
+ vec_x = __lsx_vadd_w(vec_x, vec1);
+ tmp3 = __lsx_vsrai_w(vec_x, 16);
+ tmp7 = __lsx_vand_v(vec_x, const1);
+ vec_x = __lsx_vadd_w(vec_x, vec1);
+ DUP4_ARG2(__lsx_vsrai_w, tmp4, 9, tmp5, 9, tmp6, 9, tmp7, 9, tmp4, tmp5,
+ tmp6, tmp7);
+ LOAD_DATA(src_ptr, tmp0, reg0);
+ LOAD_DATA(src_ptr, tmp1, reg1);
+ LOAD_DATA(src_ptr, tmp2, reg2);
+ LOAD_DATA(src_ptr, tmp3, reg3);
+ DUP4_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp2, 1, tmp3, 1, tmp0, tmp1,
+ tmp2, tmp3);
+ LOAD_DATA(src_ptr, tmp0, reg4);
+ LOAD_DATA(src_ptr, tmp1, reg5);
+ LOAD_DATA(src_ptr, tmp2, reg6);
+ LOAD_DATA(src_ptr, tmp3, reg7);
+ DUP4_ARG2(__lsx_vsub_w, reg4, reg0, reg5, reg1, reg6, reg2, reg7, reg3,
+ reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vmul_w, reg4, tmp4, reg5, tmp5, reg6, tmp6, reg7, tmp7,
+ reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vadd_w, reg4, const2, reg5, const2, reg6, const2, reg7,
+ const2, reg4, reg5, reg6, reg7);
+ DUP4_ARG2(__lsx_vsrai_w, reg4, 7, reg5, 7, reg6, 7, reg7, 7, reg4, reg5,
+ reg6, reg7);
+ DUP4_ARG2(__lsx_vadd_w, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7,
+ reg0, reg1, reg2, reg3);
+ DUP2_ARG2(__lsx_vpickev_h, reg1, reg0, reg3, reg2, tmp0, tmp1);
+ dst0 = __lsx_vpickev_b(tmp1, tmp0);
+ __lsx_vst(dst0, dst_ptr, 0);
+ dst_ptr += 16;
+ }
+}
+
+void ScaleARGBCols_LSX(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32_t* src = (const uint32_t*)src_argb;
+ uint32_t* dst = (uint32_t*)dst_argb;
+ int j;
+ int len = dst_width / 4;
+ __m128i tmp0, tmp1, tmp2, dst0;
+ __m128i vec_x = __lsx_vreplgr2vr_w(x);
+ __m128i vec_dx = __lsx_vreplgr2vr_w(dx);
+ __m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
+
+ tmp0 = __lsx_vmul_w(vec_dx, const_tmp);
+ tmp1 = __lsx_vslli_w(vec_dx, 2);
+ vec_x = __lsx_vadd_w(vec_x, tmp0);
+
+ for (j = 0; j < len; j++) {
+ tmp2 = __lsx_vsrai_w(vec_x, 16);
+ vec_x = __lsx_vadd_w(vec_x, tmp1);
+ LOAD_DATA(src, tmp2, dst0);
+ __lsx_vst(dst0, dst, 0);
+ dst += 4;
+ }
+}
+
+void ScaleARGBFilterCols_LSX(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32_t* src = (const uint32_t*)src_argb;
+ int j;
+ int len = dst_width / 8;
+ __m128i src0, src1, src2, src3;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ __m128i vec0, vec1, dst0, dst1;
+ __m128i vec_x = __lsx_vreplgr2vr_w(x);
+ __m128i vec_dx = __lsx_vreplgr2vr_w(dx);
+ __m128i const_tmp = {0x0000000100000000, 0x0000000300000002};
+ __m128i const_7f = __lsx_vldi(0x7F);
+
+ vec0 = __lsx_vmul_w(vec_dx, const_tmp);
+ vec1 = __lsx_vslli_w(vec_dx, 2);
+ vec_x = __lsx_vadd_w(vec_x, vec0);
+
+ for (j = 0; j < len; j++) {
+ tmp0 = __lsx_vsrai_w(vec_x, 16);
+ reg0 = __lsx_vsrai_w(vec_x, 9);
+ vec_x = __lsx_vadd_w(vec_x, vec1);
+ tmp1 = __lsx_vsrai_w(vec_x, 16);
+ reg1 = __lsx_vsrai_w(vec_x, 9);
+ vec_x = __lsx_vadd_w(vec_x, vec1);
+ DUP2_ARG2(__lsx_vand_v, reg0, const_7f, reg1, const_7f, reg0, reg1);
+ DUP2_ARG2(__lsx_vshuf4i_b, reg0, 0, reg1, 0, reg0, reg1);
+ DUP2_ARG2(__lsx_vxor_v, reg0, const_7f, reg1, const_7f, reg2, reg3);
+ DUP2_ARG2(__lsx_vilvl_b, reg0, reg2, reg1, reg3, reg4, reg6);
+ DUP2_ARG2(__lsx_vilvh_b, reg0, reg2, reg1, reg3, reg5, reg7);
+ LOAD_DATA(src, tmp0, src0);
+ LOAD_DATA(src, tmp1, src1);
+ DUP2_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp0, tmp1);
+ LOAD_DATA(src, tmp0, src2);
+ LOAD_DATA(src, tmp1, src3);
+ DUP2_ARG2(__lsx_vilvl_b, src2, src0, src3, src1, tmp4, tmp6);
+ DUP2_ARG2(__lsx_vilvh_b, src2, src0, src3, src1, tmp5, tmp7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, reg4, tmp5, reg5, tmp6, reg6, tmp7, reg7,
+ tmp0, tmp1, tmp2, tmp3);
+ DUP2_ARG3(__lsx_vsrani_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst0, dst1);
+ __lsx_vst(dst0, dst_argb, 0);
+ __lsx_vst(dst1, dst_argb, 16);
+ dst_argb += 32;
+ }
+}
+
+void ScaleRowDown34_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ __m128i src0, src1, src2, src3;
+ __m128i dst0, dst1, dst2;
+ __m128i shuff0 = {0x0908070504030100, 0x141311100F0D0C0B};
+ __m128i shuff1 = {0x0F0D0C0B09080705, 0x1918171514131110};
+ __m128i shuff2 = {0x141311100F0D0C0B, 0x1F1D1C1B19181715};
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+
+ for (x = 0; x < dst_width; x += 48) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0, src2, src1, shuff1, dst0,
+ dst1);
+ dst2 = __lsx_vshuf_b(src3, src2, shuff2);
+ __lsx_vst(dst0, dst, 0);
+ __lsx_vst(dst1, dst, 16);
+ __lsx_vst(dst2, dst, 32);
+ src_ptr += 64;
+ dst += 48;
+ }
+}
+
+void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width) {
+ const uint8_t* src_nex = src_ptr + src_stride;
+ int x;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
+ __m128i tmp10, tmp11, dst0, dst1, dst2;
+ __m128i const0 = {0x0103030101010103, 0x0101010303010101};
+ __m128i const1 = {0x0301010101030301, 0x0103030101010103};
+ __m128i const2 = {0x0101010303010101, 0x0301010101030301};
+ __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605};
+ __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110};
+ __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A};
+ __m128i shift0 = {0x0002000200010002, 0x0001000200020001};
+ __m128i shift1 = {0x0002000100020002, 0x0002000200010002};
+ __m128i shift2 = {0x0001000200020001, 0x0002000100020002};
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+
+ for (x = 0; x < dst_width; x += 48) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
+ src4, src5, src6, src7);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1,
+ shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4,
+ shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6,
+ shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3,
+ const0, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7,
+ const1, src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11,
+ const2, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3,
+ shift0, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7,
+ shift1, src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3,
+ shift2, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vslli_h, src0, 1, src1, 1, src2, 1, src3, 1, tmp5, tmp6,
+ tmp7, tmp8);
+ DUP2_ARG2(__lsx_vslli_h, src4, 1, src5, 1, tmp9, tmp10);
+ DUP4_ARG2(__lsx_vadd_h, src0, tmp5, src1, tmp6, src2, tmp7, src3, tmp8,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vadd_h, src4, tmp9, src5, tmp10, src4, src5);
+ DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5);
+ DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 2, src3, src2, 2, dst0, dst1);
+ dst2 = __lsx_vsrarni_b_h(src5, src4, 2);
+ __lsx_vst(dst0, d, 0);
+ __lsx_vst(dst1, d, 16);
+ __lsx_vst(dst2, d, 32);
+ src_ptr += 64;
+ src_nex += 64;
+ d += 48;
+ }
+}
+
+void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width) {
+ const uint8_t* src_nex = src_ptr + src_stride;
+ int x;
+ __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
+ __m128i tmp10, tmp11, dst0, dst1, dst2;
+ __m128i const0 = {0x0103030101010103, 0x0101010303010101};
+ __m128i const1 = {0x0301010101030301, 0x0103030101010103};
+ __m128i const2 = {0x0101010303010101, 0x0301010101030301};
+ __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605};
+ __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110};
+ __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A};
+ __m128i shift0 = {0x0002000200010002, 0x0001000200020001};
+ __m128i shift1 = {0x0002000100020002, 0x0002000200010002};
+ __m128i shift2 = {0x0001000200020001, 0x0002000100020002};
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+
+ for (x = 0; x < dst_width; x += 48) {
+ DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+ src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48,
+ src4, src5, src6, src7);
+ DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1,
+ shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4,
+ shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7);
+ DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6,
+ shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3,
+ const0, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7,
+ const1, src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, tmp11,
+ const2, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3,
+ shift0, src0, src1, src2, src3);
+ DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7,
+ shift1, src4, src5, src6, src7);
+ DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3,
+ shift2, tmp0, tmp1, tmp2, tmp3);
+ DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1,
+ src0, src1, src2, src3);
+ DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5);
+ DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 1, src3, src2, 1, dst0, dst1);
+ dst2 = __lsx_vsrarni_b_h(src5, src4, 1);
+ __lsx_vst(dst0, d, 0);
+ __lsx_vst(dst1, d, 16);
+ __lsx_vst(dst2, d, 32);
+ src_ptr += 64;
+ src_nex += 64;
+ d += 48;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
diff --git a/files/source/scale_mmi.cc b/files/source/scale_mmi.cc
index 990463c2..1226ef3e 100644
--- a/files/source/scale_mmi.cc
+++ b/files/source/scale_mmi.cc
@@ -1103,6 +1103,61 @@ void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
: "memory");
}
+void ScaleRowDown34_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ uint64_t src[2];
+ uint64_t tmp[2];
+ __asm__ volatile (
+ "1: \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "and %[tmp1], %[src0], %[mask1] \n\t"
+ "psrlw %[tmp0], %[src0], %[rmov] \n\t"
+ "psllw %[tmp0], %[tmp0], %[lmov1] \n\t"
+ "or %[src0], %[tmp0], %[tmp1] \n\t"
+ "punpckhwd %[tmp0], %[src0], %[src0] \n\t"
+ "psllw %[tmp1], %[tmp0], %[rmov] \n\t"
+ "or %[src0], %[src0], %[tmp1] \n\t"
+ "psrlw %[tmp0], %[tmp0], %[rmov8] \n\t"
+ "pextrh %[tmp0], %[tmp0], %[zero] \n\t"
+ "pinsrh_2 %[src0], %[src0], %[tmp0] \n\t"
+ "pextrh %[tmp0], %[src1], %[zero] \n\t"
+ "pinsrh_3 %[src0], %[src0], %[tmp0] \n\t"
+
+ "punpckhwd %[tmp0], %[src1], %[src1] \n\t"
+ "pextrh %[tmp1], %[tmp0], %[zero] \n\t"
+ "psrlw %[src1], %[src1], %[rmov] \n\t"
+ "psllw %[tmp1], %[tmp1], %[rmov8] \n\t"
+ "or %[src1], %[src1], %[tmp1] \n\t"
+ "and %[tmp0], %[tmp0], %[mask2] \n\t"
+ "or %[src1], %[src1], %[tmp0] \n\t"
+
+ "gssdlc1 %[src0], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[src0], 0x00(%[dst_ptr]) \n\t"
+ "gsswlc1 %[src1], 0x0b(%[dst_ptr]) \n\t"
+ "gsswrc1 %[src1], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x0c \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [src0]"=&f"(src[0]), [src1]"=&f"(src[1]),
+ [tmp0]"=&f"(tmp[0]), [tmp1]"=&f"(tmp[1])
+ : [src_ptr]"r"(src_ptr), [dst_ptr]"r"(dst),
+ [lmov]"f"(0xc), [rmov]"f"(0x18),
+ [mask1]"f"(0xffff0000ffff), [rmov8]"f"(0x8),
+ [zero]"f"(0x0), [mask2]"f"(0xff000000),
+ [width]"r"(dst_width), [lmov1]"f"(0x10)
+ : "memory"
+ );
+}
// clang-format on
#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc
index 366b155b..6a0d6e1b 100644
--- a/files/source/scale_neon.cc
+++ b/files/source/scale_neon.cc
@@ -31,10 +31,10 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load even pixels into q0, odd into q1
- "vld2.8 {q0, q1}, [%0]! \n"
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vst1.8 {q1}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -51,11 +51,11 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vrhadd.u8 q0, q0, q1 \n" // rounding half add
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -71,21 +71,21 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %0 \n"
- "1: \n"
- "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
- "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
- "vpaddl.u8 q1, q1 \n"
- "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
+ "add %1, %0 \n"
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
// row1
- "vpadal.u8 q1, q3 \n"
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
// pack
- "vrshrn.u16 d1, q1, #2 \n"
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -102,10 +102,10 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vst1.8 {d2}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -122,20 +122,20 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile(
"1: \n"
- "vld1.8 {q0}, [%0]! \n" // load up 16x4
- "vld1.8 {q1}, [%3]! \n"
- "vld1.8 {q2}, [%4]! \n"
- "vld1.8 {q3}, [%5]! \n"
- "subs %2, %2, #4 \n"
- "vpaddl.u8 q0, q0 \n"
- "vpadal.u8 q0, q1 \n"
- "vpadal.u8 q0, q2 \n"
- "vpadal.u8 q0, q3 \n"
- "vpaddl.u16 q0, q0 \n"
- "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
- "vmovn.u16 d0, q0 \n"
- "vst1.32 {d0[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ "vld1.8 {q1}, [%3]! \n"
+ "vld1.8 {q2}, [%4]! \n"
+ "vld1.8 {q3}, [%5]! \n"
+ "subs %2, %2, #4 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ "vst1.32 {d0[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -156,11 +156,11 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #24 \n"
- "vmov d2, d3 \n" // order d0, d1, d2
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -173,49 +173,49 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
- "vmovl.u8 q8, d4 \n"
- "vmovl.u8 q9, d5 \n"
- "vmovl.u8 q10, d6 \n"
- "vmovl.u8 q11, d7 \n"
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
// 3 * line_0 + line_1
- "vmlal.u8 q8, d0, d24 \n"
- "vmlal.u8 q9, d1, d24 \n"
- "vmlal.u8 q10, d2, d24 \n"
- "vmlal.u8 q11, d3, d24 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
- // (3 * line_0 + line_1) >> 2
- "vqrshrn.u16 d0, q8, #2 \n"
- "vqrshrn.u16 d1, q9, #2 \n"
- "vqrshrn.u16 d2, q10, #2 \n"
- "vqrshrn.u16 d3, q11, #2 \n"
+ // (3 * line_0 + line_1 + 2) >> 2
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q8, d1 \n"
- "vmlal.u8 q8, d0, d24 \n"
- "vqrshrn.u16 d0, q8, #2 \n"
+ // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
+ // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q8, d2 \n"
- "vmlal.u8 q8, d3, d24 \n"
- "vqrshrn.u16 d2, q8, #2 \n"
+ // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
- "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -230,31 +230,31 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
// average src line 0 with src line 1
- "vrhadd.u8 q0, q0, q2 \n"
- "vrhadd.u8 q1, q1, q3 \n"
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q3, d1 \n"
- "vmlal.u8 q3, d0, d24 \n"
- "vqrshrn.u16 d0, q3, #2 \n"
+ // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
+ // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q3, d2 \n"
- "vmlal.u8 q3, d3, d24 \n"
- "vqrshrn.u16 d2, q3, #2 \n"
+ // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -282,15 +282,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "vld1.8 {q3}, [%3] \n"
- "1: \n"
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
- "subs %2, %2, #12 \n"
- "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
- "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
- "vst1.8 {d4}, [%1]! \n"
- "vst1.32 {d5[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {q3}, [%3] \n"
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.8 {d4}, [%1]! \n"
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -306,57 +306,57 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
asm volatile(
- "vld1.16 {q13}, [%5] \n"
- "vld1.8 {q14}, [%6] \n"
- "vld1.8 {q15}, [%7] \n"
- "add %3, %0 \n"
+ "vld1.16 {q13}, [%5] \n"
+ "vld1.8 {q14}, [%6] \n"
+ "vld1.8 {q15}, [%7] \n"
+ "add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
- "subs %2, %2, #12 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
+ "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
- "vtrn.u8 d16, d17 \n"
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
- "vtrn.u8 d18, d19 \n"
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
- "vpaddl.u8 q8, q8 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
// d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
- "vpaddl.u8 d19, d19 \n"
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
// combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 q0, q8 \n"
- "vadd.u16 d4, d3, d7 \n"
- "vadd.u16 d4, d19 \n"
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
"vqrdmulh.s16 q2, q2, q13 \n"
- "vmovn.u16 d4, q2 \n"
+ "vmovn.u16 d4, q2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -364,24 +364,24 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
- "vmovl.u8 q9, d18 \n"
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
// combine source lines
- "vadd.u16 q1, q3 \n"
- "vadd.u16 q1, q9 \n"
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
+ "vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
+ "vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
+ "vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
@@ -390,14 +390,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
// Align for table lookup, vtbl requires registers to
// be adjacent
- "vmov.u8 d2, d4 \n"
+ "vmov.u8 d2, d4 \n"
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
- "vst1.8 {d3}, [%1]! \n"
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -416,46 +416,46 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "vld1.16 {q13}, [%4] \n"
- "vld1.8 {q14}, [%5] \n"
- "add %3, %0 \n"
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "subs %2, %2, #12 \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
// d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
// combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "vqrshrn.u16 d4, q2, #2 \n"
+ "vqrshrn.u16 d4, q2, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -463,22 +463,22 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
// combine source lines
- "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q3 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
+ "vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
+ "vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
+ "vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
@@ -487,14 +487,14 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// Align for table lookup, vtbl requires registers to
// be adjacent
- "vmov.u8 d2, d4 \n"
+ "vmov.u8 d2, d4 \n"
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
- "vst1.8 {d3}, [%1]! \n"
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -504,6 +504,484 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
}
+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_temp = src_ptr + 1;
+ asm volatile(
+ "vmov.u8 d30, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d4}, [%0]! \n" // 01234567
+ "vld1.8 {d5}, [%3]! \n" // 12345678
+
+ "vmovl.u8 q0, d4 \n" // 01234567 (16b)
+ "vmovl.u8 q1, d5 \n" // 12345678 (16b)
+ "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd)
+ "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even)
+
+ "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even)
+
+ "vst2.8 {d0, d1}, [%1]! \n" // store
+ "subs %2, %2, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint8_t* src_temp = src_ptr + 1;
+ const uint8_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+ "vmov.u16 q15, #3 \n"
+ "vmov.u8 d28, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d4}, [%0]! \n" // 01234567
+ "vld1.8 {d5}, [%5]! \n" // 12345678
+
+ "vmovl.u8 q0, d4 \n" // 01234567 (16b)
+ "vmovl.u8 q1, d5 \n" // 12345678 (16b)
+ "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd)
+ "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even)
+
+ "vld1.8 {d8}, [%1]! \n"
+ "vld1.8 {d9}, [%6]! \n"
+
+ "vmovl.u8 q2, d8 \n"
+ "vmovl.u8 q3, d9 \n"
+ "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd)
+ "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even)
+
+ // e o
+ // q1 q0
+ // q3 q2
+
+ "vmovq q4, q2 \n"
+ "vmovq q5, q3 \n"
+ "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
+ "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
+ "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
+ "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
+
+ // e o
+ // q5 q4
+ // q1 q0
+
+ "vrshrn.u16 d2, q1, #4 \n" // 2, even
+ "vrshrn.u16 d3, q0, #4 \n" // 2, odd
+ "vrshrn.u16 d0, q5, #4 \n" // 1, even
+ "vrshrn.u16 d1, q4, #4 \n" // 1, odd
+
+ "vst2.8 {d0, d1}, [%2]! \n" // store
+ "vst2.8 {d2, d3}, [%3]! \n" // store
+ "subs %4, %4, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
+ "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 1;
+ asm volatile(
+ "vmov.u16 q15, #3 \n"
+
+ "1: \n"
+ "vld1.16 {q1}, [%0]! \n" // 01234567 (16b)
+ "vld1.16 {q0}, [%3]! \n" // 12345678 (16b)
+
+ "vmovq q2, q0 \n"
+ "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd)
+ "vmla.u16 q1, q2, q15 \n" // 3*near+far (even)
+
+ "vrshr.u16 q0, q0, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshr.u16 q1, q1, #2 \n" // 3/4*near+1/4*far (even)
+
+ "vst2.16 {d0, d1, d2, d3}, [%1]! \n" // store
+ "subs %2, %2, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 1;
+ const uint16_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+ "vmov.u16 q15, #3 \n"
+
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
+ "vld1.16 {q1}, [%5]! \n" // 12345678 (16b)
+
+ "vmovq q2, q0 \n"
+ "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd)
+ "vmla.u16 q1, q2, q15 \n" // 3*near+far (even)
+
+ "vld1.16 {q2}, [%1]! \n" // 01234567 (16b)
+ "vld1.16 {q3}, [%6]! \n" // 12345678 (16b)
+
+ "vmovq q4, q2 \n"
+ "vmla.u16 q2, q3, q15 \n" // 3*near+far (odd)
+ "vmla.u16 q3, q4, q15 \n" // 3*near+far (even)
+
+ "vmovq q4, q2 \n"
+ "vmovq q5, q3 \n"
+ "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
+ "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
+ "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
+ "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
+
+ "vrshr.u16 q2, q1, #4 \n" // 2, even
+ "vrshr.u16 q3, q0, #4 \n" // 2, odd
+ "vrshr.u16 q0, q5, #4 \n" // 1, even
+ "vrshr.u16 q1, q4, #4 \n" // 1, odd
+
+ "vst2.16 {d0, d1, d2, d3}, [%2]! \n" // store
+ "vst2.16 {d4, d5, d6, d7}, [%3]! \n" // store
+ "subs %4, %4, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+ "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 1;
+ asm volatile(
+ "vmov.u16 d31, #3 \n"
+
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
+ "vld1.16 {q1}, [%3]! \n" // 12345678 (16b)
+
+ "vmovl.u16 q2, d0 \n" // 0123 (32b)
+ "vmovl.u16 q3, d1 \n" // 4567 (32b)
+ "vmovl.u16 q4, d2 \n" // 1234 (32b)
+ "vmovl.u16 q5, d3 \n" // 5678 (32b)
+
+ "vmlal.u16 q2, d2, d31 \n"
+ "vmlal.u16 q3, d3, d31 \n"
+ "vmlal.u16 q4, d0, d31 \n"
+ "vmlal.u16 q5, d1, d31 \n"
+
+ "vrshrn.u32 d0, q4, #2 \n"
+ "vrshrn.u32 d1, q5, #2 \n"
+ "vrshrn.u32 d2, q2, #2 \n"
+ "vrshrn.u32 d3, q3, #2 \n"
+
+ "vst2.16 {q0, q1}, [%1]! \n" // store
+ "subs %2, %2, #16 \n" // 8 sample -> 16 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 1;
+ const uint16_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+ "vmov.u16 d31, #3 \n"
+ "vmov.u32 q14, #3 \n"
+
+ "1: \n"
+ "vld1.16 {d0}, [%0]! \n" // 0123 (16b)
+ "vld1.16 {d1}, [%5]! \n" // 1234 (16b)
+ "vmovl.u16 q2, d0 \n" // 0123 (32b)
+ "vmovl.u16 q3, d1 \n" // 1234 (32b)
+ "vmlal.u16 q2, d1, d31 \n"
+ "vmlal.u16 q3, d0, d31 \n"
+
+ "vld1.16 {d0}, [%1]! \n" // 0123 (16b)
+ "vld1.16 {d1}, [%6]! \n" // 1234 (16b)
+ "vmovl.u16 q4, d0 \n" // 0123 (32b)
+ "vmovl.u16 q5, d1 \n" // 1234 (32b)
+ "vmlal.u16 q4, d1, d31 \n"
+ "vmlal.u16 q5, d0, d31 \n"
+
+ "vmovq q0, q4 \n"
+ "vmovq q1, q5 \n"
+ "vmla.u32 q4, q2, q14 \n"
+ "vmla.u32 q5, q3, q14 \n"
+ "vmla.u32 q2, q0, q14 \n"
+ "vmla.u32 q3, q1, q14 \n"
+
+ "vrshrn.u32 d1, q4, #4 \n"
+ "vrshrn.u32 d0, q5, #4 \n"
+ "vrshrn.u32 d3, q2, #4 \n"
+ "vrshrn.u32 d2, q3, #4 \n"
+
+ "vst2.16 {d0, d1}, [%2]! \n" // store
+ "vst2.16 {d2, d3}, [%3]! \n" // store
+ "subs %4, %4, #8 \n" // 4 sample -> 8 sample
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
+ "d31" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_temp = src_ptr + 2;
+ asm volatile(
+ "vmov.u8 d30, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
+ "vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v)
+
+ "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b)
+ "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b)
+ "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd)
+ "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even)
+
+ "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even)
+
+ "vst2.16 {d0, d1}, [%1]! \n" // store
+ "subs %2, %2, #8 \n" // 4 uv -> 8 uv
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "d30" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint8_t* src_temp = src_ptr + 2;
+ const uint8_t* src_temp1 = src_ptr1 + 2;
+
+ asm volatile(
+ "vmov.u16 q15, #3 \n"
+ "vmov.u8 d28, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
+ "vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v)
+
+ "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b)
+ "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b)
+ "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd)
+ "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even)
+
+ "vld1.8 {d8}, [%1]! \n" // 00112233 (1u1v)
+ "vld1.8 {d9}, [%6]! \n" // 11223344 (1u1v)
+
+ "vmovl.u8 q2, d8 \n" // 00112233 (1u1v, 16b)
+ "vmovl.u8 q3, d9 \n" // 11223344 (1u1v, 16b)
+ "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd)
+ "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even)
+
+ // e o
+ // q1 q0
+ // q3 q2
+
+ "vmovq q4, q2 \n"
+ "vmovq q5, q3 \n"
+ "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
+ "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
+ "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
+ "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
+
+ // e o
+ // q5 q4
+ // q1 q0
+
+ "vrshrn.u16 d2, q1, #4 \n" // 2, even
+ "vrshrn.u16 d3, q0, #4 \n" // 2, odd
+ "vrshrn.u16 d0, q5, #4 \n" // 1, even
+ "vrshrn.u16 d1, q4, #4 \n" // 1, odd
+
+ "vst2.16 {d0, d1}, [%2]! \n" // store
+ "vst2.16 {d2, d3}, [%3]! \n" // store
+ "subs %4, %4, #8 \n" // 4 uv -> 8 uv
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
+ "q15" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 2;
+ asm volatile(
+ "vmov.u16 d30, #3 \n"
+
+ "1: \n"
+ "vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16)
+ "vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16)
+
+ "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b)
+ "vmovl.u16 q3, d2 \n" // 1122 (1u1v, 32b)
+ "vmovl.u16 q4, d1 \n" // 2233 (1u1v, 32b)
+ "vmovl.u16 q5, d3 \n" // 3344 (1u1v, 32b)
+ "vmlal.u16 q2, d2, d30 \n" // 3*near+far (odd)
+ "vmlal.u16 q3, d0, d30 \n" // 3*near+far (even)
+ "vmlal.u16 q4, d3, d30 \n" // 3*near+far (odd)
+ "vmlal.u16 q5, d1, d30 \n" // 3*near+far (even)
+
+ "vrshrn.u32 d1, q2, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshrn.u32 d0, q3, #2 \n" // 3/4*near+1/4*far (even)
+ "vrshrn.u32 d3, q4, #2 \n" // 3/4*near+1/4*far (odd)
+ "vrshrn.u32 d2, q5, #2 \n" // 3/4*near+1/4*far (even)
+
+ "vst2.32 {d0, d1}, [%1]! \n" // store
+ "vst2.32 {d2, d3}, [%1]! \n" // store
+ "subs %2, %2, #8 \n" // 4 uv -> 8 uv
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_temp) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
+ "d30" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 2;
+ const uint16_t* src_temp1 = src_ptr1 + 2;
+
+ asm volatile(
+ "vmov.u16 d30, #3 \n"
+ "vmov.u32 q14, #3 \n"
+
+ "1: \n"
+ "vld1.8 {d0}, [%0]! \n" // 0011 (1u1v)
+ "vld1.8 {d1}, [%5]! \n" // 1122 (1u1v)
+ "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b)
+ "vmovl.u16 q3, d1 \n" // 1122 (1u1v, 32b)
+ "vmlal.u16 q2, d1, d30 \n" // 3*near+far (1, odd)
+ "vmlal.u16 q3, d0, d30 \n" // 3*near+far (1, even)
+
+ "vld1.8 {d0}, [%1]! \n" // 0011 (1u1v)
+ "vld1.8 {d1}, [%6]! \n" // 1122 (1u1v)
+ "vmovl.u16 q4, d0 \n" // 0011 (1u1v, 32b)
+ "vmovl.u16 q5, d1 \n" // 1122 (1u1v, 32b)
+ "vmlal.u16 q4, d1, d30 \n" // 3*near+far (2, odd)
+ "vmlal.u16 q5, d0, d30 \n" // 3*near+far (2, even)
+
+ "vmovq q0, q4 \n"
+ "vmovq q1, q5 \n"
+ "vmla.u32 q4, q2, q14 \n" // 9 3 3 1 (1, odd)
+ "vmla.u32 q5, q3, q14 \n" // 9 3 3 1 (1, even)
+ "vmla.u32 q2, q0, q14 \n" // 9 3 3 1 (2, odd)
+ "vmla.u32 q3, q1, q14 \n" // 9 3 3 1 (2, even)
+
+ "vrshrn.u32 d1, q4, #4 \n" // 1, odd
+ "vrshrn.u32 d0, q5, #4 \n" // 1, even
+ "vrshrn.u32 d3, q2, #4 \n" // 2, odd
+ "vrshrn.u32 d2, q3, #4 \n" // 2, even
+
+ "vst2.32 {d0, d1}, [%2]! \n" // store
+ "vst2.32 {d2, d3}, [%3]! \n" // store
+ "subs %4, %4, #4 \n" // 2 uv -> 4 uv
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_ptr1), // %3
+ "+r"(dst_width), // %4
+ "+r"(src_temp), // %5
+ "+r"(src_temp1) // %6
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
+ "d30" // Clobber List
+ );
+}
+
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,
@@ -511,13 +989,13 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
int src_width) {
asm volatile(
"1: \n"
- "vld1.16 {q1, q2}, [%1] \n" // load accumulator
- "vld1.8 {q0}, [%0]! \n" // load 16 bytes
- "vaddw.u8 q2, q2, d1 \n" // add
- "vaddw.u8 q1, q1, d0 \n"
- "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
- "subs %2, %2, #16 \n" // 16 processed per loop
- "bgt 1b \n"
+ "vld1.16 {q1, q2}, [%1] \n" // load accumulator
+ "vld1.8 {q0}, [%0]! \n" // load 16 bytes
+ "vaddw.u8 q2, q2, d1 \n" // add
+ "vaddw.u8 q1, q1, d0 \n"
+ "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
@@ -547,17 +1025,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
int* tmp = dx_offset;
const uint8_t* src_tmp = src_ptr;
asm volatile (
- "vdup.32 q0, %3 \n" // x
- "vdup.32 q1, %4 \n" // dx
- "vld1.32 {q2}, [%5] \n" // 0 1 2 3
- "vshl.i32 q3, q1, #2 \n" // 4 * dx
- "vmul.s32 q1, q1, q2 \n"
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q3, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "vadd.s32 q1, q1, q0 \n"
+ "vadd.s32 q1, q1, q0 \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
- "vadd.s32 q2, q1, q3 \n"
- "vshl.i32 q0, q3, #1 \n" // 8 * dx
- "1: \n"
+ "vadd.s32 q2, q1, q3 \n"
+ "vshl.i32 q0, q3, #1 \n" // 8 * dx
+ "1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
@@ -566,27 +1044,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
- "vmov q10, q1 \n"
- "vmov q11, q2 \n"
- "vuzp.16 q10, q11 \n"
- "vmovl.u8 q8, d6 \n"
- "vmovl.u8 q9, d7 \n"
- "vsubl.s16 q11, d18, d16 \n"
- "vsubl.s16 q12, d19, d17 \n"
- "vmovl.u16 q13, d20 \n"
- "vmovl.u16 q10, d21 \n"
- "vmul.s32 q11, q11, q13 \n"
- "vmul.s32 q12, q12, q10 \n"
- "vrshrn.s32 d18, q11, #16 \n"
- "vrshrn.s32 d19, q12, #16 \n"
- "vadd.s16 q8, q8, q9 \n"
- "vmovn.s16 d6, q8 \n"
-
- "vst1.8 {d6}, [%0]! \n" // store pixels
- "vadd.s32 q1, q1, q0 \n"
- "vadd.s32 q2, q2, q0 \n"
- "subs %2, %2, #8 \n" // 8 processed per loop
- "bgt 1b \n"
+ "vmov q10, q1 \n"
+ "vmov q11, q2 \n"
+ "vuzp.16 q10, q11 \n"
+ "vmovl.u8 q8, d6 \n"
+ "vmovl.u8 q9, d7 \n"
+ "vsubl.s16 q11, d18, d16 \n"
+ "vsubl.s16 q12, d19, d17 \n"
+ "vmovl.u16 q13, d20 \n"
+ "vmovl.u16 q10, d21 \n"
+ "vmul.s32 q11, q11, q13 \n"
+ "vmul.s32 q12, q12, q10 \n"
+ "vrshrn.s32 d18, q11, #16 \n"
+ "vrshrn.s32 d19, q12, #16 \n"
+ "vadd.s16 q8, q8, q9 \n"
+ "vmovn.s16 d6, q8 \n"
+
+ "vst1.8 {d6}, [%0]! \n" // store pixels
+ "vadd.s32 q1, q1, q0 \n"
+ "vadd.s32 q2, q2, q0 \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
@@ -609,75 +1087,75 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
int dst_width,
int source_y_fraction) {
asm volatile(
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #64 \n"
- "beq 75f \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
- "cmp %4, #192 \n"
- "beq 25f \n"
-
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
// Blend 25 / 75.
"25: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 25b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "vld1.8 {q0}, [%1]! \n"
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
// Blend 75 / 25.
"75: \n"
- "vld1.8 {q1}, [%1]! \n"
- "vld1.8 {q0}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 75b \n"
- "b 99f \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
"99: \n"
- "vst1.8 {d1[7]}, [%0] \n"
+ "vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
@@ -694,12 +1172,12 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vmov q2, q1 \n" // load next 8 ARGB
- "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vmov q2, q1 \n" // load next 8 ARGB
+ "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -722,13 +1200,13 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
(void)src_stride;
asm volatile(
"1: \n"
- "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vrhadd.u8 q0, q0, q1 \n" // rounding half add
- "vrhadd.u8 q1, q2, q3 \n" // rounding half add
- "vst2.32 {q0, q1}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vrhadd.u8 q1, q2, q3 \n" // rounding half add
+ "vst2.32 {q0, q1}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -743,27 +1221,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
- "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
- "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
- "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
- "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
- "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
- "vrshrn.u16 d1, q1, #2 \n"
- "vrshrn.u16 d2, q2, #2 \n"
- "vrshrn.u16 d3, q3, #2 \n"
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
- "bgt 1b \n"
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -781,15 +1259,15 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
int dst_width) {
(void)src_stride;
asm volatile(
- "mov r12, %3, lsl #2 \n"
- "1: \n"
- "vld1.32 {d0[0]}, [%0], r12 \n"
- "vld1.32 {d0[1]}, [%0], r12 \n"
- "vld1.32 {d1[0]}, [%0], r12 \n"
- "vld1.32 {d1[1]}, [%0], r12 \n"
- "subs %2, %2, #4 \n" // 4 pixels per loop.
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
+ "mov r12, %3, lsl #2 \n"
+ "1: \n"
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -805,30 +1283,30 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
asm volatile(
- "mov r12, %4, lsl #2 \n"
- "add %1, %1, %0 \n"
- "1: \n"
- "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
- "vld1.8 {d1}, [%1], r12 \n"
- "vld1.8 {d2}, [%0], r12 \n"
- "vld1.8 {d3}, [%1], r12 \n"
- "vld1.8 {d4}, [%0], r12 \n"
- "vld1.8 {d5}, [%1], r12 \n"
- "vld1.8 {d6}, [%0], r12 \n"
- "vld1.8 {d7}, [%1], r12 \n"
- "vaddl.u8 q0, d0, d1 \n"
- "vaddl.u8 q1, d2, d3 \n"
- "vaddl.u8 q2, d4, d5 \n"
- "vaddl.u8 q3, d6, d7 \n"
- "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
- "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
- "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
- "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
- "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
- "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
- "subs %3, %3, #4 \n" // 4 pixels per loop.
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
+ "vld1.8 {d1}, [%1], r12 \n"
+ "vld1.8 {d2}, [%0], r12 \n"
+ "vld1.8 {d3}, [%1], r12 \n"
+ "vld1.8 {d4}, [%0], r12 \n"
+ "vld1.8 {d5}, [%1], r12 \n"
+ "vld1.8 {d6}, [%0], r12 \n"
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
"+r"(dst_argb), // %2
@@ -865,8 +1343,8 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
LOAD1_DATA32_LANE(d3, 1)
// clang-format on
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
- "subs %2, %2, #8 \n" // 8 processed per loop
- "bgt 1b \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@@ -897,16 +1375,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
int* tmp = dx_offset;
const uint8_t* src_tmp = src_argb;
asm volatile (
- "vdup.32 q0, %3 \n" // x
- "vdup.32 q1, %4 \n" // dx
- "vld1.32 {q2}, [%5] \n" // 0 1 2 3
- "vshl.i32 q9, q1, #2 \n" // 4 * dx
- "vmul.s32 q1, q1, q2 \n"
- "vmov.i8 q3, #0x7f \n" // 0x7F
- "vmov.i16 q15, #0x7f \n" // 0x7F
+ "vdup.32 q0, %3 \n" // x
+ "vdup.32 q1, %4 \n" // dx
+ "vld1.32 {q2}, [%5] \n" // 0 1 2 3
+ "vshl.i32 q9, q1, #2 \n" // 4 * dx
+ "vmul.s32 q1, q1, q2 \n"
+ "vmov.i8 q3, #0x7f \n" // 0x7F
+ "vmov.i16 q15, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "vadd.s32 q8, q1, q0 \n"
- "1: \n"
+ "vadd.s32 q8, q1, q0 \n"
+ "1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(d0, d2, 0)
@@ -950,6 +1428,64 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
#undef LOAD2_DATA32_LANE
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
+ "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts.
+ "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV
+ "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV
+ "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst2.8 {d0, d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q8", "q9");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx, // pixel step
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+ const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+ const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "vld1.16 {d0[0]}, [%0], %6 \n"
+ "vld1.16 {d0[1]}, [%1], %6 \n"
+ "vld1.16 {d0[2]}, [%2], %6 \n"
+ "vld1.16 {d0[3]}, [%3], %6 \n"
+ "subs %5, %5, #4 \n" // 4 pixels per loop.
+ "vst1.8 {d0}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src1_ptr), // %1
+ "+r"(src2_ptr), // %2
+ "+r"(src3_ptr), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_width) // %5
+ : "r"(src_stepx * 8) // %6
+ : "memory", "cc", "d0");
+}
+
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc
index 0a7b80ce..9f9636e6 100644
--- a/files/source/scale_neon64.cc
+++ b/files/source/scale_neon64.cc
@@ -29,10 +29,11 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load even pixels into v0, odd into v1
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -50,11 +51,12 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load even pixels into v0, odd into v1
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -70,19 +72,21 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
- "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
- "uaddlp v1.8h, v1.16b \n"
- "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
- "uadalp v1.8h, v3.16b \n"
- "rshrn v0.8b, v0.8h, #2 \n" // round and pack
- "rshrn2 v0.16b, v1.8h, #2 \n"
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddlp v1.8h, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
+ "uadalp v1.8h, v3.16b \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn2 v0.16b, v1.8h, #2 \n"
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -99,10 +103,11 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "st1 {v2.8b}, [%1], #8 \n"
- "b.gt 1b \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v2.8b}, [%1], #8 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -119,19 +124,23 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile(
"1: \n"
- "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
- "ld1 {v1.16b}, [%2], #16 \n"
- "ld1 {v2.16b}, [%3], #16 \n"
- "ld1 {v3.16b}, [%4], #16 \n"
- "subs %w5, %w5, #4 \n"
- "uaddlp v0.8h, v0.16b \n"
- "uadalp v0.8h, v1.16b \n"
- "uadalp v0.8h, v2.16b \n"
- "uadalp v0.8h, v3.16b \n"
- "addp v0.8h, v0.8h, v0.8h \n"
- "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
- "st1 {v0.s}[0], [%1], #4 \n"
- "b.gt 1b \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "ld1 {v2.16b}, [%3], #16 \n"
+ "ld1 {v3.16b}, [%4], #16 \n"
+ "subs %w5, %w5, #4 \n"
+ "uaddlp v0.8h, v0.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.8h, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "uadalp v0.8h, v2.16b \n"
+ "prfm pldl1keep, [%3, 448] \n"
+ "uadalp v0.8h, v3.16b \n"
+ "prfm pldl1keep, [%4, 448] \n"
+ "addp v0.8h, v0.8h, v0.8h \n"
+ "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
+ "st1 {v0.s}[0], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_ptr1), // %2
@@ -151,12 +160,13 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "subs %w2, %w2, #24 \n"
- "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #24 \n"
+ "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -169,49 +179,51 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
- "subs %w2, %w2, #24 \n"
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
- "ushll v16.8h, v4.8b, #0 \n"
- "ushll v17.8h, v5.8b, #0 \n"
- "ushll v18.8h, v6.8b, #0 \n"
- "ushll v19.8h, v7.8b, #0 \n"
+ "ushll v16.8h, v4.8b, #0 \n"
+ "ushll v17.8h, v5.8b, #0 \n"
+ "ushll v18.8h, v6.8b, #0 \n"
+ "ushll v19.8h, v7.8b, #0 \n"
// 3 * line_0 + line_1
- "umlal v16.8h, v0.8b, v20.8b \n"
- "umlal v17.8h, v1.8b, v20.8b \n"
- "umlal v18.8h, v2.8b, v20.8b \n"
- "umlal v19.8h, v3.8b, v20.8b \n"
-
- // (3 * line_0 + line_1) >> 2
- "uqrshrn v0.8b, v16.8h, #2 \n"
- "uqrshrn v1.8b, v17.8h, #2 \n"
- "uqrshrn v2.8b, v18.8h, #2 \n"
- "uqrshrn v3.8b, v19.8h, #2 \n"
-
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v16.8h, v1.8b, #0 \n"
- "umlal v16.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v16.8h, #2 \n"
-
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
-
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v16.8h, v2.8b, #0 \n"
- "umlal v16.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v16.8h, #2 \n"
-
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
-
- "b.gt 1b \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "umlal v17.8h, v1.8b, v20.8b \n"
+ "umlal v18.8h, v2.8b, v20.8b \n"
+ "umlal v19.8h, v3.8b, v20.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ // (3 * line_0 + line_1 + 2) >> 2
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+ "uqrshrn v1.8b, v17.8h, #2 \n"
+ "uqrshrn v2.8b, v18.8h, #2 \n"
+ "uqrshrn v3.8b, v19.8h, #2 \n"
+ "prfm pldl1keep, [%3, 448] \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+ "ushll v16.8h, v1.8b, #0 \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+ "urhadd v1.8b, v1.8b, v2.8b \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+ "ushll v16.8h, v2.8b, #0 \n"
+ "umlal v16.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v16.8h, #2 \n"
+
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -226,33 +238,35 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
- "subs %w2, %w2, #24 \n"
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
// average src line 0 with src line 1
- "urhadd v0.8b, v0.8b, v4.8b \n"
- "urhadd v1.8b, v1.8b, v5.8b \n"
- "urhadd v2.8b, v2.8b, v6.8b \n"
- "urhadd v3.8b, v3.8b, v7.8b \n"
-
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v4.8h, v1.8b, #0 \n"
- "umlal v4.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v4.8h, #2 \n"
-
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
-
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v4.8h, v2.8b, #0 \n"
- "umlal v4.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v4.8h, #2 \n"
-
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
+ "urhadd v0.8b, v0.8b, v4.8b \n"
+ "urhadd v1.8b, v1.8b, v5.8b \n"
+ "urhadd v2.8b, v2.8b, v6.8b \n"
+ "urhadd v3.8b, v3.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
+ "ushll v4.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v4.8h, #2 \n"
+ "prfm pldl1keep, [%3, 448] \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
+ "urhadd v1.8b, v1.8b, v2.8b \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
+ "ushll v4.8h, v2.8b, #0 \n"
+ "umlal v4.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v4.8h, #2 \n"
+
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@@ -279,14 +293,15 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
- "ld1 {v3.16b}, [%3] \n"
- "1: \n"
- "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #12 \n"
- "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
- "st1 {v2.8b}, [%1], #8 \n"
- "st1 {v2.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "ld1 {v3.16b}, [%3] \n"
+ "1: \n"
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #12 \n"
+ "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v2.8b}, [%1], #8 \n"
+ "st1 {v2.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -303,68 +318,68 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t tmp_src_stride = src_stride;
asm volatile(
- "ld1 {v29.8h}, [%5] \n"
- "ld1 {v30.16b}, [%6] \n"
- "ld1 {v31.8h}, [%7] \n"
- "add %2, %2, %0 \n"
- "1: \n"
+ "ld1 {v29.8h}, [%5] \n"
+ "ld1 {v30.16b}, [%6] \n"
+ "ld1 {v31.8h}, [%7] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
// 00 40 01 41 02 42 03 43
// 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
- "subs %w4, %w4, #12 \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+ "subs %w4, %w4, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// 00 10 01 11 02 12 03 13
// 40 50 41 51 42 52 43 53
- "trn1 v20.8b, v0.8b, v1.8b \n"
- "trn2 v21.8b, v0.8b, v1.8b \n"
- "trn1 v22.8b, v4.8b, v5.8b \n"
- "trn2 v23.8b, v4.8b, v5.8b \n"
- "trn1 v24.8b, v16.8b, v17.8b \n"
- "trn2 v25.8b, v16.8b, v17.8b \n"
+ "trn1 v20.8b, v0.8b, v1.8b \n"
+ "trn2 v21.8b, v0.8b, v1.8b \n"
+ "trn1 v22.8b, v4.8b, v5.8b \n"
+ "trn2 v23.8b, v4.8b, v5.8b \n"
+ "trn1 v24.8b, v16.8b, v17.8b \n"
+ "trn2 v25.8b, v16.8b, v17.8b \n"
// 20 30 21 31 22 32 23 33
// 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
- "trn1 v16.8b, v18.8b, v19.8b \n"
- "trn2 v17.8b, v18.8b, v19.8b \n"
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v16.8b, v18.8b, v19.8b \n"
+ "trn2 v17.8b, v18.8b, v19.8b \n"
// 00+10 01+11 02+12 03+13
// 40+50 41+51 42+52 43+53
- "uaddlp v20.4h, v20.8b \n"
- "uaddlp v21.4h, v21.8b \n"
- "uaddlp v22.4h, v22.8b \n"
- "uaddlp v23.4h, v23.8b \n"
- "uaddlp v24.4h, v24.8b \n"
- "uaddlp v25.4h, v25.8b \n"
+ "uaddlp v20.4h, v20.8b \n"
+ "uaddlp v21.4h, v21.8b \n"
+ "uaddlp v22.4h, v22.8b \n"
+ "uaddlp v23.4h, v23.8b \n"
+ "uaddlp v24.4h, v24.8b \n"
+ "uaddlp v25.4h, v25.8b \n"
// 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
- "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
// combine source lines
- "add v20.4h, v20.4h, v22.4h \n"
- "add v21.4h, v21.4h, v23.4h \n"
- "add v20.4h, v20.4h, v24.4h \n"
- "add v21.4h, v21.4h, v25.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
- "add v2.4h, v2.4h, v17.4h \n"
+ "add v20.4h, v20.4h, v22.4h \n"
+ "add v21.4h, v21.4h, v23.4h \n"
+ "add v20.4h, v20.4h, v24.4h \n"
+ "add v21.4h, v21.4h, v25.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+ "add v2.4h, v2.4h, v17.4h \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
- "sqrdmulh v2.8h, v2.8h, v29.8h \n"
- "xtn v2.8b, v2.8h \n"
+ "sqrdmulh v2.8h, v2.8h, v29.8h \n"
+ "xtn v2.8b, v2.8h \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -372,35 +387,38 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
// registers are already expanded. Then do transposes
// to get aligned.
// xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "ushll v16.8h, v16.8b, #0 \n"
- "uaddl v0.8h, v0.8b, v4.8b \n"
+ "ushll v16.8h, v16.8b, #0 \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
// combine source lines
- "add v0.8h, v0.8h, v16.8h \n"
+ "add v0.8h, v0.8h, v16.8h \n"
// xx 20 xx 21 xx 22 xx 23
// xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5
- "add v20.8h, v20.8h, v0.8h \n"
- "add v21.8h, v21.8h, v4.8h \n"
+ "add v20.8h, v20.8h, v0.8h \n"
+ "add v21.8h, v21.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
- "sqrdmulh v0.8h, v20.8h, v31.8h \n"
- "sqrdmulh v1.8h, v21.8h, v31.8h \n"
+ "sqrdmulh v0.8h, v20.8h, v31.8h \n"
+ "sqrdmulh v1.8h, v21.8h, v31.8h \n"
+ "prfm pldl1keep, [%3, 448] \n"
// Align for table lookup, vtbl requires registers to be adjacent
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
- "st1 {v3.8b}, [%1], #8 \n"
- "st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
@@ -422,53 +440,53 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride;
asm volatile(
- "ld1 {v30.8h}, [%4] \n"
- "ld1 {v31.16b}, [%5] \n"
- "add %2, %2, %0 \n"
- "1: \n"
+ "ld1 {v30.8h}, [%4] \n"
+ "ld1 {v31.16b}, [%5] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
// 00 40 01 41 02 42 03 43
// 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
- "subs %w3, %w3, #12 \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "subs %w3, %w3, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// 00 10 01 11 02 12 03 13
// 40 50 41 51 42 52 43 53
- "trn1 v16.8b, v0.8b, v1.8b \n"
- "trn2 v17.8b, v0.8b, v1.8b \n"
- "trn1 v18.8b, v4.8b, v5.8b \n"
- "trn2 v19.8b, v4.8b, v5.8b \n"
+ "trn1 v16.8b, v0.8b, v1.8b \n"
+ "trn2 v17.8b, v0.8b, v1.8b \n"
+ "trn1 v18.8b, v4.8b, v5.8b \n"
+ "trn2 v19.8b, v4.8b, v5.8b \n"
// 20 30 21 31 22 32 23 33
// 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
// 00+10 01+11 02+12 03+13
// 40+50 41+51 42+52 43+53
- "uaddlp v16.4h, v16.8b \n"
- "uaddlp v17.4h, v17.8b \n"
- "uaddlp v18.4h, v18.8b \n"
- "uaddlp v19.4h, v19.8b \n"
+ "uaddlp v16.4h, v16.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v18.4h, v18.8b \n"
+ "uaddlp v19.4h, v19.8b \n"
// 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
// combine source lines
- "add v16.4h, v16.4h, v18.4h \n"
- "add v17.4h, v17.4h, v19.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
+ "add v16.4h, v16.4h, v18.4h \n"
+ "add v17.4h, v17.4h, v19.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "uqrshrn v2.8b, v2.8h, #2 \n"
+ "uqrshrn v2.8b, v2.8h, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
@@ -478,33 +496,35 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
// xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
// combine source lines
- "uaddl v0.8h, v0.8b, v4.8b \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
// xx 20 xx 21 xx 22 xx 23
// xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5
- "add v16.8h, v16.8h, v0.8h \n"
- "add v17.8h, v17.8h, v4.8h \n"
+ "add v16.8h, v16.8h, v0.8h \n"
+ "add v17.8h, v17.8h, v4.8h \n"
+ "prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
- "sqrdmulh v0.8h, v16.8h, v30.8h \n"
- "sqrdmulh v1.8h, v17.8h, v30.8h \n"
+ "sqrdmulh v0.8h, v16.8h, v30.8h \n"
+ "sqrdmulh v1.8h, v17.8h, v30.8h \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
- "st1 {v3.8b}, [%1], #8 \n"
- "st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
@@ -515,6 +535,488 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"v19", "v30", "v31", "memory", "cc");
}
+void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_temp = src_ptr + 1;
+ asm volatile(
+ "movi v31.8b, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n" // 01234567
+ "ldr d1, [%1], #8 \n" // 12345678
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
+ "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
+
+ "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd)
+ "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even)
+
+ "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd)
+ "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even)
+
+ "st2 {v1.8b, v2.8b}, [%2], #16 \n" // store
+ "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint8_t* src_temp = src_ptr + 1;
+ const uint8_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+ "movi v31.8b, #3 \n"
+ "movi v30.8h, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n" // 01234567
+ "ldr d1, [%2], #8 \n" // 12345678
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
+ "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
+ "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd)
+ "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even)
+
+ "ldr d0, [%1], #8 \n"
+ "ldr d1, [%3], #8 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v4.8h, v0.8b, #0 \n" // 01234567 (16b)
+ "ushll v5.8h, v1.8b, #0 \n" // 12345678 (16b)
+ "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd)
+ "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even)
+
+ "mov v0.16b, v4.16b \n"
+ "mov v1.16b, v5.16b \n"
+ "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd)
+ "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even)
+ "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd)
+ "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even)
+
+ "rshrn v2.8b, v2.8h, #4 \n" // 2, odd
+ "rshrn v1.8b, v3.8h, #4 \n" // 2, even
+ "rshrn v4.8b, v4.8h, #4 \n" // 1, odd
+ "rshrn v3.8b, v5.8h, #4 \n" // 1, even
+
+ "st2 {v1.8b, v2.8b}, [%5], #16 \n" // store 1
+ "st2 {v3.8b, v4.8b}, [%4], #16 \n" // store 2
+ "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+ "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 1;
+ asm volatile(
+ "movi v31.8h, #3 \n"
+
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
+ "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "mov v2.16b, v0.16b \n"
+ "mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd)
+ "mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even)
+
+ "urshr v2.8h, v0.8h, #2 \n" // 3/4*near+1/4*far (odd)
+ "urshr v1.8h, v1.8h, #2 \n" // 3/4*near+1/4*far (even)
+
+ "st2 {v1.8h, v2.8h}, [%2], #32 \n" // store
+ "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 1;
+ const uint16_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+ "movi v31.8h, #3 \n"
+
+ "1: \n"
+ "ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b)
+ "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "mov v0.16b, v2.16b \n"
+ "mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd)
+ "mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even)
+
+ "ld1 {v4.8h}, [%1], #16 \n" // 01234567 (16b)
+ "ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+
+ "mov v0.16b, v4.16b \n"
+ "mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd)
+ "mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even)
+
+ "mov v0.16b, v4.16b \n"
+ "mov v1.16b, v5.16b \n"
+ "mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd)
+ "mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even)
+ "mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd)
+ "mla v3.8h, v1.8h, v31.8h \n" // 9 3 3 1 (2, even)
+
+ "urshr v2.8h, v2.8h, #4 \n" // 2, odd
+ "urshr v1.8h, v3.8h, #4 \n" // 2, even
+ "urshr v4.8h, v4.8h, #4 \n" // 1, odd
+ "urshr v3.8h, v5.8h, #4 \n" // 1, even
+
+ "st2 {v3.8h, v4.8h}, [%4], #32 \n" // store 1
+ "st2 {v1.8h, v2.8h}, [%5], #32 \n" // store 2
+
+ "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 1;
+ asm volatile(
+ "movi v31.8h, #3 \n"
+
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
+ "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b)
+ "ushll2 v3.4s, v0.8h, #0 \n" // 4567 (32b)
+ "ushll v4.4s, v1.4h, #0 \n" // 1234 (32b)
+ "ushll2 v5.4s, v1.8h, #0 \n" // 5678 (32b)
+
+ "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd)
+ "umlal2 v3.4s, v1.8h, v31.8h \n" // 3*near+far (2, odd)
+ "umlal v4.4s, v0.4h, v31.4h \n" // 3*near+far (1, even)
+ "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (2, even)
+
+ "rshrn v0.4h, v4.4s, #2 \n" // 3/4*near+1/4*far
+ "rshrn2 v0.8h, v5.4s, #2 \n" // 3/4*near+1/4*far (even)
+ "rshrn v1.4h, v2.4s, #2 \n" // 3/4*near+1/4*far
+ "rshrn2 v1.8h, v3.4s, #2 \n" // 3/4*near+1/4*far (odd)
+
+ "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store
+ "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List
+ );
+}
+
+void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 1;
+ const uint16_t* src_temp1 = src_ptr1 + 1;
+
+ asm volatile(
+ "movi v31.4h, #3 \n"
+ "movi v30.4s, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n" // 0123 (16b)
+ "ldr d1, [%2], #8 \n" // 1234 (16b)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b)
+ "ushll v3.4s, v1.4h, #0 \n" // 1234 (32b)
+ "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd)
+ "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even)
+
+ "ldr d0, [%1], #8 \n" // 0123 (16b)
+ "ldr d1, [%3], #8 \n" // 1234 (16b)
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "ushll v4.4s, v0.4h, #0 \n" // 0123 (32b)
+ "ushll v5.4s, v1.4h, #0 \n" // 1234 (32b)
+ "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd)
+ "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even)
+
+ "mov v0.16b, v4.16b \n"
+ "mov v1.16b, v5.16b \n"
+ "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd)
+ "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even)
+ "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd)
+ "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even)
+
+ "rshrn v1.4h, v4.4s, #4 \n" // 3/4*near+1/4*far
+ "rshrn v0.4h, v5.4s, #4 \n" // 3/4*near+1/4*far
+ "rshrn v5.4h, v2.4s, #4 \n" // 3/4*near+1/4*far
+ "rshrn v4.4h, v3.4s, #4 \n" // 3/4*near+1/4*far
+
+ "st2 {v0.4h, v1.4h}, [%4], #16 \n" // store 1
+ "st2 {v4.4h, v5.4h}, [%5], #16 \n" // store 2
+
+ "subs %w6, %w6, #8 \n" // 4 sample -> 8 sample
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+ "v31" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src_temp = src_ptr + 2;
+ asm volatile(
+ "movi v31.8b, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n" // 00112233 (1u1v)
+ "ldr d1, [%1], #8 \n" // 11223344 (1u1v)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.8h, v0.8b, #0 \n" // 00112233 (1u1v, 16b)
+ "ushll v3.8h, v1.8b, #0 \n" // 11223344 (1u1v, 16b)
+
+ "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd)
+ "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even)
+
+ "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd)
+ "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even)
+
+ "st2 {v1.4h, v2.4h}, [%2], #16 \n" // store
+ "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint8_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint8_t* src_temp = src_ptr + 2;
+ const uint8_t* src_temp1 = src_ptr1 + 2;
+
+ asm volatile(
+ "movi v31.8b, #3 \n"
+ "movi v30.8h, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n"
+ "ldr d1, [%2], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.8h, v0.8b, #0 \n"
+ "ushll v3.8h, v1.8b, #0 \n"
+ "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd)
+ "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even)
+
+ "ldr d0, [%1], #8 \n"
+ "ldr d1, [%3], #8 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v4.8h, v0.8b, #0 \n"
+ "ushll v5.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd)
+ "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even)
+
+ "mov v0.16b, v4.16b \n"
+ "mov v1.16b, v5.16b \n"
+ "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd)
+ "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even)
+ "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd)
+ "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even)
+
+ "rshrn v2.8b, v2.8h, #4 \n" // 2, odd
+ "rshrn v1.8b, v3.8h, #4 \n" // 2, even
+ "rshrn v4.8b, v4.8h, #4 \n" // 1, odd
+ "rshrn v3.8b, v5.8h, #4 \n" // 1, even
+
+ "st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 2
+ "st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 1
+ "subs %w6, %w6, #8 \n" // 4 uv -> 8 uv
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+ "v31" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ int dst_width) {
+ const uint16_t* src_temp = src_ptr + 2;
+ asm volatile(
+ "movi v31.8h, #3 \n"
+
+ "1: \n"
+ "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
+ "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+
+ "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
+ "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
+ "ushll2 v4.4s, v0.8h, #0 \n" // 2233 (1u1v, 32b)
+ "ushll2 v5.4s, v1.8h, #0 \n" // 3344 (1u1v, 32b)
+
+ "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (odd)
+ "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (even)
+ "umlal2 v4.4s, v1.8h, v31.8h \n" // 3*near+far (odd)
+ "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (even)
+
+ "rshrn v2.4h, v2.4s, #2 \n" // 3/4*near+1/4*far (odd)
+ "rshrn v1.4h, v3.4s, #2 \n" // 3/4*near+1/4*far (even)
+ "rshrn v4.4h, v4.4s, #2 \n" // 3/4*near+1/4*far (odd)
+ "rshrn v3.4h, v5.4s, #2 \n" // 3/4*near+1/4*far (even)
+
+ "st2 {v1.2s, v2.2s}, [%2], #16 \n" // store
+ "st2 {v3.2s, v4.2s}, [%2], #16 \n" // store
+ "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_temp), // %1
+ "+r"(dst_ptr), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+ "v31" // Clobber List
+ );
+}
+
+void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst_ptr,
+ ptrdiff_t dst_stride,
+ int dst_width) {
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
+ uint16_t* dst_ptr1 = dst_ptr + dst_stride;
+ const uint16_t* src_temp = src_ptr + 2;
+ const uint16_t* src_temp1 = src_ptr1 + 2;
+
+ asm volatile(
+ "movi v31.4h, #3 \n"
+ "movi v30.4s, #3 \n"
+
+ "1: \n"
+ "ldr d0, [%0], #8 \n"
+ "ldr d1, [%2], #8 \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
+ "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
+ "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd)
+ "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even)
+
+ "ldr d0, [%1], #8 \n"
+ "ldr d1, [%3], #8 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "ushll v4.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
+ "ushll v5.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
+ "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd)
+ "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even)
+
+ "mov v0.16b, v4.16b \n"
+ "mov v1.16b, v5.16b \n"
+ "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd)
+ "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even)
+ "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd)
+ "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even)
+
+ "rshrn v1.4h, v2.4s, #4 \n" // 2, odd
+ "rshrn v0.4h, v3.4s, #4 \n" // 2, even
+ "rshrn v3.4h, v4.4s, #4 \n" // 1, odd
+ "rshrn v2.4h, v5.4s, #4 \n" // 1, even
+
+ "st2 {v0.2s, v1.2s}, [%5], #16 \n" // store 2
+ "st2 {v2.2s, v3.2s}, [%4], #16 \n" // store 1
+ "subs %w6, %w6, #4 \n" // 2 uv -> 4 uv
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_ptr1), // %1
+ "+r"(src_temp), // %2
+ "+r"(src_temp1), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_ptr1), // %5
+ "+r"(dst_width) // %6
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
+ "v31" // Clobber List
+ );
+}
+
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,
@@ -522,13 +1024,14 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
int src_width) {
asm volatile(
"1: \n"
- "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
- "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
- "uaddw v1.8h, v1.8h, v0.8b \n"
- "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "b.gt 1b \n"
+ "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
+ "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddw v1.8h, v1.8h, v0.8b \n"
+ "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
@@ -560,17 +1063,17 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
int64_t x64 = (int64_t)x; // NOLINT
int64_t dx64 = (int64_t)dx; // NOLINT
asm volatile (
- "dup v0.4s, %w3 \n" // x
- "dup v1.4s, %w4 \n" // dx
- "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
- "shl v3.4s, v1.4s, #2 \n" // 4 * dx
- "mul v1.4s, v1.4s, v2.4s \n"
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v3.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "add v1.4s, v1.4s, v0.4s \n"
+ "add v1.4s, v1.4s, v0.4s \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
- "add v2.4s, v1.4s, v3.4s \n"
- "shl v0.4s, v3.4s, #1 \n" // 8 * dx
- "1: \n"
+ "add v2.4s, v1.4s, v3.4s \n"
+ "shl v0.4s, v3.4s, #1 \n" // 8 * dx
+ "1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
@@ -579,27 +1082,27 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
- "mov v6.16b, v1.16b \n"
- "mov v7.16b, v2.16b \n"
- "uzp1 v6.8h, v6.8h, v7.8h \n"
- "ushll v4.8h, v4.8b, #0 \n"
- "ushll v5.8h, v5.8b, #0 \n"
- "ssubl v16.4s, v5.4h, v4.4h \n"
- "ssubl2 v17.4s, v5.8h, v4.8h \n"
- "ushll v7.4s, v6.4h, #0 \n"
- "ushll2 v6.4s, v6.8h, #0 \n"
- "mul v16.4s, v16.4s, v7.4s \n"
- "mul v17.4s, v17.4s, v6.4s \n"
- "rshrn v6.4h, v16.4s, #16 \n"
- "rshrn2 v6.8h, v17.4s, #16 \n"
- "add v4.8h, v4.8h, v6.8h \n"
- "xtn v4.8b, v4.8h \n"
-
- "st1 {v4.8b}, [%0], #8 \n" // store pixels
- "add v1.4s, v1.4s, v0.4s \n"
- "add v2.4s, v2.4s, v0.4s \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "b.gt 1b \n"
+ "mov v6.16b, v1.16b \n"
+ "mov v7.16b, v2.16b \n"
+ "uzp1 v6.8h, v6.8h, v7.8h \n"
+ "ushll v4.8h, v4.8b, #0 \n"
+ "ushll v5.8h, v5.8b, #0 \n"
+ "ssubl v16.4s, v5.4h, v4.4h \n"
+ "ssubl2 v17.4s, v5.8h, v4.8h \n"
+ "ushll v7.4s, v6.4h, #0 \n"
+ "ushll2 v6.4s, v6.8h, #0 \n"
+ "mul v16.4s, v16.4s, v7.4s \n"
+ "mul v17.4s, v17.4s, v6.4s \n"
+ "rshrn v6.4h, v16.4s, #16 \n"
+ "rshrn2 v6.8h, v17.4s, #16 \n"
+ "add v4.8h, v4.8h, v6.8h \n"
+ "xtn v4.8b, v4.8h \n"
+
+ "st1 {v4.8b}, [%0], #8 \n" // store pixels
+ "add v1.4s, v1.4s, v0.4s \n"
+ "add v2.4s, v2.4s, v0.4s \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
@@ -623,74 +1126,83 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
int source_y_fraction) {
int y_fraction = 256 - source_y_fraction;
asm volatile(
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "add %2, %2, %1 \n"
- "cmp %w4, #64 \n"
- "b.eq 75f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
- "cmp %w4, #192 \n"
- "b.eq 25f \n"
-
- "dup v5.8b, %w4 \n"
- "dup v4.8b, %w5 \n"
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "add %2, %2, %1 \n"
+ "cmp %w4, #64 \n"
+ "b.eq 75f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+ "cmp %w4, #192 \n"
+ "b.eq 25f \n"
+
+ "dup v5.8b, %w4 \n"
+ "dup v4.8b, %w5 \n"
// General purpose row blend.
"1: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "umull v6.8h, v0.8b, v4.8b \n"
- "umull2 v7.8h, v0.16b, v4.16b \n"
- "umlal v6.8h, v1.8b, v5.8b \n"
- "umlal2 v7.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v6.8h, #8 \n"
- "rshrn2 v0.16b, v7.8h, #8 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v6.8h, v0.8b, v4.8b \n"
+ "umull2 v7.8h, v0.16b, v4.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "umlal v6.8h, v1.8b, v5.8b \n"
+ "umlal2 v7.8h, v1.16b, v5.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "rshrn v0.8b, v6.8h, #8 \n"
+ "rshrn2 v0.16b, v7.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
// Blend 25 / 75.
"25: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 25b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 25b \n"
+ "b 99f \n"
// Blend 50 / 50.
"50: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
// Blend 75 / 25.
"75: \n"
- "ld1 {v1.16b}, [%1], #16 \n"
- "ld1 {v0.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 75b \n"
- "b 99f \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "ld1 {v0.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "prfm pldl1keep, [%2, 448] \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 75b \n"
+ "b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
"99: \n"
- "st1 {v0.b}[15], [%0] \n"
+ "st1 {v0.b}[15], [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
@@ -709,11 +1221,12 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
asm volatile(
"1: \n"
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
- "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "mov v2.16b, v3.16b \n"
- "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
- "b.gt 1b \n"
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "mov v2.16b, v3.16b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
@@ -730,13 +1243,14 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
asm volatile(
"1: \n"
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
- "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
-
- "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
- "urhadd v1.16b, v2.16b, v3.16b \n"
- "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
- "b.gt 1b \n"
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "urhadd v1.16b, v2.16b, v3.16b \n"
+ "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -751,25 +1265,27 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
- "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
- "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
- "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
- "rshrn v0.8b, v0.8h, #2 \n" // round and pack
- "rshrn v1.8b, v1.8h, #2 \n"
- "rshrn v2.8b, v2.8h, #2 \n"
- "rshrn v3.8b, v3.8h, #2 \n"
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
- "b.gt 1b \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
+ "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "rshrn v2.8b, v2.8h, #2 \n"
+ "rshrn v3.8b, v3.8h, #2 \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -788,13 +1304,14 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
(void)src_stride;
asm volatile(
"1: \n"
- "ld1 {v0.s}[0], [%0], %3 \n"
- "ld1 {v0.s}[1], [%0], %3 \n"
- "ld1 {v0.s}[2], [%0], %3 \n"
- "ld1 {v0.s}[3], [%0], %3 \n"
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.s}[0], [%0], %3 \n"
+ "ld1 {v0.s}[1], [%0], %3 \n"
+ "ld1 {v0.s}[2], [%0], %3 \n"
+ "ld1 {v0.s}[3], [%0], %3 \n"
+ "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
@@ -812,33 +1329,35 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
asm volatile(
- "add %1, %1, %0 \n"
+ "add %1, %1, %0 \n"
"1: \n"
- "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
- "ld1 {v1.8b}, [%1], %4 \n"
- "ld1 {v2.8b}, [%0], %4 \n"
- "ld1 {v3.8b}, [%1], %4 \n"
- "ld1 {v4.8b}, [%0], %4 \n"
- "ld1 {v5.8b}, [%1], %4 \n"
- "ld1 {v6.8b}, [%0], %4 \n"
- "ld1 {v7.8b}, [%1], %4 \n"
- "uaddl v0.8h, v0.8b, v1.8b \n"
- "uaddl v2.8h, v2.8b, v3.8b \n"
- "uaddl v4.8h, v4.8b, v5.8b \n"
- "uaddl v6.8h, v6.8b, v7.8b \n"
- "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
- "mov v0.d[1], v2.d[0] \n"
- "mov v2.d[0], v16.d[1] \n"
- "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
- "mov v4.d[1], v6.d[0] \n"
- "mov v6.d[0], v16.d[1] \n"
- "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
- "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
- "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
- "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
- "subs %w3, %w3, #4 \n" // 4 pixels per loop.
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
+ "ld1 {v1.8b}, [%1], %4 \n"
+ "ld1 {v2.8b}, [%0], %4 \n"
+ "ld1 {v3.8b}, [%1], %4 \n"
+ "ld1 {v4.8b}, [%0], %4 \n"
+ "ld1 {v5.8b}, [%1], %4 \n"
+ "ld1 {v6.8b}, [%0], %4 \n"
+ "ld1 {v7.8b}, [%1], %4 \n"
+ "uaddl v0.8h, v0.8b, v1.8b \n"
+ "uaddl v2.8h, v2.8b, v3.8b \n"
+ "uaddl v4.8h, v4.8b, v5.8b \n"
+ "uaddl v6.8h, v6.8b, v7.8b \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
+ "mov v0.d[1], v2.d[0] \n"
+ "mov v2.d[0], v16.d[1] \n"
+ "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
+ "mov v4.d[1], v6.d[0] \n"
+ "mov v6.d[0], v16.d[1] \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
+ "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
+ "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
+ "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
+ "subs %w3, %w3, #4 \n" // 4 pixels per loop.
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
"+r"(dst_argb), // %2
@@ -875,10 +1394,11 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
LOAD1_DATA32_LANE(v1, 1)
LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3)
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
// clang-format on
- "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "b.gt 1b \n"
+ "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@@ -911,16 +1431,16 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
int64_t x64 = (int64_t)x; // NOLINT
int64_t dx64 = (int64_t)dx; // NOLINT
asm volatile (
- "dup v0.4s, %w3 \n" // x
- "dup v1.4s, %w4 \n" // dx
- "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
- "shl v6.4s, v1.4s, #2 \n" // 4 * dx
- "mul v1.4s, v1.4s, v2.4s \n"
- "movi v3.16b, #0x7f \n" // 0x7F
- "movi v4.8h, #0x7f \n" // 0x7F
+ "dup v0.4s, %w3 \n" // x
+ "dup v1.4s, %w4 \n" // dx
+ "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
+ "shl v6.4s, v1.4s, #2 \n" // 4 * dx
+ "mul v1.4s, v1.4s, v2.4s \n"
+ "movi v3.16b, #0x7f \n" // 0x7F
+ "movi v4.8h, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
- "add v5.4s, v1.4s, v0.4s \n"
- "1: \n"
+ "add v5.4s, v1.4s, v0.4s \n"
+ "1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(v0, v1, 0)
@@ -941,15 +1461,15 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"umull2 v17.8h, v0.16b, v7.16b \n"
"umull v18.8h, v1.8b, v2.8b \n"
"umull2 v19.8h, v1.16b, v2.16b \n"
+ "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"add v16.8h, v16.8h, v18.8h \n"
"add v17.8h, v17.8h, v19.8h \n"
"shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n"
-
"st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
- "b.gt 1b \n"
+ "b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@@ -972,19 +1492,21 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
- "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
"1: \n"
- "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
- "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
- "subs %w3, %w3, #8 \n" // 8 processed per loop
- "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
- "uaddlp v1.4s, v1.8h \n"
- "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
- "uadalp v1.4s, v3.8h \n"
- "rshrn v0.4h, v0.4s, #2 \n" // round and pack
- "rshrn2 v0.8h, v1.4s, #2 \n"
- "st1 {v0.8h}, [%2], #16 \n"
- "b.gt 1b \n"
+ "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #8 \n" // 8 processed per loop
+ "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
+ "uaddlp v1.4s, v1.8h \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
+ "uadalp v1.4s, v3.8h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v0.4h, v0.4s, #2 \n" // round and pack
+ "rshrn2 v0.8h, v1.4s, #2 \n"
+ "st1 {v0.8h}, [%2], #16 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -1001,38 +1523,40 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
uint16_t* dst,
int dst_width) {
asm volatile(
- "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
- "movi v0.8h, #9 \n" // constants
- "movi v1.4s, #3 \n"
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "movi v0.8h, #9 \n" // constants
+ "movi v1.4s, #3 \n"
"1: \n"
- "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
- "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
- "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
- "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
- "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
- "umull v16.4s, v3.4h, v0.4h \n"
- "umull2 v7.4s, v3.8h, v0.8h \n"
- "umull v18.4s, v4.4h, v0.4h \n"
- "umull2 v17.4s, v4.8h, v0.8h \n"
- "uaddw v16.4s, v16.4s, v6.4h \n"
- "uaddl2 v19.4s, v6.8h, v3.8h \n"
- "uaddl v3.4s, v6.4h, v3.4h \n"
- "uaddw2 v6.4s, v7.4s, v6.8h \n"
- "uaddl2 v7.4s, v5.8h, v4.8h \n"
- "uaddl v4.4s, v5.4h, v4.4h \n"
- "uaddw v18.4s, v18.4s, v5.4h \n"
- "mla v16.4s, v4.4s, v1.4s \n"
- "mla v18.4s, v3.4s, v1.4s \n"
- "mla v6.4s, v7.4s, v1.4s \n"
- "uaddw2 v4.4s, v17.4s, v5.8h \n"
- "uqrshrn v16.4h, v16.4s, #4 \n"
- "mla v4.4s, v19.4s, v1.4s \n"
- "uqrshrn2 v16.8h, v6.4s, #4 \n"
- "uqrshrn v17.4h, v18.4s, #4 \n"
- "uqrshrn2 v17.8h, v4.4s, #4 \n"
- "st2 {v16.8h-v17.8h}, [%2], #32 \n"
- "b.gt 1b \n"
+ "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
+ "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
+ "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
+ "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
+ "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
+ "umull v16.4s, v3.4h, v0.4h \n"
+ "umull2 v7.4s, v3.8h, v0.8h \n"
+ "umull v18.4s, v4.4h, v0.4h \n"
+ "umull2 v17.4s, v4.8h, v0.8h \n"
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "uaddw v16.4s, v16.4s, v6.4h \n"
+ "uaddl2 v19.4s, v6.8h, v3.8h \n"
+ "uaddl v3.4s, v6.4h, v3.4h \n"
+ "uaddw2 v6.4s, v7.4s, v6.8h \n"
+ "uaddl2 v7.4s, v5.8h, v4.8h \n"
+ "uaddl v4.4s, v5.4h, v4.4h \n"
+ "uaddw v18.4s, v18.4s, v5.4h \n"
+ "prfm pldl1keep, [%1, 448] \n"
+ "mla v16.4s, v4.4s, v1.4s \n"
+ "mla v18.4s, v3.4s, v1.4s \n"
+ "mla v6.4s, v7.4s, v1.4s \n"
+ "uaddw2 v4.4s, v17.4s, v5.8h \n"
+ "uqrshrn v16.4h, v16.4s, #4 \n"
+ "mla v4.4s, v19.4s, v1.4s \n"
+ "uqrshrn2 v16.8h, v6.4s, #4 \n"
+ "uqrshrn v17.4h, v18.4s, #4 \n"
+ "uqrshrn2 v17.8h, v4.4s, #4 \n"
+ "st2 {v16.8h-v17.8h}, [%2], #32 \n"
+ "b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
@@ -1044,6 +1568,64 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
);
}
+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts.
+ "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16
+ "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts.
+ "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "prfm pldl1keep, [%1, 448] \n"
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v16", "v17");
+}
+
+// Reads 4 pixels at a time.
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx, // pixel step
+ uint8_t* dst_ptr,
+ int dst_width) {
+ const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
+ const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
+ const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
+ (void)src_stride;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.h}[0], [%0], %6 \n"
+ "ld1 {v1.h}[0], [%1], %6 \n"
+ "ld1 {v2.h}[0], [%2], %6 \n"
+ "ld1 {v3.h}[0], [%3], %6 \n"
+ "subs %w5, %w5, #4 \n" // 4 pixels per loop.
+ "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src1_ptr), // %1
+ "+r"(src2_ptr), // %2
+ "+r"(src3_ptr), // %3
+ "+r"(dst_ptr), // %4
+ "+r"(dst_width) // %5
+ : "r"((int64_t)(src_stepx * 8)) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3");
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
diff --git a/files/source/scale_rgb.cc b/files/source/scale_rgb.cc
new file mode 100644
index 00000000..8db59b56
--- /dev/null
+++ b/files/source/scale_rgb.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2022 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h" /* For FilterMode */
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_argb.h"
+#include "libyuv/scale_rgb.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Scale a 24 bit image.
+// Converts to ARGB as intermediate step
+
+LIBYUV_API
+int RGBScale(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ int src_width,
+ int src_height,
+ uint8_t* dst_rgb,
+ int dst_stride_rgb,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int r;
+ uint8_t* src_argb =
+ (uint8_t*)malloc(src_width * src_height * 4 + dst_width * dst_height * 4);
+ uint8_t* dst_argb = src_argb + src_width * src_height * 4;
+
+ if (!src_argb) {
+ return 1;
+ }
+
+ r = RGB24ToARGB(src_rgb, src_stride_rgb, src_argb, src_width * 4, src_width,
+ src_height);
+ if (!r) {
+ r = ARGBScale(src_argb, src_width * 4, src_width, src_height, dst_argb,
+ dst_width * 4, dst_width, dst_height, filtering);
+ if (!r) {
+ r = ARGBToRGB24(dst_argb, dst_width * 4, dst_rgb, dst_stride_rgb,
+ dst_width, dst_height);
+ }
+ }
+ free(src_argb);
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/scale_uv.cc b/files/source/scale_uv.cc
new file mode 100644
index 00000000..3b3d7b8e
--- /dev/null
+++ b/files/source/scale_uv.cc
@@ -0,0 +1,1161 @@
+/*
+ * Copyright 2020 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyUV
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Macros to enable specialized scalers
+
+#ifndef HAS_SCALEUVDOWN2
+#define HAS_SCALEUVDOWN2 1
+#endif
+#ifndef HAS_SCALEUVDOWN4BOX
+#define HAS_SCALEUVDOWN4BOX 1
+#endif
+#ifndef HAS_SCALEUVDOWNEVEN
+#define HAS_SCALEUVDOWNEVEN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARDOWN
+#define HAS_SCALEUVBILINEARDOWN 1
+#endif
+#ifndef HAS_SCALEUVBILINEARUP
+#define HAS_SCALEUVBILINEARUP 1
+#endif
+#ifndef HAS_UVCOPY
+#define HAS_UVCOPY 1
+#endif
+#ifndef HAS_SCALEPLANEVERTICAL
+#define HAS_SCALEPLANEVERTICAL 1
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// ScaleUV, 1/2
+// This is an optimized version for scaling down a UV to 1/2 of
+// its original size.
+#if HAS_SCALEUVDOWN2
+static void ScaleUVDown2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ uint8_t* dst_uv, int dst_width) =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_C
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C
+ : ScaleUVRowDown2Box_C);
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 2); // Test scale factor of 2.
+ assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
+ // Advance to odd row, even column.
+ if (filtering == kFilterBilinear) {
+ src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2;
+ } else {
+ src_uv += (y >> 16) * (int64_t)src_stride + ((x >> 16) - 1) * 2;
+ }
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && filtering) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+ }
+ }
+#endif
+
+// This code is not enabled. Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_SSSE3
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3
+ : ScaleUVRowDown2Box_Any_SSSE3);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_SSSE3
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3
+ : ScaleUVRowDown2Box_SSSE3);
+ }
+ }
+#endif
+// This code is not enabled. Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_NEON
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
+ : ScaleUVRowDown2Box_Any_NEON);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_NEON
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
+ : ScaleUVRowDown2Box_NEON);
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_Any_MSA
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA
+ : ScaleUVRowDown2Box_Any_MSA);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleUVRowDown2 =
+ filtering == kFilterNone
+ ? ScaleUVRowDown2_MSA
+ : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA
+ : ScaleUVRowDown2Box_MSA);
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+}
+#endif // HAS_SCALEUVDOWN2
+
+// ScaleUV, 1/4
+// This is an optimized version for scaling down a UV to 1/4 of
+// its original size.
+#if HAS_SCALEUVDOWN4BOX
+static void ScaleUVDown4Box(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ // Allocate 2 rows of UV.
+ const int kRowSize = (dst_width * 2 * 2 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ uint8_t* dst_uv, int dst_width) =
+ ScaleUVRowDown2Box_C;
+ // Advance to odd row, even column.
+ src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2;
+ (void)src_width;
+ (void)src_height;
+ (void)dx;
+ assert(dx == 65536 * 4); // Test scale factor of 4.
+ assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
+
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+ }
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2);
+ ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize,
+ dst_width * 2);
+ ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+#endif // HAS_SCALEUVDOWN4BOX
+
+// ScaleUV Even
+// This is an optimized version for scaling down a UV to even
+// multiple of its original size.
+#if HAS_SCALEUVDOWNEVEN
+static void ScaleUVDownEven(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ int col_step = dx >> 16;
+ int row_stride = (dy >> 16) * (int64_t)src_stride;
+ void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride,
+ int src_step, uint8_t* dst_uv, int dst_width) =
+ filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C;
+ (void)src_width;
+ (void)src_height;
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+ src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2;
+#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
+ : ScaleUVRowDownEven_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && !filtering) {
+ ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven = ScaleUVRowDownEven_NEON;
+ }
+ }
+#endif // TODO(fbarchard): Enable Box filter
+#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON
+ : ScaleUVRowDownEven_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVROWDOWNEVEN_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVRowDownEven =
+ filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA;
+ }
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width);
+ src_uv += row_stride;
+ dst_uv += dst_stride;
+ }
+}
+#endif
+
+// Scale UV down with bilinear interpolation.
+#if HAS_SCALEUVBILINEARDOWN
+static void ScaleUVBilinearDown(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C;
+ int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+ int64_t xl = (dx >= 0) ? x : xlast;
+ int64_t xr = (dx >= 0) ? xlast : x;
+ int clip_src_width;
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
+ xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
+ if (xr > src_width) {
+ xr = src_width;
+ }
+ clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2.
+ src_uv += xl * 2;
+ x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_LSX;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+ }
+ }
+#endif
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row of UV.
+ {
+ align_buffer_64(row, clip_src_width * 2);
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint8_t* src = src_uv + yi * (int64_t)src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleUVFilterCols(dst_uv, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, clip_src_width, yf);
+ ScaleUVFilterCols(dst_uv, row, dst_width, x, dx);
+ }
+ dst_uv += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+#endif
+
+// Scale UV up with bilinear interpolation.
+#if HAS_SCALEUVBILINEARUP
+static void ScaleUVBilinearUp(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleUVFilterCols_C : ScaleUVCols_C;
+ const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_LSX)
+ if (TestCpuFlag(kCpuHasLSX)) {
+ InterpolateRow = InterpolateRow_Any_LSX;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_LSX;
+ }
+ }
+#endif
+ if (src_width >= 32768) {
+ ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;
+ }
+#if defined(HAS_SCALEUVFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_NEON)
+ if (filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVFILTERCOLS_MSA)
+ if (filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleUVFilterCols = ScaleUVFilterCols_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+ if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVFilterCols = ScaleUVCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+ if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleUVFilterCols = ScaleUVCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+ if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVFilterCols = ScaleUVCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVCols_MSA;
+ }
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleUVFilterCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+ ScaleUVFilterCols = ScaleUVColsUp2_SSSE3;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ {
+ int yi = y >> 16;
+ const uint8_t* src = src_uv + yi * (int64_t)src_stride;
+
+ // Allocate 2 rows of UV.
+ const int kRowSize = (dst_width * 2 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8_t* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ if (src_height > 2) {
+ src += src_stride;
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_uv + yi * (int64_t)src_stride;
+ }
+ if (yi != lasty) {
+ ScaleUVFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ if ((y + 65536) < max_y) {
+ src += src_stride;
+ }
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf);
+ }
+ dst_uv += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+#endif // HAS_SCALEUVBILINEARUP
+
+// Scale UV, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of NV16 to NV24.
+void ScaleUVLinearUp2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv) {
+ void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
+ ScaleUVRowUp2_Linear_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_uv + (y >> 16) * (int64_t)src_stride, dst_uv, dst_width);
+ dst_uv += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+// Scale plane, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of NV12 to NV24.
+void ScaleUVBilinearUp2(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleUVRowUp2_Bilinear_Any_C;
+ int x;
+
+ // This function can only scale up by 2 times.
+ assert(src_width == ((dst_width + 1) / 2));
+ assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON;
+ }
+#endif
+
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ // TODO(fbarchard): Test performance of writing one row of destination at a
+ // time.
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+}
+
+// Scale 16 bit UV, horizontally up by 2 times.
+// Uses linear filter horizontally, nearest vertically.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original width, using linear interpolation.
+// This is used to scale U and V planes of P210 to P410.
+void ScaleUVLinearUp2_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_uv,
+ uint16_t* dst_uv) {
+ void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
+ ScaleUVRowUp2_Linear_16_Any_C;
+ int i;
+ int y;
+ int dy;
+
+ // This function can only scale up by 2 times horizontally.
+ assert(src_width == ((dst_width + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
+ }
+#endif
+
+ if (dst_height == 1) {
+ ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv,
+ dst_width);
+ } else {
+ dy = FixedDiv(src_height - 1, dst_height - 1);
+ y = (1 << 15) - 1;
+ for (i = 0; i < dst_height; ++i) {
+ ScaleRowUp(src_uv + (y >> 16) * (int64_t)src_stride, dst_uv, dst_width);
+ dst_uv += dst_stride;
+ y += dy;
+ }
+ }
+}
+
+// Scale 16 bit UV, up by 2 times.
+// This is an optimized version for scaling up a plane to 2 times of
+// its original size, using bilinear interpolation.
+// This is used to scale U and V planes of P010 to P410.
+void ScaleUVBilinearUp2_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
+ void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
+ ScaleUVRowUp2_Bilinear_16_Any_C;
+ int x;
+
+ // This function can only scale up by 2 times.
+ assert(src_width == ((dst_width + 1) / 2));
+ assert(src_height == ((dst_height + 1) / 2));
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
+ }
+#endif
+
+#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON
+ if (TestCpuFlag(kCpuHasNEON)) {
+ Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
+ }
+#endif
+
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ dst_ptr += dst_stride;
+ for (x = 0; x < src_height - 1; ++x) {
+ Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
+ src_ptr += src_stride;
+ // TODO(fbarchard): Test performance of writing one row of destination at a
+ // time.
+ dst_ptr += 2 * dst_stride;
+ }
+ if (!(dst_height & 1)) {
+ Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
+ }
+}
+
+// Scale UV to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleUVSimple(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ int src_stride,
+ int dst_stride,
+ const uint8_t* src_uv,
+ uint8_t* dst_uv,
+ int x,
+ int dx,
+ int y,
+ int dy) {
+ int j;
+ void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width,
+ int x, int dx) =
+ (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
+ (void)src_height;
+#if defined(HAS_SCALEUVCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleUVCols = ScaleUVCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleUVCols = ScaleUVCols_Any_NEON;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleUVCols = ScaleUVCols_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEUVCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleUVCols = ScaleUVCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleUVCols = ScaleUVCols_MSA;
+ }
+ }
+#endif
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleUVCols = ScaleUVColsUp2_C;
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) {
+ ScaleUVCols = ScaleUVColsUp2_SSSE3;
+ }
+#endif
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleUVCols(dst_uv, src_uv + (y >> 16) * (int64_t)src_stride, dst_width, x,
+ dx);
+ dst_uv += dst_stride;
+ y += dy;
+ }
+}
+
+// Copy UV with optional flipping
+#if HAS_UVCOPY
+static int UVCopy(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * (int64_t)src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, width * 2, height);
+ return 0;
+}
+
+static int UVCopy_16(const uint16_t* src_uv,
+ int src_stride_uv,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ if (!src_uv || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uv = src_uv + (height - 1) * (int64_t)src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ CopyPlane_16(src_uv, src_stride_uv, dst_uv, dst_stride_uv, width * 2, height);
+ return 0;
+}
+#endif // HAS_UVCOPY
+
+// Scale a UV plane (from NV12)
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleUV(const uint8_t* src,
+ int src_stride,
+ int src_width,
+ int src_height,
+ uint8_t* dst,
+ int dst_stride,
+ int dst_width,
+ int dst_height,
+ int clip_x,
+ int clip_y,
+ int clip_width,
+ int clip_height,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // UV does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative src_height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * (int64_t)src_stride;
+ src_stride = -src_stride;
+ }
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+ &dx, &dy);
+ src_width = Abs(src_width);
+ if (clip_x) {
+ int64_t clipf = (int64_t)(clip_x)*dx;
+ x += (clipf & 0xffff);
+ src += (clipf >> 16) * 2;
+ dst += clip_x * 2;
+ }
+ if (clip_y) {
+ int64_t clipf = (int64_t)(clip_y)*dy;
+ y += (clipf & 0xffff);
+ src += (clipf >> 16) * (int64_t)src_stride;
+ dst += clip_y * dst_stride;
+ }
+
+ // Special case for integer step values.
+ if (((dx | dy) & 0xffff) == 0) {
+ if (!dx || !dy) { // 1 pixel wide and/or tall.
+ filtering = kFilterNone;
+ } else {
+ // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+ if (!(dx & 0x10000) && !(dy & 0x10000)) {
+#if HAS_SCALEUVDOWN2
+ if (dx == 0x20000) {
+ // Optimized 1/2 downsample.
+ ScaleUVDown2(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+#endif
+#if HAS_SCALEUVDOWN4BOX
+ if (dx == 0x40000 && filtering == kFilterBox) {
+ // Optimized 1/4 box downsample.
+ ScaleUVDown4Box(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy);
+ return;
+ }
+#endif
+#if HAS_SCALEUVDOWNEVEN
+ ScaleUVDownEven(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+#endif
+ }
+ // Optimized odd scale down. ie 3, 5, 7, 9x.
+ if ((dx & 0x10000) && (dy & 0x10000)) {
+ filtering = kFilterNone;
+#ifdef HAS_UVCOPY
+ if (dx == 0x10000 && dy == 0x10000) {
+ // Straight copy.
+ UVCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 2,
+ src_stride, dst, dst_stride, clip_width, clip_height);
+ return;
+ }
+#endif
+ }
+ }
+ }
+ // HAS_SCALEPLANEVERTICAL
+ if (dx == 0x10000 && (x & 0xffff) == 0) {
+ // Arbitrary scale vertically, but unscaled horizontally.
+ ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, y, dy, /*bpp=*/2, filtering);
+ return;
+ }
+ if (filtering && (dst_width + 1) / 2 == src_width) {
+ ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst);
+ return;
+ }
+ if ((clip_height + 1) / 2 == src_height &&
+ (clip_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+#if HAS_SCALEUVBILINEARUP
+ if (filtering && dy < 65536) {
+ ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+#endif
+#if HAS_SCALEUVBILINEARDOWN
+ if (filtering) {
+ ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy,
+ filtering);
+ return;
+ }
+#endif
+ ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride,
+ dst_stride, src, dst, x, dx, y, dy);
+}
+
+// Scale an UV image.
+LIBYUV_API
+int UVScale(const uint8_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 ||
+ src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv,
+ dst_width, dst_height, 0, 0, dst_width, dst_height, filtering);
+ return 0;
+}
+
+// Scale a 16 bit UV image.
+// This function is currently incomplete, it can't handle all cases.
+LIBYUV_API
+int UVScale_16(const uint16_t* src_uv,
+ int src_stride_uv,
+ int src_width,
+ int src_height,
+ uint16_t* dst_uv,
+ int dst_stride_uv,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ int dy = 0;
+
+ if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 ||
+ src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ // UV does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+ filtering);
+
+ // Negative src_height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src_uv = src_uv + (src_height - 1) * (int64_t)src_stride_uv;
+ src_stride_uv = -src_stride_uv;
+ }
+ src_width = Abs(src_width);
+
+#ifdef HAS_UVCOPY
+ if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) {
+ if (dst_height == 1) {
+ UVCopy_16(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride_uv,
+ src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height);
+ } else {
+ dy = src_height / dst_height;
+ UVCopy_16(src_uv + ((dy - 1) / 2) * (int64_t)src_stride_uv,
+ dy * (int64_t)src_stride_uv, dst_uv, dst_stride_uv, dst_width,
+ dst_height);
+ }
+
+ return 0;
+ }
+#endif
+
+ if (filtering && (dst_width + 1) / 2 == src_width) {
+ ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height,
+ src_stride_uv, dst_stride_uv, src_uv, dst_uv);
+ return 0;
+ }
+
+ if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
+ (filtering == kFilterBilinear || filtering == kFilterBox)) {
+ ScaleUVBilinearUp2_16(src_width, src_height, dst_width, dst_height,
+ src_stride_uv, dst_stride_uv, src_uv, dst_uv);
+ return 0;
+ }
+
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/scale_win.cc b/files/source/scale_win.cc
index c5fc86f3..ea1f95c6 100644
--- a/files/source/scale_win.cc
+++ b/files/source/scale_win.cc
@@ -16,8 +16,9 @@ namespace libyuv {
extern "C" {
#endif
-// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for 32 bit Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ !defined(__clang__) && defined(_M_IX86)
// Offsets for source bytes 0 to 9
static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
diff --git a/files/source/test.sh b/files/source/test.sh
new file mode 100755
index 00000000..7f12c3c1
--- /dev/null
+++ b/files/source/test.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+set -x
+
+function runbenchmark1 {
+ perf record /google/src/cloud/fbarchard/clean/google3/blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*$1 --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1
+ perf report | grep AVX
+}
+
+runbenchmark1 ABGRToI420
+runbenchmark1 Android420ToI420
+runbenchmark1 ARGBToI420
+runbenchmark1 Convert16To8Plane
+runbenchmark1 ConvertToARGB
+runbenchmark1 ConvertToI420
+runbenchmark1 CopyPlane
+runbenchmark1 H010ToAB30
+runbenchmark1 H010ToAR30
+runbenchmark1 HalfFloatPlane
+runbenchmark1 I010ToAB30
+runbenchmark1 I010ToAR30
+runbenchmark1 I420Copy
+runbenchmark1 I420Psnr
+runbenchmark1 I420Scale
+runbenchmark1 I420Ssim
+runbenchmark1 I420ToARGB
+runbenchmark1 I420ToNV12
+runbenchmark1 I420ToUYVY
+runbenchmark1 I422ToI420
+runbenchmark1 InitCpuFlags
+runbenchmark1 J420ToARGB
+runbenchmark1 NV12ToARGB
+runbenchmark1 NV12ToI420
+runbenchmark1 NV12ToI420Rotate
+runbenchmark1 SetCpuFlags
+runbenchmark1 YUY2ToI420