diff options
Diffstat (limited to 'source/row_gcc.cc')
-rw-r--r-- | source/row_gcc.cc | 108 |
1 files changed, 97 insertions, 11 deletions
diff --git a/source/row_gcc.cc b/source/row_gcc.cc index cf87d46e..faf0fc91 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -3653,22 +3653,18 @@ void MergeUVRow_SSE2(const uint8_t* src_u, } #endif // HAS_MERGEUVROW_SSE2 -// Use scale to convert lsb formats to msb, depending how many bits there are: -// 128 = 9 bits -// 64 = 10 bits -// 16 = 12 bits -// 1 = 16 bits #ifdef HAS_MERGEUVROW_16_AVX2 void MergeUVRow_16_AVX2(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, - int scale, + int depth, int width) { + depth = 16 - depth; // clang-format off asm volatile ( "vmovd %4,%%xmm3 \n" "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" + "vbroadcastss %%xmm3,%%xmm3 \n" "sub %0,%1 \n" // 16 pixels per loop. @@ -3678,8 +3674,8 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, "vmovdqu (%0,%1,1),%%ymm1 \n" "add $0x20,%0 \n" - "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" - "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsllw %%xmm3,%%ymm0,%%ymm0 \n" + "vpsllw %%xmm3,%%ymm1,%%ymm1 \n" "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm2,(%2) \n" @@ -3694,12 +3690,62 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 - : "r"(scale) // %4 + : "r"(depth) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); // clang-format on } #endif // HAS_MERGEUVROW_AVX2 +#ifdef HAS_MERGEUVROW_16_AVX2 +const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13, + 2, 3, 6, 7, 10, 11, 14, 15}; +void SplitUVRow_16_AVX2(const uint16_t* src_uv, + uint16_t* dst_u, + uint16_t* dst_v, + int depth, + int width) { + depth = 16 - depth; + // clang-format off + asm volatile ( + "vmovd %4,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%xmm3 \n" + "vbroadcastf128 %5,%%ymm4 \n" + "sub %1,%2 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + + "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x0,%%ymm1,0x10(%1) \n" + "vextractf128 $0x1,%%ymm0,(%1,%2) \n" + "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n" + "add $0x20,%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width), // %3 + "+r"(depth) // %4 + : + "m"(kSplitUVShuffle16) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); + // clang-format on +} +#endif // HAS_MERGEUVROW_AVX2 + // Use scale to convert lsb formats to msb, depending how many bits there are: // 128 = 9 bits // 64 = 10 bits @@ -3717,7 +3763,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, "vbroadcastss %%xmm3,%%ymm3 \n" "sub %0,%1 \n" - // 16 pixels per loop. + // 32 pixels per loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" @@ -3739,6 +3785,46 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, } #endif // HAS_MULTIPLYROW_16_AVX2 +// Use scale to convert msb formats to lsb, depending how many bits there are: +// 512 = 9 bits +// 1024 = 10 bits +// 4096 = 12 bits +// 65536 = 16 bits +#ifdef HAS_DIVIDEROW_16_AVX2 +void DivideRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width), // %2 + "+r"(scale) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} +#endif // HAS_MULTIPLYROW_16_AVX2 + // Use scale to convert lsb formats to msb, depending how many bits there are: // 32768 = 9 bits // 16384 = 10 bits |