aboutsummaryrefslogtreecommitdiff
path: root/source/row_gcc.cc
diff options
context:
space:
mode:
Diffstat (limited to 'source/row_gcc.cc')
-rw-r--r--source/row_gcc.cc108
1 files changed, 97 insertions, 11 deletions
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index cf87d46e..faf0fc91 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -3653,22 +3653,18 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
}
#endif // HAS_MERGEUVROW_SSE2
-// Use scale to convert lsb formats to msb, depending how many bits there are:
-// 128 = 9 bits
-// 64 = 10 bits
-// 16 = 12 bits
-// 1 = 16 bits
#ifdef HAS_MERGEUVROW_16_AVX2
void MergeUVRow_16_AVX2(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
- int scale,
+ int depth,
int width) {
+ depth = 16 - depth;
// clang-format off
asm volatile (
"vmovd %4,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
+ "vbroadcastss %%xmm3,%%xmm3 \n"
"sub %0,%1 \n"
// 16 pixels per loop.
@@ -3678,8 +3674,8 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
"vmovdqu (%0,%1,1),%%ymm1 \n"
"add $0x20,%0 \n"
- "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsllw %%xmm3,%%ymm0,%%ymm0 \n"
+ "vpsllw %%xmm3,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
"vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm2,(%2) \n"
@@ -3694,12 +3690,62 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
- : "r"(scale) // %4
+ : "r"(depth) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
// clang-format on
}
#endif // HAS_MERGEUVROW_AVX2
+#ifdef HAS_MERGEUVROW_16_AVX2
+const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13,
+ 2, 3, 6, 7, 10, 11, 14, 15};
+void SplitUVRow_16_AVX2(const uint16_t* src_uv,
+ uint16_t* dst_u,
+ uint16_t* dst_v,
+ int depth,
+ int width) {
+ depth = 16 - depth;
+ // clang-format off
+ asm volatile (
+ "vmovd %4,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%xmm3 \n"
+ "vbroadcastf128 %5,%%ymm4 \n"
+ "sub %1,%2 \n"
+
+ // 16 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "add $0x40,%0 \n"
+
+ "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n"
+ "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x0,%%ymm1,0x10(%1) \n"
+ "vextractf128 $0x1,%%ymm0,(%1,%2) \n"
+ "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n"
+ "add $0x20,%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width), // %3
+ "+r"(depth) // %4
+ :
+ "m"(kSplitUVShuffle16) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+ // clang-format on
+}
+#endif // HAS_MERGEUVROW_AVX2
+
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 128 = 9 bits
// 64 = 10 bits
@@ -3717,7 +3763,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
"vbroadcastss %%xmm3,%%ymm3 \n"
"sub %0,%1 \n"
- // 16 pixels per loop.
+ // 32 pixels per loop.
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
@@ -3739,6 +3785,46 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
}
#endif // HAS_MULTIPLYROW_16_AVX2
+// Use scale to convert msb formats to lsb, depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+// 65536 = 16 bits
+#ifdef HAS_DIVIDEROW_16_AVX2
+void DivideRow_16_AVX2(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%0,%1) \n"
+ "vmovdqu %%ymm1,0x20(%0,%1) \n"
+ "add $0x40,%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width), // %2
+ "+r"(scale) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm3");
+ // clang-format on
+}
+#endif // HAS_MULTIPLYROW_16_AVX2
+
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 32768 = 9 bits
// 16384 = 10 bits