aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorge Steed <george.steed@arm.com>2024-04-10 16:36:25 +0100
committerFrank Barchard <fbarchard@chromium.org>2024-05-21 07:35:07 +0000
commit9fac9a4a823476da831419acb13a4ece67358d21 (patch)
tree2ef9fc30845794358f9060d6415ed29e8b084fdb
parent83c48c782afc1e8b6cc9795633c88837471e96fe (diff)
downloadlibyuv-9fac9a4a823476da831419acb13a4ece67358d21.tar.gz
[AArch64] Add Neon implementations for {ARGB,ABGR}ToAR30Row
There are existing x86 implementations for these kernels but not for AArch64, so add them. Reduction in runtimes, compared to the existing C code compiled with LLVM 17: | ABGRToAR30Row | ARGBToAR30Row Cortex-A55 | -55.1% | -55.1% Cortex-A510 | -39.3% | -40.1% Cortex-A76 | -62.3% | -63.6% Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com> Bug: libyuv:976 Change-Id: I307f03bddcbe5429c2d3ab2f42aa023a3539ddd0 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5465592 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r--include/libyuv/row.h11
-rw-r--r--source/convert_from_argb.cc16
-rw-r--r--source/row_any.cc6
-rw-r--r--source/row_neon64.cc53
4 files changed, 86 insertions, 0 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 43ffe247..b0ee4bf2 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -553,6 +553,9 @@ extern "C" {
// The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_ARGBTOAR30ROW_NEON
+#define HAS_ABGRTOAR30ROW_NEON
+
#define HAS_ABGRTOYJROW_NEON_DOTPROD
#define HAS_ABGRTOYROW_NEON_DOTPROD
#define HAS_ARGBTOYJROW_NEON_DOTPROD
@@ -5136,6 +5139,14 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
+void ARGBToAR30Row_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ABGRToAR30Row_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ABGRToAR30Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToAR30Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index c684ac00..264d7388 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -2268,6 +2268,14 @@ int ABGRToAR30(const uint8_t* src_abgr,
height = 1;
src_stride_abgr = dst_stride_ar30 = 0;
}
+#if defined(HAS_ABGRTOAR30ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToAR30Row = ABGRToAR30Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToAR30Row = ABGRToAR30Row_NEON;
+ }
+ }
+#endif
#if defined(HAS_ABGRTOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
@@ -2317,6 +2325,14 @@ int ARGBToAR30(const uint8_t* src_argb,
height = 1;
src_stride_argb = dst_stride_ar30 = 0;
}
+#if defined(HAS_ARGBTOAR30ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBToAR30Row = ARGBToAR30Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToAR30Row = ARGBToAR30Row_NEON;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
diff --git a/source/row_any.cc b/source/row_any.cc
index a466e342..e603d754 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -948,6 +948,12 @@ ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
#if defined(HAS_ABGRTOAR30ROW_SSSE3)
ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
#endif
+#if defined(HAS_ABGRTOAR30ROW_NEON)
+ANY11(ABGRToAR30Row_Any_NEON, ABGRToAR30Row_NEON, 0, 4, 4, 7)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_NEON)
+ANY11(ARGBToAR30Row_Any_NEON, ARGBToAR30Row_NEON, 0, 4, 4, 7)
+#endif
#if defined(HAS_ARGBTOAR30ROW_SSSE3)
ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
#endif
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 9dec07a1..4259f425 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -1722,6 +1722,59 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
);
}
+static const int16_t kAR30Row_BoxShifts[] = {0, -6, 0, -6, 0, -6, 0, -6};
+
+static const uint8_t kABGRToAR30Row_BoxIndices[] = {
+ 2, 2, 1, 1, 6, 6, 5, 5, 10, 10, 9, 9, 14, 14, 13, 13,
+ 0, 0, 3, 3, 4, 4, 7, 7, 8, 8, 11, 11, 12, 12, 15, 15};
+static const uint8_t kARGBToAR30Row_BoxIndices[] = {
+ 0, 0, 1, 1, 4, 4, 5, 5, 8, 8, 9, 9, 12, 12, 13, 13,
+ 2, 2, 3, 3, 6, 6, 7, 7, 10, 10, 11, 11, 14, 14, 15, 15};
+
+// ARGB or ABGR as input, reordering based on TBL indices parameter.
+static void ABCDToAR30Row_NEON(const uint8_t* src_abcd,
+ uint8_t* dst_ar30,
+ int width,
+ const uint8_t* indices) {
+ asm volatile(
+ "movi v2.4s, #0xf, msl 16 \n" // 0xfffff
+ "ldr q3, [%[kAR30Row_BoxShifts]] \n"
+ "ldp q4, q5, [%[indices]] \n"
+ "1: \n"
+ "ldp q0, q20, [%[src]], #32 \n"
+ "subs %w[width], %w[width], #8 \n"
+ "tbl v1.16b, {v0.16b}, v5.16b \n"
+ "tbl v21.16b, {v20.16b}, v5.16b \n"
+ "tbl v0.16b, {v0.16b}, v4.16b \n"
+ "tbl v20.16b, {v20.16b}, v4.16b \n"
+ "ushl v0.8h, v0.8h, v3.8h \n"
+ "ushl v20.8h, v20.8h, v3.8h \n"
+ "ushl v1.8h, v1.8h, v3.8h \n"
+ "ushl v21.8h, v21.8h, v3.8h \n"
+ "ushr v0.4s, v0.4s, #6 \n"
+ "ushr v20.4s, v20.4s, #6 \n"
+ "shl v1.4s, v1.4s, #14 \n"
+ "shl v21.4s, v21.4s, #14 \n"
+ "bif v0.16b, v1.16b, v2.16b \n"
+ "bif v20.16b, v21.16b, v2.16b \n"
+ "stp q0, q20, [%[dst]], #32 \n"
+ "b.gt 1b \n"
+ : [src] "+r"(src_abcd), // %[src]
+ [dst] "+r"(dst_ar30), // %[dst]
+ [width] "+r"(width) // %[width]
+ : [kAR30Row_BoxShifts] "r"(kAR30Row_BoxShifts), // %[kAR30Row_BoxShifts]
+ [indices] "r"(indices) // %[indices]
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v20", "v21");
+}
+
+void ABGRToAR30Row_NEON(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
+ ABCDToAR30Row_NEON(src_abgr, dst_ar30, width, kABGRToAR30Row_BoxIndices);
+}
+
+void ARGBToAR30Row_NEON(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
+ ABCDToAR30Row_NEON(src_argb, dst_ar30, width, kARGBToAR30Row_BoxIndices);
+}
+
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb24,
int width) {