diff options
author | George Steed <george.steed@arm.com> | 2024-04-10 16:36:25 +0100 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2024-05-21 07:35:07 +0000 |
commit | 9fac9a4a823476da831419acb13a4ece67358d21 (patch) | |
tree | 2ef9fc30845794358f9060d6415ed29e8b084fdb | |
parent | 83c48c782afc1e8b6cc9795633c88837471e96fe (diff) | |
download | libyuv-9fac9a4a823476da831419acb13a4ece67358d21.tar.gz |
[AArch64] Add Neon implementations for {ARGB,ABGR}ToAR30Row
There are existing x86 implementations for these kernels but not for
AArch64, so add them.
Reduction in runtimes, compared to the existing C code compiled with
LLVM 17:
| ABGRToAR30Row | ARGBToAR30Row
Cortex-A55 | -55.1% | -55.1%
Cortex-A510 | -39.3% | -40.1%
Cortex-A76 | -62.3% | -63.6%
Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com>
Bug: libyuv:976
Change-Id: I307f03bddcbe5429c2d3ab2f42aa023a3539ddd0
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5465592
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r-- | include/libyuv/row.h | 11 | ||||
-rw-r--r-- | source/convert_from_argb.cc | 16 | ||||
-rw-r--r-- | source/row_any.cc | 6 | ||||
-rw-r--r-- | source/row_neon64.cc | 53 |
4 files changed, 86 insertions, 0 deletions
diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 43ffe247..b0ee4bf2 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -553,6 +553,9 @@ extern "C" { // The following are available on AArch64 platforms: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_ARGBTOAR30ROW_NEON +#define HAS_ABGRTOAR30ROW_NEON + #define HAS_ABGRTOYJROW_NEON_DOTPROD #define HAS_ABGRTOYROW_NEON_DOTPROD #define HAS_ARGBTOYJROW_NEON_DOTPROD @@ -5136,6 +5139,14 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void ARGBToAR30Row_NEON(const uint8_t* src, uint8_t* dst, int width); +void ABGRToAR30Row_NEON(const uint8_t* src, uint8_t* dst, int width); +void ABGRToAR30Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToAR30Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void I444ToARGBRow_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index c684ac00..264d7388 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -2268,6 +2268,14 @@ int ABGRToAR30(const uint8_t* src_abgr, height = 1; src_stride_abgr = dst_stride_ar30 = 0; } +#if defined(HAS_ABGRTOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToAR30Row = ABGRToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ABGRToAR30Row = ABGRToAR30Row_NEON; + } + } +#endif #if defined(HAS_ABGRTOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3; @@ -2317,6 +2325,14 @@ int ARGBToAR30(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_ar30 = 0; } +#if defined(HAS_ARGBTOAR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAR30Row = ARGBToAR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAR30Row = ARGBToAR30Row_NEON; + } + } +#endif #if defined(HAS_ARGBTOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; diff --git a/source/row_any.cc b/source/row_any.cc index a466e342..e603d754 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -948,6 +948,12 @@ ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) #if defined(HAS_ABGRTOAR30ROW_SSSE3) ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3) #endif +#if defined(HAS_ABGRTOAR30ROW_NEON) +ANY11(ABGRToAR30Row_Any_NEON, ABGRToAR30Row_NEON, 0, 4, 4, 7) +#endif +#if defined(HAS_ARGBTOAR30ROW_NEON) +ANY11(ARGBToAR30Row_Any_NEON, ARGBToAR30Row_NEON, 0, 4, 4, 7) +#endif #if defined(HAS_ARGBTOAR30ROW_SSSE3) ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3) #endif diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 9dec07a1..4259f425 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -1722,6 +1722,59 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, ); } +static const int16_t kAR30Row_BoxShifts[] = {0, -6, 0, -6, 0, -6, 0, -6}; + +static const uint8_t kABGRToAR30Row_BoxIndices[] = { + 2, 2, 1, 1, 6, 6, 5, 5, 10, 10, 9, 9, 14, 14, 13, 13, + 0, 0, 3, 3, 4, 4, 7, 7, 8, 8, 11, 11, 12, 12, 15, 15}; +static const uint8_t kARGBToAR30Row_BoxIndices[] = { + 0, 0, 1, 1, 4, 4, 5, 5, 8, 8, 9, 9, 12, 12, 13, 13, + 2, 2, 3, 3, 6, 6, 7, 7, 10, 10, 11, 11, 14, 14, 15, 15}; + +// ARGB or ABGR as input, reordering based on TBL indices parameter. +static void ABCDToAR30Row_NEON(const uint8_t* src_abcd, + uint8_t* dst_ar30, + int width, + const uint8_t* indices) { + asm volatile( + "movi v2.4s, #0xf, msl 16 \n" // 0xfffff + "ldr q3, [%[kAR30Row_BoxShifts]] \n" + "ldp q4, q5, [%[indices]] \n" + "1: \n" + "ldp q0, q20, [%[src]], #32 \n" + "subs %w[width], %w[width], #8 \n" + "tbl v1.16b, {v0.16b}, v5.16b \n" + "tbl v21.16b, {v20.16b}, v5.16b \n" + "tbl v0.16b, {v0.16b}, v4.16b \n" + "tbl v20.16b, {v20.16b}, v4.16b \n" + "ushl v0.8h, v0.8h, v3.8h \n" + "ushl v20.8h, v20.8h, v3.8h \n" + "ushl v1.8h, v1.8h, v3.8h \n" + "ushl v21.8h, v21.8h, v3.8h \n" + "ushr v0.4s, v0.4s, #6 \n" + "ushr v20.4s, v20.4s, #6 \n" + "shl v1.4s, v1.4s, #14 \n" + "shl v21.4s, v21.4s, #14 \n" + "bif v0.16b, v1.16b, v2.16b \n" + "bif v20.16b, v21.16b, v2.16b \n" + "stp q0, q20, [%[dst]], #32 \n" + "b.gt 1b \n" + : [src] "+r"(src_abcd), // %[src] + [dst] "+r"(dst_ar30), // %[dst] + [width] "+r"(width) // %[width] + : [kAR30Row_BoxShifts] "r"(kAR30Row_BoxShifts), // %[kAR30Row_BoxShifts] + [indices] "r"(indices) // %[indices] + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v20", "v21"); +} + +void ABGRToAR30Row_NEON(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) { + ABCDToAR30Row_NEON(src_abgr, dst_ar30, width, kABGRToAR30Row_BoxIndices); +} + +void ARGBToAR30Row_NEON(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { + ABCDToAR30Row_NEON(src_argb, dst_ar30, width, kARGBToAR30Row_BoxIndices); +} + void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { |