diff options
author | Frank Barchard <fbarchard@google.com> | 2022-09-16 11:12:39 -0700 |
---|---|---|
committer | Frank Barchard <fbarchard@chromium.org> | 2022-09-16 19:46:47 +0000 |
commit | f71c83552d373f0ff41833b17e2880632d8561d7 (patch) | |
tree | 09088188086a6b03d07a5ebaa8edf01658466ad8 /source/row_neon64.cc | |
parent | 3e38ce50589d9319badc0501f96d6c5b2b177472 (diff) | |
download | libyuv-f71c83552d373f0ff41833b17e2880632d8561d7.tar.gz |
I420ToRGB24MatrixFilter function added
- Implemented as 3 steps: Upsample UV to 4:4:4, I444ToARGB, ARGBToRGB24
- Fix some build warnings for missing prototypes.
Pixel 4
I420ToRGB24_Opt (743 ms)
I420ToRGB24Filter_Opt (1331 ms)
Windows with skylake xeon:
x86 32 bit
I420ToRGB24_Opt (387 ms)
I420ToRGB24Filter_Opt (571 ms)
x64 64 bit
I420ToRGB24_Opt (384 ms)
I420ToRGB24Filter_Opt (582 ms)
Bug: libyuv:938, libyuv:830
Change-Id: Ie27f70816ec084437014f8a1c630ae011ee2348c
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3900298
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Diffstat (limited to 'source/row_neon64.cc')
-rw-r--r-- | source/row_neon64.cc | 46 |
1 files changed, 23 insertions, 23 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc index e166ce04..37962378 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -653,11 +653,11 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, #if LIBYUV_USE_ST2 // Read 16 Y, 8 UV, and write 8 YUY2 void DetileToYUY2_NEON(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width) { + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { asm volatile( "1: \n" "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys @@ -667,23 +667,23 @@ void DetileToYUY2_NEON(const uint8_t* src_y, "subs %w3, %w3, #16 \n" // store 8 YUY2 "st2 {v0.16b,v1.16b}, [%2], #32 \n" "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_yuy2), // %2 - "+r"(width) // %3 - : "r"(src_y_tile_stride), // %4 - "r"(src_uv_tile_stride) // %5 + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 : "cc", "memory", "v0", "v1" // Clobber list ); } #else // Read 16 Y, 8 UV, and write 8 YUY2 void DetileToYUY2_NEON(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width) { + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { asm volatile( "1: \n" "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys @@ -694,13 +694,13 @@ void DetileToYUY2_NEON(const uint8_t* src_y, "prfm pldl1keep, [%1, 1792] \n" "zip2 v3.16b, v0.16b, v1.16b \n" "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 8 YUY2 - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_yuy2), // %2 - "+r"(width) // %3 - : "r"(src_y_tile_stride), // %4 - "r"(src_uv_tile_stride) // %5 + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber list ); } |