diff options
author | Frank Barchard <fbarchard@google.com> | 2023-02-13 10:52:58 -0800 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-02-13 20:14:57 +0000 |
commit | 2bdc210be9eb11ded16bf3ef1f6cadb0d4dcb0c2 (patch) | |
tree | c446d71538c965d0e5391ef77cd49b45ba51463d /source/row_lsx.cc | |
parent | b2528b0be934de1918e20c85fc170d809eeb49ab (diff) | |
download | libyuv-2bdc210be9eb11ded16bf3ef1f6cadb0d4dcb0c2.tar.gz |
MergeUV_AVX512BW for I420ToNV12
On Skylake Xeon 640x360 100000 iterations
AVX512 MergeUVPlane_Opt (1196 ms)
AVX2 MergeUVPlane_Opt (1565 ms)
SSE2 MergeUVPlane_Opt (1780 ms)
Pixel 7 MergeUVPlane_Opt (1177 ms)
Bug: None
Change-Id: If47d4fa957cf27781bba5fd6a2f0bf554101a5c6
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4242247
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Diffstat (limited to 'source/row_lsx.cc')
-rw-r--r-- | source/row_lsx.cc | 80 |
1 files changed, 42 insertions, 38 deletions
diff --git a/source/row_lsx.cc b/source/row_lsx.cc index 0825b633..9c1e16f2 100644 --- a/source/row_lsx.cc +++ b/source/row_lsx.cc @@ -1679,7 +1679,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( + asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants @@ -1688,31 +1688,32 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, "vld $vr4, %0, 0 \n\t" "vld $vr5, %0, 16 \n\t" "vld $vr6, %0, 32 \n\t" - "vld $vr7, %0, 48 \n\t" // load 16 pixels of ARGB + "vld $vr7, %0, 48 \n\t" // load 16 pixels of + // ARGB "vor.v $vr12, $vr3, $vr3 \n\t" "vor.v $vr13, $vr3, $vr3 \n\t" - "addi.d %2, %2, -16 \n\t" // 16 processed per loop. - "vpickev.b $vr8, $vr5, $vr4 \n\t" //BR + "addi.d %2, %2, -16 \n\t" // 16 processed per + // loop. + "vpickev.b $vr8, $vr5, $vr4 \n\t" // BR "vpickev.b $vr10, $vr7, $vr6 \n\t" - "vpickod.b $vr9, $vr5, $vr4 \n\t" //GA + "vpickod.b $vr9, $vr5, $vr4 \n\t" // GA "vpickod.b $vr11, $vr7, $vr6 \n\t" - "vmaddwev.h.bu $vr12, $vr8, $vr0 \n\t" //B + "vmaddwev.h.bu $vr12, $vr8, $vr0 \n\t" // B "vmaddwev.h.bu $vr13, $vr10, $vr0 \n\t" - "vmaddwev.h.bu $vr12, $vr9, $vr1 \n\t" //G + "vmaddwev.h.bu $vr12, $vr9, $vr1 \n\t" // G "vmaddwev.h.bu $vr13, $vr11, $vr1 \n\t" - "vmaddwod.h.bu $vr12, $vr8, $vr2 \n\t" //R + "vmaddwod.h.bu $vr12, $vr8, $vr2 \n\t" // R "vmaddwod.h.bu $vr13, $vr10, $vr2 \n\t" "addi.d %0, %0, 64 \n\t" "vpickod.b $vr10, $vr13, $vr12 \n\t" "vst $vr10, %1, 0 \n\t" "addi.d %1, %1, 16 \n\t" "bnez %2, 1b \n\t" - : "+&r"(src_argb), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 + : "+&r"(src_argb), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 : "r"(rgbconstants) - : "memory" - ); + : "memory"); } void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) { @@ -1737,7 +1738,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile( + asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants @@ -1746,31 +1747,32 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, "vld $vr4, %0, 0 \n\t" "vld $vr5, %0, 16 \n\t" "vld $vr6, %0, 32 \n\t" - "vld $vr7, %0, 48 \n\t" // load 16 pixels of RGBA + "vld $vr7, %0, 48 \n\t" // load 16 pixels of + // RGBA "vor.v $vr12, $vr3, $vr3 \n\t" "vor.v $vr13, $vr3, $vr3 \n\t" - "addi.d %2, %2, -16 \n\t" // 16 processed per loop. - "vpickev.b $vr8, $vr5, $vr4 \n\t" //AG + "addi.d %2, %2, -16 \n\t" // 16 processed per + // loop. + "vpickev.b $vr8, $vr5, $vr4 \n\t" // AG "vpickev.b $vr10, $vr7, $vr6 \n\t" - "vpickod.b $vr9, $vr5, $vr4 \n\t" //BR + "vpickod.b $vr9, $vr5, $vr4 \n\t" // BR "vpickod.b $vr11, $vr7, $vr6 \n\t" - "vmaddwev.h.bu $vr12, $vr9, $vr0 \n\t" //B + "vmaddwev.h.bu $vr12, $vr9, $vr0 \n\t" // B "vmaddwev.h.bu $vr13, $vr11, $vr0 \n\t" - "vmaddwod.h.bu $vr12, $vr8, $vr1 \n\t" //G + "vmaddwod.h.bu $vr12, $vr8, $vr1 \n\t" // G "vmaddwod.h.bu $vr13, $vr10, $vr1 \n\t" - "vmaddwod.h.bu $vr12, $vr9, $vr2 \n\t" //R + "vmaddwod.h.bu $vr12, $vr9, $vr2 \n\t" // R "vmaddwod.h.bu $vr13, $vr11, $vr2 \n\t" "addi.d %0, %0, 64 \n\t" "vpickod.b $vr10, $vr13, $vr12 \n\t" "vst $vr10, %1, 0 \n\t" "addi.d %1, %1, 16 \n\t" "bnez %2, 1b \n\t" - : "+&r"(src_rgba), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 : "r"(rgbconstants) - : "memory" - ); + : "memory"); } void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { @@ -1789,11 +1791,12 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, - 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, - 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, - 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; - asm volatile( + int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, + 20, 21, 23, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, + 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10, + 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0, + 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; + asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants @@ -1805,19 +1808,21 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, "1: \n\t" "vld $vr8, %0, 0 \n\t" "vld $vr9, %0, 16 \n\t" - "vld $vr10, %0, 32 \n\t" // load 16 pixels of RGB + "vld $vr10, %0, 32 \n\t" // load 16 pixels of + // RGB "vor.v $vr12, $vr3, $vr3 \n\t" "vor.v $vr13, $vr3, $vr3 \n\t" - "addi.d %2, %2, -16 \n\t" // 16 processed per loop. + "addi.d %2, %2, -16 \n\t" // 16 processed per + // loop. "vshuf.b $vr14, $vr9, $vr8, $vr4 \n\t" "vshuf.b $vr15, $vr9, $vr10, $vr5 \n\t" "vshuf.b $vr16, $vr9, $vr8, $vr6 \n\t" "vshuf.b $vr17, $vr9, $vr10, $vr7 \n\t" - "vmaddwev.h.bu $vr12, $vr16, $vr1 \n\t" //G + "vmaddwev.h.bu $vr12, $vr16, $vr1 \n\t" // G "vmaddwev.h.bu $vr13, $vr17, $vr1 \n\t" - "vmaddwev.h.bu $vr12, $vr14, $vr0 \n\t" //B + "vmaddwev.h.bu $vr12, $vr14, $vr0 \n\t" // B "vmaddwev.h.bu $vr13, $vr15, $vr0 \n\t" - "vmaddwod.h.bu $vr12, $vr14, $vr2 \n\t" //R + "vmaddwod.h.bu $vr12, $vr14, $vr2 \n\t" // R "vmaddwod.h.bu $vr13, $vr15, $vr2 \n\t" "addi.d %0, %0, 48 \n\t" "vpickod.b $vr10, $vr13, $vr12 \n\t" @@ -1829,8 +1834,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, "+&r"(width) // %2 : "r"(rgbconstants), // %3 "r"(shuff) // %4 - : "memory" - ); + : "memory"); } void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { |