diff options
author | Lu Wang <wanglu@loongson.cn> | 2023-05-08 21:13:25 +0800 |
---|---|---|
committer | libyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2023-05-10 00:25:48 +0000 |
commit | 1d940cc570212c8979d81e78738296fe39f9df43 (patch) | |
tree | 7bcaeb72858520f5bf99655359cd8e9cc723d1db /source/row_lsx.cc | |
parent | b372510c5699abdde5d50b60e89daa5b71b7792c (diff) | |
download | libyuv-1d940cc570212c8979d81e78738296fe39f9df43.tar.gz |
Optimize the following functions with LSX.
MirrorRow_LSX, MirrorUVRow_LSX, ARGBMirrorRow_LSX,
I422ToYUY2Row_LSX, I422ToUYVYRow_LSX, I422ToARGBRow_LSX,
I422ToRGBARow_LSX, I422AlphaToARGBRow_LSX, I422ToRGB24Row_LSX,
I422ToRGB565Row_LSX, I422ToARGB4444Row_LSX, I422ToARGB1555Row_LSX,
YUY2ToYRow_LSX, YUY2ToUVRow_LSX, YUY2ToUV422Row_LSX
Bug: libyuv:913
Change-Id: I46cec605001d7ddd73846eed6d0a77f936b6dc53
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4515191
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Diffstat (limited to 'source/row_lsx.cc')
-rw-r--r-- | source/row_lsx.cc | 559 |
1 files changed, 559 insertions, 0 deletions
diff --git a/source/row_lsx.cc b/source/row_lsx.cc index 9c1e16f2..48baafb7 100644 --- a/source/row_lsx.cc +++ b/source/row_lsx.cc @@ -31,6 +31,91 @@ extern "C" { yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ } +// Load 32 YUV422 pixel data +#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \ + { \ + __m128i temp0, temp1; \ + \ + DUP2_ARG2(__lsx_vld, psrc_y, 0, psrc_u, 0, out_y, temp0); \ + temp1 = __lsx_vld(psrc_v, 0); \ + temp0 = __lsx_vsub_b(temp0, const_80); \ + temp1 = __lsx_vsub_b(temp1, const_80); \ + temp0 = __lsx_vsllwil_h_b(temp0, 0); \ + temp1 = __lsx_vsllwil_h_b(temp1, 0); \ + uv_l = __lsx_vilvl_h(temp0, temp1); \ + uv_h = __lsx_vilvh_h(temp0, temp1); \ + } + +// Load 16 YUV422 pixel data +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \ + { \ + __m128i temp0, temp1; \ + \ + out_y = __lsx_vld(psrc_y, 0); \ + temp0 = __lsx_vldrepl_d(psrc_u, 0); \ + temp1 = __lsx_vldrepl_d(psrc_v, 0); \ + uv = __lsx_vilvl_b(temp0, temp1); \ + uv = __lsx_vsub_b(uv, const_80); \ + uv = __lsx_vsllwil_h_b(uv, 0); \ + } + +// Convert 16 pixels of YUV420 to RGB. +#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, \ + b_h, g_l,g_h, r_l, r_h) \ + { \ + __m128i u_l, u_h, v_l, v_h; \ + __m128i yl_ev, yl_od, yh_ev, yh_od; \ + __m128i temp0, temp1, temp2, temp3; \ + \ + temp0 = __lsx_vilvl_b(in_y, in_y); \ + temp1 = __lsx_vilvh_b(in_y, in_y); \ + yl_ev = __lsx_vmulwev_w_hu_h(temp0, yg); \ + yl_od = __lsx_vmulwod_w_hu_h(temp0, yg); \ + yh_ev = __lsx_vmulwev_w_hu_h(temp1, yg); \ + yh_od = __lsx_vmulwod_w_hu_h(temp1, yg); \ + DUP4_ARG2(__lsx_vsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \ + yl_ev, yl_od, yh_ev, yh_od); \ + yl_ev = __lsx_vadd_w(yl_ev, yb); \ + yl_od = __lsx_vadd_w(yl_od, yb); \ + yh_ev = __lsx_vadd_w(yh_ev, yb); \ + yh_od = __lsx_vadd_w(yh_od, yb); \ + v_l = __lsx_vmulwev_w_h(in_uvl, ubvr); \ + u_l = __lsx_vmulwod_w_h(in_uvl, ubvr); \ + v_h = __lsx_vmulwev_w_h(in_uvh, ubvr); \ + u_h = __lsx_vmulwod_w_h(in_uvh, ubvr); \ + temp0 = __lsx_vadd_w(yl_ev, u_l); \ + temp1 = __lsx_vadd_w(yl_od, u_l); \ + temp2 = __lsx_vadd_w(yh_ev, u_h); \ + temp3 = __lsx_vadd_w(yh_od, u_h); \ + DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + b_l = __lsx_vpackev_h(temp1, temp0); \ + b_h = __lsx_vpackev_h(temp3, temp2); \ + temp0 = __lsx_vadd_w(yl_ev, v_l); \ + temp1 = __lsx_vadd_w(yl_od, v_l); \ + temp2 = __lsx_vadd_w(yh_ev, v_h); \ + temp3 = __lsx_vadd_w(yh_od, v_h); \ + DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + r_l = __lsx_vpackev_h(temp1, temp0); \ + r_h = __lsx_vpackev_h(temp3, temp2); \ + DUP2_ARG2(__lsx_vdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \ + temp0 = __lsx_vsub_w(yl_ev, u_l); \ + temp1 = __lsx_vsub_w(yl_od, u_l); \ + temp2 = __lsx_vsub_w(yh_ev, u_h); \ + temp3 = __lsx_vsub_w(yh_od, u_h); \ + DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + g_l = __lsx_vpackev_h(temp1, temp0); \ + g_h = __lsx_vpackev_h(temp3, temp2); \ + } + // Convert 8 pixels of YUV420 to RGB. #define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \ { \ @@ -118,6 +203,26 @@ extern "C" { out_g = __lsx_vpackev_h(tmp1, tmp0); \ } +// Pack and Store 16 ARGB values. +#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \ + { \ + \ + __m128i temp0, temp1, temp2, temp3; \ + temp0 = __lsx_vpackev_b(g_l, b_l); \ + temp1 = __lsx_vpackev_b(a_l, r_l); \ + temp2 = __lsx_vpackev_b(g_h, b_h); \ + temp3 = __lsx_vpackev_b(a_h, r_h); \ + r_l = __lsx_vilvl_h(temp1, temp0); \ + r_h = __lsx_vilvh_h(temp1, temp0); \ + g_l = __lsx_vilvl_h(temp3, temp2); \ + g_h = __lsx_vilvh_h(temp3, temp2); \ + __lsx_vst(r_l, pdst_argb, 0); \ + __lsx_vst(r_h, pdst_argb, 16); \ + __lsx_vst(g_l, pdst_argb, 32); \ + __lsx_vst(g_h, pdst_argb, 48); \ + pdst_argb += 64; \ + } + // Pack and Store 8 ARGB values. #define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \ { \ @@ -155,6 +260,460 @@ extern "C" { _dst0 = __lsx_vpickod_b(_reg1, _reg0); \ } +void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 32; + __m128i src0, src1; + __m128i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607}; + src += width - 32; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, + src1); + __lsx_vst(src1, dst, 0); + __lsx_vst(src0, dst, 16); + dst += 32; + src -= 32; + } +} + +void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + int len = width / 8; + __m128i src, dst; + __m128i shuffler = {0x0004000500060007, 0x0000000100020003}; + + src_uv += (width - 8) << 1; + for (x = 0; x < len; x++) { + src = __lsx_vld(src_uv, 0); + dst = __lsx_vshuf_h(shuffler, src, src); + __lsx_vst(dst, dst_uv, 0); + src_uv -= 16; + dst_uv += 16; + } +} + +void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 8; + __m128i src0, src1; + __m128i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504}; + + src += (width * 4) - 32; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, + src1); + __lsx_vst(src1, dst, 0); + __lsx_vst(src0, dst, 16); + dst += 32; + src -= 32; + } +} + +void I422ToYUY2Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + int x; + int len = width / 16; + __m128i src_u0, src_v0, src_y0, vec_uv0; + __m128i vec_yuy2_0, vec_yuy2_1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lsx_vld(src_y, 0); + vec_uv0 = __lsx_vilvl_b(src_v0, src_u0); + vec_yuy2_0 = __lsx_vilvl_b(vec_uv0, src_y0); + vec_yuy2_1 = __lsx_vilvh_b(vec_uv0, src_y0); + __lsx_vst(vec_yuy2_0, dst_yuy2, 0); + __lsx_vst(vec_yuy2_1, dst_yuy2, 16); + src_u += 8; + src_v += 8; + src_y += 16; + dst_yuy2 += 32; + } +} + +void I422ToUYVYRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + int x; + int len = width / 16; + __m128i src_u0, src_v0, src_y0, vec_uv0; + __m128i vec_uyvy0, vec_uyvy1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lsx_vld(src_y, 0); + vec_uv0 = __lsx_vilvl_b(src_v0, src_u0); + vec_uyvy0 = __lsx_vilvl_b(src_y0, vec_uv0); + vec_uyvy1 = __lsx_vilvh_b(src_y0, vec_uv0); + __lsx_vst(vec_uyvy0, dst_uyvy, 0); + __lsx_vst(vec_uyvy1, dst_uyvy, 16); + src_u += 8; + src_v += 8; + src_y += 16; + dst_uyvy += 32; + } +} + +void I422ToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i alpha = __lsx_vldi(0xFF); + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void I422ToRGBARow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i alpha = __lsx_vldi(0xFF); + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb); + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void I422AlphaToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + int res = width & 15; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i zero = __lsx_vldi(0); + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h; + + y = __lsx_vld(src_a, 0); + a_l = __lsx_vilvl_b(zero, y); + a_h = __lsx_vilvh_b(zero, y); + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); + src_y += 16; + src_u += 8; + src_v += 8; + src_a += 16; + } + if (res) { + __m128i y, uv, r, g, b, a; + a = __lsx_vld(src_a, 0); + a = __lsx_vsllwil_hu_bu(a, 0); + READYUV422(src_y, src_u, src_v, y, uv); + YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r); + STOREARGB(a, r, g, b, dst_argb); + } +} + +void I422ToRGB24Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int32_t width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + __m128i shuffler0 = {0x0504120302100100, 0x0A18090816070614}; + __m128i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B}; + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m128i temp0, temp1, temp2, temp3; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + temp0 = __lsx_vpackev_b(g_l, b_l); + temp1 = __lsx_vpackev_b(g_h, b_h); + DUP4_ARG3(__lsx_vshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, + r_l, temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0, + temp1); + + b_l = __lsx_vilvl_d(temp1, temp2); + b_h = __lsx_vilvh_d(temp3, temp1); + __lsx_vst(temp0, dst_argb, 0); + __lsx_vst(b_l, dst_argb, 16); + __lsx_vst(b_h, dst_argb, 32); + dst_argb += 48; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. +void I422ToRGB565Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lsx_vsrli_h(b_l, 3); + b_h = __lsx_vsrli_h(b_h, 3); + g_l = __lsx_vsrli_h(g_l, 2); + g_h = __lsx_vsrli_h(g_h, 2); + r_l = __lsx_vsrli_h(r_l, 3); + r_h = __lsx_vsrli_h(r_h, 3); + r_l = __lsx_vslli_h(r_l, 11); + r_h = __lsx_vslli_h(r_h, 11); + g_l = __lsx_vslli_h(g_l, 5); + g_h = __lsx_vslli_h(g_h, 5); + r_l = __lsx_vor_v(r_l, g_l); + r_l = __lsx_vor_v(r_l, b_l); + r_h = __lsx_vor_v(r_h, g_h); + r_h = __lsx_vor_v(r_h, b_h); + __lsx_vst(r_l, dst_rgb565, 0); + __lsx_vst(r_h, dst_rgb565, 16); + dst_rgb565 += 32; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. +void I422ToARGB4444Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + __m128i alpha = {0xF000F000F000F000, 0xF000F000F000F000}; + __m128i mask = {0x00F000F000F000F0, 0x00F000F000F000F0}; + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lsx_vsrli_h(b_l, 4); + b_h = __lsx_vsrli_h(b_h, 4); + r_l = __lsx_vsrli_h(r_l, 4); + r_h = __lsx_vsrli_h(r_h, 4); + g_l = __lsx_vand_v(g_l, mask); + g_h = __lsx_vand_v(g_h, mask); + r_l = __lsx_vslli_h(r_l, 8); + r_h = __lsx_vslli_h(r_h, 8); + r_l = __lsx_vor_v(r_l, alpha); + r_h = __lsx_vor_v(r_h, alpha); + r_l = __lsx_vor_v(r_l, g_l); + r_h = __lsx_vor_v(r_h, g_h); + r_l = __lsx_vor_v(r_l, b_l); + r_h = __lsx_vor_v(r_h, b_h); + __lsx_vst(r_l, dst_argb4444, 0); + __lsx_vst(r_h, dst_argb4444, 16); + dst_argb4444 += 32; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void I422ToARGB1555Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + __m128i alpha = {0x8000800080008000, 0x8000800080008000}; + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lsx_vsrli_h(b_l, 3); + b_h = __lsx_vsrli_h(b_h, 3); + g_l = __lsx_vsrli_h(g_l, 3); + + g_h = __lsx_vsrli_h(g_h, 3); + g_l = __lsx_vslli_h(g_l, 5); + g_h = __lsx_vslli_h(g_h, 5); + r_l = __lsx_vsrli_h(r_l, 3); + r_h = __lsx_vsrli_h(r_h, 3); + r_l = __lsx_vslli_h(r_l, 10); + r_h = __lsx_vslli_h(r_h, 10); + r_l = __lsx_vor_v(r_l, alpha); + r_h = __lsx_vor_v(r_h, alpha); + r_l = __lsx_vor_v(r_l, g_l); + r_h = __lsx_vor_v(r_h, g_h); + r_l = __lsx_vor_v(r_l, b_l); + r_h = __lsx_vor_v(r_h, b_h); + __lsx_vst(r_l, dst_argb1555, 0); + __lsx_vst(r_h, dst_argb1555, 16); + dst_argb1555 += 32; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1); + dst0 = __lsx_vpickev_b(src1, src0); + __lsx_vst(dst0, dst_y, 0); + src_yuy2 += 32; + dst_y += 16; + } +} + +void YUY2ToUVRow_LSX(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src_yuy2_next, 0, + src_yuy2_next, 16, src0, src1, src2, src3); + src0 = __lsx_vpickod_b(src1, src0); + src1 = __lsx_vpickod_b(src3, src2); + tmp0 = __lsx_vavgr_bu(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_yuy2 += 32; + src_yuy2_next += 32; + dst_u += 8; + dst_v += 8; + } +} + +void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1); + tmp0 = __lsx_vpickod_b(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_yuy2 += 32; + dst_u += 8; + dst_v += 8; + } +} + void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { |