diff options
author | Frank Barchard <fbarchard@google.com> | 2016-10-26 11:12:17 -0700 |
---|---|---|
committer | Frank Barchard <fbarchard@google.com> | 2016-10-26 11:12:17 -0700 |
commit | 532f5708a95b02185db463a481c0315524742dd8 (patch) | |
tree | abec3366300f458be77f6df7a5010f0716158dc2 /source | |
parent | 02ae8b60c54171bbd171d2518d12cd0e1f9be191 (diff) | |
download | libyuv-532f5708a95b02185db463a481c0315524742dd8.tar.gz |
Add MSA optimized I422AlphaToARGBRow_MSA and I422ToRGB24Row_MSA functions
R=fbarchard@google.com
BUG=libyuv:634
Performance Gain (vs C vectorized)
I422AlphaToARGBRow_MSA : ~1.4x
I422AlphaToARGBRow_Any_MSA : ~1.4x
I422ToRGB24Row_MSA : ~4.8x
I422ToRGB24Row_Any_MSA : ~4.8x
Performance Gain (vs C non-vectorized)
I422AlphaToARGBRow_MSA : ~7.0x
I422AlphaToARGBRow_Any_MSA : ~7.0x
I422ToRGB24Row_MSA : ~7.9x
I422ToRGB24Row_Any_MSA : ~7.7x
Review URL: https://codereview.chromium.org/2454433003 .
Diffstat (limited to 'source')
-rw-r--r-- | source/convert_argb.cc | 8 | ||||
-rw-r--r-- | source/convert_from.cc | 8 | ||||
-rw-r--r-- | source/row_any.cc | 4 | ||||
-rw-r--r-- | source/row_msa.cc | 244 | ||||
-rw-r--r-- | source/row_neon64.cc | 26 |
5 files changed, 188 insertions, 102 deletions
diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 6e23cf74..f3f0d838 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -564,6 +564,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2; } #endif +#if defined(HAS_I422ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; + } + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; diff --git a/source/convert_from.cc b/source/convert_from.cc index 7847622c..9859f5f3 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -558,6 +558,14 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB24Row = I422ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); diff --git a/source/row_any.cc b/source/row_any.cc index 6b896257..0d15df67 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -53,6 +53,9 @@ ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) #ifdef HAS_I422ALPHATOARGBROW_NEON ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) #endif +#ifdef HAS_I422ALPHATOARGBROW_MSA +ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7) +#endif #undef ANY41C // Any 3 planes to 1. @@ -168,6 +171,7 @@ ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) #ifdef HAS_I422TOARGBROW_MSA ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7) +ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15) #endif #undef ANY31C diff --git a/source/row_msa.cc b/source/row_msa.cc index b747a77e..7b3c2067 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -19,65 +19,79 @@ namespace libyuv { extern "C" { #endif +// Load YUV 422 pixel data +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) { \ + uint64 y_m; \ + uint32 u_m, v_m; \ + v4i32 zero_m = { 0 }; \ + y_m = LD(psrc_y); \ + u_m = LW(psrc_u); \ + v_m = LW(psrc_v); \ + out_y = (v16u8) __msa_insert_d((v2i64) zero_m, 0, (int64) y_m); \ + out_u = (v16u8) __msa_insert_w(zero_m, 0, (int32) u_m); \ + out_v = (v16u8) __msa_insert_w(zero_m, 0, (int32) v_m); \ +} + // Convert 8 pixels of YUV 420 to RGB. -#define I422TORGB(in0, in1, in2, ub, vr, ug, vg, \ - bb, bg, br, yg, out0, out1, out2) { \ - v8i16 vec0_m; \ - v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ - v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \ - v4i32 max_val_m = __msa_ldi_w(255); \ - v8i16 zero_m = { 0 }; \ - \ - in1 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in1); \ - in2 = (v16u8) __msa_ilvr_b((v16i8) in2, (v16i8) in2); \ - vec0_m = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in0); \ - reg0_m = (v4i32) __msa_ilvr_h(zero_m, vec0_m); \ - reg1_m = (v4i32) __msa_ilvl_h(zero_m, vec0_m); \ - reg0_m *= vec_yg; \ - reg1_m *= vec_yg; \ - reg0_m = __msa_srai_w(reg0_m, 16); \ - reg1_m = __msa_srai_w(reg1_m, 16); \ - reg4_m = reg0_m + br; \ - reg5_m = reg1_m + br; \ - reg2_m = reg0_m + bg; \ - reg3_m = reg1_m + bg; \ - reg0_m += bb; \ - reg1_m += bb; \ - vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in1); \ - reg6_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \ - reg7_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \ - vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in2); \ - reg8_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \ - reg9_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \ - reg0_m -= reg6_m * ub; \ - reg1_m -= reg7_m * ub; \ - reg2_m -= reg6_m * ug; \ - reg3_m -= reg7_m * ug; \ - reg4_m -= reg8_m * vr; \ - reg5_m -= reg9_m * vr; \ - reg2_m -= reg8_m * vg; \ - reg3_m -= reg9_m * vg; \ - reg0_m = __msa_srai_w(reg0_m, 6); \ - reg1_m = __msa_srai_w(reg1_m, 6); \ - reg2_m = __msa_srai_w(reg2_m, 6); \ - reg3_m = __msa_srai_w(reg3_m, 6); \ - reg4_m = __msa_srai_w(reg4_m, 6); \ - reg5_m = __msa_srai_w(reg5_m, 6); \ - reg0_m = __msa_maxi_s_w(reg0_m, 0); \ - reg1_m = __msa_maxi_s_w(reg1_m, 0); \ - reg2_m = __msa_maxi_s_w(reg2_m, 0); \ - reg3_m = __msa_maxi_s_w(reg3_m, 0); \ - reg4_m = __msa_maxi_s_w(reg4_m, 0); \ - reg5_m = __msa_maxi_s_w(reg5_m, 0); \ - reg0_m = __msa_min_s_w(reg0_m, max_val_m); \ - reg1_m = __msa_min_s_w(reg1_m, max_val_m); \ - reg2_m = __msa_min_s_w(reg2_m, max_val_m); \ - reg3_m = __msa_min_s_w(reg3_m, max_val_m); \ - reg4_m = __msa_min_s_w(reg4_m, max_val_m); \ - reg5_m = __msa_min_s_w(reg5_m, max_val_m); \ - out0 = __msa_pckev_h((v8i16) reg1_m, (v8i16) reg0_m); \ - out1 = __msa_pckev_h((v8i16) reg3_m, (v8i16) reg2_m); \ - out2 = __msa_pckev_h((v8i16) reg5_m, (v8i16) reg4_m); \ +#define I422TORGB(in_y, in_u, in_v, \ + ub, vr, ug, vg, bb, bg, br, yg, \ + out_b, out_g, out_r) { \ + v8i16 vec0_m; \ + v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ + v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \ + v4i32 max_val_m = __msa_ldi_w(255); \ + v8i16 zero_m = { 0 }; \ + \ + in_u = (v16u8) __msa_ilvr_b((v16i8) in_u, (v16i8) in_u); \ + in_v = (v16u8) __msa_ilvr_b((v16i8) in_v, (v16i8) in_v); \ + vec0_m = (v8i16) __msa_ilvr_b((v16i8) in_y, (v16i8) in_y); \ + reg0_m = (v4i32) __msa_ilvr_h(zero_m, vec0_m); \ + reg1_m = (v4i32) __msa_ilvl_h(zero_m, vec0_m); \ + reg0_m *= vec_yg; \ + reg1_m *= vec_yg; \ + reg0_m = __msa_srai_w(reg0_m, 16); \ + reg1_m = __msa_srai_w(reg1_m, 16); \ + reg4_m = reg0_m + br; \ + reg5_m = reg1_m + br; \ + reg2_m = reg0_m + bg; \ + reg3_m = reg1_m + bg; \ + reg0_m += bb; \ + reg1_m += bb; \ + vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in_u); \ + reg6_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \ + reg7_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \ + vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in_v); \ + reg8_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \ + reg9_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \ + reg0_m -= reg6_m * ub; \ + reg1_m -= reg7_m * ub; \ + reg2_m -= reg6_m * ug; \ + reg3_m -= reg7_m * ug; \ + reg4_m -= reg8_m * vr; \ + reg5_m -= reg9_m * vr; \ + reg2_m -= reg8_m * vg; \ + reg3_m -= reg9_m * vg; \ + reg0_m = __msa_srai_w(reg0_m, 6); \ + reg1_m = __msa_srai_w(reg1_m, 6); \ + reg2_m = __msa_srai_w(reg2_m, 6); \ + reg3_m = __msa_srai_w(reg3_m, 6); \ + reg4_m = __msa_srai_w(reg4_m, 6); \ + reg5_m = __msa_srai_w(reg5_m, 6); \ + reg0_m = __msa_maxi_s_w(reg0_m, 0); \ + reg1_m = __msa_maxi_s_w(reg1_m, 0); \ + reg2_m = __msa_maxi_s_w(reg2_m, 0); \ + reg3_m = __msa_maxi_s_w(reg3_m, 0); \ + reg4_m = __msa_maxi_s_w(reg4_m, 0); \ + reg5_m = __msa_maxi_s_w(reg5_m, 0); \ + reg0_m = __msa_min_s_w(reg0_m, max_val_m); \ + reg1_m = __msa_min_s_w(reg1_m, max_val_m); \ + reg2_m = __msa_min_s_w(reg2_m, max_val_m); \ + reg3_m = __msa_min_s_w(reg3_m, max_val_m); \ + reg4_m = __msa_min_s_w(reg4_m, max_val_m); \ + reg5_m = __msa_min_s_w(reg5_m, max_val_m); \ + out_b = __msa_pckev_h((v8i16) reg1_m, (v8i16) reg0_m); \ + out_g = __msa_pckev_h((v8i16) reg3_m, (v8i16) reg2_m); \ + out_r = __msa_pckev_h((v8i16) reg5_m, (v8i16) reg4_m); \ } void MirrorRow_MSA(const uint8* src, uint8* dst, int width) { @@ -166,13 +180,10 @@ void I422ToARGBRow_MSA(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; - int32 data_u, data_v; - int64 data_y; v16u8 src0, src1, src2, dst0, dst1; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v16u8 const_255 = (v16u8) __msa_ldi_b(255); - v4i32 zero = { 0 }; vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); @@ -184,12 +195,7 @@ void I422ToARGBRow_MSA(const uint8* src_y, const uint8* src_u, vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); for (x = 0; x < width; x += 8) { - data_y = LD(src_y); - data_u = LW(src_u); - data_v = LW(src_v); - src0 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_y); - src1 = (v16u8) __msa_insert_w(zero, 0, data_u); - src2 = (v16u8) __msa_insert_w(zero, 0, data_v); + READYUV422(src_y, src_u, src_v, src0, src1, src2); I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); vec0 = (v8i16) __msa_ilvev_b((v16i8) vec1, (v16i8) vec0); @@ -208,13 +214,10 @@ void I422ToRGBARow_MSA(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; - int64 data_y; - int32 data_u, data_v; v16u8 src0, src1, src2, dst0, dst1; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v16u8 const_255 = (v16u8) __msa_ldi_b(255); - v4i32 zero = { 0 }; vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); @@ -226,12 +229,7 @@ void I422ToRGBARow_MSA(const uint8* src_y, const uint8* src_u, vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); for (x = 0; x < width; x += 8) { - data_y = LD(src_y); - data_u = LW(src_u); - data_v = LW(src_v); - src0 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_y); - src1 = (v16u8) __msa_insert_w(zero, 0, data_u); - src2 = (v16u8) __msa_insert_w(zero, 0, data_v); + READYUV422(src_y, src_u, src_v, src0, src1, src2); I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); vec0 = (v8i16) __msa_ilvev_b((v16i8) vec0, (v16i8) const_255); @@ -246,6 +244,100 @@ void I422ToRGBARow_MSA(const uint8* src_y, const uint8* src_u, } } +void I422AlphaToARGBRow_MSA(const uint8* src_y, const uint8* src_u, + const uint8* src_v, const uint8* src_a, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int64 data_a; + v16u8 src0, src1, src2, src3, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 zero = { 0 }; + + vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); + vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); + vec_ug = __msa_fill_w(yuvconstants->kUVToG[0]); + vec_vg = __msa_fill_w(yuvconstants->kUVToG[1]); + vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]); + vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]); + vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]); + vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); + + for (x = 0; x < width; x += 8) { + data_a = LD(src_a); + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src3 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_a); + I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, + vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); + src3 = (v16u8) __msa_ilvr_b((v16i8) src3, (v16i8) src3); + vec0 = (v8i16) __msa_ilvev_b((v16i8) vec1, (v16i8) vec0); + vec1 = (v8i16) __msa_ilvev_b((v16i8) src3, (v16i8) vec2); + dst0 = (v16u8) __msa_ilvr_h((v8i16) vec1, (v8i16) vec0); + dst1 = (v16u8) __msa_ilvl_h((v8i16) vec1, (v8i16) vec0); + ST_UB2(dst0, dst1, rgb_buf, 16); + src_y += 8; + src_u += 4; + src_v += 4; + src_a += 8; + rgb_buf += 32; + } +} + +void I422ToRGB24Row_MSA(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int32 width) { + int x; + int64 data_u, data_v; + v16u8 src0, src1, src2, src3, src4, src5, dst0, dst1, dst2; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v16u8 reg0, reg1, reg2, reg3; + v2i64 zero = { 0 }; + v16i8 shuffler0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10 }; + v16i8 shuffler1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10 }; + v16i8 shuffler2 = + { 26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31 }; + + vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); + vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); + vec_ug = __msa_fill_w(yuvconstants->kUVToG[0]); + vec_vg = __msa_fill_w(yuvconstants->kUVToG[1]); + vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]); + vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]); + vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]); + vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8) __msa_ld_b((v16u8*) src_y, 0); + data_u = LD(src_u); + data_v = LD(src_v); + src1 = (v16u8) __msa_insert_d(zero, 0, data_u); + src2 = (v16u8) __msa_insert_d(zero, 0, data_v); + src3 = (v16u8) __msa_sldi_b((v16i8) src0, (v16i8) src0, 8); + src4 = (v16u8) __msa_sldi_b((v16i8) src1, (v16i8) src1, 4); + src5 = (v16u8) __msa_sldi_b((v16i8) src2, (v16i8) src2, 4); + I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, + vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); + I422TORGB(src3, src4, src5, vec_ub, vec_vr, vec_ug, vec_vg, + vec_bb, vec_bg, vec_br, vec_yg, vec3, vec4, vec5); + reg0 = (v16u8) __msa_ilvev_b((v16i8) vec1, (v16i8) vec0); + reg2 = (v16u8) __msa_ilvev_b((v16i8) vec4, (v16i8) vec3); + reg3 = (v16u8) __msa_pckev_b((v16i8) vec5, (v16i8) vec2); + reg1 = (v16u8) __msa_sldi_b((v16i8) reg2, (v16i8) reg0, 11); + dst0 = (v16u8) __msa_vshf_b(shuffler0, (v16i8) reg3, (v16i8) reg0); + dst1 = (v16u8) __msa_vshf_b(shuffler1, (v16i8) reg3, (v16i8) reg1); + dst2 = (v16u8) __msa_vshf_b(shuffler2, (v16i8) reg3, (v16i8) reg2); + ST_UB2(dst0, dst1, rgb_buf, 16); + ST_UB(dst2, (rgb_buf + 32)); + src_y += 16; + src_u += 8; + src_v += 8; + rgb_buf += 48; + } +} + void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 9508d465..2f0227cf 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2711,7 +2711,6 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ); } - void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { asm volatile ( "1: \n" @@ -2735,31 +2734,6 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { ); } -void HalfFloatRow_NEON2(const uint16* src, uint16* dst, float scale, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "subs %w2, %w2, #8 \n" // 8 pixels per loop - "uxtl v2.4s, v1.4h \n" // 8 int's - "uxtl2 v1.4s, v1.8h \n" - "scvtf v2.4s, v2.4s \n" // 8 floats - "scvtf v1.4s, v1.4s \n" - "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent - "fmul v1.4s, v1.4s, %3.s[0] \n" - "uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat - "uqshrn2 v4.8h, v1.4s, #13 \n" - MEMACCESS(1) - "st1 {v4.16b}, [%1], #16 \n" // store 8 shorts - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "w"(scale * 1.9259299444e-34f) // %3 - : "cc", "memory", "v1", "v2", "v4" - ); -} - void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { asm volatile ( "1: \n" |