diff options
author | xiangmingzhu <xiangzhu@cisco.com> | 2014-04-30 15:48:07 +0800 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2014-10-03 21:16:00 -0400 |
commit | c95c9a048f3283afb2e412b10085d4f7c19e1412 (patch) | |
tree | ed8873af6559d7a98922e0fed85be47c826ef521 /celt | |
parent | 80460334b77d70e665a390503cd8992cdad06c10 (diff) | |
download | libopus-c95c9a048f3283afb2e412b10085d4f7c19e1412.tar.gz |
Cisco optimization for x86 & fixed point
1. Only for fixed point on x86 platform (32bit and 64bit, uses SIMD
intrinsics up to SSE4.2)
2. Use "configure --enable-fixed-point --enable-intrinsics" to enable
optimization, default is disabled.
3. Official test cases are verified and passed.
Signed-off-by: Timothy B. Terriberry <tterribe@xiph.org>
Diffstat (limited to 'celt')
-rw-r--r-- | celt/bands.c | 20 | ||||
-rw-r--r-- | celt/bands.h | 19 | ||||
-rw-r--r-- | celt/celt_decoder.c | 10 | ||||
-rw-r--r-- | celt/celt_encoder.c | 18 | ||||
-rw-r--r-- | celt/celt_lpc.c | 14 | ||||
-rw-r--r-- | celt/celt_lpc.h | 19 | ||||
-rw-r--r-- | celt/cpu_support.h | 13 | ||||
-rw-r--r-- | celt/entenc.c | 2 | ||||
-rw-r--r-- | celt/mips/vq_mipsr1.h | 6 | ||||
-rw-r--r-- | celt/pitch.c | 13 | ||||
-rw-r--r-- | celt/pitch.h | 34 | ||||
-rw-r--r-- | celt/tests/test_unit_mathops.c | 14 | ||||
-rw-r--r-- | celt/tests/test_unit_rotation.c | 11 | ||||
-rw-r--r-- | celt/vq.c | 10 | ||||
-rw-r--r-- | celt/vq.h | 4 | ||||
-rw-r--r-- | celt/x86/celt_lpc_sse.c | 128 | ||||
-rw-r--r-- | celt/x86/celt_lpc_sse.h | 58 | ||||
-rw-r--r-- | celt/x86/pitch_sse.c | 251 | ||||
-rw-r--r-- | celt/x86/pitch_sse.h | 58 | ||||
-rw-r--r-- | celt/x86/x86_celt_map.c | 84 | ||||
-rw-r--r-- | celt/x86/x86cpu.c | 111 | ||||
-rw-r--r-- | celt/x86/x86cpu.h | 63 |
22 files changed, 894 insertions, 66 deletions
diff --git a/celt/bands.c b/celt/bands.c index 30a5894e..c643b093 100644 --- a/celt/bands.c +++ b/celt/bands.c @@ -164,7 +164,7 @@ void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *band for (i=0;i<end;i++) { opus_val32 sum; - sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM); + sum = 1e-27f + celt_inner_prod_c(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM); bandE[i+c*m->nbEBands] = celt_sqrt(sum); /*printf ("%f ", bandE[i+c*m->nbEBands]);*/ } @@ -266,7 +266,7 @@ void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X, /* This prevents energy collapse for transients with multiple short MDCTs */ void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size, int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE, - const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed) + const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed, int arch) { int c, i, j, k; for (i=start;i<end;i++) @@ -355,7 +355,7 @@ void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_mas } /* We just added some energy, so we need to renormalise */ if (renormalize) - renormalise_vector(X, N0<<LM, Q15ONE); + renormalise_vector(X, N0<<LM, Q15ONE, arch); } while (++c<C); } } @@ -656,6 +656,7 @@ struct band_ctx { opus_int32 remaining_bits; const celt_ener *bandE; opus_uint32 seed; + int arch; }; struct split_ctx { @@ -707,7 +708,7 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx, side and mid. With just that parameter, we can re-scale both mid and side because we know that 1) they have unit norm and 2) they are orthogonal. */ - itheta = stereo_itheta(X, Y, stereo, N); + itheta = stereo_itheta(X, Y, stereo, N, ctx->arch); } tell = ec_tell_frac(ec); if (qn!=1) @@ -1055,7 +1056,7 @@ static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X, } cm = fill; } - renormalise_vector(X, N, gain); + renormalise_vector(X, N, gain, ctx->arch); } } } @@ -1360,9 +1361,11 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm void quant_all_bands(int encode, const CELTMode *m, int start, int end, - celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses, - int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res, - opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int LM, int codedBands, opus_uint32 *seed) + celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks, + const celt_ener *bandE, int *pulses, int shortBlocks, int spread, + int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits, + opus_int32 balance, ec_ctx *ec, int LM, int codedBands, + opus_uint32 *seed, int arch) { int i; opus_int32 remaining_bits; @@ -1404,6 +1407,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end, ctx.m = m; ctx.seed = *seed; ctx.spread = spread; + ctx.arch = arch; for (i=start;i<end;i++) { opus_int32 tell; diff --git a/celt/bands.h b/celt/bands.h index 69901b1e..e8bef4ba 100644 --- a/celt/bands.h +++ b/celt/bands.h @@ -98,15 +98,20 @@ void haar1(celt_norm *X, int N0, int stride); * @param LM log2() of the number of 2.5 subframes in the frame * @param codedBands Last band to receive bits + 1 * @param seed Random generator seed + * @param arch Run-time architecture (see opus_select_arch()) */ void quant_all_bands(int encode, const CELTMode *m, int start, int end, - celt_norm * X, celt_norm * Y, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses, - int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res, - opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed); - -void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size, - int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE, - const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed); + celt_norm * X, celt_norm * Y, unsigned char *collapse_masks, + const celt_ener *bandE, int *pulses, int shortBlocks, int spread, + int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits, + opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed, + int arch); + +void anti_collapse(const CELTMode *m, celt_norm *X_, + unsigned char *collapse_masks, int LM, int C, int size, int start, + int end, const opus_val16 *logE, const opus_val16 *prev1logE, + const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed, + int arch); opus_uint32 celt_lcg_rand(opus_uint32 seed); diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c index 8af96b79..4304a3e8 100644 --- a/celt/celt_decoder.c +++ b/celt/celt_decoder.c @@ -499,7 +499,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) seed = celt_lcg_rand(seed); X[boffs+j] = (celt_norm)((opus_int32)seed>>20); } - renormalise_vector(X+boffs, blen, Q15ONE); + renormalise_vector(X+boffs, blen, Q15ONE, st->arch); } } st->rng = seed; @@ -583,7 +583,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) } /* Compute the excitation for exc_length samples before the loss. */ celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER, - exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem); + exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem, st->arch); } /* Check if the waveform is decaying, and if so how fast. @@ -650,7 +650,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) the signal domain. */ celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER, buf+DECODE_BUFFER_SIZE-N, extrapolation_len, LPC_ORDER, - lpc_mem); + lpc_mem, st->arch); } /* Check if the synthesis energy is higher than expected, which can @@ -982,7 +982,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat quant_all_bands(0, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks, NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res, - len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng); + len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng, st->arch); if (anti_collapse_rsv > 0) { @@ -994,7 +994,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat if (anti_collapse_on) anti_collapse(mode, X, collapse_masks, LM, C, N, - start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng); + start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch); if (silence) { diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c index 7387ad5f..6e2827f6 100644 --- a/celt/celt_encoder.c +++ b/celt/celt_encoder.c @@ -751,7 +751,7 @@ static void tf_encode(int start, int end, int isTransient, int *tf_res, int LM, static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X, const opus_val16 *bandLogE, int end, int LM, int C, int N0, AnalysisInfo *analysis, opus_val16 *stereo_saving, opus_val16 tf_estimate, - int intensity, opus_val16 surround_trim) + int intensity, opus_val16 surround_trim, int arch) { int i; opus_val32 diff=0; @@ -767,7 +767,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X, for (i=0;i<8;i++) { opus_val32 partial; - partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM); + partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], + (m->eBands[i+1]-m->eBands[i])<<LM, arch); sum = ADD16(sum, EXTRACT16(SHR32(partial, 18))); } sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum); @@ -776,7 +777,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X, for (i=8;i<intensity;i++) { opus_val32 partial; - partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM); + partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], + (m->eBands[i+1]-m->eBands[i])<<LM, arch); minXC = MIN16(minXC, ABS16(EXTRACT16(SHR32(partial, 18)))); } minXC = MIN16(QCONST16(1.f, 10), ABS16(minXC)); @@ -1097,7 +1099,7 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem, pitch_index = COMBFILTER_MAXPERIOD-pitch_index; gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD, - N, &pitch_index, st->prefilter_period, st->prefilter_gain); + N, &pitch_index, st->prefilter_period, st->prefilter_gain, st->arch); if (pitch_index > COMBFILTER_MAXPERIOD-2) pitch_index = COMBFILTER_MAXPERIOD-2; gain1 = MULT16_16_Q15(QCONST16(.7f,15),gain1); @@ -1887,7 +1889,8 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, alloc_trim = 5; else alloc_trim = alloc_trim_analysis(mode, X, bandLogE, - end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate, st->intensity, surround_trim); + end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate, + st->intensity, surround_trim, st->arch); ec_enc_icdf(enc, alloc_trim, trim_icdf, 7); tell = ec_tell_frac(enc); } @@ -2022,8 +2025,9 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, /* Residual quantisation */ ALLOC(collapse_masks, C*nbEBands, unsigned char); quant_all_bands(1, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks, - bandE, pulses, shortBlocks, st->spread_decision, dual_stereo, st->intensity, tf_res, - nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv, balance, enc, LM, codedBands, &st->rng); + bandE, pulses, shortBlocks, st->spread_decision, + dual_stereo, st->intensity, tf_res, nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv, + balance, enc, LM, codedBands, &st->rng, st->arch); if (anti_collapse_rsv > 0) { diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c index fa29d626..48566484 100644 --- a/celt/celt_lpc.c +++ b/celt/celt_lpc.c @@ -88,12 +88,15 @@ int p #endif } -void celt_fir(const opus_val16 *_x, + +void celt_fir_c( + const opus_val16 *_x, const opus_val16 *num, opus_val16 *_y, int N, int ord, - opus_val16 *mem) + opus_val16 *mem, + int arch) { int i,j; VARDECL(opus_val16, rnum); @@ -124,7 +127,7 @@ void celt_fir(const opus_val16 *_x, for (i=0;i<N-3;i+=4) { opus_val32 sum[4]={0,0,0,0}; - xcorr_kernel(rnum, x+i, sum, ord); + xcorr_kernel(rnum, x+i, sum, ord, arch); _y[i ] = SATURATE16(ADD32(EXTEND32(_x[i ]), PSHR32(sum[0], SIG_SHIFT))); _y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT))); _y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT))); @@ -146,7 +149,8 @@ void celt_iir(const opus_val32 *_x, opus_val32 *_y, int N, int ord, - opus_val16 *mem) + opus_val16 *mem, + int arch) { #ifdef SMALL_FOOTPRINT int i,j; @@ -187,7 +191,7 @@ void celt_iir(const opus_val32 *_x, sum[1]=_x[i+1]; sum[2]=_x[i+2]; sum[3]=_x[i+3]; - xcorr_kernel(rden, y+i, sum, ord); + xcorr_kernel(rden, y+i, sum, ord, arch); /* Patch up the result to compensate for the fact that this is an IIR */ y[i+ord ] = -ROUND16(sum[0],SIG_SHIFT); diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h index dc2a0a3d..dc8967f0 100644 --- a/celt/celt_lpc.h +++ b/celt/celt_lpc.h @@ -29,24 +29,37 @@ #define PLC_H #include "arch.h" +#include "cpu_support.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#include "x86/celt_lpc_sse.h" +#endif #define LPC_ORDER 24 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p); -void celt_fir(const opus_val16 *x, +void celt_fir_c( + const opus_val16 *x, const opus_val16 *num, opus_val16 *y, int N, int ord, - opus_val16 *mem); + opus_val16 *mem, + int arch); + +#if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +#define celt_fir(x, num, y, N, ord, mem, arch) \ + (celt_fir_c(x, num, y, N, ord, mem, arch)) +#endif void celt_iir(const opus_val32 *x, const opus_val16 *den, opus_val32 *y, int N, int ord, - opus_val16 *mem); + opus_val16 *mem, + int arch); int _celt_autocorr(const opus_val16 *x, opus_val32 *ac, const opus_val16 *window, int overlap, int lag, int n, int arch); diff --git a/celt/cpu_support.h b/celt/cpu_support.h index d68dbe62..71efff14 100644 --- a/celt/cpu_support.h +++ b/celt/cpu_support.h @@ -42,6 +42,18 @@ */ #define OPUS_ARCHMASK 3 +#elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) + +#include "x86/x86cpu.h" +/* We currently support 3 x86 variants: + * arch[0] -> non-sse + * arch[1] -> sse2 + * arch[2] -> sse4.1 + * arch[3] -> NULL + */ +#define OPUS_ARCHMASK 3 +int opus_select_arch(void); + #else #define OPUS_ARCHMASK 0 @@ -50,5 +62,4 @@ static OPUS_INLINE int opus_select_arch(void) return 0; } #endif - #endif diff --git a/celt/entenc.c b/celt/entenc.c index 271e4d30..f1750d25 100644 --- a/celt/entenc.c +++ b/celt/entenc.c @@ -98,7 +98,7 @@ static void ec_enc_carry_out(ec_enc *_this,int _c){ else _this->ext++; } -static void ec_enc_normalize(ec_enc *_this){ +static OPUS_INLINE void ec_enc_normalize(ec_enc *_this){ /*If the range is too small, output some bits and rescale it.*/ while(_this->rng<=EC_CODE_BOT){ ec_enc_carry_out(_this,(int)(_this->val>>EC_CODE_SHIFT)); diff --git a/celt/mips/vq_mipsr1.h b/celt/mips/vq_mipsr1.h index 3cea0709..0affae01 100644 --- a/celt/mips/vq_mipsr1.h +++ b/celt/mips/vq_mipsr1.h @@ -73,7 +73,11 @@ static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_ } #define OVERRIDE_renormalise_vector -void renormalise_vector(celt_norm *X, int N, opus_val16 gain) + +#define renormalise_vector(X, N, gain, arch) \ + ((void)(arch), renormalize_vector_mips(x, N, gain)) + +void renormalise_vector_mips(celt_norm *X, int N, opus_val16 gain) { int i; #ifdef FIXED_POINT diff --git a/celt/pitch.c b/celt/pitch.c index 2f0b14b7..154c8484 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -250,7 +250,8 @@ opus_val32 #else void #endif -celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch) +celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch, int arch) { int i; /*The EDSP version requires that max_pitch is at least 1, and that _x is @@ -264,7 +265,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr for (i=0;i<max_pitch-3;i+=4) { opus_val32 sum[4]={0,0,0,0}; - xcorr_kernel(_x, _y+i, sum, len); + xcorr_kernel(_x, _y+i, sum, len, arch); xcorr[i]=sum[0]; xcorr[i+1]=sum[1]; xcorr[i+2]=sum[2]; @@ -280,7 +281,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr for (;i<max_pitch;i++) { opus_val32 sum; - sum = celt_inner_prod(_x, _y+i, len); + sum = celt_inner_prod(_x, _y+i, len, arch); xcorr[i] = sum; #ifdef FIXED_POINT maxcorr = MAX32(maxcorr, sum); @@ -369,7 +370,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR for (j=0;j<len>>1;j++) sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift); #else - sum = celt_inner_prod(x_lp, y+i, len>>1); + sum = celt_inner_prod_c(x_lp, y+i, len>>1); #endif xcorr[i] = MAX32(-1, sum); #ifdef FIXED_POINT @@ -405,7 +406,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2}; opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, - int N, int *T0_, int prev_period, opus_val16 prev_gain) + int N, int *T0_, int prev_period, opus_val16 prev_gain, int arch) { int k, i, T, T0; opus_val16 g, g0; @@ -517,7 +518,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, pg = SHR32(frac_div32(best_xy,best_yy+1),16); for (k=0;k<3;k++) - xcorr[k] = celt_inner_prod(x, x-(T+k-1), N); + xcorr[k] = celt_inner_prod(x, x-(T+k-1), N, arch); if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0])) offset = 1; else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2])) diff --git a/celt/pitch.h b/celt/pitch.h index b199976f..e692c594 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -37,7 +37,8 @@ #include "modes.h" #include "cpu_support.h" -#if defined(__SSE__) && !defined(FIXED_POINT) +#if defined(__SSE__) && !defined(FIXED_POINT) \ + || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) #include "x86/pitch_sse.h" #endif @@ -56,12 +57,13 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR int len, int max_pitch, int *pitch, int arch); opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, - int N, int *T0, int prev_period, opus_val16 prev_gain); + int N, int *T0, int prev_period, opus_val16 prev_gain, int arch); + /* OPT: This is the kernel you really want to optimize. It gets used a lot by the prefilter and by the PLC. */ #ifndef OVERRIDE_XCORR_KERNEL -static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) +static OPUS_INLINE void xcorr_kernel_c(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) { int j; opus_val16 y_0, y_1, y_2, y_3; @@ -126,8 +128,15 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, sum[3] = MAC16_16(sum[3],tmp,y_1); } } + +#if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +#define xcorr_kernel(x, y, sum, len, arch) \ + ((void)(arch),xcorr_kernel_c(x, y, sum, len)) +#endif + #endif /* OVERRIDE_XCORR_KERNEL */ + #ifndef OVERRIDE_DUAL_INNER_PROD static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) @@ -145,9 +154,10 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y } #endif -#ifndef OVERRIDE_CELT_INNER_PROD -static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y, - int N) +/*We make sure a C version is always available for cases where the overhead of + vectorization and passing around an arch flag aren't worth it.*/ +static OPUS_INLINE opus_val32 celt_inner_prod_c(const opus_val16 *x, + const opus_val16 *y, int N) { int i; opus_val32 xy=0; @@ -155,6 +165,10 @@ static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_va xy = MAC16_16(xy, x[i], y[i]); return xy; } + +#if !defined(OVERRIDE_CELT_INNER_PROD) +# define celt_inner_prod(x, y, N, arch) \ + ((void)(arch),celt_inner_prod_c(x, y, N)) #endif #ifdef FIXED_POINT @@ -163,11 +177,11 @@ opus_val32 void #endif celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, - opus_val32 *xcorr, int len, int max_pitch); + opus_val32 *xcorr, int len, int max_pitch, int arch); #if !defined(OVERRIDE_PITCH_XCORR) /*Is run-time CPU detection enabled on this platform?*/ -# if defined(OPUS_HAVE_RTCD) +# if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM) extern # if defined(FIXED_POINT) opus_val32 @@ -179,10 +193,10 @@ void # define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \ - xcorr, len, max_pitch)) + xcorr, len, max_pitch, arch)) # else # define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ - ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch)) + ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch, arch)) # endif #endif diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index 4bb780e6..5060f7f2 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -36,6 +36,8 @@ #define CELT_C +#include <stdio.h> +#include <math.h> #include "mathops.c" #include "entenc.c" #include "entdec.c" @@ -45,8 +47,16 @@ #include "laplace.c" #include "vq.c" #include "cwrs.c" -#include <stdio.h> -#include <math.h> +#include "pitch.c" +#include "celt_lpc.c" + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) +#include "x86/pitch_sse.c" +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#include "x86/celt_lpc_sse.c" +#endif +#include "x86/x86_celt_map.c" +#endif #ifdef FIXED_POINT #define WORD "%d" diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c index ce5f0967..4dce1af7 100644 --- a/celt/tests/test_unit_rotation.c +++ b/celt/tests/test_unit_rotation.c @@ -44,7 +44,18 @@ #include "entdec.c" #include "mathops.c" #include "bands.h" +#include "pitch.c" +#include "celt_lpc.c" #include <math.h> + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) +#include "x86/pitch_sse.c" +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#include "x86/celt_lpc_sse.c" +#endif +#include "x86/x86_celt_map.c" +#endif + #define MAX_SIZE 100 int ret=0; @@ -350,7 +350,7 @@ unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B, } #ifndef OVERRIDE_renormalise_vector -void renormalise_vector(celt_norm *X, int N, opus_val16 gain) +void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch) { int i; #ifdef FIXED_POINT @@ -360,7 +360,7 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain) opus_val16 g; opus_val32 t; celt_norm *xptr; - E = EPSILON + celt_inner_prod(X, X, N); + E = EPSILON + celt_inner_prod(X, X, N, arch); #ifdef FIXED_POINT k = celt_ilog2(E)>>1; #endif @@ -377,7 +377,7 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain) } #endif /* OVERRIDE_renormalise_vector */ -int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N) +int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch) { int i; int itheta; @@ -396,8 +396,8 @@ int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N) Eside = MAC16_16(Eside, s, s); } } else { - Emid += celt_inner_prod(X, X, N); - Eside += celt_inner_prod(Y, Y, N); + Emid += celt_inner_prod(X, X, N, arch); + Eside += celt_inner_prod(Y, Y, N, arch); } mid = celt_sqrt(Emid); side = celt_sqrt(Eside); @@ -63,8 +63,8 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B, ec_dec *dec, opus_val16 gain); -void renormalise_vector(celt_norm *X, int N, opus_val16 gain); +void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch); -int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N); +int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch); #endif /* VQ_H */ diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c new file mode 100644 index 00000000..9fb97798 --- /dev/null +++ b/celt/x86/celt_lpc_sse.c @@ -0,0 +1,128 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <xmmintrin.h> +#include <emmintrin.h> +#include <smmintrin.h> +#include "celt_lpc.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "pitch.h" +#include "x86cpu.h" + +void celt_fir_sse4_1(const opus_val16 *_x, + const opus_val16 *num, + opus_val16 *_y, + int N, + int ord, + opus_val16 *mem, + int arch) +{ + int i,j; + VARDECL(opus_val16, rnum); + VARDECL(opus_val16, x); + + __m128i vecNoA; + opus_int32 noA ; + SAVE_STACK; + + ALLOC(rnum, ord, opus_val16); + ALLOC(x, N+ord, opus_val16); + for(i=0;i<ord;i++) + rnum[i] = num[ord-i-1]; + for(i=0;i<ord;i++) + x[i] = mem[ord-i-1]; + + for (i=0;i<N-7;i+=8) + { + x[i+ord ]=_x[i ]; + x[i+ord+1]=_x[i+1]; + x[i+ord+2]=_x[i+2]; + x[i+ord+3]=_x[i+3]; + x[i+ord+4]=_x[i+4]; + x[i+ord+5]=_x[i+5]; + x[i+ord+6]=_x[i+6]; + x[i+ord+7]=_x[i+7]; + } + + for (;i<N-3;i+=4) + { + x[i+ord ]=_x[i ]; + x[i+ord+1]=_x[i+1]; + x[i+ord+2]=_x[i+2]; + x[i+ord+3]=_x[i+3]; + } + + for (;i<N;i++) + x[i+ord]=_x[i]; + + for(i=0;i<ord;i++) + mem[i] = _x[N-i-1]; +#ifdef SMALL_FOOTPRINT + for (i=0;i<N;i++) + { + opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT); + for (j=0;j<ord;j++) + { + sum = MAC16_16(sum,rnum[j],x[i+j]); + } + _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); + } +#else + noA = EXTEND32(1) << SIG_SHIFT >> 1; + vecNoA = _mm_set_epi32(noA, noA, noA, noA); + + for (i=0;i<N-3;i+=4) + { + opus_val32 sums[4] = {0}; + __m128i vecSum, vecX; + + xcorr_kernel(rnum, x+i, sums, ord, arch); + + vecSum = _mm_loadu_si128((__m128i *)sums); + vecSum = _mm_add_epi32(vecSum, vecNoA); + vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT); + vecX = OP_CVTEPI16_EPI32_M64(_x + i); + vecSum = _mm_add_epi32(vecSum, vecX); + vecSum = _mm_packs_epi32(vecSum, vecSum); + _mm_storel_epi64((__m128i *)(_y + i), vecSum); + } + for (;i<N;i++) + { + opus_val32 sum = 0; + for (j=0;j<ord;j++) + sum = MAC16_16(sum, rnum[j], x[i + j]); + _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT))); + } + +#endif + RESTORE_STACK; +} diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h new file mode 100644 index 00000000..f1114202 --- /dev/null +++ b/celt/x86/celt_lpc_sse.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef CELT_LPC_SSE_H +#define CELT_LPC_SSE_H + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +void celt_fir_sse4_1( + const opus_val16 *x, + const opus_val16 *num, + opus_val16 *y, + int N, + int ord, + opus_val16 *mem, + int arch); + +extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *num, + opus_val16 *y, + int N, + int ord, + opus_val16 *mem, + int arch); + +# define celt_fir(x, num, y, N, ord, mem, arch) \ + ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch)) + +#endif +#endif diff --git a/celt/x86/pitch_sse.c b/celt/x86/pitch_sse.c new file mode 100644 index 00000000..e3bc6d7d --- /dev/null +++ b/celt/x86/pitch_sse.c @@ -0,0 +1,251 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <xmmintrin.h> +#include <emmintrin.h> + +#include "macros.h" +#include "celt_lpc.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#include <smmintrin.h> +#include "x86cpu.h" + +opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, + int N) +{ + opus_int i, dataSize16; + opus_int32 sum; + __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; + __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; + __m128i inVec1_3210, inVec2_3210; + + sum = 0; + dataSize16 = N & ~15; + + acc1 = _mm_setzero_si128(); + acc2 = _mm_setzero_si128(); + + for (i=0;i<dataSize16;i+=16) { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); + inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); + } + + acc1 = _mm_add_epi32(acc1, acc2); + + if (N - i >= 8) + { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + i += 8; + } + + if (N - i >= 4) + { + inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); + inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); + + inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); + + acc1 = _mm_add_epi32(acc1, inVec1_3210); + i += 4; + } + + acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); + acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); + + sum += _mm_cvtsi128_si32(acc1); + + for (;i<N;i++) + { + sum = silk_SMLABB(sum, x[i], y[i]); + } + + return sum; +} + +void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) +{ + int j; + + __m128i vecX, vecX0, vecX1, vecX2, vecX3; + __m128i vecY0, vecY1, vecY2, vecY3; + __m128i sum0, sum1, sum2, sum3, vecSum; + __m128i initSum; + + celt_assert(len >= 3); + + sum0 = _mm_setzero_si128(); + sum1 = _mm_setzero_si128(); + sum2 = _mm_setzero_si128(); + sum3 = _mm_setzero_si128(); + + for (j=0;j<(len-7);j+=8) + { + vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); + vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); + vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); + vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); + vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); + + sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); + sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); + sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); + sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); + } + + sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); + sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); + + sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); + sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); + + sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); + sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); + + sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); + sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); + + vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), + _mm_unpacklo_epi32(sum2, sum3)); + + for (;j<(len-3);j+=4) + { + vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); + vecX0 = _mm_shuffle_epi32(vecX, 0x00); + vecX1 = _mm_shuffle_epi32(vecX, 0x55); + vecX2 = _mm_shuffle_epi32(vecX, 0xaa); + vecX3 = _mm_shuffle_epi32(vecX, 0xff); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); + vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); + vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + sum1 = _mm_mullo_epi32(vecX1, vecY1); + sum2 = _mm_mullo_epi32(vecX2, vecY2); + sum3 = _mm_mullo_epi32(vecX3, vecY3); + + sum0 = _mm_add_epi32(sum0, sum1); + sum2 = _mm_add_epi32(sum2, sum3); + vecSum = _mm_add_epi32(vecSum, sum0); + vecSum = _mm_add_epi32(vecSum, sum2); + } + + for (;j<len;j++) + { + vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); + vecX0 = _mm_shuffle_epi32(vecX, 0x00); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + vecSum = _mm_add_epi32(vecSum, sum0); + } + + initSum = _mm_loadu_si128((__m128i *)(&sum[0])); + initSum = _mm_add_epi32(initSum, vecSum); + _mm_storeu_si128((__m128i *)sum, initSum); +} +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE2) +opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y, + int N) +{ + opus_int i, dataSize16; + opus_int32 sum; + + __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; + __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; + + sum = 0; + dataSize16 = N & ~15; + + acc1 = _mm_setzero_si128(); + acc2 = _mm_setzero_si128(); + + for (i=0;i<dataSize16;i+=16) + { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); + inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); + } + + acc1 = _mm_add_epi32( acc1, acc2 ); + + if (N - i >= 8) + { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + i += 8; + } + + acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1)); + acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E)); + sum += _mm_cvtsi128_si32(acc1); + + for (;i<N;i++) { + sum = silk_SMLABB(sum, x[i], y[i]); + } + + return sum; +} +#endif diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index 58f83246..837e8ae2 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -1,4 +1,5 @@ -/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */ +/* Copyright (c) 2013 Jean-Marc Valin and John Ridges + Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/ /** @file pitch_sse.h @brief Pitch analysis @@ -32,11 +33,55 @@ #ifndef PITCH_SSE_H #define PITCH_SSE_H +#if defined(HAVE_CONFIG_H) +#include "config.h" +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +void xcorr_kernel_sse4_1( + const opus_int16 *x, + const opus_int16 *y, + opus_val32 sum[4], + int len ); + +extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( + const opus_int16 *x, + const opus_int16 *y, + opus_val32 sum[4], + int len ); + +#define xcorr_kernel(x, y, sum, len, arch) \ + ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len)) + +opus_val32 celt_inner_prod_sse4_1( + const opus_int16 *x, + const opus_int16 *y, + int N); +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE2) +opus_val32 celt_inner_prod_sse2( + const opus_int16 *x, + const opus_int16 *y, + int N); +#endif + +extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_int16 *x, + const opus_int16 *y, + int N); + +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N)) +#else + #include <xmmintrin.h> #include "arch.h" #define OVERRIDE_XCORR_KERNEL -static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) +static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) { int j; __m128 xsum1, xsum2; @@ -71,6 +116,9 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, o _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); } +#define xcorr_kernel(_x, _y, _z, len, arch) \ + ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len)) + #define OVERRIDE_DUAL_INNER_PROD static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) @@ -102,7 +150,7 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y } #define OVERRIDE_CELT_INNER_PROD -static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y, +static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, int N) { int i; @@ -127,6 +175,9 @@ static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_va return xy; } +# define celt_inner_prod(_x, _y, len, arch) \ + ((void)(arch),celt_inner_prod_sse(_x, _y, len)) + #define OVERRIDE_COMB_FILTER_CONST static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, opus_val16 g10, opus_val16 g11, opus_val16 g12) @@ -180,3 +231,4 @@ static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, i } #endif +#endif diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c new file mode 100644 index 00000000..83410dbc --- /dev/null +++ b/celt/x86/x86_celt_map.c @@ -0,0 +1,84 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if defined(HAVE_CONFIG_H) +#include "config.h" +#endif + +#include "x86/x86cpu.h" +#include "celt_lpc.h" +#include "pitch.h" +#include "pitch_sse.h" + +#if defined(OPUS_HAVE_RTCD) + +# if defined(FIXED_POINT) + +void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *num, + opus_val16 *y, + int N, + int ord, + opus_val16 *mem, + int arch +) = { + celt_fir_c, /* non-sse */ + celt_fir_c, + MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */ + NULL +}; + +void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len +) = { + xcorr_kernel_c, /* non-sse */ + xcorr_kernel_c, + MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */ + NULL +}; + +opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + int N +) = { + celt_inner_prod_c, /* non-sse */ + MAY_HAVE_SSE2(celt_inner_prod), + MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */ + NULL +}; + +# else +# error "Floating-point implementation is not supported by x86 RTCD yet." \ + "Reconfigure with --disable-rtcd or send patches." +# endif + +#endif diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c new file mode 100644 index 00000000..c82a4b7b --- /dev/null +++ b/celt/x86/x86cpu.c @@ -0,0 +1,111 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "cpu_support.h" +#include "macros.h" +#include "main.h" +#include "pitch.h" +#include "x86cpu.h" + +#if defined(_MSC_VER) + +#include <intrin.h> +#define cpuid(info,x) __cpuid(info,x) +#else + +#if defined(CPU_INFO_BY_C) +#include <cpuid.h> +#endif + +static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) +{ +#if defined(CPU_INFO_BY_ASM) + __asm__ __volatile__ ( + "cpuid": + "=a" (CPUInfo[0]), + "=b" (CPUInfo[1]), + "=c" (CPUInfo[2]), + "=d" (CPUInfo[3]) : + "a" (InfoType), "c" (0) + ); +#elif defined(CPU_INFO_BY_C) + __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3])); +#endif +} + +#endif + +#include "SigProc_FIX.h" +#include "celt_lpc.h" + +typedef struct CPU_Feature{ + /* SIMD: 128-bit */ + int HW_SSE2; + int HW_SSE41; +} CPU_Feature; + +static void opus_cpu_feature_check(CPU_Feature *cpu_feature) +{ + unsigned int info[4] = {0}; + unsigned int nIds = 0; + + cpuid(info, 0); + nIds = info[0]; + + if (nIds >= 1){ + cpuid(info, 1); + cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0; + cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0; + } +} + +int opus_select_arch(void) +{ + CPU_Feature cpu_feature = {0}; + int arch; + + opus_cpu_feature_check(&cpu_feature); + + arch = 0; + if (!cpu_feature.HW_SSE2) + { + return arch; + } + arch++; + + if (!cpu_feature.HW_SSE41) + { + return arch; + } + arch++; + + return arch; +} diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h new file mode 100644 index 00000000..2394b05e --- /dev/null +++ b/celt/x86/x86cpu.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(X86CPU_H) +# define X86CPU_H + +# if defined(OPUS_X86_MAY_HAVE_SSE2) +# define MAY_HAVE_SSE2(name) name ## _sse2 +# else +# define MAY_HAVE_SSE2(name) name ## _c +# endif + +# if defined(OPUS_X86_MAY_HAVE_SSE4_1) +# define MAY_HAVE_SSE4_1(name) name ## _sse4_1 +# else +# define MAY_HAVE_SSE4_1(name) name ## _c +# endif + +# if defined(OPUS_HAVE_RTCD) +int opus_select_arch(void); +# endif + +/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi16_epi32() + when optimizations are disabled, even though the actual PMOVSXWD instruction + takes an m64. Unlike a normal m64 reference, these require 16-byte alignment + and load 16 bytes instead of 8, possibly reading out of bounds. + + We can insert an explicit MOVQ using _mm_loadl_epi64(), which should have the + same semantics as an m64 reference in the PMOVSXWD instruction itself, but + gcc is not smart enough to optimize this out when optimizations ARE enabled.*/ +# if !defined(__OPTIMIZE__) +# define OP_CVTEPI16_EPI32_M64(x) \ + (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) +# else +# define OP_CVTEPI16_EPI32_M64(x) \ + (_mm_cvtepi16_epi32(*(__m128i *)(x))) +# endif + +#endif |