diff options
author | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2013-12-09 02:33:42 -0500 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2013-12-09 15:26:58 -0500 |
commit | 57cd849cf71e6abdfedfea1d381d4e06581015d5 (patch) | |
tree | c08ca3adc7ae370c550034e9ba1ac8f1d09b46d7 /celt/x86 | |
parent | ff072009fe5bdd3540ac6ac331e9961e83da722a (diff) | |
download | libopus-57cd849cf71e6abdfedfea1d381d4e06581015d5.tar.gz |
Defining celt_inner_prod() and using it instead of explicit loops.
Also adds an SSE-optimized celt_inner_prod().
Diffstat (limited to 'celt/x86')
-rw-r--r-- | celt/x86/pitch_sse.h | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index 695122a5..58f83246 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -101,6 +101,32 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y } } +#define OVERRIDE_CELT_INNER_PROD +static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y, + int N) +{ + int i; + float xy; + __m128 sum; + sum = _mm_setzero_ps(); + /* FIXME: We should probably go 8-way and use 2 sums. */ + for (i=0;i<N-3;i+=4) + { + __m128 xi = _mm_loadu_ps(x+i); + __m128 yi = _mm_loadu_ps(y+i); + sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); + } + /* Horizontal sum */ + sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); + sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); + _mm_store_ss(&xy, sum); + for (;i<N;i++) + { + xy = MAC16_16(xy, x[i], y[i]); + } + return xy; +} + #define OVERRIDE_COMB_FILTER_CONST static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, opus_val16 g10, opus_val16 g11, opus_val16 g12) |