aboutsummaryrefslogtreecommitdiff
path: root/celt
diff options
context:
space:
mode:
authorxiangmingzhu <xiangzhu@cisco.com>2014-04-30 15:48:07 +0800
committerJean-Marc Valin <jmvalin@jmvalin.ca>2014-10-03 21:16:00 -0400
commitc95c9a048f3283afb2e412b10085d4f7c19e1412 (patch)
treeed8873af6559d7a98922e0fed85be47c826ef521 /celt
parent80460334b77d70e665a390503cd8992cdad06c10 (diff)
downloadlibopus-c95c9a048f3283afb2e412b10085d4f7c19e1412.tar.gz
Cisco optimization for x86 & fixed point
1. Only for fixed point on x86 platform (32bit and 64bit, uses SIMD intrinsics up to SSE4.2) 2. Use "configure --enable-fixed-point --enable-intrinsics" to enable optimization, default is disabled. 3. Official test cases are verified and passed. Signed-off-by: Timothy B. Terriberry <tterribe@xiph.org>
Diffstat (limited to 'celt')
-rw-r--r--celt/bands.c20
-rw-r--r--celt/bands.h19
-rw-r--r--celt/celt_decoder.c10
-rw-r--r--celt/celt_encoder.c18
-rw-r--r--celt/celt_lpc.c14
-rw-r--r--celt/celt_lpc.h19
-rw-r--r--celt/cpu_support.h13
-rw-r--r--celt/entenc.c2
-rw-r--r--celt/mips/vq_mipsr1.h6
-rw-r--r--celt/pitch.c13
-rw-r--r--celt/pitch.h34
-rw-r--r--celt/tests/test_unit_mathops.c14
-rw-r--r--celt/tests/test_unit_rotation.c11
-rw-r--r--celt/vq.c10
-rw-r--r--celt/vq.h4
-rw-r--r--celt/x86/celt_lpc_sse.c128
-rw-r--r--celt/x86/celt_lpc_sse.h58
-rw-r--r--celt/x86/pitch_sse.c251
-rw-r--r--celt/x86/pitch_sse.h58
-rw-r--r--celt/x86/x86_celt_map.c84
-rw-r--r--celt/x86/x86cpu.c111
-rw-r--r--celt/x86/x86cpu.h63
22 files changed, 894 insertions, 66 deletions
diff --git a/celt/bands.c b/celt/bands.c
index 30a5894e..c643b093 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -164,7 +164,7 @@ void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *band
for (i=0;i<end;i++)
{
opus_val32 sum;
- sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
+ sum = 1e-27f + celt_inner_prod_c(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
bandE[i+c*m->nbEBands] = celt_sqrt(sum);
/*printf ("%f ", bandE[i+c*m->nbEBands]);*/
}
@@ -266,7 +266,7 @@ void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
/* This prevents energy collapse for transients with multiple short MDCTs */
void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE,
- const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed)
+ const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed, int arch)
{
int c, i, j, k;
for (i=start;i<end;i++)
@@ -355,7 +355,7 @@ void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_mas
}
/* We just added some energy, so we need to renormalise */
if (renormalize)
- renormalise_vector(X, N0<<LM, Q15ONE);
+ renormalise_vector(X, N0<<LM, Q15ONE, arch);
} while (++c<C);
}
}
@@ -656,6 +656,7 @@ struct band_ctx {
opus_int32 remaining_bits;
const celt_ener *bandE;
opus_uint32 seed;
+ int arch;
};
struct split_ctx {
@@ -707,7 +708,7 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx,
side and mid. With just that parameter, we can re-scale both
mid and side because we know that 1) they have unit norm and
2) they are orthogonal. */
- itheta = stereo_itheta(X, Y, stereo, N);
+ itheta = stereo_itheta(X, Y, stereo, N, ctx->arch);
}
tell = ec_tell_frac(ec);
if (qn!=1)
@@ -1055,7 +1056,7 @@ static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X,
}
cm = fill;
}
- renormalise_vector(X, N, gain);
+ renormalise_vector(X, N, gain, ctx->arch);
}
}
}
@@ -1360,9 +1361,11 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
void quant_all_bands(int encode, const CELTMode *m, int start, int end,
- celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
- int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
- opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int LM, int codedBands, opus_uint32 *seed)
+ celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks,
+ const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
+ int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
+ opus_int32 balance, ec_ctx *ec, int LM, int codedBands,
+ opus_uint32 *seed, int arch)
{
int i;
opus_int32 remaining_bits;
@@ -1404,6 +1407,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
ctx.m = m;
ctx.seed = *seed;
ctx.spread = spread;
+ ctx.arch = arch;
for (i=start;i<end;i++)
{
opus_int32 tell;
diff --git a/celt/bands.h b/celt/bands.h
index 69901b1e..e8bef4ba 100644
--- a/celt/bands.h
+++ b/celt/bands.h
@@ -98,15 +98,20 @@ void haar1(celt_norm *X, int N0, int stride);
* @param LM log2() of the number of 2.5 subframes in the frame
* @param codedBands Last band to receive bits + 1
* @param seed Random generator seed
+ * @param arch Run-time architecture (see opus_select_arch())
*/
void quant_all_bands(int encode, const CELTMode *m, int start, int end,
- celt_norm * X, celt_norm * Y, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
- int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
- opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed);
-
-void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
- int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE,
- const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed);
+ celt_norm * X, celt_norm * Y, unsigned char *collapse_masks,
+ const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
+ int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
+ opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed,
+ int arch);
+
+void anti_collapse(const CELTMode *m, celt_norm *X_,
+ unsigned char *collapse_masks, int LM, int C, int size, int start,
+ int end, const opus_val16 *logE, const opus_val16 *prev1logE,
+ const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed,
+ int arch);
opus_uint32 celt_lcg_rand(opus_uint32 seed);
diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 8af96b79..4304a3e8 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -499,7 +499,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
seed = celt_lcg_rand(seed);
X[boffs+j] = (celt_norm)((opus_int32)seed>>20);
}
- renormalise_vector(X+boffs, blen, Q15ONE);
+ renormalise_vector(X+boffs, blen, Q15ONE, st->arch);
}
}
st->rng = seed;
@@ -583,7 +583,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
}
/* Compute the excitation for exc_length samples before the loss. */
celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
- exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem);
+ exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem, st->arch);
}
/* Check if the waveform is decaying, and if so how fast.
@@ -650,7 +650,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
the signal domain. */
celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
buf+DECODE_BUFFER_SIZE-N, extrapolation_len, LPC_ORDER,
- lpc_mem);
+ lpc_mem, st->arch);
}
/* Check if the synthesis energy is higher than expected, which can
@@ -982,7 +982,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
quant_all_bands(0, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
- len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng);
+ len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng, st->arch);
if (anti_collapse_rsv > 0)
{
@@ -994,7 +994,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
if (anti_collapse_on)
anti_collapse(mode, X, collapse_masks, LM, C, N,
- start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
+ start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch);
if (silence)
{
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index 7387ad5f..6e2827f6 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -751,7 +751,7 @@ static void tf_encode(int start, int end, int isTransient, int *tf_res, int LM,
static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
const opus_val16 *bandLogE, int end, int LM, int C, int N0,
AnalysisInfo *analysis, opus_val16 *stereo_saving, opus_val16 tf_estimate,
- int intensity, opus_val16 surround_trim)
+ int intensity, opus_val16 surround_trim, int arch)
{
int i;
opus_val32 diff=0;
@@ -767,7 +767,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
for (i=0;i<8;i++)
{
opus_val32 partial;
- partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM);
+ partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)],
+ (m->eBands[i+1]-m->eBands[i])<<LM, arch);
sum = ADD16(sum, EXTRACT16(SHR32(partial, 18)));
}
sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum);
@@ -776,7 +777,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
for (i=8;i<intensity;i++)
{
opus_val32 partial;
- partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM);
+ partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)],
+ (m->eBands[i+1]-m->eBands[i])<<LM, arch);
minXC = MIN16(minXC, ABS16(EXTRACT16(SHR32(partial, 18))));
}
minXC = MIN16(QCONST16(1.f, 10), ABS16(minXC));
@@ -1097,7 +1099,7 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem,
pitch_index = COMBFILTER_MAXPERIOD-pitch_index;
gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD,
- N, &pitch_index, st->prefilter_period, st->prefilter_gain);
+ N, &pitch_index, st->prefilter_period, st->prefilter_gain, st->arch);
if (pitch_index > COMBFILTER_MAXPERIOD-2)
pitch_index = COMBFILTER_MAXPERIOD-2;
gain1 = MULT16_16_Q15(QCONST16(.7f,15),gain1);
@@ -1887,7 +1889,8 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
alloc_trim = 5;
else
alloc_trim = alloc_trim_analysis(mode, X, bandLogE,
- end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate, st->intensity, surround_trim);
+ end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate,
+ st->intensity, surround_trim, st->arch);
ec_enc_icdf(enc, alloc_trim, trim_icdf, 7);
tell = ec_tell_frac(enc);
}
@@ -2022,8 +2025,9 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
/* Residual quantisation */
ALLOC(collapse_masks, C*nbEBands, unsigned char);
quant_all_bands(1, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
- bandE, pulses, shortBlocks, st->spread_decision, dual_stereo, st->intensity, tf_res,
- nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv, balance, enc, LM, codedBands, &st->rng);
+ bandE, pulses, shortBlocks, st->spread_decision,
+ dual_stereo, st->intensity, tf_res, nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv,
+ balance, enc, LM, codedBands, &st->rng, st->arch);
if (anti_collapse_rsv > 0)
{
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index fa29d626..48566484 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -88,12 +88,15 @@ int p
#endif
}
-void celt_fir(const opus_val16 *_x,
+
+void celt_fir_c(
+ const opus_val16 *_x,
const opus_val16 *num,
opus_val16 *_y,
int N,
int ord,
- opus_val16 *mem)
+ opus_val16 *mem,
+ int arch)
{
int i,j;
VARDECL(opus_val16, rnum);
@@ -124,7 +127,7 @@ void celt_fir(const opus_val16 *_x,
for (i=0;i<N-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
- xcorr_kernel(rnum, x+i, sum, ord);
+ xcorr_kernel(rnum, x+i, sum, ord, arch);
_y[i ] = SATURATE16(ADD32(EXTEND32(_x[i ]), PSHR32(sum[0], SIG_SHIFT)));
_y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
_y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
@@ -146,7 +149,8 @@ void celt_iir(const opus_val32 *_x,
opus_val32 *_y,
int N,
int ord,
- opus_val16 *mem)
+ opus_val16 *mem,
+ int arch)
{
#ifdef SMALL_FOOTPRINT
int i,j;
@@ -187,7 +191,7 @@ void celt_iir(const opus_val32 *_x,
sum[1]=_x[i+1];
sum[2]=_x[i+2];
sum[3]=_x[i+3];
- xcorr_kernel(rden, y+i, sum, ord);
+ xcorr_kernel(rden, y+i, sum, ord, arch);
/* Patch up the result to compensate for the fact that this is an IIR */
y[i+ord ] = -ROUND16(sum[0],SIG_SHIFT);
diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index dc2a0a3d..dc8967f0 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -29,24 +29,37 @@
#define PLC_H
#include "arch.h"
+#include "cpu_support.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/celt_lpc_sse.h"
+#endif
#define LPC_ORDER 24
void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
-void celt_fir(const opus_val16 *x,
+void celt_fir_c(
+ const opus_val16 *x,
const opus_val16 *num,
opus_val16 *y,
int N,
int ord,
- opus_val16 *mem);
+ opus_val16 *mem,
+ int arch);
+
+#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#define celt_fir(x, num, y, N, ord, mem, arch) \
+ (celt_fir_c(x, num, y, N, ord, mem, arch))
+#endif
void celt_iir(const opus_val32 *x,
const opus_val16 *den,
opus_val32 *y,
int N,
int ord,
- opus_val16 *mem);
+ opus_val16 *mem,
+ int arch);
int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,
const opus_val16 *window, int overlap, int lag, int n, int arch);
diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index d68dbe62..71efff14 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -42,6 +42,18 @@
*/
#define OPUS_ARCHMASK 3
+#elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
+
+#include "x86/x86cpu.h"
+/* We currently support 3 x86 variants:
+ * arch[0] -> non-sse
+ * arch[1] -> sse2
+ * arch[2] -> sse4.1
+ * arch[3] -> NULL
+ */
+#define OPUS_ARCHMASK 3
+int opus_select_arch(void);
+
#else
#define OPUS_ARCHMASK 0
@@ -50,5 +62,4 @@ static OPUS_INLINE int opus_select_arch(void)
return 0;
}
#endif
-
#endif
diff --git a/celt/entenc.c b/celt/entenc.c
index 271e4d30..f1750d25 100644
--- a/celt/entenc.c
+++ b/celt/entenc.c
@@ -98,7 +98,7 @@ static void ec_enc_carry_out(ec_enc *_this,int _c){
else _this->ext++;
}
-static void ec_enc_normalize(ec_enc *_this){
+static OPUS_INLINE void ec_enc_normalize(ec_enc *_this){
/*If the range is too small, output some bits and rescale it.*/
while(_this->rng<=EC_CODE_BOT){
ec_enc_carry_out(_this,(int)(_this->val>>EC_CODE_SHIFT));
diff --git a/celt/mips/vq_mipsr1.h b/celt/mips/vq_mipsr1.h
index 3cea0709..0affae01 100644
--- a/celt/mips/vq_mipsr1.h
+++ b/celt/mips/vq_mipsr1.h
@@ -73,7 +73,11 @@ static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_
}
#define OVERRIDE_renormalise_vector
-void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
+
+#define renormalise_vector(X, N, gain, arch) \
+ ((void)(arch), renormalize_vector_mips(x, N, gain))
+
+void renormalise_vector_mips(celt_norm *X, int N, opus_val16 gain)
{
int i;
#ifdef FIXED_POINT
diff --git a/celt/pitch.c b/celt/pitch.c
index 2f0b14b7..154c8484 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -250,7 +250,8 @@ opus_val32
#else
void
#endif
-celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
+ opus_val32 *xcorr, int len, int max_pitch, int arch)
{
int i;
/*The EDSP version requires that max_pitch is at least 1, and that _x is
@@ -264,7 +265,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr
for (i=0;i<max_pitch-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
- xcorr_kernel(_x, _y+i, sum, len);
+ xcorr_kernel(_x, _y+i, sum, len, arch);
xcorr[i]=sum[0];
xcorr[i+1]=sum[1];
xcorr[i+2]=sum[2];
@@ -280,7 +281,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr
for (;i<max_pitch;i++)
{
opus_val32 sum;
- sum = celt_inner_prod(_x, _y+i, len);
+ sum = celt_inner_prod(_x, _y+i, len, arch);
xcorr[i] = sum;
#ifdef FIXED_POINT
maxcorr = MAX32(maxcorr, sum);
@@ -369,7 +370,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
for (j=0;j<len>>1;j++)
sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
#else
- sum = celt_inner_prod(x_lp, y+i, len>>1);
+ sum = celt_inner_prod_c(x_lp, y+i, len>>1);
#endif
xcorr[i] = MAX32(-1, sum);
#ifdef FIXED_POINT
@@ -405,7 +406,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
- int N, int *T0_, int prev_period, opus_val16 prev_gain)
+ int N, int *T0_, int prev_period, opus_val16 prev_gain, int arch)
{
int k, i, T, T0;
opus_val16 g, g0;
@@ -517,7 +518,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
pg = SHR32(frac_div32(best_xy,best_yy+1),16);
for (k=0;k<3;k++)
- xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);
+ xcorr[k] = celt_inner_prod(x, x-(T+k-1), N, arch);
if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0]))
offset = 1;
else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2]))
diff --git a/celt/pitch.h b/celt/pitch.h
index b199976f..e692c594 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -37,7 +37,8 @@
#include "modes.h"
#include "cpu_support.h"
-#if defined(__SSE__) && !defined(FIXED_POINT)
+#if defined(__SSE__) && !defined(FIXED_POINT) \
+ || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
#include "x86/pitch_sse.h"
#endif
@@ -56,12 +57,13 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
int len, int max_pitch, int *pitch, int arch);
opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
- int N, int *T0, int prev_period, opus_val16 prev_gain);
+ int N, int *T0, int prev_period, opus_val16 prev_gain, int arch);
+
/* OPT: This is the kernel you really want to optimize. It gets used a lot
by the prefilter and by the PLC. */
#ifndef OVERRIDE_XCORR_KERNEL
-static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
+static OPUS_INLINE void xcorr_kernel_c(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
{
int j;
opus_val16 y_0, y_1, y_2, y_3;
@@ -126,8 +128,15 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y,
sum[3] = MAC16_16(sum[3],tmp,y_1);
}
}
+
+#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#define xcorr_kernel(x, y, sum, len, arch) \
+ ((void)(arch),xcorr_kernel_c(x, y, sum, len))
+#endif
+
#endif /* OVERRIDE_XCORR_KERNEL */
+
#ifndef OVERRIDE_DUAL_INNER_PROD
static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
@@ -145,9 +154,10 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
}
#endif
-#ifndef OVERRIDE_CELT_INNER_PROD
-static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y,
- int N)
+/*We make sure a C version is always available for cases where the overhead of
+ vectorization and passing around an arch flag aren't worth it.*/
+static OPUS_INLINE opus_val32 celt_inner_prod_c(const opus_val16 *x,
+ const opus_val16 *y, int N)
{
int i;
opus_val32 xy=0;
@@ -155,6 +165,10 @@ static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_va
xy = MAC16_16(xy, x[i], y[i]);
return xy;
}
+
+#if !defined(OVERRIDE_CELT_INNER_PROD)
+# define celt_inner_prod(x, y, N, arch) \
+ ((void)(arch),celt_inner_prod_c(x, y, N))
#endif
#ifdef FIXED_POINT
@@ -163,11 +177,11 @@ opus_val32
void
#endif
celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
- opus_val32 *xcorr, int len, int max_pitch);
+ opus_val32 *xcorr, int len, int max_pitch, int arch);
#if !defined(OVERRIDE_PITCH_XCORR)
/*Is run-time CPU detection enabled on this platform?*/
-# if defined(OPUS_HAVE_RTCD)
+# if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
extern
# if defined(FIXED_POINT)
opus_val32
@@ -179,10 +193,10 @@ void
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
- xcorr, len, max_pitch))
+ xcorr, len, max_pitch, arch))
# else
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
- ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch))
+ ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch, arch))
# endif
#endif
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index 4bb780e6..5060f7f2 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -36,6 +36,8 @@
#define CELT_C
+#include <stdio.h>
+#include <math.h>
#include "mathops.c"
#include "entenc.c"
#include "entdec.c"
@@ -45,8 +47,16 @@
#include "laplace.c"
#include "vq.c"
#include "cwrs.c"
-#include <stdio.h>
-#include <math.h>
+#include "pitch.c"
+#include "celt_lpc.c"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
+#include "x86/pitch_sse.c"
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/celt_lpc_sse.c"
+#endif
+#include "x86/x86_celt_map.c"
+#endif
#ifdef FIXED_POINT
#define WORD "%d"
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index ce5f0967..4dce1af7 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -44,7 +44,18 @@
#include "entdec.c"
#include "mathops.c"
#include "bands.h"
+#include "pitch.c"
+#include "celt_lpc.c"
#include <math.h>
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
+#include "x86/pitch_sse.c"
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/celt_lpc_sse.c"
+#endif
+#include "x86/x86_celt_map.c"
+#endif
+
#define MAX_SIZE 100
int ret=0;
diff --git a/celt/vq.c b/celt/vq.c
index b047b227..0c58cdd4 100644
--- a/celt/vq.c
+++ b/celt/vq.c
@@ -350,7 +350,7 @@ unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
}
#ifndef OVERRIDE_renormalise_vector
-void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
+void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch)
{
int i;
#ifdef FIXED_POINT
@@ -360,7 +360,7 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
opus_val16 g;
opus_val32 t;
celt_norm *xptr;
- E = EPSILON + celt_inner_prod(X, X, N);
+ E = EPSILON + celt_inner_prod(X, X, N, arch);
#ifdef FIXED_POINT
k = celt_ilog2(E)>>1;
#endif
@@ -377,7 +377,7 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
}
#endif /* OVERRIDE_renormalise_vector */
-int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N)
+int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch)
{
int i;
int itheta;
@@ -396,8 +396,8 @@ int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N)
Eside = MAC16_16(Eside, s, s);
}
} else {
- Emid += celt_inner_prod(X, X, N);
- Eside += celt_inner_prod(Y, Y, N);
+ Emid += celt_inner_prod(X, X, N, arch);
+ Eside += celt_inner_prod(Y, Y, N, arch);
}
mid = celt_sqrt(Emid);
side = celt_sqrt(Eside);
diff --git a/celt/vq.h b/celt/vq.h
index 84115cbc..f8958206 100644
--- a/celt/vq.h
+++ b/celt/vq.h
@@ -63,8 +63,8 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B,
unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
ec_dec *dec, opus_val16 gain);
-void renormalise_vector(celt_norm *X, int N, opus_val16 gain);
+void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch);
-int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N);
+int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch);
#endif /* VQ_H */
diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c
new file mode 100644
index 00000000..9fb97798
--- /dev/null
+++ b/celt/x86/celt_lpc_sse.c
@@ -0,0 +1,128 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+#include "x86cpu.h"
+
+void celt_fir_sse4_1(const opus_val16 *_x,
+ const opus_val16 *num,
+ opus_val16 *_y,
+ int N,
+ int ord,
+ opus_val16 *mem,
+ int arch)
+{
+ int i,j;
+ VARDECL(opus_val16, rnum);
+ VARDECL(opus_val16, x);
+
+ __m128i vecNoA;
+ opus_int32 noA ;
+ SAVE_STACK;
+
+ ALLOC(rnum, ord, opus_val16);
+ ALLOC(x, N+ord, opus_val16);
+ for(i=0;i<ord;i++)
+ rnum[i] = num[ord-i-1];
+ for(i=0;i<ord;i++)
+ x[i] = mem[ord-i-1];
+
+ for (i=0;i<N-7;i+=8)
+ {
+ x[i+ord ]=_x[i ];
+ x[i+ord+1]=_x[i+1];
+ x[i+ord+2]=_x[i+2];
+ x[i+ord+3]=_x[i+3];
+ x[i+ord+4]=_x[i+4];
+ x[i+ord+5]=_x[i+5];
+ x[i+ord+6]=_x[i+6];
+ x[i+ord+7]=_x[i+7];
+ }
+
+ for (;i<N-3;i+=4)
+ {
+ x[i+ord ]=_x[i ];
+ x[i+ord+1]=_x[i+1];
+ x[i+ord+2]=_x[i+2];
+ x[i+ord+3]=_x[i+3];
+ }
+
+ for (;i<N;i++)
+ x[i+ord]=_x[i];
+
+ for(i=0;i<ord;i++)
+ mem[i] = _x[N-i-1];
+#ifdef SMALL_FOOTPRINT
+ for (i=0;i<N;i++)
+ {
+ opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+ for (j=0;j<ord;j++)
+ {
+ sum = MAC16_16(sum,rnum[j],x[i+j]);
+ }
+ _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+ }
+#else
+ noA = EXTEND32(1) << SIG_SHIFT >> 1;
+ vecNoA = _mm_set_epi32(noA, noA, noA, noA);
+
+ for (i=0;i<N-3;i+=4)
+ {
+ opus_val32 sums[4] = {0};
+ __m128i vecSum, vecX;
+
+ xcorr_kernel(rnum, x+i, sums, ord, arch);
+
+ vecSum = _mm_loadu_si128((__m128i *)sums);
+ vecSum = _mm_add_epi32(vecSum, vecNoA);
+ vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
+ vecX = OP_CVTEPI16_EPI32_M64(_x + i);
+ vecSum = _mm_add_epi32(vecSum, vecX);
+ vecSum = _mm_packs_epi32(vecSum, vecSum);
+ _mm_storel_epi64((__m128i *)(_y + i), vecSum);
+ }
+ for (;i<N;i++)
+ {
+ opus_val32 sum = 0;
+ for (j=0;j<ord;j++)
+ sum = MAC16_16(sum, rnum[j], x[i + j]);
+ _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+ }
+
+#endif
+ RESTORE_STACK;
+}
diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
new file mode 100644
index 00000000..f1114202
--- /dev/null
+++ b/celt/x86/celt_lpc_sse.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_LPC_SSE_H
+#define CELT_LPC_SSE_H
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+void celt_fir_sse4_1(
+ const opus_val16 *x,
+ const opus_val16 *num,
+ opus_val16 *y,
+ int N,
+ int ord,
+ opus_val16 *mem,
+ int arch);
+
+extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *num,
+ opus_val16 *y,
+ int N,
+ int ord,
+ opus_val16 *mem,
+ int arch);
+
+# define celt_fir(x, num, y, N, ord, mem, arch) \
+ ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch))
+
+#endif
+#endif
diff --git a/celt/x86/pitch_sse.c b/celt/x86/pitch_sse.c
new file mode 100644
index 00000000..e3bc6d7d
--- /dev/null
+++ b/celt/x86/pitch_sse.c
@@ -0,0 +1,251 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "macros.h"
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include <smmintrin.h>
+#include "x86cpu.h"
+
+opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
+ int N)
+{
+ opus_int i, dataSize16;
+ opus_int32 sum;
+ __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
+ __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
+ __m128i inVec1_3210, inVec2_3210;
+
+ sum = 0;
+ dataSize16 = N & ~15;
+
+ acc1 = _mm_setzero_si128();
+ acc2 = _mm_setzero_si128();
+
+ for (i=0;i<dataSize16;i+=16) {
+ inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+ inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+ inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
+ inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
+
+ inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+ inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+ acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
+ }
+
+ acc1 = _mm_add_epi32(acc1, acc2);
+
+ if (N - i >= 8)
+ {
+ inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+ inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+ inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+ i += 8;
+ }
+
+ if (N - i >= 4)
+ {
+ inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
+ inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
+
+ inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_3210);
+ i += 4;
+ }
+
+ acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
+ acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
+
+ sum += _mm_cvtsi128_si32(acc1);
+
+ for (;i<N;i++)
+ {
+ sum = silk_SMLABB(sum, x[i], y[i]);
+ }
+
+ return sum;
+}
+
+void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
+{
+ int j;
+
+ __m128i vecX, vecX0, vecX1, vecX2, vecX3;
+ __m128i vecY0, vecY1, vecY2, vecY3;
+ __m128i sum0, sum1, sum2, sum3, vecSum;
+ __m128i initSum;
+
+ celt_assert(len >= 3);
+
+ sum0 = _mm_setzero_si128();
+ sum1 = _mm_setzero_si128();
+ sum2 = _mm_setzero_si128();
+ sum3 = _mm_setzero_si128();
+
+ for (j=0;j<(len-7);j+=8)
+ {
+ vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
+ vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
+ vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
+ vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
+ vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
+
+ sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
+ sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
+ sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
+ sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
+ }
+
+ sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
+ sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
+
+ sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
+ sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
+
+ sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
+ sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
+
+ sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
+ sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
+
+ vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
+ _mm_unpacklo_epi32(sum2, sum3));
+
+ for (;j<(len-3);j+=4)
+ {
+ vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
+ vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+ vecX1 = _mm_shuffle_epi32(vecX, 0x55);
+ vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
+ vecX3 = _mm_shuffle_epi32(vecX, 0xff);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+ vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+ vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
+ vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ sum1 = _mm_mullo_epi32(vecX1, vecY1);
+ sum2 = _mm_mullo_epi32(vecX2, vecY2);
+ sum3 = _mm_mullo_epi32(vecX3, vecY3);
+
+ sum0 = _mm_add_epi32(sum0, sum1);
+ sum2 = _mm_add_epi32(sum2, sum3);
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ vecSum = _mm_add_epi32(vecSum, sum2);
+ }
+
+ for (;j<len;j++)
+ {
+ vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
+ vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ }
+
+ initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
+ initSum = _mm_add_epi32(initSum, vecSum);
+ _mm_storeu_si128((__m128i *)sum, initSum);
+}
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2)
+opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
+ int N)
+{
+ opus_int i, dataSize16;
+ opus_int32 sum;
+
+ __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
+ __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
+
+ sum = 0;
+ dataSize16 = N & ~15;
+
+ acc1 = _mm_setzero_si128();
+ acc2 = _mm_setzero_si128();
+
+ for (i=0;i<dataSize16;i+=16)
+ {
+ inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+ inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+ inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
+ inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
+
+ inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+ inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+ acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
+ }
+
+ acc1 = _mm_add_epi32( acc1, acc2 );
+
+ if (N - i >= 8)
+ {
+ inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+ inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+ inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+ i += 8;
+ }
+
+ acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
+ acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
+ sum += _mm_cvtsi128_si32(acc1);
+
+ for (;i<N;i++) {
+ sum = silk_SMLABB(sum, x[i], y[i]);
+ }
+
+ return sum;
+}
+#endif
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index 58f83246..837e8ae2 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -1,4 +1,5 @@
-/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */
+/* Copyright (c) 2013 Jean-Marc Valin and John Ridges
+ Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/
/**
@file pitch_sse.h
@brief Pitch analysis
@@ -32,11 +33,55 @@
#ifndef PITCH_SSE_H
#define PITCH_SSE_H
+#if defined(HAVE_CONFIG_H)
+#include "config.h"
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+void xcorr_kernel_sse4_1(
+ const opus_int16 *x,
+ const opus_int16 *y,
+ opus_val32 sum[4],
+ int len );
+
+extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_int16 *x,
+ const opus_int16 *y,
+ opus_val32 sum[4],
+ int len );
+
+#define xcorr_kernel(x, y, sum, len, arch) \
+ ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
+
+opus_val32 celt_inner_prod_sse4_1(
+ const opus_int16 *x,
+ const opus_int16 *y,
+ int N);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2)
+opus_val32 celt_inner_prod_sse2(
+ const opus_int16 *x,
+ const opus_int16 *y,
+ int N);
+#endif
+
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_int16 *x,
+ const opus_int16 *y,
+ int N);
+
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+ ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
+#else
+
#include <xmmintrin.h>
#include "arch.h"
#define OVERRIDE_XCORR_KERNEL
-static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
+static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
{
int j;
__m128 xsum1, xsum2;
@@ -71,6 +116,9 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, o
_mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
}
+#define xcorr_kernel(_x, _y, _z, len, arch) \
+ ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len))
+
#define OVERRIDE_DUAL_INNER_PROD
static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
@@ -102,7 +150,7 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
}
#define OVERRIDE_CELT_INNER_PROD
-static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y,
+static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
int N)
{
int i;
@@ -127,6 +175,9 @@ static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_va
return xy;
}
+# define celt_inner_prod(_x, _y, len, arch) \
+ ((void)(arch),celt_inner_prod_sse(_x, _y, len))
+
#define OVERRIDE_COMB_FILTER_CONST
static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
opus_val16 g10, opus_val16 g11, opus_val16 g12)
@@ -180,3 +231,4 @@ static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, i
}
#endif
+#endif
diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c
new file mode 100644
index 00000000..83410dbc
--- /dev/null
+++ b/celt/x86/x86_celt_map.c
@@ -0,0 +1,84 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(HAVE_CONFIG_H)
+#include "config.h"
+#endif
+
+#include "x86/x86cpu.h"
+#include "celt_lpc.h"
+#include "pitch.h"
+#include "pitch_sse.h"
+
+#if defined(OPUS_HAVE_RTCD)
+
+# if defined(FIXED_POINT)
+
+void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *num,
+ opus_val16 *y,
+ int N,
+ int ord,
+ opus_val16 *mem,
+ int arch
+) = {
+ celt_fir_c, /* non-sse */
+ celt_fir_c,
+ MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */
+ NULL
+};
+
+void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ opus_val32 sum[4],
+ int len
+) = {
+ xcorr_kernel_c, /* non-sse */
+ xcorr_kernel_c,
+ MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */
+ NULL
+};
+
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ int N
+) = {
+ celt_inner_prod_c, /* non-sse */
+ MAY_HAVE_SSE2(celt_inner_prod),
+ MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */
+ NULL
+};
+
+# else
+# error "Floating-point implementation is not supported by x86 RTCD yet." \
+ "Reconfigure with --disable-rtcd or send patches."
+# endif
+
+#endif
diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
new file mode 100644
index 00000000..c82a4b7b
--- /dev/null
+++ b/celt/x86/x86cpu.c
@@ -0,0 +1,111 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "cpu_support.h"
+#include "macros.h"
+#include "main.h"
+#include "pitch.h"
+#include "x86cpu.h"
+
+#if defined(_MSC_VER)
+
+#include <intrin.h>
+#define cpuid(info,x) __cpuid(info,x)
+#else
+
+#if defined(CPU_INFO_BY_C)
+#include <cpuid.h>
+#endif
+
+static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
+{
+#if defined(CPU_INFO_BY_ASM)
+ __asm__ __volatile__ (
+ "cpuid":
+ "=a" (CPUInfo[0]),
+ "=b" (CPUInfo[1]),
+ "=c" (CPUInfo[2]),
+ "=d" (CPUInfo[3]) :
+ "a" (InfoType), "c" (0)
+ );
+#elif defined(CPU_INFO_BY_C)
+ __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
+#endif
+}
+
+#endif
+
+#include "SigProc_FIX.h"
+#include "celt_lpc.h"
+
+typedef struct CPU_Feature{
+ /* SIMD: 128-bit */
+ int HW_SSE2;
+ int HW_SSE41;
+} CPU_Feature;
+
+static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
+{
+ unsigned int info[4] = {0};
+ unsigned int nIds = 0;
+
+ cpuid(info, 0);
+ nIds = info[0];
+
+ if (nIds >= 1){
+ cpuid(info, 1);
+ cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
+ cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
+ }
+}
+
+int opus_select_arch(void)
+{
+ CPU_Feature cpu_feature = {0};
+ int arch;
+
+ opus_cpu_feature_check(&cpu_feature);
+
+ arch = 0;
+ if (!cpu_feature.HW_SSE2)
+ {
+ return arch;
+ }
+ arch++;
+
+ if (!cpu_feature.HW_SSE41)
+ {
+ return arch;
+ }
+ arch++;
+
+ return arch;
+}
diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
new file mode 100644
index 00000000..2394b05e
--- /dev/null
+++ b/celt/x86/x86cpu.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(X86CPU_H)
+# define X86CPU_H
+
+# if defined(OPUS_X86_MAY_HAVE_SSE2)
+# define MAY_HAVE_SSE2(name) name ## _sse2
+# else
+# define MAY_HAVE_SSE2(name) name ## _c
+# endif
+
+# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# define MAY_HAVE_SSE4_1(name) name ## _sse4_1
+# else
+# define MAY_HAVE_SSE4_1(name) name ## _c
+# endif
+
+# if defined(OPUS_HAVE_RTCD)
+int opus_select_arch(void);
+# endif
+
+/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi16_epi32()
+ when optimizations are disabled, even though the actual PMOVSXWD instruction
+ takes an m64. Unlike a normal m64 reference, these require 16-byte alignment
+ and load 16 bytes instead of 8, possibly reading out of bounds.
+
+ We can insert an explicit MOVQ using _mm_loadl_epi64(), which should have the
+ same semantics as an m64 reference in the PMOVSXWD instruction itself, but
+ gcc is not smart enough to optimize this out when optimizations ARE enabled.*/
+# if !defined(__OPTIMIZE__)
+# define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
+# else
+# define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(*(__m128i *)(x)))
+# endif
+
+#endif