aboutsummaryrefslogtreecommitdiff
path: root/celt
diff options
context:
space:
mode:
Diffstat (limited to 'celt')
-rw-r--r--celt/arm/armcpu.c9
-rw-r--r--celt/arm/pitch_neon_intr.c51
-rw-r--r--celt/bands.c9
-rw-r--r--celt/celt_decoder.c49
-rw-r--r--celt/celt_encoder.c14
-rw-r--r--celt/celt_lpc.c62
-rw-r--r--celt/cpu_support.h5
-rw-r--r--celt/fixed_debug.h67
-rw-r--r--celt/fixed_generic.h10
-rw-r--r--celt/float_cast.h4
-rw-r--r--celt/mathops.h2
-rw-r--r--celt/meson.build8
-rw-r--r--celt/modes.c3
-rw-r--r--celt/pitch.c23
-rw-r--r--celt/rate.c2
-rw-r--r--celt/tests/test_unit_dft.c3
-rw-r--r--celt/tests/test_unit_entropy.c4
-rw-r--r--celt/tests/test_unit_mdct.c3
-rw-r--r--celt/x86/celt_lpc_sse.h5
-rw-r--r--celt/x86/pitch_sse.h6
-rw-r--r--celt/x86/pitch_sse4_1.c51
-rw-r--r--celt/x86/x86cpu.c37
-rw-r--r--celt/x86/x86cpu.h46
23 files changed, 323 insertions, 150 deletions
diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c
index cce3ae3a..c7d16e6d 100644
--- a/celt/arm/armcpu.c
+++ b/celt/arm/armcpu.c
@@ -156,7 +156,7 @@ opus_uint32 opus_cpu_capabilities(void)
"your platform. Reconfigure with --disable-rtcd (or send patches)."
#endif
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
{
opus_uint32 flags = opus_cpu_capabilities();
int arch = 0;
@@ -184,4 +184,11 @@ int opus_select_arch(void)
return arch;
}
+int opus_select_arch(void) {
+ int arch = opus_select_arch_impl();
+#ifdef FUZZING
+ arch = rand()%(arch+1);
+#endif
+ return arch;
+}
#endif
diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c
index 1ac38c43..35cc46e2 100644
--- a/celt/arm/pitch_neon_intr.c
+++ b/celt/arm/pitch_neon_intr.c
@@ -137,22 +137,27 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */
/* operations of celt_inner_prod_neon(), and both functions should have bit */
/* exact output. */
-static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
+static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N)
{
int i;
+ *err = 0;
opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
for (i = 0; i < N - 3; i += 4) {
xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+ *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3);
}
xy0 += xy2;
xy1 += xy3;
xy = xy0 + xy1;
+ *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy);
for (; i < N; i++) {
xy = MAC16_16(xy, x[i], y[i]);
+ *err += ABS32(xy);
}
+ *err = *err*2e-7 + N*1e-37;
return xy;
}
@@ -160,32 +165,10 @@ static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, c
/* operations of dual_inner_prod_neon(), and both functions should have bit */
/* exact output. */
static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
- int N, opus_val32 *xy1, opus_val32 *xy2)
+ int N, opus_val32 *xy1, opus_val32 *xy2, float *err)
{
- int i;
- opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
- for (i = 0; i < N - 3; i += 4) {
- xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
- xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
- xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
- xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
- xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
- xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
- xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
- xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
- }
- xy01_0 += xy01_2;
- xy02_0 += xy02_2;
- xy01_1 += xy01_3;
- xy02_1 += xy02_3;
- xy01 = xy01_0 + xy01_1;
- xy02 = xy02_0 + xy02_1;
- for (; i < N; i++) {
- xy01 = MAC16_16(xy01, x[i], y01[i]);
- xy02 = MAC16_16(xy02, x[i], y02[i]);
- }
- *xy1 = xy01;
- *xy2 = xy02;
+ *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N);
+ *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N);
}
#endif /* OPUS_CHECK_ASM */
@@ -225,7 +208,12 @@ opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
}
#ifdef OPUS_CHECK_ASM
- celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
+ {
+ float err, res;
+ res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N);
+ /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/
+ celt_assert(ABS32(res - xy) <= err);
+ }
#endif
return xy;
@@ -280,9 +268,12 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
#ifdef OPUS_CHECK_ASM
{
opus_val32 xy1_c, xy2_c;
- dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
- celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
- celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
+ float err[2];
+ dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err);
+ /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]);
+ if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/
+ celt_assert(ABS32(xy1_c - *xy1) <= err[0]);
+ celt_assert(ABS32(xy2_c - *xy2) <= err[1]);
}
#endif
}
diff --git a/celt/bands.c b/celt/bands.c
index 2702963c..5320ffab 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -901,7 +901,7 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx,
sctx->itheta = itheta;
sctx->qalloc = qalloc;
}
-static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, int b,
+static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
celt_norm *lowband_out)
{
int c;
@@ -926,7 +926,6 @@ static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
sign = ec_dec_bits(ec, 1);
}
ctx->remaining_bits -= 1<<BITRES;
- b-=1<<BITRES;
}
if (ctx->resynth)
x[0] = sign ? -NORM_SCALING : NORM_SCALING;
@@ -1134,7 +1133,7 @@ static unsigned quant_band(struct band_ctx *ctx, celt_norm *X,
/* Special case for one sample */
if (N==1)
{
- return quant_band_n1(ctx, X, NULL, b, lowband_out);
+ return quant_band_n1(ctx, X, NULL, lowband_out);
}
if (tf_change>0)
@@ -1256,7 +1255,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
/* Special case for one sample */
if (N==1)
{
- return quant_band_n1(ctx, X, Y, b, lowband_out);
+ return quant_band_n1(ctx, X, Y, lowband_out);
}
orig_fill = fill;
@@ -1381,6 +1380,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
return cm;
}
+#ifndef DISABLE_UPDATE_DRAFT
static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm *norm2, int start, int M, int dual_stereo)
{
int n1, n2;
@@ -1393,6 +1393,7 @@ static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm
if (dual_stereo)
OPUS_COPY(&norm2[n1], &norm2[2*n1 - n2], n2-n1);
}
+#endif
void quant_all_bands(int encode, const CELTMode *m, int start, int end,
celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks,
diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 74ca3b74..35a2073a 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -90,7 +90,7 @@ struct OpusCustomDecoder {
opus_uint32 rng;
int error;
int last_pitch_index;
- int loss_count;
+ int loss_duration;
int skip_plc;
int postfilter_period;
int postfilter_period_old;
@@ -512,7 +512,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
int nbEBands;
int overlap;
int start;
- int loss_count;
+ int loss_duration;
int noise_based;
const opus_int16 *eBands;
SAVE_STACK;
@@ -532,9 +532,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
oldLogE2 = oldLogE + 2*nbEBands;
backgroundLogE = oldLogE2 + 2*nbEBands;
- loss_count = st->loss_count;
+ loss_duration = st->loss_duration;
start = st->start;
- noise_based = loss_count >= 5 || start != 0 || st->skip_plc;
+ noise_based = loss_duration >= 40 || start != 0 || st->skip_plc;
if (noise_based)
{
/* Noise-based PLC/CNG */
@@ -557,9 +557,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
#else
ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */
#endif
+ c=0; do {
+ OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
+ DECODE_BUFFER_SIZE-N+(overlap>>1));
+ } while (++c<C);
/* Energy decay */
- decay = loss_count==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
+ decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
c=0; do
{
for (i=start;i<end;i++)
@@ -585,11 +589,6 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
}
st->rng = seed;
- c=0; do {
- OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
- DECODE_BUFFER_SIZE-N+(overlap>>1));
- } while (++c<C);
-
celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch);
} else {
int exc_length;
@@ -602,7 +601,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
VARDECL(opus_val16, _exc);
VARDECL(opus_val16, fir_tmp);
- if (loss_count == 0)
+ if (loss_duration == 0)
{
st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch);
} else {
@@ -632,7 +631,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
for (i=0;i<MAX_PERIOD+LPC_ORDER;i++)
exc[i-LPC_ORDER] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);
- if (loss_count == 0)
+ if (loss_duration == 0)
{
opus_val32 ac[LPC_ORDER+1];
/* Compute LPC coefficients for the last MAX_PERIOD samples before
@@ -812,7 +811,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
} while (++c<C);
}
- st->loss_count = loss_count+1;
+ /* Saturate to soemthing large to avoid wrap-around. */
+ st->loss_duration = IMIN(10000, loss_duration+(1<<LM));
RESTORE_STACK;
}
@@ -868,6 +868,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
int nbEBands;
int overlap;
const opus_int16 *eBands;
+ opus_val16 max_background_increase;
ALLOC_STACK;
VALIDATE_CELT_DECODER(st);
@@ -942,7 +943,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
/* Check if there are at least two packets received consecutively before
* turning on the pitch-based PLC */
- st->skip_plc = st->loss_count != 0;
+ st->skip_plc = st->loss_duration != 0;
if (dec == NULL)
{
@@ -1140,25 +1141,21 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
if (C==1)
OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands);
- /* In case start or end were to change */
if (!isTransient)
{
- opus_val16 max_background_increase;
OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands);
OPUS_COPY(oldLogE, oldBandE, 2*nbEBands);
- /* In normal circumstances, we only allow the noise floor to increase by
- up to 2.4 dB/second, but when we're in DTX, we allow up to 6 dB
- increase for each update.*/
- if (st->loss_count < 10)
- max_background_increase = M*QCONST16(0.001f,DB_SHIFT);
- else
- max_background_increase = QCONST16(1.f,DB_SHIFT);
- for (i=0;i<2*nbEBands;i++)
- backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
} else {
for (i=0;i<2*nbEBands;i++)
oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
}
+ /* In normal circumstances, we only allow the noise floor to increase by
+ up to 2.4 dB/second, but when we're in DTX we give the weight of
+ all missing packets to the update packet. */
+ max_background_increase = IMIN(160, st->loss_duration+M)*QCONST16(0.001f,DB_SHIFT);
+ for (i=0;i<2*nbEBands;i++)
+ backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
+ /* In case start or end were to change */
c=0; do
{
for (i=0;i<start;i++)
@@ -1175,7 +1172,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
st->rng = dec->rng;
deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
- st->loss_count = 0;
+ st->loss_duration = 0;
RESTORE_STACK;
if (ec_tell(dec) > 8*len)
return OPUS_INTERNAL_ERROR;
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index d6f8afc2..637d442c 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -1719,8 +1719,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch);
compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
amp2Log2(mode, effEnd, end, bandE, bandLogE2, C);
- for (i=0;i<C*nbEBands;i++)
- bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+ for (c=0;c<C;c++)
+ {
+ for (i=0;i<end;i++)
+ bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT));
+ }
}
compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch);
@@ -1856,8 +1859,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
amp2Log2(mode, effEnd, end, bandE, bandLogE, C);
/* Compensate for the scaling of short vs long mdcts */
- for (i=0;i<C*nbEBands;i++)
- bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+ for (c=0;c<C;c++)
+ {
+ for (i=0;i<end;i++)
+ bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT));
+ }
tf_estimate = QCONST16(.2f,14);
}
}
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index 8ecb693e..242e6df5 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -50,17 +50,21 @@ int p
#endif
OPUS_CLEAR(lpc, p);
+#ifdef FIXED_POINT
if (ac[0] != 0)
+#else
+ if (ac[0] > 1e-10f)
+#endif
{
for (i = 0; i < p; i++) {
/* Sum up this iteration's reflection coefficient */
opus_val32 rr = 0;
for (j = 0; j < i; j++)
rr += MULT32_32_Q31(lpc[j],ac[i - j]);
- rr += SHR32(ac[i + 1],3);
- r = -frac_div32(SHL32(rr,3), error);
+ rr += SHR32(ac[i + 1],6);
+ r = -frac_div32(SHL32(rr,6), error);
/* Update LPC coefficients and total error */
- lpc[i] = SHR32(r,3);
+ lpc[i] = SHR32(r,6);
for (j = 0; j < (i+1)>>1; j++)
{
opus_val32 tmp1, tmp2;
@@ -73,17 +77,61 @@ int p
error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error);
/* Bail out once we get 30 dB gain */
#ifdef FIXED_POINT
- if (error<SHR32(ac[0],10))
+ if (error<=SHR32(ac[0],10))
break;
#else
- if (error<.001f*ac[0])
+ if (error<=.001f*ac[0])
break;
#endif
}
}
#ifdef FIXED_POINT
- for (i=0;i<p;i++)
- _lpc[i] = ROUND16(lpc[i],16);
+ {
+ /* Convert the int32 lpcs to int16 and ensure there are no wrap-arounds.
+ This reuses the logic in silk_LPC_fit() and silk_bwexpander_32(). Any bug
+ fixes should also be applied there. */
+ int iter, idx = 0;
+ opus_val32 maxabs, absval, chirp_Q16, chirp_minus_one_Q16;
+
+ for (iter = 0; iter < 10; iter++) {
+ maxabs = 0;
+ for (i = 0; i < p; i++) {
+ absval = ABS32(lpc[i]);
+ if (absval > maxabs) {
+ maxabs = absval;
+ idx = i;
+ }
+ }
+ maxabs = PSHR32(maxabs, 13); /* Q25->Q12 */
+
+ if (maxabs > 32767) {
+ maxabs = MIN32(maxabs, 163838);
+ chirp_Q16 = QCONST32(0.999, 16) - DIV32(SHL32(maxabs - 32767, 14),
+ SHR32(MULT32_32_32(maxabs, idx + 1), 2));
+ chirp_minus_one_Q16 = chirp_Q16 - 65536;
+
+ /* Apply bandwidth expansion. */
+ for (i = 0; i < p - 1; i++) {
+ lpc[i] = MULT32_32_Q16(chirp_Q16, lpc[i]);
+ chirp_Q16 += PSHR32(MULT32_32_32(chirp_Q16, chirp_minus_one_Q16), 16);
+ }
+ lpc[p - 1] = MULT32_32_Q16(chirp_Q16, lpc[p - 1]);
+ } else {
+ break;
+ }
+ }
+
+ if (iter == 10) {
+ /* If the coeffs still do not fit into the 16 bit range after 10 iterations,
+ fall back to the A(z)=1 filter. */
+ OPUS_CLEAR(lpc, p);
+ _lpc[0] = 4096; /* Q12 */
+ } else {
+ for (i = 0; i < p; i++) {
+ _lpc[i] = EXTRACT16(PSHR32(lpc[i], 13)); /* Q25->Q12 */
+ }
+ }
+ }
#endif
}
diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index 68fc6067..7b5c56ca 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -43,10 +43,11 @@
*/
#define OPUS_ARCHMASK 3
-#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#elif defined(OPUS_HAVE_RTCD) && \
+ ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
(defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
(defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
- (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
+ (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
#include "x86/x86cpu.h"
/* We currently support 5 x86 variants:
diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h
index f4352952..c2cf5a83 100644
--- a/celt/fixed_debug.h
+++ b/celt/fixed_debug.h
@@ -167,7 +167,7 @@ static OPUS_INLINE short SHR16_(int a, int shift, char *file, int line)
#define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__)
static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
{
- int res;
+ opus_int32 res;
if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift))
{
fprintf (stderr, "SHL16: inputs are not short: %d %d in %s: line %d\n", a, shift, file, line);
@@ -175,7 +175,7 @@ static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
celt_assert(0);
#endif
}
- res = a<<shift;
+ res = (opus_int32)((opus_uint32)a<<shift);
if (!VERIFY_SHORT(res))
{
fprintf (stderr, "SHL16: output is not short: %d in %s: line %d\n", res, file, line);
@@ -214,15 +214,15 @@ static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line)
opus_int64 res;
if (!VERIFY_INT(a) || !VERIFY_SHORT(shift))
{
- fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", a, shift, file, line);
+ fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", (long long)a, shift, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
}
- res = a<<shift;
+ res = (opus_int64)((opus_uint64)a<<shift);
if (!VERIFY_INT(res))
{
- fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", a, shift, res, file, line);
+ fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", (long long)a, shift, (long long)res, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
@@ -339,7 +339,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file
opus_uint64 res;
if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
{
- fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+ fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
@@ -347,7 +347,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file
res = a+b;
if (!VERIFY_UINT(res))
{
- fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", res, file, line);
+ fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", (unsigned long long)res, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
@@ -363,14 +363,14 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file
opus_uint64 res;
if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
{
- fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+ fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
}
if (a<b)
{
- fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", a, b, file, line);
+ fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
@@ -378,7 +378,7 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file
res = a-b;
if (!VERIFY_UINT(res))
{
- fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", a, b, res, file, line);
+ fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, (unsigned long long)res, file, line);
#ifdef FIXED_DEBUG_ASSERT
celt_assert(0);
#endif
@@ -410,6 +410,51 @@ static OPUS_INLINE short MULT16_16_16(int a, int b)
return res;
}
+/* result fits in 32 bits */
+static OPUS_INLINE int MULT32_32_32(opus_int64 a, opus_int64 b)
+{
+ opus_int64 res;
+ if (!VERIFY_INT(a) || !VERIFY_INT(b))
+ {
+ fprintf (stderr, "MULT32_32_32: inputs are not int: %lld %lld\n", (long long)a, (long long)b);
+#ifdef FIXED_DEBUG_ASSERT
+ celt_assert(0);
+#endif
+ }
+ res = a*b;
+ if (!VERIFY_INT(res))
+ {
+ fprintf (stderr, "MULT32_32_32: output is not int: %lld\n", (long long)res);
+#ifdef FIXED_DEBUG_ASSERT
+ celt_assert(0);
+#endif
+ }
+ celt_mips+=5;
+ return res;
+}
+
+static OPUS_INLINE int MULT32_32_Q16(opus_int64 a, opus_int64 b)
+{
+ opus_int64 res;
+ if (!VERIFY_INT(a) || !VERIFY_INT(b))
+ {
+ fprintf (stderr, "MULT32_32_Q16: inputs are not int: %lld %lld\n", (long long)a, (long long)b);
+#ifdef FIXED_DEBUG_ASSERT
+ celt_assert(0);
+#endif
+ }
+ res = ((opus_int64)(a)*(opus_int64)(b)) >> 16;
+ if (!VERIFY_INT(res))
+ {
+ fprintf (stderr, "MULT32_32_Q16: output is not int: %lld*%lld=%lld\n", (long long)a, (long long)b, (long long)res);
+#ifdef FIXED_DEBUG_ASSERT
+ celt_assert(0);
+#endif
+ }
+ celt_mips+=5;
+ return res;
+}
+
#define MULT16_16(a, b) MULT16_16_(a, b, __FILE__, __LINE__)
static OPUS_INLINE int MULT16_16_(int a, int b, char *file, int line)
{
@@ -786,6 +831,6 @@ static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x)
#undef PRINT_MIPS
-#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0);
+#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", (unsigned long long)celt_mips);} while (0);
#endif
diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h
index 0ecbb899..8f29d46b 100644
--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -57,6 +57,13 @@
#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
#endif
+/** 32x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#if OPUS_FAST_INT64
+#define MULT32_32_Q16(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),16))
+#else
+#define MULT32_32_Q16(a,b) (ADD32(ADD32(ADD32((opus_val32)(SHR32(((opus_uint32)((a)&0x0000ffff)*(opus_uint32)((b)&0x0000ffff)),16)), MULT16_16SU(SHR32(a,16),((b)&0x0000ffff))), MULT16_16SU(SHR32(b,16),((a)&0x0000ffff))), SHL32(MULT16_16(SHR32(a,16),SHR32(b,16)),16)))
+#endif
+
/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
#if OPUS_FAST_INT64
#define MULT32_32_Q31(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),31))
@@ -131,6 +138,9 @@
/** 16x16 multiplication where the result fits in 16 bits */
#define MULT16_16_16(a,b) ((((opus_val16)(a))*((opus_val16)(b))))
+/** 32x32 multiplication where the result fits in 32 bits */
+#define MULT32_32_32(a,b) ((((opus_val32)(a))*((opus_val32)(b))))
+
/* (opus_val32)(opus_val16) gives TI compiler a hint that it's 16x16->32 multiply */
/** 16x16 multiplication where the result fits in 32 bits */
#define MULT16_16(a,b) (((opus_val32)(opus_val16)(a))*((opus_val32)(opus_val16)(b)))
diff --git a/celt/float_cast.h b/celt/float_cast.h
index 9d34976e..8915a5fd 100644
--- a/celt/float_cast.h
+++ b/celt/float_cast.h
@@ -99,7 +99,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
return intgr ;
}
-#elif defined(HAVE_LRINTF)
+#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
/* These defines enable functionality introduced with the 1999 ISO C
** standard. They must be defined before the inclusion of math.h to
@@ -117,7 +117,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
#include <math.h>
#define float2int(x) lrintf(x)
-#elif (defined(HAVE_LRINT))
+#elif defined(HAVE_LRINT) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
#define _ISOC9X_SOURCE 1
#define _ISOC99_SOURCE 1
diff --git a/celt/mathops.h b/celt/mathops.h
index fe29dac1..478ac918 100644
--- a/celt/mathops.h
+++ b/celt/mathops.h
@@ -153,7 +153,7 @@ static OPUS_INLINE float celt_exp2(float x)
float f;
opus_uint32 i;
} res;
- integer = floor(x);
+ integer = (int)floor(x);
if (integer < -50)
return 0;
frac = x-integer;
diff --git a/celt/meson.build b/celt/meson.build
index 370ea1fe..ad95d949 100644
--- a/celt/meson.build
+++ b/celt/meson.build
@@ -10,6 +10,10 @@ celt_neon_intr_sources = sources['CELT_SOURCES_ARM_NEON_INTR']
celt_static_libs = []
+if host_cpu_family in ['x86', 'x86_64'] and opus_conf.has('OPUS_HAVE_RTCD')
+ celt_sources += sources['CELT_SOURCES_X86_RTCD']
+endif
+
foreach intr_name : ['sse', 'sse2', 'sse4_1', 'neon_intr']
have_intr = get_variable('have_' + intr_name)
if not have_intr
@@ -30,7 +34,9 @@ if (intrinsics_support.length() + asm_optimization.length() + inline_optimizatio
endif
if host_cpu_family in ['arm', 'aarch64'] and have_arm_intrinsics_or_asm
- celt_sources += sources['CELT_SOURCES_ARM']
+ if opus_conf.has('OPUS_HAVE_RTCD')
+ celt_sources += sources['CELT_SOURCES_ARM_RTCD']
+ endif
if have_arm_ne10
celt_sources += sources['CELT_SOURCES_ARM_NE10']
endif
diff --git a/celt/modes.c b/celt/modes.c
index 390c5e8a..23f7cde6 100644
--- a/celt/modes.c
+++ b/celt/modes.c
@@ -173,7 +173,10 @@ static void compute_allocation_table(CELTMode *mode)
mode->nbAllocVectors = BITALLOC_SIZE;
allocVectors = opus_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands));
if (allocVectors==NULL)
+ {
+ mode->allocVectors = NULL;
return;
+ }
/* Check for standard mode */
if (mode->Fs == 400*(opus_int32)mode->shortMdctSize)
diff --git a/celt/pitch.c b/celt/pitch.c
index 872582a4..7998db41 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -161,17 +161,26 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x
shift=0;
if (C==2)
shift++;
-#endif
for (i=1;i<len>>1;i++)
- x_lp[i] = SHR32(HALF32(HALF32(x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]), shift);
- x_lp[0] = SHR32(HALF32(HALF32(x[0][1])+x[0][0]), shift);
+ x_lp[i] = SHR32(x[0][(2*i-1)], shift+2) + SHR32(x[0][(2*i+1)], shift+2) + SHR32(x[0][2*i], shift+1);
+ x_lp[0] = SHR32(x[0][1], shift+2) + SHR32(x[0][0], shift+1);
if (C==2)
{
for (i=1;i<len>>1;i++)
- x_lp[i] += SHR32(HALF32(HALF32(x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]), shift);
- x_lp[0] += SHR32(HALF32(HALF32(x[1][1])+x[1][0]), shift);
+ x_lp[i] += SHR32(x[1][(2*i-1)], shift+2) + SHR32(x[1][(2*i+1)], shift+2) + SHR32(x[1][2*i], shift+1);
+ x_lp[0] += SHR32(x[1][1], shift+2) + SHR32(x[1][0], shift+1);
}
-
+#else
+ for (i=1;i<len>>1;i++)
+ x_lp[i] = .25f*x[0][(2*i-1)] + .25f*x[0][(2*i+1)] + .5f*x[0][2*i];
+ x_lp[0] = .25f*x[0][1] + .5f*x[0][0];
+ if (C==2)
+ {
+ for (i=1;i<len>>1;i++)
+ x_lp[i] += .25f*x[1][(2*i-1)] + .25f*x[1][(2*i+1)] + .5f*x[1][2*i];
+ x_lp[0] += .25f*x[1][1] + .5f*x[1][0];
+ }
+#endif
_celt_autocorr(x_lp, ac, NULL, 0,
4, len>>1, arch);
@@ -249,7 +258,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 maxcorr=1;
#endif
celt_assert(max_pitch>0);
- celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+ celt_sig_assert(((size_t)_x&3)==0);
for (i=0;i<max_pitch-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
diff --git a/celt/rate.c b/celt/rate.c
index 465e1ba2..7f7ad3fa 100644
--- a/celt/rate.c
+++ b/celt/rate.c
@@ -356,6 +356,8 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,
else
depth_threshold = 0;
#ifdef FUZZING
+ (void)signalBandwidth;
+ (void)depth_threshold;
if ((rand()&0x1) == 0)
#else
if (codedBands<=start+2 || (band_bits > (depth_threshold*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth))
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 70f8f493..ae9a7b56 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -144,8 +144,9 @@ void test1d(int nfft,int isinverse,int arch)
int main(int argc,char ** argv)
{
+ int arch;
ALLOC_STACK;
- int arch = opus_select_arch();
+ arch = opus_select_arch();
if (argc>1) {
int k;
diff --git a/celt/tests/test_unit_entropy.c b/celt/tests/test_unit_entropy.c
index 7f674529..b1619b74 100644
--- a/celt/tests/test_unit_entropy.c
+++ b/celt/tests/test_unit_entropy.c
@@ -104,7 +104,7 @@ int main(int _argc,char **_argv){
nbits=ec_tell_frac(&enc);
ec_enc_done(&enc);
fprintf(stderr,
- "Encoded %0.2lf bits of entropy to %0.2lf bits (%0.3lf%% wasted).\n",
+ "Encoded %0.2f bits of entropy to %0.2f bits (%0.3f%% wasted).\n",
entropy,ldexp(nbits,-3),100*(nbits-ldexp(entropy,3))/nbits);
fprintf(stderr,"Packed to %li bytes.\n",(long)ec_range_bytes(&enc));
ec_dec_init(&dec,ptr,DATA_SIZE);
@@ -129,7 +129,7 @@ int main(int _argc,char **_argv){
nbits2=ec_tell_frac(&dec);
if(nbits!=nbits2){
fprintf(stderr,
- "Reported number of bits used was %0.2lf, should be %0.2lf.\n",
+ "Reported number of bits used was %0.2f, should be %0.2f.\n",
ldexp(nbits2,-3),ldexp(nbits,-3));
ret=-1;
}
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 4a563ccf..844c5b48 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -184,8 +184,9 @@ void test1d(int nfft,int isinverse,int arch)
int main(int argc,char ** argv)
{
+ int arch;
ALLOC_STACK;
- int arch = opus_select_arch();
+ arch = opus_select_arch();
if (argc>1) {
int k;
diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
index 7d1ecf75..90e69ecf 100644
--- a/celt/x86/celt_lpc_sse.h
+++ b/celt/x86/celt_lpc_sse.h
@@ -33,7 +33,6 @@
#endif
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
-#define OVERRIDE_CELT_FIR
void celt_fir_sse4_1(
const opus_val16 *x,
@@ -44,10 +43,11 @@ void celt_fir_sse4_1(
int arch);
#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define OVERRIDE_CELT_FIR
#define celt_fir(x, num, y, N, ord, arch) \
((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
-#else
+#elif defined(OPUS_HAVE_RTCD)
extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
@@ -57,6 +57,7 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
int ord,
int arch);
+#define OVERRIDE_CELT_FIR
# define celt_fir(x, num, y, N, ord, arch) \
((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index f7a014b6..964aef50 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -63,7 +63,7 @@ void xcorr_kernel_sse(
#define xcorr_kernel(x, y, sum, len, arch) \
((void)arch, xcorr_kernel_sse(x, y, sum, len))
-#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) && ((defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
@@ -115,8 +115,8 @@ opus_val32 celt_inner_prod_sse(
((void)arch, celt_inner_prod_sse(x, y, N))
-#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
- (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) && (((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c
index a092c68b..2bc57830 100644
--- a/celt/x86/pitch_sse4_1.c
+++ b/celt/x86/pitch_sse4_1.c
@@ -117,6 +117,14 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
__m128i sum0, sum1, sum2, sum3, vecSum;
__m128i initSum;
+#ifdef OPUS_CHECK_ASM
+ opus_val32 sum_c[4];
+ for (j=0;j<4;j++) {
+ sum_c[j] = sum[j];
+ }
+ xcorr_kernel_c(x, y, sum_c, len);
+#endif
+
celt_assert(len >= 3);
sum0 = _mm_setzero_si128();
@@ -177,19 +185,56 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
vecSum = _mm_add_epi32(vecSum, sum2);
}
- for (;j<len;j++)
+ vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
+ if (len - j == 3)
{
- vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
- vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+ vecX0 = _mm_shuffle_epi32(vecX, 0x55);
+ vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
+ vecX2 = _mm_shuffle_epi32(vecX, 0xff);
vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+ vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+ vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ sum1 = _mm_mullo_epi32(vecX1, vecY1);
+ sum2 = _mm_mullo_epi32(vecX2, vecY2);
+
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ vecSum = _mm_add_epi32(vecSum, sum1);
+ vecSum = _mm_add_epi32(vecSum, sum2);
+ }
+ else if (len - j == 2)
+ {
+ vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
+ vecX1 = _mm_shuffle_epi32(vecX, 0xff);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+ vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ sum1 = _mm_mullo_epi32(vecX1, vecY1);
+
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ vecSum = _mm_add_epi32(vecSum, sum1);
+ }
+ else if (len - j == 1)
+ {
+ vecX0 = _mm_shuffle_epi32(vecX, 0xff);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+
vecSum = _mm_add_epi32(vecSum, sum0);
}
initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
initSum = _mm_add_epi32(initSum, vecSum);
_mm_storeu_si128((__m128i *)sum, initSum);
+
+#ifdef OPUS_CHECK_ASM
+ celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
+#endif
}
#endif
diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
index 080eb25e..6a1914de 100644
--- a/celt/x86/x86cpu.c
+++ b/celt/x86/x86cpu.c
@@ -35,11 +35,11 @@
#include "pitch.h"
#include "x86cpu.h"
-#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#if defined(OPUS_HAVE_RTCD) && \
+ ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
(defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
(defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
- (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
-
+ (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
#if defined(_MSC_VER)
@@ -68,7 +68,8 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
"=r" (CPUInfo[1]),
"=c" (CPUInfo[2]),
"=d" (CPUInfo[3]) :
- "0" (InfoType)
+ /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+ "0" (InfoType), "2" (0)
);
#else
__asm__ __volatile__ (
@@ -77,11 +78,22 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
"=b" (CPUInfo[1]),
"=c" (CPUInfo[2]),
"=d" (CPUInfo[3]) :
- "0" (InfoType)
+ /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+ "0" (InfoType), "2" (0)
);
#endif
#elif defined(CPU_INFO_BY_C)
- __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
+ /* We use __get_cpuid_count to clear ECX to avoid a valgrind false-positive
+ prior to v3.17.0.*/
+ if (!__get_cpuid_count(InfoType, 0, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]))) {
+ /* Our function cannot fail, but __get_cpuid{_count} can.
+ Returning all zeroes will effectively disable all SIMD, which is
+ what we want on CPUs that don't support CPUID. */
+ CPUInfo[3] = CPUInfo[2] = CPUInfo[1] = CPUInfo[0] = 0;
+ }
+#else
+# error "Configured to use x86 RTCD, but no CPU detection method available. " \
+ "Reconfigure with --disable-rtcd (or send patches)."
#endif
}
@@ -98,7 +110,7 @@ typedef struct CPU_Feature{
static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
{
- unsigned int info[4] = {0};
+ unsigned int info[4];
unsigned int nIds = 0;
cpuid(info, 0);
@@ -119,7 +131,7 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
}
}
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
{
CPU_Feature cpu_feature;
int arch;
@@ -154,4 +166,13 @@ int opus_select_arch(void)
return arch;
}
+int opus_select_arch(void) {
+ int arch = opus_select_arch_impl();
+#ifdef FUZZING
+ /* Randomly downgrade the architecture. */
+ arch = rand()%(arch+1);
+#endif
+ return arch;
+}
+
#endif
diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
index 1e2bf17b..04e80489 100644
--- a/celt/x86/x86cpu.h
+++ b/celt/x86/x86cpu.h
@@ -56,40 +56,18 @@
int opus_select_arch(void);
# endif
-/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32()
- or _mm_cvtepi16_epi32() when optimizations are disabled, even though the
- actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory
- reference, these require 16-byte alignment and load a full 16 bytes (instead
- of 4 or 8), possibly reading out of bounds.
-
- We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or
- _mm_loadl_epi64(), which should have the same semantics as an m32 or m64
- reference in the PMOVSXWD instruction itself, but gcc is not smart enough to
- optimize this out when optimizations ARE enabled.
-
- Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
- (which is fair, since technically the compiler is always allowed to do the
- dereference before invoking the function implementing the intrinsic).
- However, it is smart enough to eliminate the extra MOVD instruction.
- For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out
- the extra MOVQ if it's specified explicitly */
-
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-# define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
-# else
-# define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(*(__m128i *)(x)))
-#endif
-
-/* similar reasoning about the instruction sequence as in the 32-bit macro above,
- */
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-# define OP_CVTEPI16_EPI32_M64(x) \
+/*MOVD should not impose any alignment restrictions, but the C standard does,
+ and UBSan will report errors if we actually make unaligned accesses.
+ Use this to work around those restrictions (which should hopefully all get
+ optimized to a single MOVD instruction).*/
+#define OP_LOADU_EPI32(x) \
+ (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\
+ *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U))
+
+#define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x))))
+
+#define OP_CVTEPI16_EPI32_M64(x) \
(_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
-# else
-# define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(*(__m128i *)(x)))
-# endif
#endif