diff options
author | Harish Mahendrakar <harish.mahendrakar@ittiam.com> | 2022-08-08 17:13:54 +0000 |
---|---|---|
committer | Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> | 2022-08-08 17:13:54 +0000 |
commit | 0a0c005652f9b1c0f737643845591a2196b26538 (patch) | |
tree | c78adfd6bffe3bf8b0f2edb165f4687a16f9dad5 /celt | |
parent | 3eb6b26bc34ace509cd8d5842d276f2934cfa07a (diff) | |
parent | 170fd0b529b1871b9dbddae1c98233faad14cba7 (diff) | |
download | libopus-0a0c005652f9b1c0f737643845591a2196b26538.tar.gz |
Merge commit 'c9d5bea13e3cb7381bfa897a45d8bab4e7b767a7' into HEAD am: 170fd0b529
Original change: https://android-review.googlesource.com/c/platform/external/libopus/+/2164282
Change-Id: I8f395bfa68114aed5cc458fb6e424ff545c4b2c4
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
Diffstat (limited to 'celt')
-rw-r--r-- | celt/arm/armcpu.c | 9 | ||||
-rw-r--r-- | celt/arm/pitch_neon_intr.c | 51 | ||||
-rw-r--r-- | celt/bands.c | 9 | ||||
-rw-r--r-- | celt/celt_decoder.c | 49 | ||||
-rw-r--r-- | celt/celt_encoder.c | 14 | ||||
-rw-r--r-- | celt/celt_lpc.c | 62 | ||||
-rw-r--r-- | celt/cpu_support.h | 5 | ||||
-rw-r--r-- | celt/fixed_debug.h | 67 | ||||
-rw-r--r-- | celt/fixed_generic.h | 10 | ||||
-rw-r--r-- | celt/float_cast.h | 4 | ||||
-rw-r--r-- | celt/mathops.h | 2 | ||||
-rw-r--r-- | celt/meson.build | 8 | ||||
-rw-r--r-- | celt/modes.c | 3 | ||||
-rw-r--r-- | celt/pitch.c | 23 | ||||
-rw-r--r-- | celt/rate.c | 2 | ||||
-rw-r--r-- | celt/tests/test_unit_dft.c | 3 | ||||
-rw-r--r-- | celt/tests/test_unit_entropy.c | 4 | ||||
-rw-r--r-- | celt/tests/test_unit_mdct.c | 3 | ||||
-rw-r--r-- | celt/x86/celt_lpc_sse.h | 5 | ||||
-rw-r--r-- | celt/x86/pitch_sse.h | 6 | ||||
-rw-r--r-- | celt/x86/pitch_sse4_1.c | 51 | ||||
-rw-r--r-- | celt/x86/x86cpu.c | 37 | ||||
-rw-r--r-- | celt/x86/x86cpu.h | 46 |
23 files changed, 323 insertions, 150 deletions
diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c index cce3ae3a..c7d16e6d 100644 --- a/celt/arm/armcpu.c +++ b/celt/arm/armcpu.c @@ -156,7 +156,7 @@ opus_uint32 opus_cpu_capabilities(void) "your platform. Reconfigure with --disable-rtcd (or send patches)." #endif -int opus_select_arch(void) +static int opus_select_arch_impl(void) { opus_uint32 flags = opus_cpu_capabilities(); int arch = 0; @@ -184,4 +184,11 @@ int opus_select_arch(void) return arch; } +int opus_select_arch(void) { + int arch = opus_select_arch_impl(); +#ifdef FUZZING + arch = rand()%(arch+1); +#endif + return arch; +} #endif diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c index 1ac38c43..35cc46e2 100644 --- a/celt/arm/pitch_neon_intr.c +++ b/celt/arm/pitch_neon_intr.c @@ -137,22 +137,27 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */ /* operations of celt_inner_prod_neon(), and both functions should have bit */ /* exact output. */ -static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N) +static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N) { int i; + *err = 0; opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0; for (i = 0; i < N - 3; i += 4) { xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]); xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]); xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]); xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]); + *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3); } xy0 += xy2; xy1 += xy3; xy = xy0 + xy1; + *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy); for (; i < N; i++) { xy = MAC16_16(xy, x[i], y[i]); + *err += ABS32(xy); } + *err = *err*2e-7 + N*1e-37; return xy; } @@ -160,32 +165,10 @@ static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, c /* operations of dual_inner_prod_neon(), and both functions should have bit */ /* exact output. */ static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, - int N, opus_val32 *xy1, opus_val32 *xy2) + int N, opus_val32 *xy1, opus_val32 *xy2, float *err) { - int i; - opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0; - for (i = 0; i < N - 3; i += 4) { - xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]); - xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]); - xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]); - xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]); - xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]); - xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]); - xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]); - xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]); - } - xy01_0 += xy01_2; - xy02_0 += xy02_2; - xy01_1 += xy01_3; - xy02_1 += xy02_3; - xy01 = xy01_0 + xy01_1; - xy02 = xy02_0 + xy02_1; - for (; i < N; i++) { - xy01 = MAC16_16(xy01, x[i], y01[i]); - xy02 = MAC16_16(xy02, x[i], y02[i]); - } - *xy1 = xy01; - *xy2 = xy02; + *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N); + *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N); } #endif /* OPUS_CHECK_ASM */ @@ -225,7 +208,12 @@ opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N) } #ifdef OPUS_CHECK_ASM - celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL); + { + float err, res; + res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N); + /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/ + celt_assert(ABS32(res - xy) <= err); + } #endif return xy; @@ -280,9 +268,12 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus #ifdef OPUS_CHECK_ASM { opus_val32 xy1_c, xy2_c; - dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c); - celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL); - celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL); + float err[2]; + dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err); + /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]); + if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/ + celt_assert(ABS32(xy1_c - *xy1) <= err[0]); + celt_assert(ABS32(xy2_c - *xy2) <= err[1]); } #endif } diff --git a/celt/bands.c b/celt/bands.c index 2702963c..5320ffab 100644 --- a/celt/bands.c +++ b/celt/bands.c @@ -901,7 +901,7 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx, sctx->itheta = itheta; sctx->qalloc = qalloc; } -static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, int b, +static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, celt_norm *lowband_out) { int c; @@ -926,7 +926,6 @@ static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, sign = ec_dec_bits(ec, 1); } ctx->remaining_bits -= 1<<BITRES; - b-=1<<BITRES; } if (ctx->resynth) x[0] = sign ? -NORM_SCALING : NORM_SCALING; @@ -1134,7 +1133,7 @@ static unsigned quant_band(struct band_ctx *ctx, celt_norm *X, /* Special case for one sample */ if (N==1) { - return quant_band_n1(ctx, X, NULL, b, lowband_out); + return quant_band_n1(ctx, X, NULL, lowband_out); } if (tf_change>0) @@ -1256,7 +1255,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm /* Special case for one sample */ if (N==1) { - return quant_band_n1(ctx, X, Y, b, lowband_out); + return quant_band_n1(ctx, X, Y, lowband_out); } orig_fill = fill; @@ -1381,6 +1380,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm return cm; } +#ifndef DISABLE_UPDATE_DRAFT static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm *norm2, int start, int M, int dual_stereo) { int n1, n2; @@ -1393,6 +1393,7 @@ static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm if (dual_stereo) OPUS_COPY(&norm2[n1], &norm2[2*n1 - n2], n2-n1); } +#endif void quant_all_bands(int encode, const CELTMode *m, int start, int end, celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks, diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c index 74ca3b74..35a2073a 100644 --- a/celt/celt_decoder.c +++ b/celt/celt_decoder.c @@ -90,7 +90,7 @@ struct OpusCustomDecoder { opus_uint32 rng; int error; int last_pitch_index; - int loss_count; + int loss_duration; int skip_plc; int postfilter_period; int postfilter_period_old; @@ -512,7 +512,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) int nbEBands; int overlap; int start; - int loss_count; + int loss_duration; int noise_based; const opus_int16 *eBands; SAVE_STACK; @@ -532,9 +532,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) oldLogE2 = oldLogE + 2*nbEBands; backgroundLogE = oldLogE2 + 2*nbEBands; - loss_count = st->loss_count; + loss_duration = st->loss_duration; start = st->start; - noise_based = loss_count >= 5 || start != 0 || st->skip_plc; + noise_based = loss_duration >= 40 || start != 0 || st->skip_plc; if (noise_based) { /* Noise-based PLC/CNG */ @@ -557,9 +557,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) #else ALLOC(X, C*N, celt_norm); /**< Interleaved normalised MDCTs */ #endif + c=0; do { + OPUS_MOVE(decode_mem[c], decode_mem[c]+N, + DECODE_BUFFER_SIZE-N+(overlap>>1)); + } while (++c<C); /* Energy decay */ - decay = loss_count==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT); + decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT); c=0; do { for (i=start;i<end;i++) @@ -585,11 +589,6 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) } st->rng = seed; - c=0; do { - OPUS_MOVE(decode_mem[c], decode_mem[c]+N, - DECODE_BUFFER_SIZE-N+(overlap>>1)); - } while (++c<C); - celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch); } else { int exc_length; @@ -602,7 +601,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) VARDECL(opus_val16, _exc); VARDECL(opus_val16, fir_tmp); - if (loss_count == 0) + if (loss_duration == 0) { st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch); } else { @@ -632,7 +631,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) for (i=0;i<MAX_PERIOD+LPC_ORDER;i++) exc[i-LPC_ORDER] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT); - if (loss_count == 0) + if (loss_duration == 0) { opus_val32 ac[LPC_ORDER+1]; /* Compute LPC coefficients for the last MAX_PERIOD samples before @@ -812,7 +811,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) } while (++c<C); } - st->loss_count = loss_count+1; + /* Saturate to soemthing large to avoid wrap-around. */ + st->loss_duration = IMIN(10000, loss_duration+(1<<LM)); RESTORE_STACK; } @@ -868,6 +868,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat int nbEBands; int overlap; const opus_int16 *eBands; + opus_val16 max_background_increase; ALLOC_STACK; VALIDATE_CELT_DECODER(st); @@ -942,7 +943,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat /* Check if there are at least two packets received consecutively before * turning on the pitch-based PLC */ - st->skip_plc = st->loss_count != 0; + st->skip_plc = st->loss_duration != 0; if (dec == NULL) { @@ -1140,25 +1141,21 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat if (C==1) OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands); - /* In case start or end were to change */ if (!isTransient) { - opus_val16 max_background_increase; OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands); OPUS_COPY(oldLogE, oldBandE, 2*nbEBands); - /* In normal circumstances, we only allow the noise floor to increase by - up to 2.4 dB/second, but when we're in DTX, we allow up to 6 dB - increase for each update.*/ - if (st->loss_count < 10) - max_background_increase = M*QCONST16(0.001f,DB_SHIFT); - else - max_background_increase = QCONST16(1.f,DB_SHIFT); - for (i=0;i<2*nbEBands;i++) - backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]); } else { for (i=0;i<2*nbEBands;i++) oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]); } + /* In normal circumstances, we only allow the noise floor to increase by + up to 2.4 dB/second, but when we're in DTX we give the weight of + all missing packets to the update packet. */ + max_background_increase = IMIN(160, st->loss_duration+M)*QCONST16(0.001f,DB_SHIFT); + for (i=0;i<2*nbEBands;i++) + backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]); + /* In case start or end were to change */ c=0; do { for (i=0;i<start;i++) @@ -1175,7 +1172,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat st->rng = dec->rng; deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum); - st->loss_count = 0; + st->loss_duration = 0; RESTORE_STACK; if (ec_tell(dec) > 8*len) return OPUS_INTERNAL_ERROR; diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c index d6f8afc2..637d442c 100644 --- a/celt/celt_encoder.c +++ b/celt/celt_encoder.c @@ -1719,8 +1719,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch); compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); amp2Log2(mode, effEnd, end, bandE, bandLogE2, C); - for (i=0;i<C*nbEBands;i++) - bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT)); + for (c=0;c<C;c++) + { + for (i=0;i<end;i++) + bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT)); + } } compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch); @@ -1856,8 +1859,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch); amp2Log2(mode, effEnd, end, bandE, bandLogE, C); /* Compensate for the scaling of short vs long mdcts */ - for (i=0;i<C*nbEBands;i++) - bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT)); + for (c=0;c<C;c++) + { + for (i=0;i<end;i++) + bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT)); + } tf_estimate = QCONST16(.2f,14); } } diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c index 8ecb693e..242e6df5 100644 --- a/celt/celt_lpc.c +++ b/celt/celt_lpc.c @@ -50,17 +50,21 @@ int p #endif OPUS_CLEAR(lpc, p); +#ifdef FIXED_POINT if (ac[0] != 0) +#else + if (ac[0] > 1e-10f) +#endif { for (i = 0; i < p; i++) { /* Sum up this iteration's reflection coefficient */ opus_val32 rr = 0; for (j = 0; j < i; j++) rr += MULT32_32_Q31(lpc[j],ac[i - j]); - rr += SHR32(ac[i + 1],3); - r = -frac_div32(SHL32(rr,3), error); + rr += SHR32(ac[i + 1],6); + r = -frac_div32(SHL32(rr,6), error); /* Update LPC coefficients and total error */ - lpc[i] = SHR32(r,3); + lpc[i] = SHR32(r,6); for (j = 0; j < (i+1)>>1; j++) { opus_val32 tmp1, tmp2; @@ -73,17 +77,61 @@ int p error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error); /* Bail out once we get 30 dB gain */ #ifdef FIXED_POINT - if (error<SHR32(ac[0],10)) + if (error<=SHR32(ac[0],10)) break; #else - if (error<.001f*ac[0]) + if (error<=.001f*ac[0]) break; #endif } } #ifdef FIXED_POINT - for (i=0;i<p;i++) - _lpc[i] = ROUND16(lpc[i],16); + { + /* Convert the int32 lpcs to int16 and ensure there are no wrap-arounds. + This reuses the logic in silk_LPC_fit() and silk_bwexpander_32(). Any bug + fixes should also be applied there. */ + int iter, idx = 0; + opus_val32 maxabs, absval, chirp_Q16, chirp_minus_one_Q16; + + for (iter = 0; iter < 10; iter++) { + maxabs = 0; + for (i = 0; i < p; i++) { + absval = ABS32(lpc[i]); + if (absval > maxabs) { + maxabs = absval; + idx = i; + } + } + maxabs = PSHR32(maxabs, 13); /* Q25->Q12 */ + + if (maxabs > 32767) { + maxabs = MIN32(maxabs, 163838); + chirp_Q16 = QCONST32(0.999, 16) - DIV32(SHL32(maxabs - 32767, 14), + SHR32(MULT32_32_32(maxabs, idx + 1), 2)); + chirp_minus_one_Q16 = chirp_Q16 - 65536; + + /* Apply bandwidth expansion. */ + for (i = 0; i < p - 1; i++) { + lpc[i] = MULT32_32_Q16(chirp_Q16, lpc[i]); + chirp_Q16 += PSHR32(MULT32_32_32(chirp_Q16, chirp_minus_one_Q16), 16); + } + lpc[p - 1] = MULT32_32_Q16(chirp_Q16, lpc[p - 1]); + } else { + break; + } + } + + if (iter == 10) { + /* If the coeffs still do not fit into the 16 bit range after 10 iterations, + fall back to the A(z)=1 filter. */ + OPUS_CLEAR(lpc, p); + _lpc[0] = 4096; /* Q12 */ + } else { + for (i = 0; i < p; i++) { + _lpc[i] = EXTRACT16(PSHR32(lpc[i], 13)); /* Q25->Q12 */ + } + } + } #endif } diff --git a/celt/cpu_support.h b/celt/cpu_support.h index 68fc6067..7b5c56ca 100644 --- a/celt/cpu_support.h +++ b/celt/cpu_support.h @@ -43,10 +43,11 @@ */ #define OPUS_ARCHMASK 3 -#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ +#elif defined(OPUS_HAVE_RTCD) && \ + ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ - (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)) + (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))) #include "x86/x86cpu.h" /* We currently support 5 x86 variants: diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h index f4352952..c2cf5a83 100644 --- a/celt/fixed_debug.h +++ b/celt/fixed_debug.h @@ -167,7 +167,7 @@ static OPUS_INLINE short SHR16_(int a, int shift, char *file, int line) #define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__) static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line) { - int res; + opus_int32 res; if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift)) { fprintf (stderr, "SHL16: inputs are not short: %d %d in %s: line %d\n", a, shift, file, line); @@ -175,7 +175,7 @@ static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line) celt_assert(0); #endif } - res = a<<shift; + res = (opus_int32)((opus_uint32)a<<shift); if (!VERIFY_SHORT(res)) { fprintf (stderr, "SHL16: output is not short: %d in %s: line %d\n", res, file, line); @@ -214,15 +214,15 @@ static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line) opus_int64 res; if (!VERIFY_INT(a) || !VERIFY_SHORT(shift)) { - fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", a, shift, file, line); + fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", (long long)a, shift, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif } - res = a<<shift; + res = (opus_int64)((opus_uint64)a<<shift); if (!VERIFY_INT(res)) { - fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", a, shift, res, file, line); + fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", (long long)a, shift, (long long)res, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif @@ -339,7 +339,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file opus_uint64 res; if (!VERIFY_UINT(a) || !VERIFY_UINT(b)) { - fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line); + fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif @@ -347,7 +347,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file res = a+b; if (!VERIFY_UINT(res)) { - fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", res, file, line); + fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", (unsigned long long)res, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif @@ -363,14 +363,14 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file opus_uint64 res; if (!VERIFY_UINT(a) || !VERIFY_UINT(b)) { - fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line); + fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif } if (a<b) { - fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", a, b, file, line); + fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif @@ -378,7 +378,7 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file res = a-b; if (!VERIFY_UINT(res)) { - fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", a, b, res, file, line); + fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, (unsigned long long)res, file, line); #ifdef FIXED_DEBUG_ASSERT celt_assert(0); #endif @@ -410,6 +410,51 @@ static OPUS_INLINE short MULT16_16_16(int a, int b) return res; } +/* result fits in 32 bits */ +static OPUS_INLINE int MULT32_32_32(opus_int64 a, opus_int64 b) +{ + opus_int64 res; + if (!VERIFY_INT(a) || !VERIFY_INT(b)) + { + fprintf (stderr, "MULT32_32_32: inputs are not int: %lld %lld\n", (long long)a, (long long)b); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = a*b; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "MULT32_32_32: output is not int: %lld\n", (long long)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=5; + return res; +} + +static OPUS_INLINE int MULT32_32_Q16(opus_int64 a, opus_int64 b) +{ + opus_int64 res; + if (!VERIFY_INT(a) || !VERIFY_INT(b)) + { + fprintf (stderr, "MULT32_32_Q16: inputs are not int: %lld %lld\n", (long long)a, (long long)b); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + res = ((opus_int64)(a)*(opus_int64)(b)) >> 16; + if (!VERIFY_INT(res)) + { + fprintf (stderr, "MULT32_32_Q16: output is not int: %lld*%lld=%lld\n", (long long)a, (long long)b, (long long)res); +#ifdef FIXED_DEBUG_ASSERT + celt_assert(0); +#endif + } + celt_mips+=5; + return res; +} + #define MULT16_16(a, b) MULT16_16_(a, b, __FILE__, __LINE__) static OPUS_INLINE int MULT16_16_(int a, int b, char *file, int line) { @@ -786,6 +831,6 @@ static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x) #undef PRINT_MIPS -#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0); +#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", (unsigned long long)celt_mips);} while (0); #endif diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h index 0ecbb899..8f29d46b 100644 --- a/celt/fixed_generic.h +++ b/celt/fixed_generic.h @@ -57,6 +57,13 @@ #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15)) #endif +/** 32x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */ +#if OPUS_FAST_INT64 +#define MULT32_32_Q16(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),16)) +#else +#define MULT32_32_Q16(a,b) (ADD32(ADD32(ADD32((opus_val32)(SHR32(((opus_uint32)((a)&0x0000ffff)*(opus_uint32)((b)&0x0000ffff)),16)), MULT16_16SU(SHR32(a,16),((b)&0x0000ffff))), MULT16_16SU(SHR32(b,16),((a)&0x0000ffff))), SHL32(MULT16_16(SHR32(a,16),SHR32(b,16)),16))) +#endif + /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */ #if OPUS_FAST_INT64 #define MULT32_32_Q31(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),31)) @@ -131,6 +138,9 @@ /** 16x16 multiplication where the result fits in 16 bits */ #define MULT16_16_16(a,b) ((((opus_val16)(a))*((opus_val16)(b)))) +/** 32x32 multiplication where the result fits in 32 bits */ +#define MULT32_32_32(a,b) ((((opus_val32)(a))*((opus_val32)(b)))) + /* (opus_val32)(opus_val16) gives TI compiler a hint that it's 16x16->32 multiply */ /** 16x16 multiplication where the result fits in 32 bits */ #define MULT16_16(a,b) (((opus_val32)(opus_val16)(a))*((opus_val32)(opus_val16)(b))) diff --git a/celt/float_cast.h b/celt/float_cast.h index 9d34976e..8915a5fd 100644 --- a/celt/float_cast.h +++ b/celt/float_cast.h @@ -99,7 +99,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s return intgr ; } -#elif defined(HAVE_LRINTF) +#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* These defines enable functionality introduced with the 1999 ISO C ** standard. They must be defined before the inclusion of math.h to @@ -117,7 +117,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s #include <math.h> #define float2int(x) lrintf(x) -#elif (defined(HAVE_LRINT)) +#elif defined(HAVE_LRINT) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L #define _ISOC9X_SOURCE 1 #define _ISOC99_SOURCE 1 diff --git a/celt/mathops.h b/celt/mathops.h index fe29dac1..478ac918 100644 --- a/celt/mathops.h +++ b/celt/mathops.h @@ -153,7 +153,7 @@ static OPUS_INLINE float celt_exp2(float x) float f; opus_uint32 i; } res; - integer = floor(x); + integer = (int)floor(x); if (integer < -50) return 0; frac = x-integer; diff --git a/celt/meson.build b/celt/meson.build index 370ea1fe..ad95d949 100644 --- a/celt/meson.build +++ b/celt/meson.build @@ -10,6 +10,10 @@ celt_neon_intr_sources = sources['CELT_SOURCES_ARM_NEON_INTR'] celt_static_libs = [] +if host_cpu_family in ['x86', 'x86_64'] and opus_conf.has('OPUS_HAVE_RTCD') + celt_sources += sources['CELT_SOURCES_X86_RTCD'] +endif + foreach intr_name : ['sse', 'sse2', 'sse4_1', 'neon_intr'] have_intr = get_variable('have_' + intr_name) if not have_intr @@ -30,7 +34,9 @@ if (intrinsics_support.length() + asm_optimization.length() + inline_optimizatio endif if host_cpu_family in ['arm', 'aarch64'] and have_arm_intrinsics_or_asm - celt_sources += sources['CELT_SOURCES_ARM'] + if opus_conf.has('OPUS_HAVE_RTCD') + celt_sources += sources['CELT_SOURCES_ARM_RTCD'] + endif if have_arm_ne10 celt_sources += sources['CELT_SOURCES_ARM_NE10'] endif diff --git a/celt/modes.c b/celt/modes.c index 390c5e8a..23f7cde6 100644 --- a/celt/modes.c +++ b/celt/modes.c @@ -173,7 +173,10 @@ static void compute_allocation_table(CELTMode *mode) mode->nbAllocVectors = BITALLOC_SIZE; allocVectors = opus_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands)); if (allocVectors==NULL) + { + mode->allocVectors = NULL; return; + } /* Check for standard mode */ if (mode->Fs == 400*(opus_int32)mode->shortMdctSize) diff --git a/celt/pitch.c b/celt/pitch.c index 872582a4..7998db41 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -161,17 +161,26 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x shift=0; if (C==2) shift++; -#endif for (i=1;i<len>>1;i++) - x_lp[i] = SHR32(HALF32(HALF32(x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]), shift); - x_lp[0] = SHR32(HALF32(HALF32(x[0][1])+x[0][0]), shift); + x_lp[i] = SHR32(x[0][(2*i-1)], shift+2) + SHR32(x[0][(2*i+1)], shift+2) + SHR32(x[0][2*i], shift+1); + x_lp[0] = SHR32(x[0][1], shift+2) + SHR32(x[0][0], shift+1); if (C==2) { for (i=1;i<len>>1;i++) - x_lp[i] += SHR32(HALF32(HALF32(x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]), shift); - x_lp[0] += SHR32(HALF32(HALF32(x[1][1])+x[1][0]), shift); + x_lp[i] += SHR32(x[1][(2*i-1)], shift+2) + SHR32(x[1][(2*i+1)], shift+2) + SHR32(x[1][2*i], shift+1); + x_lp[0] += SHR32(x[1][1], shift+2) + SHR32(x[1][0], shift+1); } - +#else + for (i=1;i<len>>1;i++) + x_lp[i] = .25f*x[0][(2*i-1)] + .25f*x[0][(2*i+1)] + .5f*x[0][2*i]; + x_lp[0] = .25f*x[0][1] + .5f*x[0][0]; + if (C==2) + { + for (i=1;i<len>>1;i++) + x_lp[i] += .25f*x[1][(2*i-1)] + .25f*x[1][(2*i+1)] + .5f*x[1][2*i]; + x_lp[0] += .25f*x[1][1] + .5f*x[1][0]; + } +#endif _celt_autocorr(x_lp, ac, NULL, 0, 4, len>>1, arch); @@ -249,7 +258,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 maxcorr=1; #endif celt_assert(max_pitch>0); - celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); + celt_sig_assert(((size_t)_x&3)==0); for (i=0;i<max_pitch-3;i+=4) { opus_val32 sum[4]={0,0,0,0}; diff --git a/celt/rate.c b/celt/rate.c index 465e1ba2..7f7ad3fa 100644 --- a/celt/rate.c +++ b/celt/rate.c @@ -356,6 +356,8 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end, else depth_threshold = 0; #ifdef FUZZING + (void)signalBandwidth; + (void)depth_threshold; if ((rand()&0x1) == 0) #else if (codedBands<=start+2 || (band_bits > (depth_threshold*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth)) diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c index 70f8f493..ae9a7b56 100644 --- a/celt/tests/test_unit_dft.c +++ b/celt/tests/test_unit_dft.c @@ -144,8 +144,9 @@ void test1d(int nfft,int isinverse,int arch) int main(int argc,char ** argv) { + int arch; ALLOC_STACK; - int arch = opus_select_arch(); + arch = opus_select_arch(); if (argc>1) { int k; diff --git a/celt/tests/test_unit_entropy.c b/celt/tests/test_unit_entropy.c index 7f674529..b1619b74 100644 --- a/celt/tests/test_unit_entropy.c +++ b/celt/tests/test_unit_entropy.c @@ -104,7 +104,7 @@ int main(int _argc,char **_argv){ nbits=ec_tell_frac(&enc); ec_enc_done(&enc); fprintf(stderr, - "Encoded %0.2lf bits of entropy to %0.2lf bits (%0.3lf%% wasted).\n", + "Encoded %0.2f bits of entropy to %0.2f bits (%0.3f%% wasted).\n", entropy,ldexp(nbits,-3),100*(nbits-ldexp(entropy,3))/nbits); fprintf(stderr,"Packed to %li bytes.\n",(long)ec_range_bytes(&enc)); ec_dec_init(&dec,ptr,DATA_SIZE); @@ -129,7 +129,7 @@ int main(int _argc,char **_argv){ nbits2=ec_tell_frac(&dec); if(nbits!=nbits2){ fprintf(stderr, - "Reported number of bits used was %0.2lf, should be %0.2lf.\n", + "Reported number of bits used was %0.2f, should be %0.2f.\n", ldexp(nbits2,-3),ldexp(nbits,-3)); ret=-1; } diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c index 4a563ccf..844c5b48 100644 --- a/celt/tests/test_unit_mdct.c +++ b/celt/tests/test_unit_mdct.c @@ -184,8 +184,9 @@ void test1d(int nfft,int isinverse,int arch) int main(int argc,char ** argv) { + int arch; ALLOC_STACK; - int arch = opus_select_arch(); + arch = opus_select_arch(); if (argc>1) { int k; diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h index 7d1ecf75..90e69ecf 100644 --- a/celt/x86/celt_lpc_sse.h +++ b/celt/x86/celt_lpc_sse.h @@ -33,7 +33,6 @@ #endif #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) -#define OVERRIDE_CELT_FIR void celt_fir_sse4_1( const opus_val16 *x, @@ -44,10 +43,11 @@ void celt_fir_sse4_1( int arch); #if defined(OPUS_X86_PRESUME_SSE4_1) +#define OVERRIDE_CELT_FIR #define celt_fir(x, num, y, N, ord, arch) \ ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch)) -#else +#elif defined(OPUS_HAVE_RTCD) extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, @@ -57,6 +57,7 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( int ord, int arch); +#define OVERRIDE_CELT_FIR # define celt_fir(x, num, y, N, ord, arch) \ ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch)) diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index f7a014b6..964aef50 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -63,7 +63,7 @@ void xcorr_kernel_sse( #define xcorr_kernel(x, y, sum, len, arch) \ ((void)arch, xcorr_kernel_sse(x, y, sum, len)) -#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) +#elif defined(OPUS_HAVE_RTCD) && ((defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))) extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, @@ -115,8 +115,8 @@ opus_val32 celt_inner_prod_sse( ((void)arch, celt_inner_prod_sse(x, y, N)) -#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \ - (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) +#elif defined(OPUS_HAVE_RTCD) && (((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))) extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c index a092c68b..2bc57830 100644 --- a/celt/x86/pitch_sse4_1.c +++ b/celt/x86/pitch_sse4_1.c @@ -117,6 +117,14 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 __m128i sum0, sum1, sum2, sum3, vecSum; __m128i initSum; +#ifdef OPUS_CHECK_ASM + opus_val32 sum_c[4]; + for (j=0;j<4;j++) { + sum_c[j] = sum[j]; + } + xcorr_kernel_c(x, y, sum_c, len); +#endif + celt_assert(len >= 3); sum0 = _mm_setzero_si128(); @@ -177,19 +185,56 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 vecSum = _mm_add_epi32(vecSum, sum2); } - for (;j<len;j++) + vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]); + if (len - j == 3) { - vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); - vecX0 = _mm_shuffle_epi32(vecX, 0x00); + vecX0 = _mm_shuffle_epi32(vecX, 0x55); + vecX1 = _mm_shuffle_epi32(vecX, 0xaa); + vecX2 = _mm_shuffle_epi32(vecX, 0xff); vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); + vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); sum0 = _mm_mullo_epi32(vecX0, vecY0); + sum1 = _mm_mullo_epi32(vecX1, vecY1); + sum2 = _mm_mullo_epi32(vecX2, vecY2); + + vecSum = _mm_add_epi32(vecSum, sum0); + vecSum = _mm_add_epi32(vecSum, sum1); + vecSum = _mm_add_epi32(vecSum, sum2); + } + else if (len - j == 2) + { + vecX0 = _mm_shuffle_epi32(vecX, 0xaa); + vecX1 = _mm_shuffle_epi32(vecX, 0xff); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + sum1 = _mm_mullo_epi32(vecX1, vecY1); + + vecSum = _mm_add_epi32(vecSum, sum0); + vecSum = _mm_add_epi32(vecSum, sum1); + } + else if (len - j == 1) + { + vecX0 = _mm_shuffle_epi32(vecX, 0xff); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + vecSum = _mm_add_epi32(vecSum, sum0); } initSum = _mm_loadu_si128((__m128i *)(&sum[0])); initSum = _mm_add_epi32(initSum, vecSum); _mm_storeu_si128((__m128i *)sum, initSum); + +#ifdef OPUS_CHECK_ASM + celt_assert(!memcmp(sum_c, sum, sizeof(sum_c))); +#endif } #endif diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c index 080eb25e..6a1914de 100644 --- a/celt/x86/x86cpu.c +++ b/celt/x86/x86cpu.c @@ -35,11 +35,11 @@ #include "pitch.h" #include "x86cpu.h" -#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ +#if defined(OPUS_HAVE_RTCD) && \ + ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ - (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)) - + (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))) #if defined(_MSC_VER) @@ -68,7 +68,8 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3]) : - "0" (InfoType) + /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */ + "0" (InfoType), "2" (0) ); #else __asm__ __volatile__ ( @@ -77,11 +78,22 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3]) : - "0" (InfoType) + /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */ + "0" (InfoType), "2" (0) ); #endif #elif defined(CPU_INFO_BY_C) - __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3])); + /* We use __get_cpuid_count to clear ECX to avoid a valgrind false-positive + prior to v3.17.0.*/ + if (!__get_cpuid_count(InfoType, 0, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]))) { + /* Our function cannot fail, but __get_cpuid{_count} can. + Returning all zeroes will effectively disable all SIMD, which is + what we want on CPUs that don't support CPUID. */ + CPUInfo[3] = CPUInfo[2] = CPUInfo[1] = CPUInfo[0] = 0; + } +#else +# error "Configured to use x86 RTCD, but no CPU detection method available. " \ + "Reconfigure with --disable-rtcd (or send patches)." #endif } @@ -98,7 +110,7 @@ typedef struct CPU_Feature{ static void opus_cpu_feature_check(CPU_Feature *cpu_feature) { - unsigned int info[4] = {0}; + unsigned int info[4]; unsigned int nIds = 0; cpuid(info, 0); @@ -119,7 +131,7 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature) } } -int opus_select_arch(void) +static int opus_select_arch_impl(void) { CPU_Feature cpu_feature; int arch; @@ -154,4 +166,13 @@ int opus_select_arch(void) return arch; } +int opus_select_arch(void) { + int arch = opus_select_arch_impl(); +#ifdef FUZZING + /* Randomly downgrade the architecture. */ + arch = rand()%(arch+1); +#endif + return arch; +} + #endif diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h index 1e2bf17b..04e80489 100644 --- a/celt/x86/x86cpu.h +++ b/celt/x86/x86cpu.h @@ -56,40 +56,18 @@ int opus_select_arch(void); # endif -/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32() - or _mm_cvtepi16_epi32() when optimizations are disabled, even though the - actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory - reference, these require 16-byte alignment and load a full 16 bytes (instead - of 4 or 8), possibly reading out of bounds. - - We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or - _mm_loadl_epi64(), which should have the same semantics as an m32 or m64 - reference in the PMOVSXWD instruction itself, but gcc is not smart enough to - optimize this out when optimizations ARE enabled. - - Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32 - (which is fair, since technically the compiler is always allowed to do the - dereference before invoking the function implementing the intrinsic). - However, it is smart enough to eliminate the extra MOVD instruction. - For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out - the extra MOVQ if it's specified explicitly */ - -# if defined(__clang__) || !defined(__OPTIMIZE__) -# define OP_CVTEPI8_EPI32_M32(x) \ - (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x)))) -# else -# define OP_CVTEPI8_EPI32_M32(x) \ - (_mm_cvtepi8_epi32(*(__m128i *)(x))) -#endif - -/* similar reasoning about the instruction sequence as in the 32-bit macro above, - */ -# if defined(__clang__) || !defined(__OPTIMIZE__) -# define OP_CVTEPI16_EPI32_M64(x) \ +/*MOVD should not impose any alignment restrictions, but the C standard does, + and UBSan will report errors if we actually make unaligned accesses. + Use this to work around those restrictions (which should hopefully all get + optimized to a single MOVD instruction).*/ +#define OP_LOADU_EPI32(x) \ + (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\ + *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U)) + +#define OP_CVTEPI8_EPI32_M32(x) \ + (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x)))) + +#define OP_CVTEPI16_EPI32_M64(x) \ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) -# else -# define OP_CVTEPI16_EPI32_M64(x) \ - (_mm_cvtepi16_epi32(*(__m128i *)(x))) -# endif #endif |