23 files changed, 323 insertions, 150 deletions
diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c
index cce3ae3a..c7d16e6d 100644
--- a/celt/arm/armcpu.c
+++ b/celt/arm/armcpu.c
@@ -156,7 +156,7 @@ opus_uint32 opus_cpu_capabilities(void)
    "your platform.  Reconfigure with --disable-rtcd (or send patches)."
 #endif
 
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
 {
   opus_uint32 flags = opus_cpu_capabilities();
   int arch = 0;
@@ -184,4 +184,11 @@ int opus_select_arch(void)
   return arch;
 }
 
+int opus_select_arch(void) {
+  int arch = opus_select_arch_impl();
+#ifdef FUZZING
+  arch = rand()%(arch+1);
+#endif
+  return arch;
+}
 #endif
diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c
index 1ac38c43..35cc46e2 100644
--- a/celt/arm/pitch_neon_intr.c
+++ b/celt/arm/pitch_neon_intr.c
@@ -137,22 +137,27 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
 /* operations of celt_inner_prod_neon(), and both functions should have bit */
 /* exact output.                                                            */
-static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
+static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N)
 {
    int i;
+   *err = 0;
    opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
    for (i = 0; i < N - 3; i += 4) {
       xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
       xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
       xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
       xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+      *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3);
    }
    xy0 += xy2;
    xy1 += xy3;
    xy = xy0 + xy1;
+   *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy);
    for (; i < N; i++) {
       xy = MAC16_16(xy, x[i], y[i]);
+      *err += ABS32(xy);
    }
+   *err = *err*2e-7 + N*1e-37;
    return xy;
 }
 
@@ -160,32 +165,10 @@ static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, c
 /* operations of dual_inner_prod_neon(), and both functions should have bit */
 /* exact output.                                                            */
 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
-      int N, opus_val32 *xy1, opus_val32 *xy2)
+      int N, opus_val32 *xy1, opus_val32 *xy2, float *err)
 {
-   int i;
-   opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
-   for (i = 0; i < N - 3; i += 4) {
-      xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
-      xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
-      xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
-      xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
-      xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
-      xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
-      xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
-      xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
-   }
-   xy01_0 += xy01_2;
-   xy02_0 += xy02_2;
-   xy01_1 += xy01_3;
-   xy02_1 += xy02_3;
-   xy01 = xy01_0 + xy01_1;
-   xy02 = xy02_0 + xy02_1;
-   for (; i < N; i++) {
-      xy01 = MAC16_16(xy01, x[i], y01[i]);
-      xy02 = MAC16_16(xy02, x[i], y02[i]);
-   }
-   *xy1 = xy01;
-   *xy2 = xy02;
+   *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N);
+   *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N);
 }
 
 #endif /* OPUS_CHECK_ASM */
@@ -225,7 +208,12 @@ opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
     }
 
 #ifdef OPUS_CHECK_ASM
-    celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
+    {
+        float err, res;
+        res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N);
+        /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/
+        celt_assert(ABS32(res - xy) <= err);
+    }
 #endif
 
     return xy;
@@ -280,9 +268,12 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
 #ifdef OPUS_CHECK_ASM
     {
         opus_val32 xy1_c, xy2_c;
-        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
-        celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
-        celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
+        float err[2];
+        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err);
+        /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]);
+        if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/
+        celt_assert(ABS32(xy1_c - *xy1) <= err[0]);
+        celt_assert(ABS32(xy2_c - *xy2) <= err[1]);
     }
 #endif
 }
diff --git a/celt/bands.c b/celt/bands.c
index 2702963c..5320ffab 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -901,7 +901,7 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx,
    sctx->itheta = itheta;
    sctx->qalloc = qalloc;
 }
-static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, int b,
+static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
       celt_norm *lowband_out)
 {
    int c;
@@ -926,7 +926,6 @@ static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
             sign = ec_dec_bits(ec, 1);
          }
          ctx->remaining_bits -= 1<<BITRES;
-         b-=1<<BITRES;
       }
       if (ctx->resynth)
          x[0] = sign ? -NORM_SCALING : NORM_SCALING;
@@ -1134,7 +1133,7 @@ static unsigned quant_band(struct band_ctx *ctx, celt_norm *X,
    /* Special case for one sample */
    if (N==1)
    {
-      return quant_band_n1(ctx, X, NULL, b, lowband_out);
+      return quant_band_n1(ctx, X, NULL, lowband_out);
    }
 
    if (tf_change>0)
@@ -1256,7 +1255,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
    /* Special case for one sample */
    if (N==1)
    {
-      return quant_band_n1(ctx, X, Y, b, lowband_out);
+      return quant_band_n1(ctx, X, Y, lowband_out);
    }
 
    orig_fill = fill;
@@ -1381,6 +1380,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
    return cm;
 }
 
+#ifndef DISABLE_UPDATE_DRAFT
 static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm *norm2, int start, int M, int dual_stereo)
 {
    int n1, n2;
@@ -1393,6 +1393,7 @@ static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm
    if (dual_stereo)
       OPUS_COPY(&norm2[n1], &norm2[2*n1 - n2], n2-n1);
 }
+#endif
 
 void quant_all_bands(int encode, const CELTMode *m, int start, int end,
       celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks,
diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 74ca3b74..35a2073a 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -90,7 +90,7 @@ struct OpusCustomDecoder {
    opus_uint32 rng;
    int error;
    int last_pitch_index;
-   int loss_count;
+   int loss_duration;
    int skip_plc;
    int postfilter_period;
    int postfilter_period_old;
@@ -512,7 +512,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
    int nbEBands;
    int overlap;
    int start;
-   int loss_count;
+   int loss_duration;
    int noise_based;
    const opus_int16 *eBands;
    SAVE_STACK;
@@ -532,9 +532,9 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
    oldLogE2 = oldLogE + 2*nbEBands;
    backgroundLogE = oldLogE2  + 2*nbEBands;
 
-   loss_count = st->loss_count;
+   loss_duration = st->loss_duration;
    start = st->start;
-   noise_based = loss_count >= 5 || start != 0 || st->skip_plc;
+   noise_based = loss_duration >= 40 || start != 0 || st->skip_plc;
    if (noise_based)
    {
       /* Noise-based PLC/CNG */
@@ -557,9 +557,13 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
 #else
       ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
 #endif
+      c=0; do {
+         OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
+               DECODE_BUFFER_SIZE-N+(overlap>>1));
+      } while (++c<C);
 
       /* Energy decay */
-      decay = loss_count==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
+      decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
       c=0; do
       {
          for (i=start;i<end;i++)
@@ -585,11 +589,6 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       }
       st->rng = seed;
 
-      c=0; do {
-         OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
-               DECODE_BUFFER_SIZE-N+(overlap>>1));
-      } while (++c<C);
-
       celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch);
    } else {
       int exc_length;
@@ -602,7 +601,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       VARDECL(opus_val16, _exc);
       VARDECL(opus_val16, fir_tmp);
 
-      if (loss_count == 0)
+      if (loss_duration == 0)
       {
          st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch);
       } else {
@@ -632,7 +631,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
          for (i=0;i<MAX_PERIOD+LPC_ORDER;i++)
             exc[i-LPC_ORDER] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);
 
-         if (loss_count == 0)
+         if (loss_duration == 0)
          {
             opus_val32 ac[LPC_ORDER+1];
             /* Compute LPC coefficients for the last MAX_PERIOD samples before
@@ -812,7 +811,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       } while (++c<C);
    }
 
-   st->loss_count = loss_count+1;
+   /* Saturate to soemthing large to avoid wrap-around. */
+   st->loss_duration = IMIN(10000, loss_duration+(1<<LM));
 
    RESTORE_STACK;
 }
@@ -868,6 +868,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    int nbEBands;
    int overlap;
    const opus_int16 *eBands;
+   opus_val16 max_background_increase;
    ALLOC_STACK;
 
    VALIDATE_CELT_DECODER(st);
@@ -942,7 +943,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
 
    /* Check if there are at least two packets received consecutively before
     * turning on the pitch-based PLC */
-   st->skip_plc = st->loss_count != 0;
+   st->skip_plc = st->loss_duration != 0;
 
    if (dec == NULL)
    {
@@ -1140,25 +1141,21 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    if (C==1)
       OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands);
 
-   /* In case start or end were to change */
    if (!isTransient)
    {
-      opus_val16 max_background_increase;
       OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands);
       OPUS_COPY(oldLogE, oldBandE, 2*nbEBands);
-      /* In normal circumstances, we only allow the noise floor to increase by
-         up to 2.4 dB/second, but when we're in DTX, we allow up to 6 dB
-         increase for each update.*/
-      if (st->loss_count < 10)
-         max_background_increase = M*QCONST16(0.001f,DB_SHIFT);
-      else
-         max_background_increase = QCONST16(1.f,DB_SHIFT);
-      for (i=0;i<2*nbEBands;i++)
-         backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
    } else {
       for (i=0;i<2*nbEBands;i++)
          oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
    }
+   /* In normal circumstances, we only allow the noise floor to increase by
+      up to 2.4 dB/second, but when we're in DTX we give the weight of
+      all missing packets to the update packet. */
+   max_background_increase = IMIN(160, st->loss_duration+M)*QCONST16(0.001f,DB_SHIFT);
+   for (i=0;i<2*nbEBands;i++)
+      backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
+   /* In case start or end were to change */
    c=0; do
    {
       for (i=0;i<start;i++)
@@ -1175,7 +1172,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
    st->rng = dec->rng;
 
    deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
-   st->loss_count = 0;
+   st->loss_duration = 0;
    RESTORE_STACK;
    if (ec_tell(dec) > 8*len)
       return OPUS_INTERNAL_ERROR;
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index d6f8afc2..637d442c 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -1719,8 +1719,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
       compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch);
       compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
       amp2Log2(mode, effEnd, end, bandE, bandLogE2, C);
-      for (i=0;i<C*nbEBands;i++)
-         bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+      for (c=0;c<C;c++)
+      {
+         for (i=0;i<end;i++)
+            bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT));
+      }
    }
 
    compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch);
@@ -1856,8 +1859,11 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
          compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
          amp2Log2(mode, effEnd, end, bandE, bandLogE, C);
          /* Compensate for the scaling of short vs long mdcts */
-         for (i=0;i<C*nbEBands;i++)
-            bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+         for (c=0;c<C;c++)
+         {
+            for (i=0;i<end;i++)
+               bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT));
+         }
          tf_estimate = QCONST16(.2f,14);
       }
    }
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index 8ecb693e..242e6df5 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -50,17 +50,21 @@ int          p
 #endif
 
    OPUS_CLEAR(lpc, p);
+#ifdef FIXED_POINT
    if (ac[0] != 0)
+#else
+   if (ac[0] > 1e-10f)
+#endif
    {
       for (i = 0; i < p; i++) {
          /* Sum up this iteration's reflection coefficient */
          opus_val32 rr = 0;
          for (j = 0; j < i; j++)
             rr += MULT32_32_Q31(lpc[j],ac[i - j]);
-         rr += SHR32(ac[i + 1],3);
-         r = -frac_div32(SHL32(rr,3), error);
+         rr += SHR32(ac[i + 1],6);
+         r = -frac_div32(SHL32(rr,6), error);
          /*  Update LPC coefficients and total error */
-         lpc[i] = SHR32(r,3);
+         lpc[i] = SHR32(r,6);
          for (j = 0; j < (i+1)>>1; j++)
          {
             opus_val32 tmp1, tmp2;
@@ -73,17 +77,61 @@ int          p
          error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error);
          /* Bail out once we get 30 dB gain */
 #ifdef FIXED_POINT
-         if (error<SHR32(ac[0],10))
+         if (error<=SHR32(ac[0],10))
             break;
 #else
-         if (error<.001f*ac[0])
+         if (error<=.001f*ac[0])
             break;
 #endif
       }
    }
 #ifdef FIXED_POINT
-   for (i=0;i<p;i++)
-      _lpc[i] = ROUND16(lpc[i],16);
+   {
+      /* Convert the int32 lpcs to int16 and ensure there are no wrap-arounds.
+         This reuses the logic in silk_LPC_fit() and silk_bwexpander_32(). Any bug
+         fixes should also be applied there. */
+      int iter, idx = 0;
+      opus_val32 maxabs, absval, chirp_Q16, chirp_minus_one_Q16;
+
+      for (iter = 0; iter < 10; iter++) {
+         maxabs = 0;
+         for (i = 0; i < p; i++) {
+            absval = ABS32(lpc[i]);
+            if (absval > maxabs) {
+               maxabs = absval;
+               idx = i;
+            }
+         }
+         maxabs = PSHR32(maxabs, 13);  /* Q25->Q12 */
+
+         if (maxabs > 32767) {
+            maxabs = MIN32(maxabs, 163838);
+            chirp_Q16 = QCONST32(0.999, 16) - DIV32(SHL32(maxabs - 32767, 14),
+                                                    SHR32(MULT32_32_32(maxabs, idx + 1), 2));
+            chirp_minus_one_Q16 = chirp_Q16 - 65536;
+
+            /* Apply bandwidth expansion. */
+            for (i = 0; i < p - 1; i++) {
+               lpc[i] = MULT32_32_Q16(chirp_Q16, lpc[i]);
+               chirp_Q16 += PSHR32(MULT32_32_32(chirp_Q16, chirp_minus_one_Q16), 16);
+            }
+            lpc[p - 1] = MULT32_32_Q16(chirp_Q16, lpc[p - 1]);
+         } else {
+            break;
+         }
+      }
+
+      if (iter == 10) {
+         /* If the coeffs still do not fit into the 16 bit range after 10 iterations,
+            fall back to the A(z)=1 filter. */
+         OPUS_CLEAR(lpc, p);
+         _lpc[0] = 4096;  /* Q12 */
+      } else {
+         for (i = 0; i < p; i++) {
+            _lpc[i] = EXTRACT16(PSHR32(lpc[i], 13));  /* Q25->Q12 */
+         }
+      }
+   }
 #endif
 }
 
diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index 68fc6067..7b5c56ca 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -43,10 +43,11 @@
  */
 #define OPUS_ARCHMASK 3
 
-#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#elif defined(OPUS_HAVE_RTCD) && \
+  ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
-  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
+  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
 
 #include "x86/x86cpu.h"
 /* We currently support 5 x86 variants:
diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h
index f4352952..c2cf5a83 100644
--- a/celt/fixed_debug.h
+++ b/celt/fixed_debug.h
@@ -167,7 +167,7 @@ static OPUS_INLINE short SHR16_(int a, int shift, char *file, int line)
 #define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__)
 static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
 {
-   int res;
+   opus_int32 res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift))
    {
       fprintf (stderr, "SHL16: inputs are not short: %d %d in %s: line %d\n", a, shift, file, line);
@@ -175,7 +175,7 @@ static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
       celt_assert(0);
 #endif
    }
-   res = a<<shift;
+   res = (opus_int32)((opus_uint32)a<<shift);
    if (!VERIFY_SHORT(res))
    {
       fprintf (stderr, "SHL16: output is not short: %d in %s: line %d\n", res, file, line);
@@ -214,15 +214,15 @@ static OPUS_INLINE int SHL32_(opus_int64 a, int shift, char *file, int line)
    opus_int64  res;
    if (!VERIFY_INT(a) || !VERIFY_SHORT(shift))
    {
-      fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", a, shift, file, line);
+      fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", (long long)a, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
    }
-   res = a<<shift;
+   res = (opus_int64)((opus_uint64)a<<shift);
    if (!VERIFY_INT(res))
    {
-      fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", a, shift, res, file, line);
+      fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", (long long)a, shift, (long long)res, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -339,7 +339,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file
    opus_uint64 res;
    if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
    {
-      fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+      fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -347,7 +347,7 @@ static OPUS_INLINE unsigned int UADD32_(opus_uint64 a, opus_uint64 b, char *file
    res = a+b;
    if (!VERIFY_UINT(res))
    {
-      fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", res, file, line);
+      fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", (unsigned long long)res, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -363,14 +363,14 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file
    opus_uint64 res;
    if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
    {
-      fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+      fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
    }
    if (a<b)
    {
-      fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", a, b, file, line);
+      fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -378,7 +378,7 @@ static OPUS_INLINE unsigned int USUB32_(opus_uint64 a, opus_uint64 b, char *file
    res = a-b;
    if (!VERIFY_UINT(res))
    {
-      fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", a, b, res, file, line);
+      fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, (unsigned long long)res, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -410,6 +410,51 @@ static OPUS_INLINE short MULT16_16_16(int a, int b)
    return res;
 }
 
+/* result fits in 32 bits */
+static OPUS_INLINE int MULT32_32_32(opus_int64 a, opus_int64 b)
+{
+   opus_int64 res;
+   if (!VERIFY_INT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "MULT32_32_32: inputs are not int: %lld %lld\n", (long long)a, (long long)b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a*b;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT32_32_32: output is not int: %lld\n", (long long)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=5;
+   return res;
+}
+
+static OPUS_INLINE int MULT32_32_Q16(opus_int64 a, opus_int64 b)
+{
+   opus_int64 res;
+   if (!VERIFY_INT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "MULT32_32_Q16: inputs are not int: %lld %lld\n", (long long)a, (long long)b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((opus_int64)(a)*(opus_int64)(b)) >> 16;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT32_32_Q16: output is not int: %lld*%lld=%lld\n", (long long)a, (long long)b, (long long)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=5;
+   return res;
+}
+
 #define MULT16_16(a, b) MULT16_16_(a, b, __FILE__, __LINE__)
 static OPUS_INLINE int MULT16_16_(int a, int b, char *file, int line)
 {
@@ -786,6 +831,6 @@ static OPUS_INLINE opus_val16 SIG2WORD16_generic(celt_sig x)
 
 
 #undef PRINT_MIPS
-#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0);
+#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", (unsigned long long)celt_mips);} while (0);
 
 #endif
diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h
index 0ecbb899..8f29d46b 100644
--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -57,6 +57,13 @@
 #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
 #endif
 
+/** 32x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#if OPUS_FAST_INT64
+#define MULT32_32_Q16(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),16))
+#else
+#define MULT32_32_Q16(a,b) (ADD32(ADD32(ADD32((opus_val32)(SHR32(((opus_uint32)((a)&0x0000ffff)*(opus_uint32)((b)&0x0000ffff)),16)), MULT16_16SU(SHR32(a,16),((b)&0x0000ffff))), MULT16_16SU(SHR32(b,16),((a)&0x0000ffff))), SHL32(MULT16_16(SHR32(a,16),SHR32(b,16)),16)))
+#endif
+
 /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
 #if OPUS_FAST_INT64
 #define MULT32_32_Q31(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),31))
@@ -131,6 +138,9 @@
 /** 16x16 multiplication where the result fits in 16 bits */
 #define MULT16_16_16(a,b)     ((((opus_val16)(a))*((opus_val16)(b))))
 
+/** 32x32 multiplication where the result fits in 32 bits */
+#define MULT32_32_32(a,b)     ((((opus_val32)(a))*((opus_val32)(b))))
+
 /* (opus_val32)(opus_val16) gives TI compiler a hint that it's 16x16->32 multiply */
 /** 16x16 multiplication where the result fits in 32 bits */
 #define MULT16_16(a,b)     (((opus_val32)(opus_val16)(a))*((opus_val32)(opus_val16)(b)))
diff --git a/celt/float_cast.h b/celt/float_cast.h
index 9d34976e..8915a5fd 100644
--- a/celt/float_cast.h
+++ b/celt/float_cast.h
@@ -99,7 +99,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
                 return intgr ;
         }
 
-#elif defined(HAVE_LRINTF)
+#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 
 /*      These defines enable functionality introduced with the 1999 ISO C
 **      standard. They must be defined before the inclusion of math.h to
@@ -117,7 +117,7 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
 #include <math.h>
 #define float2int(x) lrintf(x)
 
-#elif (defined(HAVE_LRINT))
+#elif defined(HAVE_LRINT) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 
 #define _ISOC9X_SOURCE 1
 #define _ISOC99_SOURCE 1
diff --git a/celt/mathops.h b/celt/mathops.h
index fe29dac1..478ac918 100644
--- a/celt/mathops.h
+++ b/celt/mathops.h
@@ -153,7 +153,7 @@ static OPUS_INLINE float celt_exp2(float x)
       float f;
       opus_uint32 i;
    } res;
-   integer = floor(x);
+   integer = (int)floor(x);
    if (integer < -50)
       return 0;
    frac = x-integer;
diff --git a/celt/meson.build b/celt/meson.build
index 370ea1fe..ad95d949 100644
--- a/celt/meson.build
+++ b/celt/meson.build
@@ -10,6 +10,10 @@ celt_neon_intr_sources = sources['CELT_SOURCES_ARM_NEON_INTR']
 
 celt_static_libs = []
 
+if host_cpu_family in ['x86', 'x86_64'] and opus_conf.has('OPUS_HAVE_RTCD')
+  celt_sources +=  sources['CELT_SOURCES_X86_RTCD']
+endif
+
 foreach intr_name : ['sse', 'sse2', 'sse4_1', 'neon_intr']
   have_intr = get_variable('have_' + intr_name)
   if not have_intr
@@ -30,7 +34,9 @@ if (intrinsics_support.length() + asm_optimization.length() + inline_optimizatio
 endif
 
 if host_cpu_family in ['arm', 'aarch64'] and have_arm_intrinsics_or_asm
-  celt_sources +=  sources['CELT_SOURCES_ARM']
+  if opus_conf.has('OPUS_HAVE_RTCD')
+    celt_sources +=  sources['CELT_SOURCES_ARM_RTCD']
+  endif
   if have_arm_ne10
     celt_sources += sources['CELT_SOURCES_ARM_NE10']
   endif
diff --git a/celt/modes.c b/celt/modes.c
index 390c5e8a..23f7cde6 100644
--- a/celt/modes.c
+++ b/celt/modes.c
@@ -173,7 +173,10 @@ static void compute_allocation_table(CELTMode *mode)
    mode->nbAllocVectors = BITALLOC_SIZE;
    allocVectors = opus_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands));
    if (allocVectors==NULL)
+   {
+      mode->allocVectors = NULL;
       return;
+   }
 
    /* Check for standard mode */
    if (mode->Fs == 400*(opus_int32)mode->shortMdctSize)
diff --git a/celt/pitch.c b/celt/pitch.c
index 872582a4..7998db41 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -161,17 +161,26 @@ void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x
       shift=0;
    if (C==2)
       shift++;
-#endif
    for (i=1;i<len>>1;i++)
-      x_lp[i] = SHR32(HALF32(HALF32(x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]), shift);
-   x_lp[0] = SHR32(HALF32(HALF32(x[0][1])+x[0][0]), shift);
+      x_lp[i] = SHR32(x[0][(2*i-1)], shift+2) + SHR32(x[0][(2*i+1)], shift+2) + SHR32(x[0][2*i], shift+1);
+   x_lp[0] = SHR32(x[0][1], shift+2) + SHR32(x[0][0], shift+1);
    if (C==2)
    {
       for (i=1;i<len>>1;i++)
-         x_lp[i] += SHR32(HALF32(HALF32(x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]), shift);
-      x_lp[0] += SHR32(HALF32(HALF32(x[1][1])+x[1][0]), shift);
+         x_lp[i] += SHR32(x[1][(2*i-1)], shift+2) + SHR32(x[1][(2*i+1)], shift+2) + SHR32(x[1][2*i], shift+1);
+      x_lp[0] += SHR32(x[1][1], shift+2) + SHR32(x[1][0], shift+1);
    }
-
+#else
+   for (i=1;i<len>>1;i++)
+      x_lp[i] = .25f*x[0][(2*i-1)] + .25f*x[0][(2*i+1)] + .5f*x[0][2*i];
+   x_lp[0] = .25f*x[0][1] + .5f*x[0][0];
+   if (C==2)
+   {
+      for (i=1;i<len>>1;i++)
+         x_lp[i] += .25f*x[1][(2*i-1)] + .25f*x[1][(2*i+1)] + .5f*x[1][2*i];
+      x_lp[0] += .25f*x[1][1] + .5f*x[1][0];
+   }
+#endif
    _celt_autocorr(x_lp, ac, NULL, 0,
                   4, len>>1, arch);
 
@@ -249,7 +258,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
    opus_val32 maxcorr=1;
 #endif
    celt_assert(max_pitch>0);
-   celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+   celt_sig_assert(((size_t)_x&3)==0);
    for (i=0;i<max_pitch-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
diff --git a/celt/rate.c b/celt/rate.c
index 465e1ba2..7f7ad3fa 100644
--- a/celt/rate.c
+++ b/celt/rate.c
@@ -356,6 +356,8 @@ static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end,
             else
                depth_threshold = 0;
 #ifdef FUZZING
+            (void)signalBandwidth;
+            (void)depth_threshold;
             if ((rand()&0x1) == 0)
 #else
             if (codedBands<=start+2 || (band_bits > (depth_threshold*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth))
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 70f8f493..ae9a7b56 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -144,8 +144,9 @@ void test1d(int nfft,int isinverse,int arch)
 
 int main(int argc,char ** argv)
 {
+    int arch;
     ALLOC_STACK;
-    int arch = opus_select_arch();
+    arch = opus_select_arch();
 
     if (argc>1) {
         int k;
diff --git a/celt/tests/test_unit_entropy.c b/celt/tests/test_unit_entropy.c
index 7f674529..b1619b74 100644
--- a/celt/tests/test_unit_entropy.c
+++ b/celt/tests/test_unit_entropy.c
@@ -104,7 +104,7 @@ int main(int _argc,char **_argv){
   nbits=ec_tell_frac(&enc);
   ec_enc_done(&enc);
   fprintf(stderr,
-   "Encoded %0.2lf bits of entropy to %0.2lf bits (%0.3lf%% wasted).\n",
+   "Encoded %0.2f bits of entropy to %0.2f bits (%0.3f%% wasted).\n",
    entropy,ldexp(nbits,-3),100*(nbits-ldexp(entropy,3))/nbits);
   fprintf(stderr,"Packed to %li bytes.\n",(long)ec_range_bytes(&enc));
   ec_dec_init(&dec,ptr,DATA_SIZE);
@@ -129,7 +129,7 @@ int main(int _argc,char **_argv){
   nbits2=ec_tell_frac(&dec);
   if(nbits!=nbits2){
     fprintf(stderr,
-     "Reported number of bits used was %0.2lf, should be %0.2lf.\n",
+     "Reported number of bits used was %0.2f, should be %0.2f.\n",
      ldexp(nbits2,-3),ldexp(nbits,-3));
     ret=-1;
   }
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 4a563ccf..844c5b48 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -184,8 +184,9 @@ void test1d(int nfft,int isinverse,int arch)
 
 int main(int argc,char ** argv)
 {
+    int arch;
     ALLOC_STACK;
-    int arch = opus_select_arch();
+    arch = opus_select_arch();
 
     if (argc>1) {
         int k;
diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
index 7d1ecf75..90e69ecf 100644
--- a/celt/x86/celt_lpc_sse.h
+++ b/celt/x86/celt_lpc_sse.h
@@ -33,7 +33,6 @@
 #endif
 
 #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
-#define OVERRIDE_CELT_FIR
 
 void celt_fir_sse4_1(
          const opus_val16 *x,
@@ -44,10 +43,11 @@ void celt_fir_sse4_1(
          int arch);
 
 #if defined(OPUS_X86_PRESUME_SSE4_1)
+#define OVERRIDE_CELT_FIR
 #define celt_fir(x, num, y, N, ord, arch) \
     ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
 
-#else
+#elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          const opus_val16 *x,
@@ -57,6 +57,7 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          int ord,
          int arch);
 
+#define OVERRIDE_CELT_FIR
 #  define celt_fir(x, num, y, N, ord, arch) \
     ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
 
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index f7a014b6..964aef50 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -63,7 +63,7 @@ void xcorr_kernel_sse(
 #define xcorr_kernel(x, y, sum, len, arch) \
     ((void)arch, xcorr_kernel_sse(x, y, sum, len))
 
-#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) &&  ((defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
 
 extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_val16 *x,
@@ -115,8 +115,8 @@ opus_val32 celt_inner_prod_sse(
     ((void)arch, celt_inner_prod_sse(x, y, N))
 
 
-#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
-    (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) && (((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
+    (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
 
 extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_val16 *x,
diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c
index a092c68b..2bc57830 100644
--- a/celt/x86/pitch_sse4_1.c
+++ b/celt/x86/pitch_sse4_1.c
@@ -117,6 +117,14 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
     __m128i sum0, sum1, sum2, sum3, vecSum;
     __m128i initSum;
 
+#ifdef OPUS_CHECK_ASM
+    opus_val32 sum_c[4];
+    for (j=0;j<4;j++) {
+      sum_c[j] = sum[j];
+    }
+    xcorr_kernel_c(x, y, sum_c, len);
+#endif
+
     celt_assert(len >= 3);
 
     sum0 = _mm_setzero_si128();
@@ -177,19 +185,56 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
         vecSum = _mm_add_epi32(vecSum, sum2);
     }
 
-    for (;j<len;j++)
+    vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
+    if (len - j == 3)
     {
-        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
-        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+        vecX0 = _mm_shuffle_epi32(vecX, 0x55);
+        vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
+        vecX2 = _mm_shuffle_epi32(vecX, 0xff);
 
         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
 
         sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        sum1 = _mm_mullo_epi32(vecX1, vecY1);
+        sum2 = _mm_mullo_epi32(vecX2, vecY2);
+
+        vecSum = _mm_add_epi32(vecSum, sum0);
+        vecSum = _mm_add_epi32(vecSum, sum1);
+        vecSum = _mm_add_epi32(vecSum, sum2);
+    }
+    else if (len - j == 2)
+    {
+        vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
+        vecX1 = _mm_shuffle_epi32(vecX, 0xff);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        sum1 = _mm_mullo_epi32(vecX1, vecY1);
+
+        vecSum = _mm_add_epi32(vecSum, sum0);
+        vecSum = _mm_add_epi32(vecSum, sum1);
+    }
+    else if (len - j == 1)
+    {
+        vecX0 = _mm_shuffle_epi32(vecX, 0xff);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+
         vecSum = _mm_add_epi32(vecSum, sum0);
     }
 
     initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
     initSum = _mm_add_epi32(initSum, vecSum);
     _mm_storeu_si128((__m128i *)sum, initSum);
+
+#ifdef OPUS_CHECK_ASM
+    celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
+#endif
 }
 #endif
diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
index 080eb25e..6a1914de 100644
--- a/celt/x86/x86cpu.c
+++ b/celt/x86/x86cpu.c
@@ -35,11 +35,11 @@
 #include "pitch.h"
 #include "x86cpu.h"
 
-#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#if defined(OPUS_HAVE_RTCD) && \
+  ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
-  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
-
+  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
 
 #if defined(_MSC_VER)
 
@@ -68,7 +68,8 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
         "=r" (CPUInfo[1]),
         "=c" (CPUInfo[2]),
         "=d" (CPUInfo[3]) :
-        "0" (InfoType)
+        /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+        "0" (InfoType), "2" (0)
     );
 #else
     __asm__ __volatile__ (
@@ -77,11 +78,22 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
         "=b" (CPUInfo[1]),
         "=c" (CPUInfo[2]),
         "=d" (CPUInfo[3]) :
-        "0" (InfoType)
+        /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+        "0" (InfoType), "2" (0)
     );
 #endif
 #elif defined(CPU_INFO_BY_C)
-    __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
+    /* We use __get_cpuid_count to clear ECX to avoid a valgrind false-positive
+        prior to v3.17.0.*/
+    if (!__get_cpuid_count(InfoType, 0, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]))) {
+        /* Our function cannot fail, but __get_cpuid{_count} can.
+           Returning all zeroes will effectively disable all SIMD, which is
+            what we want on CPUs that don't support CPUID. */
+        CPUInfo[3] = CPUInfo[2] = CPUInfo[1] = CPUInfo[0] = 0;
+    }
+#else
+# error "Configured to use x86 RTCD, but no CPU detection method available. " \
+ "Reconfigure with --disable-rtcd (or send patches)."
 #endif
 }
 
@@ -98,7 +110,7 @@ typedef struct CPU_Feature{
 
 static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
 {
-    unsigned int info[4] = {0};
+    unsigned int info[4];
     unsigned int nIds = 0;
 
     cpuid(info, 0);
@@ -119,7 +131,7 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
     }
 }
 
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
 {
     CPU_Feature cpu_feature;
     int arch;
@@ -154,4 +166,13 @@ int opus_select_arch(void)
     return arch;
 }
 
+int opus_select_arch(void) {
+    int arch = opus_select_arch_impl();
+#ifdef FUZZING
+    /* Randomly downgrade the architecture. */
+    arch = rand()%(arch+1);
+#endif
+    return arch;
+}
+
 #endif
diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
index 1e2bf17b..04e80489 100644
--- a/celt/x86/x86cpu.h
+++ b/celt/x86/x86cpu.h
@@ -56,40 +56,18 @@
 int opus_select_arch(void);
 # endif
 
-/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32()
-  or _mm_cvtepi16_epi32() when optimizations are disabled, even though the
-  actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory
-  reference, these require 16-byte alignment and load a full 16 bytes (instead
-  of 4 or 8), possibly reading out of bounds.
-
-  We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or
-  _mm_loadl_epi64(), which should have the same semantics as an m32 or m64
-  reference in the PMOVSXWD instruction itself, but gcc is not smart enough to
-  optimize this out when optimizations ARE enabled.
-
-  Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
-  (which is fair, since technically the compiler is always allowed to do the
-  dereference before invoking the function implementing the intrinsic).
-  However, it is smart enough to eliminate the extra MOVD instruction.
-  For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out
-  the extra MOVQ if it's specified explicitly */
-
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-#  define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
-# else
-#  define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(*(__m128i *)(x)))
-#endif
-
-/* similar reasoning about the instruction sequence as in the 32-bit macro above,
- */
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-#  define OP_CVTEPI16_EPI32_M64(x) \
+/*MOVD should not impose any alignment restrictions, but the C standard does,
+   and UBSan will report errors if we actually make unaligned accesses.
+  Use this to work around those restrictions (which should hopefully all get
+   optimized to a single MOVD instruction).*/
+#define OP_LOADU_EPI32(x) \
+  (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\
+   *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U))
+
+#define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x))))
+
+#define OP_CVTEPI16_EPI32_M64(x) \
  (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
-# else
-#  define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(*(__m128i *)(x)))
-# endif
 
 #endif