Proper SILK delay compensation for resampling

Adds SILK delay compensation that depends on encode and decode sampling rate, as well as SILK internal coding rate. This ensures that the SILK part of Opus is always in sync with the CELT part no matter what the sampling rates are. It also increases the resampling delay to 1.15 ms (was previously 0.48 ms).
author: Jean-Marc Valin <jmvalin@jmvalin.ca> 2011-10-07 08:38:27 -0400
committer: Jean-Marc Valin <jmvalin@jmvalin.ca> 2011-10-07 11:46:01 -0400
commit: b5972388d7c9b0d5f5a780da23ba00dce6a2628d (patch)
tree: 674e0a7515fa77ae620d4cb7028f34d769e25c54
parent: a5e96b84302f2008ec32e8664356dc83efd72c17 (diff)
download: libopus-b5972388d7c9b0d5f5a780da23ba00dce6a2628d.tar.gz
7 files changed, 71 insertions, 22 deletions
diff --git a/silk/control_codec.c b/silk/control_codec.c
index b1d3bab9..64b10915 100644
--- a/silk/control_codec.c
+++ b/silk/control_codec.c
@@ -37,6 +37,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #include "tuning_parameters.h"
 
+
+static const int enc_delay_matrix[3][5] = {
+/*SILK API 8  12  16  24  48 */
+/* 8 */   {5,  0,  3,  4,  8},
+/*12 */   {0,  6,  0,  0,  0},
+/*16 */   {4,  5, 11,  5, 18}
+};
+
 opus_int silk_setup_resamplers(
     silk_encoder_state_Fxx          *psEnc,             /* I/O                      */
     opus_int                         fs_kHz              /* I                        */
@@ -235,6 +243,9 @@ opus_int silk_setup_fs(
         psEnc->sCmn.TargetRate_bps = 0;         /* trigger new SNR computation */
     }
 
+    psEnc->sCmn.delay = enc_delay_matrix[rateID(fs_kHz*1000)][rateID(psEnc->sCmn.API_fs_Hz)];
+    silk_assert(psEnc->sCmn.delay <= MAX_ENCODER_DELAY);
+
     /* Set internal sampling frequency */
     silk_assert( fs_kHz == 8 || fs_kHz == 12 || fs_kHz == 16 );
     silk_assert( psEnc->sCmn.nb_subfr == 2 || psEnc->sCmn.nb_subfr == 4 );
diff --git a/silk/dec_API.c b/silk/dec_API.c
index 675bfb99..5e676932 100644
--- a/silk/dec_API.c
+++ b/silk/dec_API.c
@@ -31,6 +31,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "API.h"
 #include "main.h"
 
+static const int dec_delay_matrix[3][5] = {
+/*SILK API 8  12  16  24  48 */
+/* 8 */   {3, 0, 2, 0, 0},
+/*12 */   {0, 8, 5, 7, 5},
+/*16 */   {0, 0, 8, 5, 5}
+};
+
+
 /************************/
 /* Decoder Super Struct */
 /************************/
@@ -82,12 +90,15 @@ opus_int silk_Decode(
 {
     opus_int   i, n, prev_fs_kHz, decode_only_middle = 0, ret = SILK_NO_ERROR;
     opus_int32 nSamplesOutDec, LBRR_symbol;
-    opus_int16 samplesOut1_tmp[ 2 ][ MAX_FS_KHZ * MAX_FRAME_LENGTH_MS + 2 ];
+    opus_int16 samplesOut1_tmp[ 2 ][ MAX_FS_KHZ * MAX_FRAME_LENGTH_MS + 2 + MAX_DECODER_DELAY ];
     opus_int16 samplesOut2_tmp[ MAX_API_FS_KHZ * MAX_FRAME_LENGTH_MS ];
     opus_int32 MS_pred_Q13[ 2 ] = { 0 };
     opus_int16 *resample_out_ptr;
     silk_decoder *psDec = ( silk_decoder * )decState;
     silk_decoder_state *channel_state = psDec->channel_state;
+    int delay;
+
+    delay = channel_state[ 0 ].delay;
 
     /**********************************/
     /* Test if first frame in payload */
@@ -106,6 +117,7 @@ opus_int silk_Decode(
         ret += silk_init_decoder( &channel_state[ 1 ] );
         if( psDec->nChannelsAPI == 2 ) {
             silk_memcpy( &channel_state[ 1 ].resampler_state, &channel_state[ 0 ].resampler_state, sizeof( silk_resampler_state_struct ) );
+            silk_memcpy( &channel_state[ 1 ].delayBuf, &channel_state[ 0 ].delayBuf, MAX_DECODER_DELAY*sizeof(opus_int16));
         }
     }
 
@@ -143,9 +155,12 @@ opus_int silk_Decode(
 
     /* Initialize resampler when switching internal or external sampling frequency */
     if( prev_fs_kHz != channel_state[ 0 ].fs_kHz || channel_state[ 0 ].prev_API_sampleRate != decControl->API_sampleRate ) {
+        channel_state[ 0 ].delay = dec_delay_matrix[rateID(silk_SMULBB( channel_state[ 0 ].fs_kHz, 1000 ))][rateID(decControl->API_sampleRate)];
+        silk_assert(channel_state[ 0 ].delay <= MAX_DECODER_DELAY);
         ret = silk_resampler_init( &channel_state[ 0 ].resampler_state, silk_SMULBB( channel_state[ 0 ].fs_kHz, 1000 ), decControl->API_sampleRate );
         if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 ) {
             silk_memcpy( &channel_state[ 1 ].resampler_state, &channel_state[ 0 ].resampler_state, sizeof( silk_resampler_state_struct ) );
+            channel_state[ 1 ].delay = channel_state[ 0 ].delay;
         }
     }
     channel_state[ 0 ].prev_API_sampleRate = decControl->API_sampleRate;
@@ -230,19 +245,19 @@ opus_int silk_Decode(
     /* Call decoder for one frame */
     for( n = 0; n < decControl->nChannelsInternal; n++ ) {
         if( n == 0 || decode_only_middle == 0 ) {
-            ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 ], &nSamplesOutDec, lostFlag );
+            ret += silk_decode_frame( &channel_state[ n ], psRangeDec, &samplesOut1_tmp[ n ][ 2 + delay ], &nSamplesOutDec, lostFlag );
         } else {
-            silk_memset( &samplesOut1_tmp[ n ][ 2 ], 0, nSamplesOutDec * sizeof( opus_int16 ) );
+            silk_memset( &samplesOut1_tmp[ n ][ 2 + delay ], 0, nSamplesOutDec * sizeof( opus_int16 ) );
         }
     }
 
     if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 ) {
         /* Convert Mid/Side to Left/Right */
-        silk_stereo_MS_to_LR( &psDec->sStereo, samplesOut1_tmp[ 0 ], samplesOut1_tmp[ 1 ], MS_pred_Q13, channel_state[ 0 ].fs_kHz, nSamplesOutDec );
+        silk_stereo_MS_to_LR( &psDec->sStereo, &samplesOut1_tmp[ 0 ][delay], &samplesOut1_tmp[ 1 ][delay], MS_pred_Q13, channel_state[ 0 ].fs_kHz, nSamplesOutDec );
     } else {
         /* Buffering */
-        silk_memcpy( samplesOut1_tmp[ 0 ], psDec->sStereo.sMid, 2 * sizeof( opus_int16 ) );
-        silk_memcpy( psDec->sStereo.sMid, &samplesOut1_tmp[ 0 ][ nSamplesOutDec ], 2 * sizeof( opus_int16 ) );
+        silk_memcpy( &samplesOut1_tmp[ 0 ][delay], psDec->sStereo.sMid, 2 * sizeof( opus_int16 ) );
+        silk_memcpy( psDec->sStereo.sMid, &samplesOut1_tmp[ 0 ][ nSamplesOutDec + delay ], 2 * sizeof( opus_int16 ) );
     }
 
     /* Number of output samples */
@@ -256,8 +271,11 @@ opus_int silk_Decode(
     }
 
     for( n = 0; n < silk_min( decControl->nChannelsAPI, decControl->nChannelsInternal ); n++ ) {
+
+        silk_memcpy(&samplesOut1_tmp[ n ][ 1 ], &channel_state[ n ].delayBuf[ MAX_DECODER_DELAY-delay ], delay*sizeof(opus_int16));
         /* Resample decoded signal to API_sampleRate */
         ret += silk_resampler( &channel_state[ n ].resampler_state, resample_out_ptr, &samplesOut1_tmp[ n ][ 1 ], nSamplesOutDec );
+        silk_memcpy(channel_state[ n ].delayBuf, &samplesOut1_tmp[ n ][ 1 + nSamplesOutDec + delay - MAX_DECODER_DELAY ], MAX_DECODER_DELAY*sizeof(opus_int16));
 
         /* Interleave if stereo output and stereo stream */
         if( decControl->nChannelsAPI == 2 && decControl->nChannelsInternal == 2 ) {
diff --git a/silk/define.h b/silk/define.h
index dcfeb1d3..c7cbdcf8 100644
--- a/silk/define.h
+++ b/silk/define.h
@@ -86,6 +86,9 @@ extern "C"
 #define MAX_FRAME_LENGTH_MS                     ( SUB_FRAME_LENGTH_MS * MAX_NB_SUBFR )
 #define MAX_FRAME_LENGTH                        ( MAX_FRAME_LENGTH_MS * MAX_FS_KHZ )
 
+#define MAX_ENCODER_DELAY                       18
+#define MAX_DECODER_DELAY                        8
+
 /* Milliseconds of lookahead for pitch analysis */
 #define LA_PITCH_MS                             2
 #define LA_PITCH_MAX                            ( LA_PITCH_MS * MAX_FS_KHZ )
diff --git a/silk/enc_API.c b/silk/enc_API.c
index 0fe945b6..403aecce 100644
--- a/silk/enc_API.c
+++ b/silk/enc_API.c
@@ -138,8 +138,8 @@ opus_int silk_Encode(
     opus_int   speech_act_thr_for_switch_Q8;
     opus_int32 TargetRate_bps, MStargetRates_bps[ 2 ], channelRate_bps, LBRR_symbol;
     silk_encoder *psEnc = ( silk_encoder * )encState;
-    opus_int16 buf[ MAX_FRAME_LENGTH_MS * MAX_API_FS_KHZ ];
-    opus_int transition;
+    opus_int16 buf[ MAX_FRAME_LENGTH_MS * MAX_API_FS_KHZ + MAX_ENCODER_DELAY];
+    opus_int transition, delay;
 
     psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded = psEnc->state_Fxx[ 1 ].sCmn.nFramesEncoded = 0;
 
@@ -222,6 +222,7 @@ opus_int silk_Encode(
     }
     silk_assert( encControl->nChannelsInternal == 1 || psEnc->state_Fxx[ 0 ].sCmn.fs_kHz == psEnc->state_Fxx[ 1 ].sCmn.fs_kHz );
 
+    delay = psEnc->state_Fxx[ 0 ].sCmn.delay;
     /* Input buffering/resampling and encoding */
     while( 1 ) {
         nSamplesToBuffer  = psEnc->state_Fxx[ 0 ].sCmn.frame_length - psEnc->state_Fxx[ 0 ].sCmn.inputBufIx;
@@ -231,12 +232,15 @@ opus_int silk_Encode(
         if( encControl->nChannelsAPI == 2 && encControl->nChannelsInternal == 2 ) {
             int id = psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded;
             for( n = 0; n < nSamplesFromInput; n++ ) {
-                buf[ n ] = samplesIn[ 2 * n ];
+                    buf[ n+delay ] = samplesIn[ 2 * n ];
             }
+            silk_memcpy(buf, &psEnc->state_Fxx[ 0 ].sCmn.delayBuf[MAX_ENCODER_DELAY-delay], delay*sizeof(opus_int16));
             /* Making sure to start both resamplers from the same state when switching from mono to stereo */
             if(psEnc->nPrevChannelsInternal == 1 && id==0) {
-               silk_memcpy(&psEnc->state_Fxx[ 1 ].sCmn.resampler_state, &psEnc->state_Fxx[ 0 ].sCmn.resampler_state, sizeof(psEnc->state_Fxx[ 1 ].sCmn.resampler_state));
+               silk_memcpy( &psEnc->state_Fxx[ 1 ].sCmn.resampler_state, &psEnc->state_Fxx[ 0 ].sCmn.resampler_state, sizeof(psEnc->state_Fxx[ 1 ].sCmn.resampler_state));
+               silk_memcpy( &psEnc->state_Fxx[ 1 ].sCmn.delayBuf, &psEnc->state_Fxx[ 0 ].sCmn.delayBuf, MAX_ENCODER_DELAY*sizeof(opus_int16));
             }
+            silk_memcpy(psEnc->state_Fxx[ 0 ].sCmn.delayBuf, buf+nSamplesFromInput+delay-MAX_ENCODER_DELAY, MAX_ENCODER_DELAY*sizeof(opus_int16));
 
             ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state,
                 &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
@@ -245,23 +249,31 @@ opus_int silk_Encode(
             nSamplesToBuffer  = psEnc->state_Fxx[ 1 ].sCmn.frame_length - psEnc->state_Fxx[ 1 ].sCmn.inputBufIx;
             nSamplesToBuffer  = silk_min( nSamplesToBuffer, 10 * nBlocksOf10ms * psEnc->state_Fxx[ 1 ].sCmn.fs_kHz );
             for( n = 0; n < nSamplesFromInput; n++ ) {
-                buf[ n ] = samplesIn[ 2 * n + 1 ];
+                    buf[ n+delay ] = samplesIn[ 2 * n + 1 ];
             }
+            silk_memcpy(buf, &psEnc->state_Fxx[ 1 ].sCmn.delayBuf[MAX_ENCODER_DELAY-delay], delay*sizeof(opus_int16));
             ret += silk_resampler( &psEnc->state_Fxx[ 1 ].sCmn.resampler_state,
                 &psEnc->state_Fxx[ 1 ].sCmn.inputBuf[ psEnc->state_Fxx[ 1 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
+            silk_memcpy(psEnc->state_Fxx[ 1 ].sCmn.delayBuf, buf+nSamplesFromInput+delay-MAX_ENCODER_DELAY, MAX_ENCODER_DELAY*sizeof(opus_int16));
+
             psEnc->state_Fxx[ 1 ].sCmn.inputBufIx += nSamplesToBuffer;
         } else if( encControl->nChannelsAPI == 2 && encControl->nChannelsInternal == 1 ) {
             /* Combine left and right channels before resampling */
             for( n = 0; n < nSamplesFromInput; n++ ) {
-                buf[ n ] = (opus_int16)silk_RSHIFT_ROUND( samplesIn[ 2 * n ] + samplesIn[ 2 * n + 1 ],  1 );
+                buf[ n+delay ] = (opus_int16)silk_RSHIFT_ROUND( samplesIn[ 2 * n ] + samplesIn[ 2 * n + 1 ],  1 );
             }
+            silk_memcpy(buf, &psEnc->state_Fxx[ 0 ].sCmn.delayBuf[MAX_ENCODER_DELAY-delay], delay*sizeof(opus_int16));
             ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state,
                 &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
+            silk_memcpy(psEnc->state_Fxx[ 0 ].sCmn.delayBuf, buf+nSamplesFromInput+delay-MAX_ENCODER_DELAY, MAX_ENCODER_DELAY*sizeof(opus_int16));
             psEnc->state_Fxx[ 0 ].sCmn.inputBufIx += nSamplesToBuffer;
         } else {
             silk_assert( encControl->nChannelsAPI == 1 && encControl->nChannelsInternal == 1 );
+            silk_memcpy(buf+delay, samplesIn, nSamplesFromInput*sizeof(opus_int16));
+            silk_memcpy(buf, &psEnc->state_Fxx[ 0 ].sCmn.delayBuf[MAX_ENCODER_DELAY-delay], delay*sizeof(opus_int16));
             ret += silk_resampler( &psEnc->state_Fxx[ 0 ].sCmn.resampler_state,
-                &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], samplesIn, nSamplesFromInput );
+                &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.inputBufIx + 2 ], buf, nSamplesFromInput );
+            silk_memcpy(psEnc->state_Fxx[ 0 ].sCmn.delayBuf, buf+nSamplesFromInput+delay-MAX_ENCODER_DELAY, MAX_ENCODER_DELAY*sizeof(opus_int16));
             psEnc->state_Fxx[ 0 ].sCmn.inputBufIx += nSamplesToBuffer;
         }
 
diff --git a/silk/main.h b/silk/main.h
index d7ed22ce..981c7cab 100644
--- a/silk/main.h
+++ b/silk/main.h
@@ -43,6 +43,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* Uncomment the next line to force a fixed internal sampling rate (independent of what bitrate is used */
 /*#define FORCE_INTERNAL_FS_KHZ       16*/
 
+/* Simple way to make [8000, 12000, 16000, 24000, 48000] to [0,1,2,3,4] */
+#define rateID(R) ( ( ( ((R)>>12) - ((R)>16000) ) >> ((R)>24000) ) - 1 )
 
 /* Convert Left/Right stereo signal to adaptive Mid/Side representation */
 void silk_stereo_LR_to_MS(
diff --git a/silk/structs.h b/silk/structs.h
index eda8173b..70c81bae 100644
--- a/silk/structs.h
+++ b/silk/structs.h
@@ -149,6 +149,7 @@ typedef struct {
     opus_int                         minInternal_fs_Hz;              /* Minimum internal sampling frequency (Hz)                             */
     opus_int                         desiredInternal_fs_Hz;          /* Soft request for internal sampling frequency (Hz)                    */
     opus_int                         fs_kHz;                         /* Internal sampling frequency (kHz)                                    */
+    opus_int                         delay;                          /* Number of samples of delay to apply */
     opus_int                         nb_subfr;                       /* Number of 5 ms subframes in a frame                                  */
     opus_int                         frame_length;                   /* Frame length (samples)                                               */
     opus_int                         subfr_length;                   /* Subframe length (samples)                                            */
@@ -192,6 +193,7 @@ typedef struct {
 
     /* Input/output buffering */
     opus_int16                       inputBuf[ MAX_FRAME_LENGTH + 2 ]; /* Buffer containing input signal                                   */
+    opus_int16                       delayBuf[MAX_ENCODER_DELAY];
     opus_int                         inputBufIx;
     opus_int                         nFramesPerPacket;
     opus_int                         nFramesEncoded;                 /* Number of frames analyzed in current packet                          */
@@ -257,6 +259,8 @@ typedef struct {
     opus_int32       sLPC_Q14[ MAX_SUB_FRAME_LENGTH + MAX_LPC_ORDER ];
     opus_int32       exc_Q10[ MAX_FRAME_LENGTH ];
     opus_int16       outBuf[ 2 * MAX_FRAME_LENGTH ];             /* Buffer for output signal                                             */
+    opus_int16       delayBuf[ MAX_DECODER_DELAY ];              /* Buffer for delaying the SILK output prior to resampling              */
+    opus_int         delay;                                      /* How much decoder delay to add                                        */
     opus_int         lagPrev;                                    /* Previous Lag                                                         */
     opus_int8        LastGainIndex;                              /* Previous gain index                                                  */
     opus_int         fs_kHz;                                     /* Sampling frequency in kHz                                            */
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index ad3279f8..1a6e38f1 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -123,6 +123,11 @@ static const opus_int32 mode_thresholds[2][2] = {
       {  48000,      24000}, /* mono */
       {  48000,      24000}, /* stereo */
 };
+
+static const int celt_delay_table[5] = {
+/* API 8  12  16  24  48 */
+      10, 16, 21, 27, 55
+};
 int opus_encoder_get_size(int channels)
 {
     int silkEncSizeBytes, celtEncSizeBytes;
@@ -202,14 +207,8 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat
     st->encoder_buffer = st->Fs/100;
 
     st->delay_compensation = st->Fs/400;
-    /* This part is meant to compensate for the resampler delay as a function
-       of the API sampling rate */
-    if (st->Fs == 48000)
-        st->delay_compensation += 23;
-    else if (st->Fs == 24000)
-       st->delay_compensation += 15;
-    else
-       st->delay_compensation += 2;
+
+    st->delay_compensation += celt_delay_table[rateID(st->Fs)];
 
     st->hybrid_stereo_width_Q14             = 1 << 14;
     st->variable_HP_smth2_Q15 = silk_LSHIFT( silk_lin2log( VARIABLE_HP_MIN_CUTOFF_HZ ), 8 );
@@ -486,7 +485,7 @@ int opus_encode_float(OpusEncoder *st, const opus_val16 *pcm, int frame_size,
     }
 #endif
 
-    if (st->stream_channels == 1 && st->prev_channels ==2 && st->silk_mode.toMono==0) 
+    if (st->stream_channels == 1 && st->prev_channels ==2 && st->silk_mode.toMono==0)
     {
        /* Delay stereo->mono transition by two frames so that SILK can do a smooth downmix */
        st->silk_mode.toMono = 1;
author	Jean-Marc Valin <jmvalin@jmvalin.ca>	2011-10-07 08:38:27 -0400
committer	Jean-Marc Valin <jmvalin@jmvalin.ca>	2011-10-07 11:46:01 -0400
commit	b5972388d7c9b0d5f5a780da23ba00dce6a2628d (patch)
tree	674e0a7515fa77ae620d4cb7028f34d769e25c54
parent	a5e96b84302f2008ec32e8664356dc83efd72c17 (diff)
download	libopus-b5972388d7c9b0d5f5a780da23ba00dce6a2628d.tar.gz