aboutsummaryrefslogtreecommitdiff
path: root/src/analysis.c
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@jmvalin.ca>2017-07-12 16:55:28 -0400
committerJean-Marc Valin <jmvalin@jmvalin.ca>2017-10-05 17:40:27 -0400
commitaf93fbd55fd5c23a2492166816311d9f67df1b24 (patch)
tree7221fd8dd284dd593e4b3eb1a3ed9cee3b4fc926 /src/analysis.c
parentf3cff05eeb83ec8c055b7331338d705af220358d (diff)
downloadlibopus-af93fbd55fd5c23a2492166816311d9f67df1b24.tar.gz
Add RNN for VAD and speech/music classification
Based on two dense layers with a GRU layer in the middle
Diffstat (limited to 'src/analysis.c')
-rw-r--r--src/analysis.c271
1 files changed, 118 insertions, 153 deletions
diff --git a/src/analysis.c b/src/analysis.c
index f4160e4b..1d6dd829 100644
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -50,6 +50,8 @@
#ifndef DISABLE_FLOAT_API
+#define TRANSITION_PENALTY 10
+
static const float dct_table[128] = {
0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
@@ -224,19 +226,22 @@ void tonality_analysis_reset(TonalityAnalysisState *tonal)
/* Clear non-reusable fields. */
char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
- tonal->music_confidence = .9f;
- tonal->speech_confidence = .1f;
}
void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
{
int pos;
int curr_lookahead;
- float psum;
float tonality_max;
float tonality_avg;
int tonality_count;
int i;
+ int pos0;
+ float prob_avg;
+ float prob_count;
+ float prob_min, prob_max;
+ float vad_prob;
+ int mpos, vpos;
pos = tonal->read_pos;
curr_lookahead = tonal->write_pos-tonal->read_pos;
@@ -254,6 +259,7 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
pos--;
if (pos<0)
pos = DETECT_SIZE-1;
+ pos0 = pos;
OPUS_COPY(info_out, &tonal->info[pos], 1);
tonality_max = tonality_avg = info_out->tonality;
tonality_count = 1;
@@ -270,6 +276,107 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
tonality_count++;
}
info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
+
+ mpos = vpos = pos0;
+ /* If we have enough look-ahead, compensate for the ~5-frame delay in the music prob and
+ ~1 frame delay in the VAD prob. */
+ if (curr_lookahead > 15)
+ {
+ mpos += 5;
+ if (mpos>=DETECT_SIZE)
+ mpos -= DETECT_SIZE;
+ vpos += 1;
+ if (vpos>=DETECT_SIZE)
+ vpos -= DETECT_SIZE;
+ }
+
+ /* The following calculations attempt to minimize a "badness function"
+ for the transition. When switching from speech to music, the badness
+ of switching at frame k is
+ b_k = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
+ where
+ v_i is the activity probability (VAD) at frame i,
+ p_i is the music probability at frame i
+ T is the probability threshold for switching
+ S is the penalty for switching during active audio rather than silence
+ the current frame has index i=0
+
+ Rather than apply badness to directly decide when to switch, what we compute
+ instead is the threshold for which the optimal switching point is now. When
+ considering whether to switch now (frame 0) or at frame k, we have:
+ S*v_0 = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
+ which gives us:
+ T = ( \sum_{i=0}^{k-1} v_i*p_i + S*(v_k-v_0) ) / ( \sum_{i=0}^{k-1} v_i )
+ We take the min threshold across all positive values of k (up to the maximum
+ amount of lookahead we have) to give us the threshold for which the current
+ frame is the optimal switch point.
+
+ The last step is that we need to consider whether we want to switch at all.
+ For that we use the average of the music probability over the entire window.
+ If the threshold is higher than that average we're not going to
+ switch, so we compute a min with the average as well. The result of all these
+ min operations is music_prob_min, which gives the threshold for switching to music
+ if we're currently encoding for speech.
+
+ We do the exact opposite to compute music_prob_max which is used for switching
+ from music to speech.
+ */
+ prob_min = 1.f;
+ prob_max = 0.f;
+ vad_prob = tonal->info[vpos].activity_probability;
+ prob_count = MAX16(.1f, vad_prob);
+ prob_avg = MAX16(.1f, vad_prob)*tonal->info[mpos].music_prob;
+ while (1)
+ {
+ float pos_vad;
+ mpos++;
+ if (mpos==DETECT_SIZE)
+ mpos = 0;
+ if (mpos == tonal->write_pos)
+ break;
+ vpos++;
+ if (vpos==DETECT_SIZE)
+ vpos = 0;
+ if (vpos == tonal->write_pos)
+ break;
+ pos_vad = tonal->info[vpos].activity_probability;
+ prob_min = MIN16((prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_min);
+ prob_max = MAX16((prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_max);
+ prob_count += MAX16(.1f, pos_vad);
+ prob_avg += MAX16(.1f, pos_vad)*tonal->info[mpos].music_prob;
+ }
+ info_out->music_prob = prob_avg/prob_count;
+ prob_min = MIN16(prob_avg/prob_count, prob_min);
+ prob_max = MAX16(prob_avg/prob_count, prob_max);
+ prob_min = MAX16(prob_min, 0.f);
+ prob_max = MIN16(prob_max, 1.f);
+
+ /* If we don't have enough look-ahead, do our best to make a decent decision. */
+ if (curr_lookahead < 10)
+ {
+ float pmin, pmax;
+ pmin = prob_min;
+ pmax = prob_max;
+ pos = pos0;
+ /* Look for min/max in the past. */
+ for (i=0;i<IMIN(tonal->count-1, 15);i++)
+ {
+ pos--;
+ if (pos < 0)
+ pos = DETECT_SIZE-1;
+ pmin = MIN16(pmin, tonal->info[pos].music_prob);
+ pmax = MAX16(pmax, tonal->info[pos].music_prob);
+ }
+ /* Bias against switching on active audio. */
+ pmin = MAX16(0.f, pmin - .1f*vad_prob);
+ pmax = MIN16(1.f, pmax + .1f*vad_prob);
+ prob_min += (1.f-.1f*curr_lookahead)*(pmin - prob_min);
+ prob_max += (1.f-.1f*curr_lookahead)*(pmax - prob_max);
+ }
+ info_out->music_prob_min = prob_min;
+ info_out->music_prob_max = prob_max;
+
+ /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */
tonal->read_subframe += len/(tonal->Fs/400);
while (tonal->read_subframe>=8)
{
@@ -278,21 +385,6 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int
}
if (tonal->read_pos>=DETECT_SIZE)
tonal->read_pos-=DETECT_SIZE;
-
- /* The -1 is to compensate for the delay in the features themselves. */
- curr_lookahead = IMAX(curr_lookahead-1, 0);
-
- psum=0;
- /* Summing the probability of transition patterns that involve music at
- time (DETECT_SIZE-curr_lookahead-1) */
- for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
- psum += tonal->pmusic[i];
- for (;i<DETECT_SIZE;i++)
- psum += tonal->pspeech[i];
- psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
- /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/
-
- info_out->music_prob = psum;
}
static const float std_feature_bias[9] = {
@@ -352,6 +444,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
float band_log2[NB_TBANDS+1];
float leakage_from[NB_TBANDS+1];
float leakage_to[NB_TBANDS+1];
+ float layer_out[MAX_NEURONS];
SAVE_STACK;
alpha = 1.f/IMIN(10, 1+tonal->count);
@@ -368,12 +461,6 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
offset = 3*offset/2;
}
- if (tonal->count<4) {
- if (tonal->application == OPUS_APPLICATION_VOIP)
- tonal->music_prob = .1f;
- else
- tonal->music_prob = .625f;
- }
kfft = celt_mode->mdct.kfft[0];
if (tonal->count==0)
tonal->mem_fill = 240;
@@ -761,139 +848,17 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
features[23] = info->tonality_slope + 0.069216f;
features[24] = tonal->lowECount - 0.067930f;
- mlp_process(&net, features, frame_probs);
- frame_probs[0] = .5f*(frame_probs[0]+1);
- /* Curve fitting between the MLP probability and the actual probability */
- /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/
- /* Probability of active audio (as opposed to silence) */
- frame_probs[1] = .5f*frame_probs[1]+.5f;
- frame_probs[1] *= frame_probs[1];
+ compute_dense(&layer0, layer_out, features);
+ compute_gru(&layer1, tonal->rnn_state, layer_out);
+ compute_dense(&layer2, frame_probs, tonal->rnn_state);
/* Probability of speech or music vs noise */
info->activity_probability = frame_probs[1];
+ /* It seems like the RNN tends to have a bias towards speech and this
+ warping of the probabilities compensates for it. */
+ info->music_prob = frame_probs[0] * (2 - frame_probs[0]);
- /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
- {
- /* Probability of state transition */
- float tau;
- /* Represents independence of the MLP probabilities, where
- beta=1 means fully independent. */
- float beta;
- /* Denormalized probability of speech (p0) and music (p1) after update */
- float p0, p1;
- /* Probabilities for "all speech" and "all music" */
- float s0, m0;
- /* Probability sum for renormalisation */
- float psum;
- /* Instantaneous probability of speech and music, with beta pre-applied. */
- float speech0;
- float music0;
- float p, q;
-
- /* More silence transitions for speech than for music. */
- tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob);
- p = MAX16(.05f,MIN16(.95f,frame_probs[1]));
- q = MAX16(.05f,MIN16(.95f,tonal->vad_prob));
- beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
- /* p0 and p1 are the probabilities of speech and music at this frame
- using only information from previous frame and applying the
- state transition model */
- p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau;
- p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau;
- /* We apply the current probability with exponent beta to work around
- the fact that the probability estimates aren't independent. */
- p0 *= (float)pow(1-frame_probs[1], beta);
- p1 *= (float)pow(frame_probs[1], beta);
- /* Normalise the probabilities to get the Marokv probability of music. */
- tonal->vad_prob = p1/(p0+p1);
- info->vad_prob = tonal->vad_prob;
- /* Consider that silence has a 50-50 probability of being speech or music. */
- frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f;
-
- /* One transition every 3 minutes of active audio */
- tau = .0001f;
- /* Adapt beta based on how "unexpected" the new prob is */
- p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
- q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
- beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
- /* p0 and p1 are the probabilities of speech and music at this frame
- using only information from previous frame and applying the
- state transition model */
- p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
- p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
- /* We apply the current probability with exponent beta to work around
- the fact that the probability estimates aren't independent. */
- p0 *= (float)pow(1-frame_probs[0], beta);
- p1 *= (float)pow(frame_probs[0], beta);
- /* Normalise the probabilities to get the Marokv probability of music. */
- tonal->music_prob = p1/(p0+p1);
- info->music_prob = tonal->music_prob;
-
- /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/
- /* This chunk of code deals with delayed decision. */
- psum=1e-20f;
- /* Instantaneous probability of speech and music, with beta pre-applied. */
- speech0 = (float)pow(1-frame_probs[0], beta);
- music0 = (float)pow(frame_probs[0], beta);
- if (tonal->count==1)
- {
- if (tonal->application == OPUS_APPLICATION_VOIP)
- tonal->pmusic[0] = .1f;
- else
- tonal->pmusic[0] = .625f;
- tonal->pspeech[0] = 1-tonal->pmusic[0];
- }
- /* Updated probability of having only speech (s0) or only music (m0),
- before considering the new observation. */
- s0 = tonal->pspeech[0] + tonal->pspeech[1];
- m0 = tonal->pmusic [0] + tonal->pmusic [1];
- /* Updates s0 and m0 with instantaneous probability. */
- tonal->pspeech[0] = s0*(1-tau)*speech0;
- tonal->pmusic [0] = m0*(1-tau)*music0;
- /* Propagate the transition probabilities */
- for (i=1;i<DETECT_SIZE-1;i++)
- {
- tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
- tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
- }
- /* Probability that the latest frame is speech, when all the previous ones were music. */
- tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
- /* Probability that the latest frame is music, when all the previous ones were speech. */
- tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
-
- /* Renormalise probabilities to 1 */
- for (i=0;i<DETECT_SIZE;i++)
- psum += tonal->pspeech[i] + tonal->pmusic[i];
- psum = 1.f/psum;
- for (i=0;i<DETECT_SIZE;i++)
- {
- tonal->pspeech[i] *= psum;
- tonal->pmusic [i] *= psum;
- }
- psum = tonal->pmusic[0];
- for (i=1;i<DETECT_SIZE;i++)
- psum += tonal->pspeech[i];
-
- /* Estimate our confidence in the speech/music decisions */
- if (frame_probs[1]>.75)
- {
- if (tonal->music_prob>.9)
- {
- float adapt;
- adapt = 1.f/(++tonal->music_confidence_count);
- tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
- tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
- }
- if (tonal->music_prob<.1)
- {
- float adapt;
- adapt = 1.f/(++tonal->speech_confidence_count);
- tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
- tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
- }
- }
- }
- tonal->last_music = tonal->music_prob>.5f;
+ /*printf("%f %f %f\n", frame_probs[0], frame_probs[1], info->music_prob);*/
#ifdef MLP_TRAINING
for (i=0;i<25;i++)
printf("%f ", features[i]);