diff options
author | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2016-12-16 17:52:15 -0500 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2016-12-20 15:33:27 -0500 |
commit | cf9409fe51fa425d176e74c2e3bc4b7a9b5e9086 (patch) | |
tree | f7b9984bc1feb07b6fa76772c7b4f50294603ca2 /src/analysis.c | |
parent | 159bb6df002560749866587becf3dc33541fec1d (diff) | |
download | libopus-cf9409fe51fa425d176e74c2e3bc4b7a9b5e9086.tar.gz |
Makes analysis run at 24 kHz, with 20-ms frames
The change also makes the analysis run for sampling rates of 16 kHz and 24 kHz
since the features are only computed on the 0-8 kHz band. The longer time
window (20 ms instead of 10 ms) makes the tonality estimator more reliable
for low-pitch harmonics.
Diffstat (limited to 'src/analysis.c')
-rw-r--r-- | src/analysis.c | 311 |
1 files changed, 259 insertions, 52 deletions
diff --git a/src/analysis.c b/src/analysis.c index b704fb4c..3e8a23bd 100644 --- a/src/analysis.c +++ b/src/analysis.c @@ -42,6 +42,7 @@ #include "analysis.h" #include "mlp.h" #include "stack_alloc.h" +#include "float_cast.h" #ifndef M_PI #define M_PI 3.141592653 @@ -100,24 +101,118 @@ static const float analysis_window[240] = { }; static const int tbands[NB_TBANDS+1] = { - 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120 + 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240 }; -static const int extra_bands[NB_TOT_BANDS+1] = { - 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200 -}; +#define NB_TONAL_SKIP_BANDS 9 -/*static const float tweight[NB_TBANDS+1] = { - .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5 -};*/ +static opus_val32 silk_resampler_down2_hp( + opus_val32 *S, /* I/O State vector [ 2 ] */ + opus_val32 *out, /* O Output signal [ floor(len/2) ] */ + const opus_val32 *in, /* I Input signal [ len ] */ + int inLen /* I Number of input samples */ +) +{ + int k, len2 = inLen/2; + opus_val32 in32, out32, out32_hp, Y, X; + opus_val64 hp_ener = 0; + /* Internal variables and state are in Q10 format */ + for( k = 0; k < len2; k++ ) { + /* Convert to Q10 */ + in32 = in[ 2 * k ]; + + /* All-pass section for even input sample */ + Y = SUB32( in32, S[ 0 ] ); + X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y); + out32 = ADD32( S[ 0 ], X ); + S[ 0 ] = ADD32( in32, X ); + out32_hp = out32; + /* Convert to Q10 */ + in32 = in[ 2 * k + 1 ]; + + /* All-pass section for odd input sample, and add to output of previous section */ + Y = SUB32( in32, S[ 1 ] ); + X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y); + out32 = ADD32( out32, S[ 1 ] ); + out32 = ADD32( out32, X ); + S[ 1 ] = ADD32( in32, X ); + + Y = SUB32( -in32, S[ 2 ] ); + X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y); + out32_hp = ADD32( out32_hp, S[ 2 ] ); + out32_hp = ADD32( out32_hp, X ); + S[ 2 ] = ADD32( -in32, X ); + + hp_ener += out32_hp*(opus_val64)out32_hp; + /* Add, convert back to int16 and store to output */ + out[ k ] = HALF32(out32); + } +#ifdef FIXED_POINT + /* len2 can be up to 480, so we shift by 8 more to make it fit. */ + hp_ener = hp_ener >> (2*SIG_SHIFT + 8); +#endif + return hp_ener; +} -#define NB_TONAL_SKIP_BANDS 9 +static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs) +{ + VARDECL(opus_val32, tmp); + opus_val32 scale; + int j; + opus_val32 ret = 0; + SAVE_STACK; + + if (subframe==0) return 0; + if (Fs == 48000) + { + subframe *= 2; + offset *= 2; + } else if (Fs == 16000) { + subframe = subframe*2/3; + offset = offset*2/3; + } + ALLOC(tmp, subframe, opus_val32); + downmix(_x, tmp, subframe, offset, c1, c2, C); +#ifdef FIXED_POINT + scale = (1<<SIG_SHIFT); +#else + scale = 1.f/32768; +#endif + if (c2==-2) + scale /= C; + else if (c2>-1) + scale /= 2; + for (j=0;j<subframe;j++) + tmp[j] *= scale; + if (Fs == 48000) + { + ret = silk_resampler_down2_hp(S, y, tmp, subframe); + } else if (Fs == 24000) { + OPUS_COPY(y, tmp, subframe); + } else if (Fs == 16000) { + VARDECL(opus_val32, tmp3x); + ALLOC(tmp3x, 3*subframe, opus_val32); + /* Don't do this at home! This resampler is horrible and it's only (barely) + usable for the purpose of the analysis because we don't care about all + the aliasing between 8 kHz and 12 kHz. */ + for (j=0;j<subframe;j++) + { + tmp3x[3*j] = tmp[j]; + tmp3x[3*j+1] = tmp[j]; + tmp3x[3*j+2] = tmp[j]; + } + silk_resampler_down2_hp(S, y, tmp3x, 3*subframe); + } + RESTORE_STACK; + return ret; +} -void tonality_analysis_init(TonalityAnalysisState *tonal) +void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs) { /* Initialize reusable fields. */ tonal->arch = opus_select_arch(); + tonal->Fs = Fs; /* Clear remaining fields. */ tonality_analysis_reset(tonal); } @@ -141,7 +236,8 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int if (curr_lookahead<0) curr_lookahead += DETECT_SIZE; - if (len > 480 && pos != tonal->write_pos) + /* On long frames, look at the second analysis window rather than the first. */ + if (len > tonal->Fs/50 && pos != tonal->write_pos) { pos++; if (pos==DETECT_SIZE) @@ -152,18 +248,27 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int if (pos<0) pos = DETECT_SIZE-1; OPUS_COPY(info_out, &tonal->info[pos], 1); - tonal->read_subframe += len/120; - while (tonal->read_subframe>=4) + /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */ + for (i=0;i<3;i++) + { + pos++; + if (pos==DETECT_SIZE) + pos = 0; + if (pos == tonal->write_pos) + break; + info_out->tonality = MAX32(0, -.03 + MAX32(info_out->tonality, tonal->info[pos].tonality-.05)); + } + tonal->read_subframe += len/(tonal->Fs/400); + while (tonal->read_subframe>=8) { - tonal->read_subframe -= 4; + tonal->read_subframe -= 8; tonal->read_pos++; } if (tonal->read_pos>=DETECT_SIZE) tonal->read_pos-=DETECT_SIZE; - /* Compensate for the delay in the features themselves. - FIXME: Need a better estimate the 10 I just made up */ - curr_lookahead = IMAX(curr_lookahead-10, 0); + /* The -1 is to compensate for the delay in the features themselves. */ + curr_lookahead = IMAX(curr_lookahead-1, 0); psum=0; /* Summing the probability of transition patterns that involve music at @@ -173,7 +278,7 @@ void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int for (;i<DETECT_SIZE;i++) psum += tonal->pspeech[i]; psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; - /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/ + /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/ info_out->music_prob = psum; } @@ -216,19 +321,33 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt float noise_floor; int remaining; AnalysisInfo *info; + float hp_ener; + float tonality2[240]; + float midE[8]; + float spec_variability=0; SAVE_STACK; - tonal->last_transition++; - alpha = 1.f/IMIN(20, 1+tonal->count); - alphaE = 1.f/IMIN(50, 1+tonal->count); - alphaE2 = 1.f/IMIN(1000, 1+tonal->count); + alpha = 1.f/IMIN(10, 1+tonal->count); + alphaE = 1.f/IMIN(25, 1+tonal->count); + alphaE2 = 1.f/IMIN(500, 1+tonal->count); + + if (tonal->Fs == 48000) + { + /* len and offset are now at 24 kHz. */ + len/= 2; + offset /= 2; + } else if (tonal->Fs == 16000) { + len = 3*len/2; + offset = 3*offset/2; + } if (tonal->count<4) tonal->music_prob = .5; kfft = celt_mode->mdct.kfft[0]; if (tonal->count==0) tonal->mem_fill = 240; - downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C); + tonal->hp_ener_accum += downmix_and_resample(downmix, x, &tonal->inmem[tonal->mem_fill], tonal->downmix_state, + IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs); if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) { tonal->mem_fill += len; @@ -236,6 +355,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt RESTORE_STACK; return; } + hp_ener = tonal->hp_ener_accum; info = &tonal->info[tonal->write_pos++]; if (tonal->write_pos>=DETECT_SIZE) tonal->write_pos-=DETECT_SIZE; @@ -254,7 +374,8 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt } OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); - downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C); + tonal->hp_ener_accum = downmix_and_resample(downmix, x, &tonal->inmem[240], tonal->downmix_state, + remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs); tonal->mem_fill = 240 + remaining; opus_fft(kfft, in, out, tonal->arch); #ifndef FIXED_POINT @@ -286,24 +407,31 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt d_angle2 = angle2 - angle; d2_angle2 = d_angle2 - d_angle; - mod1 = d2_angle - (float)floor(.5+d2_angle); + mod1 = d2_angle - (float)float2int(d2_angle); noisiness[i] = ABS16(mod1); mod1 *= mod1; mod1 *= mod1; - mod2 = d2_angle2 - (float)floor(.5+d2_angle2); + mod2 = d2_angle2 - (float)float2int(d2_angle2); noisiness[i] += ABS16(mod2); mod2 *= mod2; mod2 *= mod2; - avg_mod = .25f*(d2A[i]+2.f*mod1+mod2); + avg_mod = .25f*(d2A[i]+mod1+2*mod2); + /* This introduces an extra delay of 2 frames in the detection. */ tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f; + /* No delay on this detection, but it's less reliable. */ + tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f; A[i] = angle2; dA[i] = d_angle2; d2A[i] = mod2; } - + for (i=2;i<N2-1;i++) + { + float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1])); + tonality[i] = .9*MAX32(tonality[i], tt-.1); + } frame_tonality = 0; max_frame_tonality = 0; /*tw_sum = 0;*/ @@ -334,7 +462,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt binE *= 5.55e-17f; #endif E += binE; - tE += binE*tonality[i]; + tE += binE*MAX32(0, tonality[i]); nE += binE*2.f*(.5f-noisiness[i]); } #ifndef FIXED_POINT @@ -352,14 +480,26 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt frame_loudness += (float)sqrt(E+1e-10f); logE[b] = (float)log(E+1e-10f); - tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f); - tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f); - if (tonal->highE[b] < tonal->lowE[b]+1.f) + tonal->logE[tonal->E_count][b] = logE[b]; + if (tonal->count==0) + tonal->highE[b] = tonal->lowE[b] = logE[b]; + if (tonal->highE[b] > tonal->lowE[b] + 7.5) + { + if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b]) + tonal->highE[b] -= .01; + else + tonal->lowE[b] += .01; + } + if (logE[b] > tonal->highE[b]) + { + tonal->highE[b] = logE[b]; + tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]); + } else if (logE[b] < tonal->lowE[b]) { - tonal->highE[b]+=.5f; - tonal->lowE[b]-=.5f; + tonal->lowE[b] = logE[b]; + tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]); } - relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]); + relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b])); L1=L2=0; for (i=0;i<NB_FRAMES;i++) @@ -391,6 +531,26 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt tonal->prev_band_tonality[b] = band_tonality[b]; } + for (i=0;i<NB_FRAMES;i++) + { + int j; + float mindist = 1e15; + for (j=0;j<NB_FRAMES;j++) + { + int k; + float dist=0; + for (k=0;k<NB_TBANDS;k++) + { + float tmp; + tmp = tonal->logE[i][k] - tonal->logE[j][k]; + dist += tmp*tmp; + } + if (j!=i) + mindist = MIN32(mindist, dist); + } + spec_variability += mindist; + } + spec_variability = sqrt(spec_variability/NB_FRAMES/NB_TBANDS); bandwidth_mask = 0; bandwidth = 0; maxE = 0; @@ -399,13 +559,13 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt noise_floor *= 1<<(15+SIG_SHIFT); #endif noise_floor *= noise_floor; - for (b=0;b<NB_TOT_BANDS;b++) + for (b=0;b<NB_TBANDS;b++) { float E=0; int band_start, band_end; /* Keep a margin of 300 Hz for aliasing */ - band_start = extra_bands[b]; - band_end = extra_bands[b+1]; + band_start = tbands[b]; + band_end = tbands[b+1]; for (i=band_start;i<band_end;i++) { float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r @@ -422,14 +582,31 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt 2) less than 90 dB below the peak band (maximal masking possible considering both the ATH and the loudness-dependent slope of the spreading function) 3) above the PCM quantization noise floor + We use b+1 because the first CELT band isn't included in tbands[] */ if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start)) - bandwidth = b; + bandwidth = b+1; + } + /* Special case for the last two bands, for which we don't have spectrum but only + the energy above 12 kHz. */ + { + float E = hp_ener*(1./(240*240)); +#ifdef FIXED_POINT + /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */ + E *= ((opus_int32)1 << 2*SIG_SHIFT)*256.f; +#endif + maxE = MAX32(maxE, E); + tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); + E = MAX32(E, tonal->meanE[b]); + /* Use a simple follower with 13 dB/Bark slope for spreading function */ + bandwidth_mask = MAX32(.05f*bandwidth_mask, E); + if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160) + bandwidth = 20; } if (tonal->count<=2) bandwidth = 20; frame_loudness = 20*(float)log10(frame_loudness); - tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness); + tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness); tonal->lowECount *= (1-alphaE); if (frame_loudness < tonal->Etracker-30) tonal->lowECount += alphaE; @@ -441,6 +618,13 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt sum += dct_table[i*16+b]*logE[b]; BFCC[i] = sum; } + for (i=0;i<8;i++) + { + float sum=0; + for (b=0;b<16;b++) + sum += dct_table[i*16+b]*.5*(tonal->highE[b]+tonal->lowE[b]); + midE[i] = sum; + } frame_stationarity /= NB_TBANDS; relativeE /= NB_TBANDS; @@ -460,7 +644,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt info->tonality_slope = slope; tonal->E_count = (tonal->E_count+1)%NB_FRAMES; - tonal->count++; + tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX); info->tonality = frame_tonality; for (i=0;i<4;i++) @@ -479,6 +663,8 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt for (i=0;i<9;i++) tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i]; } + for (i=0;i<4;i++) + features[i] = BFCC[i]-midE[i]; for (i=0;i<8;i++) { @@ -489,6 +675,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt } for (i=0;i<9;i++) features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i]; + features[18] = spec_variability-.78;; features[20] = info->tonality - 0.154723; features[21] = info->activity - 0.724643; features[22] = frame_stationarity - 0.743717; @@ -503,8 +690,6 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt /* Probability of active audio (as opposed to silence) */ frame_probs[1] = .5f*frame_probs[1]+.5f; frame_probs[1] *= frame_probs[1]; - /* Consider that silence has a 50-50 probability. */ - frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f; /* Probability of speech or music vs noise */ info->activity_probability = frame_probs[1]; @@ -527,12 +712,32 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt float music0; float p, q; + /* More silence transitions for speech than for music. */ + tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob); + p = MAX16(.05f,MIN16(.95f,frame_probs[1])); + q = MAX16(.05f,MIN16(.95f,tonal->vad_prob)); + beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); + /* p0 and p1 are the probabilities of speech and music at this frame + using only information from previous frame and applying the + state transition model */ + p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau; + p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau; + /* We apply the current probability with exponent beta to work around + the fact that the probability estimates aren't independent. */ + p0 *= (float)pow(1-frame_probs[1], beta); + p1 *= (float)pow(frame_probs[1], beta); + /* Normalise the probabilities to get the Marokv probability of music. */ + tonal->vad_prob = p1/(p0+p1); + info->vad_prob = tonal->vad_prob; + /* Consider that silence has a 50-50 probability of being speech or music. */ + frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f; + /* One transition every 3 minutes of active audio */ - tau = .00005f*frame_probs[1]; + tau = .0001f; /* Adapt beta based on how "unexpected" the new prob is */ p = MAX16(.05f,MIN16(.95f,frame_probs[0])); q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); - beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); + beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p)); /* p0 and p1 are the probabilities of speech and music at this frame using only information from previous frame and applying the state transition model */ @@ -546,6 +751,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt tonal->music_prob = p1/(p0+p1); info->music_prob = tonal->music_prob; + /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/ /* This chunk of code deals with delayed decision. */ psum=1e-20f; /* Instantaneous probability of speech and music, with beta pre-applied. */ @@ -611,15 +817,15 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt tonal->speech_confidence = .1f; } } - if (tonal->last_music != (tonal->music_prob>.5f)) - tonal->last_transition=0; tonal->last_music = tonal->music_prob>.5f; #else info->music_prob = 0; #endif - /*for (i=0;i<25;i++) +#ifdef MLP_TRAINING + for (i=0;i<25;i++) printf("%f ", features[i]); - printf("\n");*/ + printf("\n"); +#endif info->bandwidth = bandwidth; /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ @@ -635,17 +841,18 @@ void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, co int offset; int pcm_len; + analysis_frame_size -= analysis_frame_size&1; if (analysis_pcm != NULL) { /* Avoid overflow/wrap-around of the analysis buffer */ - analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size); + analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size); pcm_len = analysis_frame_size - analysis->analysis_offset; offset = analysis->analysis_offset; while (pcm_len>0) { - tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix); - offset += 480; - pcm_len -= 480; + tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix); + offset += Fs/50; + pcm_len -= Fs/50; } analysis->analysis_offset = analysis_frame_size; |