aboutsummaryrefslogtreecommitdiff
path: root/src/analysis.h
diff options
context:
space:
mode:
authorJean-Marc Valin <jmvalin@jmvalin.ca>2017-07-12 16:55:28 -0400
committerJean-Marc Valin <jmvalin@jmvalin.ca>2017-10-05 17:40:27 -0400
commitaf93fbd55fd5c23a2492166816311d9f67df1b24 (patch)
tree7221fd8dd284dd593e4b3eb1a3ed9cee3b4fc926 /src/analysis.h
parentf3cff05eeb83ec8c055b7331338d705af220358d (diff)
downloadlibopus-af93fbd55fd5c23a2492166816311d9f67df1b24.tar.gz
Add RNN for VAD and speech/music classification
Based on two dense layers with a GRU layer in the middle
Diffstat (limited to 'src/analysis.h')
-rw-r--r--src/analysis.h15
1 files changed, 2 insertions, 13 deletions
diff --git a/src/analysis.h b/src/analysis.h
index cac51dfa..289c845e 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -30,6 +30,7 @@
#include "celt.h"
#include "opus_private.h"
+#include "mlp.h"
#define NB_FRAMES 8
#define NB_TBANDS 18
@@ -64,28 +65,16 @@ typedef struct {
float mem[32];
float cmean[8];
float std[9];
- float music_prob;
- float vad_prob;
float Etracker;
float lowECount;
int E_count;
- int last_music;
int count;
int analysis_offset;
- /** Probability of having speech for time i to DETECT_SIZE-1 (and music before).
- pspeech[0] is the probability that all frames in the window are speech. */
- float pspeech[DETECT_SIZE];
- /** Probability of having music for time i to DETECT_SIZE-1 (and speech before).
- pmusic[0] is the probability that all frames in the window are music. */
- float pmusic[DETECT_SIZE];
- float speech_confidence;
- float music_confidence;
- int speech_confidence_count;
- int music_confidence_count;
int write_pos;
int read_pos;
int read_subframe;
float hp_ener_accum;
+ float rnn_state[MAX_NEURONS];
opus_val32 downmix_state[3];
AnalysisInfo info[DETECT_SIZE];
} TonalityAnalysisState;