Skip to content

Commit 03c5b4d

Browse files
committed
Audio: MFCC: Add Voice Activity Detection based on Mel spectrum
This patch adds a new mfcc_vad module. It operates on the Mel log spectrum values produced by the MFCC component. The VAD is very simple and not very selective for voice vs. other signals. But the continuously updated background noise estimate prevents stationary noises from triggering the VAD. The algorithm tracks a per-bin noise floor (instant-down, slow-rise) and computes a A-weighted energy delta. The used weight emphasizes speech frequencies. Speech is declared when the delta exceeds a threshold (0.35 in Q9.23) with a 20-frame hangover to prevent rapid toggling. The VAD flag is inserted into the output stream as the first value after the magic header word in all format paths (S16, S24, S32). A new Kconfig option CONFIG_COMP_MFCC_VAD (depends on COMP_MFCC, default y) gates compilation of the VAD code and the stream format change. The README.txt file is updated to show help how to run the example Python script sof_mel_to_text_live_dsp_vad.py. It uses the MFCC Mel spectrum data and VAD flags stream as audio features for Whisper speech to text model. The formatting is changed to md. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent e35a7ef commit 03c5b4d

12 files changed

Lines changed: 1015 additions & 65 deletions

File tree

src/arch/host/configs/library_defconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ CONFIG_COMP_IIR=y
1111
CONFIG_COMP_IGO_NR=y
1212
CONFIG_COMP_LEVEL_MULTIPLIER=y
1313
CONFIG_COMP_MFCC=y
14+
CONFIG_COMP_MFCC_VAD=y
1415
CONFIG_COMP_MODULE_ADAPTER=y
1516
CONFIG_COMP_MULTIBAND_DRC=y
1617
CONFIG_COMP_MUX=y

src/audio/mfcc/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,7 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT)
55
add_dependencies(app mfcc)
66
else()
77
add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c)
8+
if(CONFIG_COMP_MFCC_VAD)
9+
add_local_sources(sof mfcc_vad.c)
10+
endif()
811
endif()

src/audio/mfcc/Kconfig

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,14 @@ config COMP_MFCC
2424
The characteristic of the audio features are defined in the binary
2525
control blob. Directory tools/tune/mfcc contains a tool to create
2626
the configurations.
27+
28+
config COMP_MFCC_VAD
29+
bool "MFCC Voice Activity Detection"
30+
depends on COMP_MFCC
31+
default y
32+
help
33+
This option enables a Voice Activity Detector (VAD) that operates
34+
on the Mel spectrum values produced by the MFCC component. The VAD
35+
flag is inserted into the output stream as the first int32_t value
36+
after the magic header word. The VAD tracks a per-bin noise floor
37+
and detects speech using a weighted energy delta with hangover.

src/audio/mfcc/mfcc_common.c

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
#include <stddef.h>
2222
#include <stdint.h>
2323

24+
#ifdef CONFIG_COMP_MFCC_VAD
25+
#include <sof/audio/mfcc/mfcc_vad.h>
26+
#endif
27+
2428
LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL);
2529

2630
/*
@@ -144,6 +148,10 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *
144148
sat_int32(Q_MULTSR_32X32(s, config->mel_scale, 23, 12, 23));
145149
}
146150

151+
#ifdef CONFIG_COMP_MFCC_VAD
152+
/* Run VAD on the mel log spectrum before further processing */
153+
state->vad_flag = mfcc_vad_update(&cd->vad, state->mel_log_32);
154+
#endif
147155
/* Store Q9.7 version in mel_spectra for s16 output mode */
148156
for (j = 0; j < state->dct.num_in; j++)
149157
state->mel_spectra->data[j] =
@@ -282,10 +290,14 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
282290

283291
/* If new output produced, set up pointer into scratch data and mark magic pending */
284292
if (num_ceps > 0) {
285-
if (state->mel_only)
293+
if (state->mel_only) {
286294
state->out_data_ptr = state->mel_spectra->data;
287-
else
295+
#ifdef CONFIG_COMP_MFCC_VAD
296+
state->vad_pending = true;
297+
#endif
298+
} else {
288299
state->out_data_ptr = state->cepstral_coef->data;
300+
}
289301

290302
state->out_remain = num_ceps;
291303
state->magic_pending = true;
@@ -301,6 +313,15 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
301313
state->magic_pending = false;
302314
}
303315

316+
#ifdef CONFIG_COMP_MFCC_VAD
317+
/* Write VAD flag as first value after magic (as two int16_t = one int32_t) */
318+
if (state->vad_pending && sink_samples >= 2) {
319+
w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, 2, (int16_t *)&state->vad_flag);
320+
sink_samples -= 2;
321+
state->vad_pending = false;
322+
}
323+
#endif
324+
304325
/* Write cepstral/mel data from scratch buffer */
305326
to_copy = MIN(state->out_remain, sink_samples);
306327
if (to_copy > 0) {
@@ -386,6 +407,9 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
386407
state->mel_log_32[k] >>= 8;
387408

388409
state->out_data_ptr_32 = state->mel_log_32;
410+
#ifdef CONFIG_COMP_MFCC_VAD
411+
state->vad_pending = true;
412+
#endif
389413
} else {
390414
state->out_data_ptr = state->cepstral_coef->data;
391415
}
@@ -404,6 +428,15 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
404428
state->magic_pending = false;
405429
}
406430

431+
#ifdef CONFIG_COMP_MFCC_VAD
432+
/* Write VAD flag as first value after magic */
433+
if (state->vad_pending && sink_samples >= 1) {
434+
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, 1, &state->vad_flag);
435+
sink_samples -= 1;
436+
state->vad_pending = false;
437+
}
438+
#endif
439+
407440
if (state->mel_only) {
408441
/* Write 32-bit mel data Q9.15, one value per int32_t */
409442
to_copy = MIN(state->out_remain, sink_samples);
@@ -461,6 +494,9 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
461494
if (num_ceps > 0) {
462495
if (state->mel_only) {
463496
state->out_data_ptr_32 = state->mel_log_32;
497+
#ifdef CONFIG_COMP_MFCC_VAD
498+
state->vad_pending = true;
499+
#endif
464500
} else {
465501
state->out_data_ptr = state->cepstral_coef->data;
466502
}
@@ -479,6 +515,15 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
479515
state->magic_pending = false;
480516
}
481517

518+
#ifdef CONFIG_COMP_MFCC_VAD
519+
/* Write VAD flag as first value after magic */
520+
if (state->vad_pending && sink_samples >= 1) {
521+
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, 1, &state->vad_flag);
522+
sink_samples -= 1;
523+
state->vad_pending = false;
524+
}
525+
#endif
526+
482527
if (state->mel_only) {
483528
/* Write 32-bit mel data Q9.23, one value per int32_t */
484529
to_copy = MIN(state->out_remain, sink_samples);

src/audio/mfcc/mfcc_setup.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
#include <stddef.h>
1919
#include <stdint.h>
2020

21+
#ifdef CONFIG_COMP_MFCC_VAD
22+
#include <sof/audio/mfcc/mfcc_vad.h>
23+
#endif
24+
2125
/* Definitions for cepstral lifter */
2226
#define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23)
2327
#define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23)
@@ -346,10 +350,22 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
346350
state->waiting_fill = true;
347351
state->prev_samples_valid = false;
348352
state->magic_pending = false;
353+
#ifdef CONFIG_COMP_MFCC_VAD
354+
state->vad_pending = false;
355+
state->vad_flag = 0;
356+
#endif
349357
state->out_data_ptr = NULL;
350358
state->out_data_ptr_32 = NULL;
351359
state->out_remain = 0;
352360

361+
#ifdef CONFIG_COMP_MFCC_VAD
362+
ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod);
363+
if (ret < 0) {
364+
comp_err(dev, "Failed VAD init");
365+
goto free_lifter;
366+
}
367+
#endif
368+
353369
comp_dbg(dev, "done");
354370
return 0;
355371

@@ -389,4 +405,8 @@ void mfcc_free_buffers(struct processing_module *mod)
389405
mod_free(mod, cd->state.melfb.data);
390406
mod_free(mod, cd->state.dct.matrix);
391407
mod_free(mod, cd->state.lifter.matrix);
408+
#ifdef CONFIG_COMP_MFCC_VAD
409+
mod_free(mod, cd->vad.noise_floor);
410+
mod_free(mod, cd->vad.weights);
411+
#endif
392412
}

0 commit comments

Comments
 (0)