Skip to content

Commit 1a9c8fc

Browse files
committed
Audio: MFCC: Add Voice Activity Detection based on Mel spectrum
Add mfcc_vad module with A-weighted energy-based voice activity detection that operates on the Mel log spectrum produced by the MFCC component. The algorithm tracks a per-bin noise floor with instant-down and slow-rise behavior, then computes a weighted energy delta above the floor. Speech is declared when the delta exceeds a threshold (0.35 in Q9.23) with a 20-frame hangover to prevent rapid toggling. The VAD is gated on the new enable_vad flag in sof_mfcc_config. Add struct mfcc_data_header with six int32 fields (magic, frame_number, reserved, energy, noise_energy, vad_flag) prepended to every output frame in all format paths (S16, S24, S32). This replaces the previous magic-word-only header. The header carries the VAD decision and energy values from the DSP for downstream consumers. Extend sof_mfcc_config in user/mfcc.h with reserved16[3] padding for 32-bit alignment, and new boolean fields enable_vad, enable_dtx, update_controls, and reserved_bool[5]. The config blob size increases from 104 to 116 bytes. Update Matlab/Octave decode scripts (decode_mel.m, decode_ceps.m, decode_all.m) and setup_mfcc.m for the expanded header and config struct. Regenerate topology2 configuration blobs (default.conf, mel80.conf) with the new blob size. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent e35a7ef commit 1a9c8fc

13 files changed

Lines changed: 550 additions & 72 deletions

File tree

src/audio/mfcc/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT)
44
add_subdirectory(llext ${PROJECT_BINARY_DIR}/mfcc_llext)
55
add_dependencies(app mfcc)
66
else()
7-
add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c)
7+
add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c mfcc_vad.c)
88
endif()

src/audio/mfcc/mfcc_common.c

Lines changed: 46 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
#include <stddef.h>
2222
#include <stdint.h>
2323

24+
#include <sof/audio/mfcc/mfcc_vad.h>
25+
2426
LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL);
2527

2628
/*
@@ -169,6 +171,22 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *
169171

170172
cc_count += state->dct.num_out;
171173
}
174+
175+
/* Use hop counter for frame numbering (independent of VAD enable) */
176+
state->header.frame_number = state->hop_count;
177+
178+
/* Run VAD on the mel log spectrum (available in both modes) */
179+
if (config->enable_vad) {
180+
mfcc_vad_update(&cd->vad, state->mel_log_32);
181+
182+
/* Populate data header for this output frame */
183+
state->header.energy = cd->vad.energy;
184+
state->header.noise_energy = cd->vad.noise_energy;
185+
state->header.vad_flag = cd->vad.is_speech ? 1 : 0;
186+
}
187+
188+
/* Increment hop counter at end of hop processing */
189+
state->hop_count++;
172190
}
173191

174192
return cc_count;
@@ -267,9 +285,8 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
267285
struct mfcc_comp_data *cd = module_get_private_data(mod);
268286
struct mfcc_state *state = &cd->state;
269287
struct mfcc_buffer *buf = &cd->state.buf;
270-
uint32_t magic = MFCC_MAGIC;
271288
int16_t *w_ptr = audio_stream_get_wptr(sink);
272-
const int num_magic = 2;
289+
const int num_header_s16 = sizeof(state->header) / sizeof(int16_t);
273290
int num_ceps;
274291
int sink_samples;
275292
int to_copy;
@@ -280,25 +297,27 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
280297
/* Run STFT and processing after FFT: Mel auditory filter and DCT. */
281298
num_ceps = mfcc_stft_process(mod->dev, cd);
282299

283-
/* If new output produced, set up pointer into scratch data and mark magic pending */
300+
/* If new output produced, set up pointer into scratch data and mark header pending */
284301
if (num_ceps > 0) {
285-
if (state->mel_only)
302+
if (state->mel_only) {
286303
state->out_data_ptr = state->mel_spectra->data;
287-
else
304+
} else {
288305
state->out_data_ptr = state->cepstral_coef->data;
306+
}
289307

290308
state->out_remain = num_ceps;
291-
state->magic_pending = true;
309+
state->header_pending = true;
292310
}
293311

294312
/* Write to sink, limited by period size */
295313
sink_samples = frames * audio_stream_get_channels(sink);
296314

297-
/* Write magic word first if pending */
298-
if (state->magic_pending && sink_samples >= num_magic) {
299-
w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_magic, (int16_t *)&magic);
300-
sink_samples -= num_magic;
301-
state->magic_pending = false;
315+
/* Write data header first if pending */
316+
if (state->header_pending && sink_samples >= num_header_s16) {
317+
w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_header_s16,
318+
(int16_t *)&state->header);
319+
sink_samples -= num_header_s16;
320+
state->header_pending = false;
302321
}
303322

304323
/* Write cepstral/mel data from scratch buffer */
@@ -363,9 +382,8 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
363382
struct mfcc_comp_data *cd = module_get_private_data(mod);
364383
struct mfcc_state *state = &cd->state;
365384
struct mfcc_buffer *buf = &cd->state.buf;
366-
uint32_t magic = MFCC_MAGIC;
367385
int32_t *w_ptr = audio_stream_get_wptr(sink);
368-
const int num_magic = 1; /* one int32_t word for magic */
386+
const int num_header_s32 = sizeof(state->header) / sizeof(int32_t);
369387
int num_ceps;
370388
int sink_samples;
371389
int remain_s32;
@@ -391,17 +409,18 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
391409
}
392410

393411
state->out_remain = num_ceps;
394-
state->magic_pending = true;
412+
state->header_pending = true;
395413
}
396414

397415
/* Write to sink, limited by period size */
398416
sink_samples = frames * audio_stream_get_channels(sink);
399417

400-
/* Write magic word first if pending */
401-
if (state->magic_pending && sink_samples >= num_magic) {
402-
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
403-
sink_samples -= num_magic;
404-
state->magic_pending = false;
418+
/* Write data header first if pending */
419+
if (state->header_pending && sink_samples >= num_header_s32) {
420+
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32,
421+
(int32_t *)&state->header);
422+
sink_samples -= num_header_s32;
423+
state->header_pending = false;
405424
}
406425

407426
if (state->mel_only) {
@@ -443,9 +462,8 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
443462
struct mfcc_comp_data *cd = module_get_private_data(mod);
444463
struct mfcc_state *state = &cd->state;
445464
struct mfcc_buffer *buf = &cd->state.buf;
446-
uint32_t magic = MFCC_MAGIC;
447465
int32_t *w_ptr = audio_stream_get_wptr(sink);
448-
const int num_magic = 1; /* one int32_t word for magic */
466+
const int num_header_s32 = sizeof(state->header) / sizeof(int32_t);
449467
int num_ceps;
450468
int sink_samples;
451469
int remain_s32;
@@ -466,17 +484,18 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
466484
}
467485

468486
state->out_remain = num_ceps;
469-
state->magic_pending = true;
487+
state->header_pending = true;
470488
}
471489

472490
/* Write to sink, limited by period size */
473491
sink_samples = frames * audio_stream_get_channels(sink);
474492

475-
/* Write magic word first if pending */
476-
if (state->magic_pending && sink_samples >= num_magic) {
477-
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
478-
sink_samples -= num_magic;
479-
state->magic_pending = false;
493+
/* Write data header first if pending */
494+
if (state->header_pending && sink_samples >= num_header_s32) {
495+
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32,
496+
(int32_t *)&state->header);
497+
sink_samples -= num_header_s32;
498+
state->header_pending = false;
480499
}
481500

482501
if (state->mel_only) {

src/audio/mfcc/mfcc_setup.c

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
#include <stddef.h>
1919
#include <stdint.h>
2020

21+
#include <sof/audio/mfcc/mfcc_vad.h>
22+
2123
/* Definitions for cepstral lifter */
2224
#define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23)
2325
#define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23)
@@ -127,6 +129,11 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
127129
return -EINVAL;
128130
}
129131

132+
if (sample_rate > MFCC_MAX_SAMPLE_RATE) {
133+
comp_err(dev, "Sample rate %d exceeds max %d Hz", sample_rate, MFCC_MAX_SAMPLE_RATE);
134+
return -EINVAL;
135+
}
136+
130137
if (config->sample_frequency != sample_rate) {
131138
comp_err(dev, "Config sample_frequency does not match stream");
132139
return -EINVAL;
@@ -328,11 +335,11 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
328335

329336
/* Check that output data can be drained within the periods spanned by one
330337
* FFT hop. Each hop consumes fft_hop_size input samples and produces
331-
* max_out_per_hop + 2 (magic) int16_t output values. The sink provides at
332-
* least fft_hop_size * channels int16_t samples per hop (worst case s16).
338+
* max_out_per_hop + 12 (magic header) int16_t output values. The sink provides
339+
* at least fft_hop_size * channels int16_t samples per hop (worst case s16).
333340
* If output exceeds this, data accumulates and will eventually overflow.
334341
*/
335-
int out_per_hop = max_out_per_hop + 2;
342+
int out_per_hop = max_out_per_hop + sizeof(state->header) / sizeof(int16_t);
336343
int sink_per_hop = fft->fft_hop_size * channels;
337344

338345
if (out_per_hop > sink_per_hop) {
@@ -345,11 +352,22 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
345352
/* Set initial state for STFT */
346353
state->waiting_fill = true;
347354
state->prev_samples_valid = false;
348-
state->magic_pending = false;
355+
state->header_pending = false;
356+
state->hop_count = 0;
357+
memset(&state->header, 0, sizeof(state->header));
358+
state->header.magic = MFCC_MAGIC;
349359
state->out_data_ptr = NULL;
350360
state->out_data_ptr_32 = NULL;
351361
state->out_remain = 0;
352362

363+
if (config->enable_vad) {
364+
ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod);
365+
if (ret < 0) {
366+
comp_err(dev, "Failed VAD init");
367+
goto free_lifter;
368+
}
369+
}
370+
353371
comp_dbg(dev, "done");
354372
return 0;
355373

@@ -389,4 +407,6 @@ void mfcc_free_buffers(struct processing_module *mod)
389407
mod_free(mod, cd->state.melfb.data);
390408
mod_free(mod, cd->state.dct.matrix);
391409
mod_free(mod, cd->state.lifter.matrix);
410+
mod_free(mod, cd->vad.noise_floor);
411+
mod_free(mod, cd->vad.weights);
392412
}

0 commit comments

Comments
 (0)