Skip to content

Commit 7d89764

Browse files
committed
Audio: MFCC: Add Voice Activity Detection based on Mel spectrum
Add mfcc_vad module with A-weighted energy-based voice activity detection that operates on the Mel log spectrum produced by the MFCC component. The algorithm tracks a per-bin noise floor with instant-down and slow-rise behavior, then computes a weighted energy delta above the floor. Speech is declared when the delta exceeds a threshold (0.35 in Q9.23) with a 20-frame hangover to prevent rapid toggling. The VAD is gated on the new enable_vad flag in sof_mfcc_config. Add struct mfcc_data_header with six int32 fields (magic, frame_number, reserved, energy, noise_energy, vad_flag) prepended to every output frame in all format paths (S16, S24, S32). This replaces the previous magic-word-only header. The header carries the VAD decision and energy values from the DSP for downstream consumers. Extend sof_mfcc_config in user/mfcc.h with reserved16[3] padding for 32-bit alignment, and new boolean fields enable_vad, enable_dtx, update_controls, and reserved_bool[5]. The config blob size increases from 104 to 116 bytes. Update Matlab/Octave decode scripts (decode_mel.m, decode_ceps.m, decode_all.m) and setup_mfcc.m for the expanded header and config struct. Regenerate topology2 configuration blobs (default.conf, mel80.conf) with the new blob size. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent e35a7ef commit 7d89764

13 files changed

Lines changed: 559 additions & 73 deletions

File tree

src/audio/mfcc/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT)
44
add_subdirectory(llext ${PROJECT_BINARY_DIR}/mfcc_llext)
55
add_dependencies(app mfcc)
66
else()
7-
add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c)
7+
add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c mfcc_vad.c)
88
endif()

src/audio/mfcc/mfcc_common.c

Lines changed: 42 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
#include <stddef.h>
2222
#include <stdint.h>
2323

24+
#include <sof/audio/mfcc/mfcc_vad.h>
25+
2426
LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL);
2527

2628
/*
@@ -169,6 +171,18 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *
169171

170172
cc_count += state->dct.num_out;
171173
}
174+
175+
/* Run VAD on the mel log spectrum (available in both modes) */
176+
if (config->enable_vad)
177+
mfcc_vad_update(&cd->vad, state->mel_log_32);
178+
179+
/* Populate data header for this output frame */
180+
state->header.magic = MFCC_MAGIC;
181+
state->header.frame_number = cd->vad.frame_count;
182+
state->header.reserved = 0;
183+
state->header.energy = cd->vad.energy;
184+
state->header.noise_energy = cd->vad.noise_energy;
185+
state->header.vad_flag = cd->vad.is_speech ? 1 : 0;
172186
}
173187

174188
return cc_count;
@@ -267,9 +281,8 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
267281
struct mfcc_comp_data *cd = module_get_private_data(mod);
268282
struct mfcc_state *state = &cd->state;
269283
struct mfcc_buffer *buf = &cd->state.buf;
270-
uint32_t magic = MFCC_MAGIC;
271284
int16_t *w_ptr = audio_stream_get_wptr(sink);
272-
const int num_magic = 2;
285+
const int num_header_s16 = sizeof(state->header) / sizeof(int16_t);
273286
int num_ceps;
274287
int sink_samples;
275288
int to_copy;
@@ -280,25 +293,27 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
280293
/* Run STFT and processing after FFT: Mel auditory filter and DCT. */
281294
num_ceps = mfcc_stft_process(mod->dev, cd);
282295

283-
/* If new output produced, set up pointer into scratch data and mark magic pending */
296+
/* If new output produced, set up pointer into scratch data and mark header pending */
284297
if (num_ceps > 0) {
285-
if (state->mel_only)
298+
if (state->mel_only) {
286299
state->out_data_ptr = state->mel_spectra->data;
287-
else
300+
} else {
288301
state->out_data_ptr = state->cepstral_coef->data;
302+
}
289303

290304
state->out_remain = num_ceps;
291-
state->magic_pending = true;
305+
state->header_pending = true;
292306
}
293307

294308
/* Write to sink, limited by period size */
295309
sink_samples = frames * audio_stream_get_channels(sink);
296310

297-
/* Write magic word first if pending */
298-
if (state->magic_pending && sink_samples >= num_magic) {
299-
w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_magic, (int16_t *)&magic);
300-
sink_samples -= num_magic;
301-
state->magic_pending = false;
311+
/* Write data header first if pending */
312+
if (state->header_pending && sink_samples >= num_header_s16) {
313+
w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_header_s16,
314+
(int16_t *)&state->header);
315+
sink_samples -= num_header_s16;
316+
state->header_pending = false;
302317
}
303318

304319
/* Write cepstral/mel data from scratch buffer */
@@ -363,9 +378,8 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
363378
struct mfcc_comp_data *cd = module_get_private_data(mod);
364379
struct mfcc_state *state = &cd->state;
365380
struct mfcc_buffer *buf = &cd->state.buf;
366-
uint32_t magic = MFCC_MAGIC;
367381
int32_t *w_ptr = audio_stream_get_wptr(sink);
368-
const int num_magic = 1; /* one int32_t word for magic */
382+
const int num_header_s32 = sizeof(state->header) / sizeof(int32_t);
369383
int num_ceps;
370384
int sink_samples;
371385
int remain_s32;
@@ -391,17 +405,18 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
391405
}
392406

393407
state->out_remain = num_ceps;
394-
state->magic_pending = true;
408+
state->header_pending = true;
395409
}
396410

397411
/* Write to sink, limited by period size */
398412
sink_samples = frames * audio_stream_get_channels(sink);
399413

400-
/* Write magic word first if pending */
401-
if (state->magic_pending && sink_samples >= num_magic) {
402-
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
403-
sink_samples -= num_magic;
404-
state->magic_pending = false;
414+
/* Write data header first if pending */
415+
if (state->header_pending && sink_samples >= num_header_s32) {
416+
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32,
417+
(int32_t *)&state->header);
418+
sink_samples -= num_header_s32;
419+
state->header_pending = false;
405420
}
406421

407422
if (state->mel_only) {
@@ -443,9 +458,8 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
443458
struct mfcc_comp_data *cd = module_get_private_data(mod);
444459
struct mfcc_state *state = &cd->state;
445460
struct mfcc_buffer *buf = &cd->state.buf;
446-
uint32_t magic = MFCC_MAGIC;
447461
int32_t *w_ptr = audio_stream_get_wptr(sink);
448-
const int num_magic = 1; /* one int32_t word for magic */
462+
const int num_header_s32 = sizeof(state->header) / sizeof(int32_t);
449463
int num_ceps;
450464
int sink_samples;
451465
int remain_s32;
@@ -466,17 +480,18 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
466480
}
467481

468482
state->out_remain = num_ceps;
469-
state->magic_pending = true;
483+
state->header_pending = true;
470484
}
471485

472486
/* Write to sink, limited by period size */
473487
sink_samples = frames * audio_stream_get_channels(sink);
474488

475-
/* Write magic word first if pending */
476-
if (state->magic_pending && sink_samples >= num_magic) {
477-
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
478-
sink_samples -= num_magic;
479-
state->magic_pending = false;
489+
/* Write data header first if pending */
490+
if (state->header_pending && sink_samples >= num_header_s32) {
491+
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32,
492+
(int32_t *)&state->header);
493+
sink_samples -= num_header_s32;
494+
state->header_pending = false;
480495
}
481496

482497
if (state->mel_only) {

src/audio/mfcc/mfcc_setup.c

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
#include <stddef.h>
1919
#include <stdint.h>
2020

21+
#include <sof/audio/mfcc/mfcc_vad.h>
22+
2123
/* Definitions for cepstral lifter */
2224
#define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23)
2325
#define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23)
@@ -332,7 +334,7 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
332334
* least fft_hop_size * channels int16_t samples per hop (worst case s16).
333335
* If output exceeds this, data accumulates and will eventually overflow.
334336
*/
335-
int out_per_hop = max_out_per_hop + 2;
337+
int out_per_hop = max_out_per_hop + sizeof(state->header) / sizeof(int16_t);
336338
int sink_per_hop = fft->fft_hop_size * channels;
337339

338340
if (out_per_hop > sink_per_hop) {
@@ -345,11 +347,20 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
345347
/* Set initial state for STFT */
346348
state->waiting_fill = true;
347349
state->prev_samples_valid = false;
348-
state->magic_pending = false;
350+
state->header_pending = false;
351+
memset(&state->header, 0, sizeof(state->header));
349352
state->out_data_ptr = NULL;
350353
state->out_data_ptr_32 = NULL;
351354
state->out_remain = 0;
352355

356+
if (config->enable_vad) {
357+
ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod);
358+
if (ret < 0) {
359+
comp_err(dev, "Failed VAD init");
360+
goto free_lifter;
361+
}
362+
}
363+
353364
comp_dbg(dev, "done");
354365
return 0;
355366

@@ -389,4 +400,6 @@ void mfcc_free_buffers(struct processing_module *mod)
389400
mod_free(mod, cd->state.melfb.data);
390401
mod_free(mod, cd->state.dct.matrix);
391402
mod_free(mod, cd->state.lifter.matrix);
403+
mod_free(mod, cd->vad.noise_floor);
404+
mod_free(mod, cd->vad.weights);
392405
}

0 commit comments

Comments
 (0)