Skip to content

Commit 043a09b

Browse files
committed
Add streaming Silero VAD runner for real-time speech detection
Add a new `silero_vad_stream_runner` CLI that reads 16kHz mono float32 PCM from stdin and outputs per-frame speech probabilities via a simple line protocol (`PROB <time> <probability>`). This enables real-time VAD as a subprocess for apps like the Voxtral Realtime macOS dictation app. Changes: - Add `reset_stream()` and `process_frame()` to SileroVadRunner for stateful frame-by-frame inference with persistent LSTM state - Add `stream_main.cpp` as the streaming CLI entry point - Update CMakeLists.txt to build both `silero_vad_runner` (offline) and `silero_vad_stream_runner` (streaming) targets - Remove unnecessary `extension_llm_runner` dependency that caused build conflicts with sentencepiece headers - Update Makefile `silero-vad-cpu` target to build both runners with `-DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=OFF` - Update README with streaming usage and architecture docs Authored with assistance from Claude. Made-with: Cursor
1 parent edb5e08 commit 043a09b

6 files changed

Lines changed: 262 additions & 98 deletions

File tree

Makefile

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -296,18 +296,21 @@ voxtral_realtime-cuda:
296296
@echo " Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"
297297

298298
silero-vad-cpu:
299-
@echo "==> Building and installing ExecuTorch..."
300-
cmake --workflow --preset llm-release
301-
@echo "==> Building Silero VAD runner (CPU)..."
299+
@echo "==> Configuring and installing ExecuTorch (without LLM runner)..."
300+
cmake --preset llm-release -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=OFF
301+
cmake --build cmake-out --parallel "$$(sysctl -n hw.ncpu)"
302+
cmake --install cmake-out
303+
@echo "==> Building Silero VAD runners (CPU)..."
302304
cmake -DCMAKE_BUILD_TYPE=Release \
303305
-DCMAKE_FIND_ROOT_PATH=$(CURDIR)/cmake-out \
304306
-DCMAKE_PREFIX_PATH=$(CURDIR)/cmake-out \
305307
-S examples/models/silero_vad \
306308
-B cmake-out/examples/models/silero_vad
307-
cmake --build cmake-out/examples/models/silero_vad --target silero_vad_runner
309+
cmake --build cmake-out/examples/models/silero_vad --target silero_vad_runner silero_vad_stream_runner
308310
@echo ""
309311
@echo "✓ Build complete!"
310312
@echo " Binary: cmake-out/examples/models/silero_vad/silero_vad_runner"
313+
@echo " Binary: cmake-out/examples/models/silero_vad/silero_vad_stream_runner"
311314

312315
llama-cpu:
313316
@echo "==> Building and installing ExecuTorch..."

examples/models/silero_vad/CMakeLists.txt

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ if(NOT _executorch_imported)
2929
executorch_target_link_options_shared_lib(executorch)
3030
endif()
3131

32-
set(link_libraries executorch gflags)
32+
set(common_link_libraries executorch gflags)
3333

3434
# Common ops for all builds
3535
if(TARGET optimized_native_cpu_ops_lib)
36-
list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
36+
list(APPEND common_link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
3737
get_target_property(_is_imported optimized_native_cpu_ops_lib IMPORTED)
3838
if(NOT _is_imported)
3939
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
@@ -46,7 +46,7 @@ if(TARGET xnnpack_backend)
4646
if(TARGET kleidiai)
4747
list(APPEND xnnpack_backend_libs kleidiai)
4848
endif()
49-
list(APPEND link_libraries ${xnnpack_backend_libs})
49+
list(APPEND common_link_libraries ${xnnpack_backend_libs})
5050
get_target_property(_xnnpack_imported xnnpack_backend IMPORTED)
5151
if(NOT _xnnpack_imported)
5252
executorch_target_link_options_shared_lib(xnnpack_backend)
@@ -55,14 +55,24 @@ endif()
5555

5656
# Needed for cpuinfo where it uses android specific log lib
5757
if(ANDROID)
58-
list(APPEND link_libraries log)
58+
list(APPEND common_link_libraries log)
5959
endif()
6060

6161
# Add the required ExecuTorch extensions
62+
set(silero_runner_link_libraries ${common_link_libraries})
6263
list(
6364
APPEND
64-
link_libraries
65-
extension_llm_runner
65+
silero_runner_link_libraries
66+
extension_module
67+
extension_data_loader
68+
extension_tensor
69+
extension_flat_tensor
70+
)
71+
72+
set(silero_stream_link_libraries ${common_link_libraries})
73+
list(
74+
APPEND
75+
silero_stream_link_libraries
6676
extension_module
6777
extension_data_loader
6878
extension_tensor
@@ -80,5 +90,23 @@ endif()
8090
target_include_directories(
8191
silero_vad_runner PUBLIC ${_common_include_directories}
8292
)
83-
target_link_libraries(silero_vad_runner PUBLIC ${link_libraries})
93+
target_link_libraries(silero_vad_runner PUBLIC ${silero_runner_link_libraries})
8494
target_compile_options(silero_vad_runner PUBLIC ${_common_compile_options})
95+
96+
add_executable(silero_vad_stream_runner stream_main.cpp silero_vad_runner.cpp)
97+
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
98+
target_link_options_gc_sections(silero_vad_stream_runner)
99+
if(NOT APPLE AND NOT MSVC)
100+
target_link_options(silero_vad_stream_runner PRIVATE "LINKER:-s")
101+
endif()
102+
endif()
103+
104+
target_include_directories(
105+
silero_vad_stream_runner PUBLIC ${_common_include_directories}
106+
)
107+
target_link_libraries(
108+
silero_vad_stream_runner PUBLIC ${silero_stream_link_libraries}
109+
)
110+
target_compile_options(
111+
silero_vad_stream_runner PUBLIC ${_common_compile_options}
112+
)

examples/models/silero_vad/README.md

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,17 @@ Voice activity detection answers "when is someone speaking" — the model output
99
```bash
1010
# Export to .pte
1111
cd examples/models/silero_vad
12-
python export_silero_vad.py --jit-model /path/to/silero_vad.jit
12+
python export_silero_vad.py \
13+
--jit-model /path/to/silero_vad.jit \
14+
--backend xnnpack \
15+
--output-dir ./silero_vad_xnnpack
1316

14-
# Build the C++ runner (from repo root)
17+
# Build the C++ runners (from repo root)
1518
make silero-vad-cpu
1619

17-
# Run VAD
20+
# Run WAV-based VAD
1821
./cmake-out/examples/models/silero_vad/silero_vad_runner \
19-
--model_path examples/models/silero_vad/silero_vad_exports/silero_vad.pte \
22+
--model_path examples/models/silero_vad/silero_vad_xnnpack/silero_vad.pte \
2023
--audio_path /path/to/audio.wav
2124
```
2225

@@ -67,9 +70,9 @@ python export_silero_vad.py --jit-model /path/to/silero-vad/src/silero_vad/data/
6770
| `--backend` | `portable` or `xnnpack` (default: `xnnpack`) |
6871
| `--output-dir` | Output directory (default: `./silero_vad_exports`) |
6972

70-
Output: `silero_vad_exports/silero_vad.pte` (~2 MB).
73+
Output: `silero_vad_xnnpack/silero_vad.pte` (~1.2 MB with XNNPACK, may vary by export settings).
7174

72-
## C++ Runner
75+
## C++ Runners
7376

7477
### Build
7578

@@ -79,16 +82,44 @@ From the repository root:
7982
make silero-vad-cpu
8083
```
8184

82-
Binary: `cmake-out/examples/models/silero_vad/silero_vad_runner`
85+
This builds:
86+
87+
- `cmake-out/examples/models/silero_vad/silero_vad_runner`
88+
- `cmake-out/examples/models/silero_vad/silero_vad_stream_runner`
89+
90+
The build configures and installs ExecuTorch first, then builds the two Silero VAD binaries from `examples/models/silero_vad/`.
8391

8492
### Arguments
8593

94+
#### `silero_vad_runner`
95+
8696
| Argument | Description |
8797
|----------|-------------|
8898
| `--model_path` | Path to `.pte` file (default: `silero_vad.pte`) |
8999
| `--audio_path` | Path to input WAV file (16kHz mono, required) |
90100
| `--threshold` | Speech probability threshold, 0.0–1.0 (default: `0.5`) |
91101

102+
#### `silero_vad_stream_runner`
103+
104+
| Argument | Description |
105+
|----------|-------------|
106+
| `--model_path` | Path to `.pte` file (default: `silero_vad.pte`) |
107+
108+
The stream runner reads 16kHz mono `float32` PCM from `stdin` and prints:
109+
110+
```text
111+
READY
112+
PROB <time_seconds> <probability>
113+
```
114+
115+
Example:
116+
117+
```bash
118+
ffmpeg -i input.wav -ar 16000 -ac 1 -f f32le -nostats -loglevel error pipe:1 | \
119+
./cmake-out/examples/models/silero_vad/silero_vad_stream_runner \
120+
--model_path examples/models/silero_vad/silero_vad_xnnpack/silero_vad.pte
121+
```
122+
92123
### How It Works
93124

94125
The model processes audio in 512-sample chunks (32ms at 16kHz). Each chunk is prepended with 64 samples of context from the previous chunk, forming a 576-sample input. The model carries an LSTM hidden state across chunks and outputs a single speech probability per chunk.

examples/models/silero_vad/silero_vad_runner.cpp

Lines changed: 76 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -43,21 +43,88 @@ SileroVadRunner::SileroVadRunner(const std::string& model_path) {
4343
context_size_ = cs.ok() ? cs.get()[0].toInt() : 64;
4444
input_size_ = window_size_ + context_size_;
4545
frame_duration_ = static_cast<double>(window_size_) / sample_rate_;
46+
reset_stream();
47+
}
48+
49+
void SileroVadRunner::reset_stream() {
50+
stream_state_data_.assign(static_cast<size_t>(2 * kHiddenDim), 0.0f);
51+
stream_context_.assign(static_cast<size_t>(context_size_), 0.0f);
52+
stream_input_.assign(static_cast<size_t>(input_size_), 0.0f);
53+
stream_frame_index_ = 0;
54+
}
55+
56+
float SileroVadRunner::process_frame(
57+
const float* audio_data,
58+
int64_t num_samples) {
59+
int64_t chunk_len = std::min(window_size_, num_samples);
60+
61+
std::memcpy(
62+
stream_input_.data(),
63+
stream_context_.data(),
64+
static_cast<size_t>(context_size_) * sizeof(float));
65+
66+
if (chunk_len > 0) {
67+
std::memcpy(
68+
stream_input_.data() + context_size_,
69+
audio_data,
70+
static_cast<size_t>(chunk_len) * sizeof(float));
71+
}
72+
if (chunk_len < window_size_) {
73+
std::memset(
74+
stream_input_.data() + context_size_ + chunk_len,
75+
0,
76+
static_cast<size_t>(window_size_ - chunk_len) * sizeof(float));
77+
}
78+
79+
auto input_tensor = from_blob(
80+
stream_input_.data(),
81+
{1, static_cast<::executorch::aten::SizesType>(input_size_)},
82+
::executorch::aten::ScalarType::Float);
83+
auto state_tensor = from_blob(
84+
stream_state_data_.data(),
85+
{2, 1, static_cast<::executorch::aten::SizesType>(kHiddenDim)},
86+
::executorch::aten::ScalarType::Float);
87+
88+
auto result = model_->execute(
89+
"forward", std::vector<EValue>{input_tensor, state_tensor});
90+
ET_CHECK_MSG(result.ok(), "Silero VAD forward failed.");
91+
92+
auto& outputs = result.get();
93+
float prob = outputs[0].toTensor().const_data_ptr<float>()[0];
94+
95+
auto new_state = outputs[1].toTensor();
96+
std::memcpy(
97+
stream_state_data_.data(),
98+
new_state.const_data_ptr<float>(),
99+
static_cast<size_t>(2 * kHiddenDim) * sizeof(float));
100+
101+
if (chunk_len >= context_size_) {
102+
std::memcpy(
103+
stream_context_.data(),
104+
audio_data + chunk_len - context_size_,
105+
static_cast<size_t>(context_size_) * sizeof(float));
106+
} else if (chunk_len > 0) {
107+
int64_t keep = context_size_ - chunk_len;
108+
std::memmove(
109+
stream_context_.data(),
110+
stream_context_.data() + chunk_len,
111+
static_cast<size_t>(keep) * sizeof(float));
112+
std::memcpy(
113+
stream_context_.data() + keep,
114+
audio_data,
115+
static_cast<size_t>(chunk_len) * sizeof(float));
116+
}
117+
118+
stream_frame_index_++;
119+
return prob;
46120
}
47121

48122
SileroVadRunner::Result SileroVadRunner::detect(
49123
const float* audio_data,
50124
int64_t num_samples,
51125
float threshold,
52126
SegmentCallback segment_cb) {
53-
// LSTM state: (2, 1, 128) — [h, c]
54-
std::vector<float> state_data(static_cast<size_t>(2 * kHiddenDim), 0.0f);
55-
56-
// Context: previous chunk's last context_size_ samples
57-
std::vector<float> context(static_cast<size_t>(context_size_), 0.0f);
58-
59-
// Input buffer: [context | chunk] = input_size_ samples
60-
std::vector<float> input(static_cast<size_t>(input_size_));
127+
reset_stream();
61128

62129
bool speech_active = false;
63130
int64_t speech_start_frame = 0;
@@ -66,78 +133,7 @@ SileroVadRunner::Result SileroVadRunner::detect(
66133
int num_segments = 0;
67134

68135
for (int64_t offset = 0; offset < num_samples; offset += window_size_) {
69-
int64_t chunk_len = std::min(window_size_, num_samples - offset);
70-
71-
// Build input: [context | chunk]
72-
std::memcpy(
73-
input.data(),
74-
context.data(),
75-
static_cast<size_t>(context_size_) * sizeof(float));
76-
77-
if (chunk_len == window_size_) {
78-
std::memcpy(
79-
input.data() + context_size_,
80-
audio_data + offset,
81-
static_cast<size_t>(window_size_) * sizeof(float));
82-
} else {
83-
// Pad the last partial chunk with zeros
84-
std::memcpy(
85-
input.data() + context_size_,
86-
audio_data + offset,
87-
static_cast<size_t>(chunk_len) * sizeof(float));
88-
std::memset(
89-
input.data() + context_size_ + chunk_len,
90-
0,
91-
static_cast<size_t>(window_size_ - chunk_len) * sizeof(float));
92-
}
93-
94-
auto input_tensor = from_blob(
95-
input.data(),
96-
{1, static_cast<::executorch::aten::SizesType>(input_size_)},
97-
::executorch::aten::ScalarType::Float);
98-
auto state_tensor = from_blob(
99-
state_data.data(),
100-
{2, 1, static_cast<::executorch::aten::SizesType>(kHiddenDim)},
101-
::executorch::aten::ScalarType::Float);
102-
103-
auto result = model_->execute(
104-
"forward", std::vector<EValue>{input_tensor, state_tensor});
105-
if (!result.ok()) {
106-
ET_LOG(
107-
Error,
108-
"forward failed at offset %lld.",
109-
static_cast<long long>(offset));
110-
break;
111-
}
112-
113-
auto& outputs = result.get();
114-
float prob = outputs[0].toTensor().const_data_ptr<float>()[0];
115-
116-
// Update LSTM state
117-
auto new_state = outputs[1].toTensor();
118-
std::memcpy(
119-
state_data.data(),
120-
new_state.const_data_ptr<float>(),
121-
static_cast<size_t>(2 * kHiddenDim) * sizeof(float));
122-
123-
// Update context from current chunk
124-
if (chunk_len >= context_size_) {
125-
std::memcpy(
126-
context.data(),
127-
audio_data + offset + chunk_len - context_size_,
128-
static_cast<size_t>(context_size_) * sizeof(float));
129-
} else {
130-
// Shift existing context and append partial chunk
131-
int64_t keep = context_size_ - chunk_len;
132-
std::memmove(
133-
context.data(),
134-
context.data() + chunk_len,
135-
static_cast<size_t>(keep) * sizeof(float));
136-
std::memcpy(
137-
context.data() + keep,
138-
audio_data + offset,
139-
static_cast<size_t>(chunk_len) * sizeof(float));
140-
}
136+
float prob = process_frame(audio_data + offset, num_samples - offset);
141137

142138
// Threshold-based speech detection
143139
if (prob > threshold) {

0 commit comments

Comments
 (0)