Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ else()
add_subdirectory(server)
add_subdirectory(quantize)
add_subdirectory(vad-speech-segments)
add_subdirectory(stream-resumable)
if (WHISPER_SDL2)
add_subdirectory(stream)
add_subdirectory(command)
Expand Down
8 changes: 8 additions & 0 deletions examples/stream-resumable/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
set(TARGET whisper-stream-resumable)
add_executable(${TARGET} stream-resumable.cpp)

include(DefaultTargetOptions)

target_link_libraries(${TARGET} PRIVATE common whisper ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})

install(TARGETS ${TARGET} RUNTIME)
71 changes: 71 additions & 0 deletions examples/stream-resumable/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# whisper.cpp/examples/stream-resumable

Reference for the **resumable / asynchronous streaming** API: feed audio
incrementally and transcribe it next to recording, at full quality, without the
sliding-window re-decoding (and output divergence) of `examples/stream`.

Each 30s window is decoded exactly once; the seek position and rolling text
context are persisted in the `whisper_state` across calls, so already-emitted
segments are never revised. The output is consistent with a single batch run.

## API

```c
whisper_resumable_reset_with_state(ctx, state);

// as audio arrives (16 kHz mono f32):
whisper_append_audio_with_state(ctx, state, pcm, n);
int n_new = whisper_full_resumable_with_state(ctx, state, params, /*finalize=*/false);
// ... consume the n_new newly produced segments ...

// when the stream ends, flush the trailing (< 30s) window:
whisper_full_resumable_with_state(ctx, state, params, /*finalize=*/true);
```

`whisper_full_resumable_with_state` decodes every **complete** 30s window
currently available and returns the number of new segments. When less than 30s
of undecoded audio is available (and `finalize == false`) it returns 0 without
decoding a partial window ("need more audio").

## Threading

The model weights (`whisper_context`) are shared read-only; each concurrent
stream uses its own `whisper_state` (allocated with `whisper_init_state`). This
example runs one **producer** thread (audio source) and one **worker** thread
(inference) connected by a PCM queue, so transcription is decoupled from
capture. In your application, replace the file-reading producer with your
microphone / network source.

## Mel normalization

`whisper_full_params.mel_norm_mode` selects how the log-mel is normalized:

- `WHISPER_MEL_NORM_GLOBAL` (default): normalize against the maximum seen across
all audio appended so far. This matches `whisper_full()` / batch behavior
exactly only when the whole signal is appended before the first decode; when
decoding incrementally, early windows use the running max rather than the
whole-signal max (the difference is negligible for typical speech).
- `WHISPER_MEL_NORM_WINDOW`: normalize each window against a reference level
with an envelope follower — instantaneous attack (so loud passages never
over-drive) and an exponential release with a half-life in audio seconds
(`mel_norm_half_life`), so a brief silence does not amplify background noise
and a steady background source that stops is forgotten only gradually. Useful
for live audio with varying levels.

## Usage

```bash
# build (no SDL required)
cmake -B build -DWHISPER_BUILD_EXAMPLES=ON
cmake --build build --target whisper-stream-resumable -j

# transcribe a 16 kHz mono WAV, simulating a live source in real time
./build/bin/whisper-stream-resumable \
-m models/ggml-base.en.bin -f samples/jfk.wav --realtime

# live mel normalization with a 2s release half-life
./build/bin/whisper-stream-resumable \
-m models/ggml-base.en.bin -f samples/jfk.wav --window-norm --half-life 2.0
```

Run with `--help` for all options.
229 changes: 229 additions & 0 deletions examples/stream-resumable/stream-resumable.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
// Resumable / asynchronous streaming transcription example.
//
// This demonstrates the resumable whisper API:
// - whisper_append_audio_with_state() feed PCM incrementally
// - whisper_full_resumable_with_state(.., false) decode complete 30s windows
// - whisper_full_resumable_with_state(.., true) flush the trailing window
//
// Unlike examples/stream (which repeatedly calls whisper_full() on a sliding
// window and therefore re-decodes overlapping audio, producing output that
// changes between iterations), this decodes each window exactly once, resumes
// the seek position and the rolling text context from the state, and never
// revises already-emitted segments. The result is consistent with a single
// batch run.
//
// The design that matters for a real application:
// - ONE producer thread captures audio and pushes PCM into a queue.
// - ONE worker thread owns a dedicated whisper_state and runs inference,
// decoupled from capture so transcription can run at full quality while
// recording continues.
// - The model weights (whisper_context) are shared read-only; each worker
// would use its own whisper_state.
//
// Here the "producer" reads a WAV file and (optionally) paces it in real time
// to simulate a live source. In your application, replace the producer with
// your microphone / network audio source and push 16 kHz mono f32 PCM.

#include "common.h"
#include "common-whisper.h"
#include "whisper.h"

#include <atomic>
#include <chrono>
#include <condition_variable>
#include <cstdio>
#include <deque>
#include <mutex>
#include <string>
#include <thread>
#include <vector>

// -----------------------------------------------------------------------------
// A minimal thread-safe PCM queue (single producer, single consumer).
// -----------------------------------------------------------------------------
class pcm_queue {
public:
void push(const float * data, size_t n) {
{
std::lock_guard<std::mutex> lock(mtx_);
buf_.insert(buf_.end(), data, data + n);
}
cv_.notify_one();
}

// Signal that no more audio will arrive.
void close() {
{
std::lock_guard<std::mutex> lock(mtx_);
closed_ = true;
}
cv_.notify_all();
}

// Block until at least one sample is available or the queue is closed.
// Drains everything currently buffered into `out`. Returns false only when
// the queue is closed AND drained (i.e. the stream has ended).
bool pop_all(std::vector<float> & out) {
std::unique_lock<std::mutex> lock(mtx_);
cv_.wait(lock, [&] { return !buf_.empty() || closed_; });
out.assign(buf_.begin(), buf_.end());
buf_.clear();
return !out.empty() || !closed_;
}

private:
std::mutex mtx_;
std::condition_variable cv_;
std::deque<float> buf_;
bool closed_ = false;
};

// -----------------------------------------------------------------------------
struct cli_params {
std::string model = "models/ggml-base.en.bin";
std::string fname;
std::string language = "en";
int n_threads = std::min(4, (int) std::thread::hardware_concurrency());
int chunk_ms = 1000; // how much audio the producer emits per push
bool realtime = false; // pace the producer to wall-clock time
bool window_norm = false; // use the per-window (live) mel normalization
float half_life = 3.0f; // release half-life in seconds (window norm)
bool translate = false;
};

static void print_usage(const char * argv0) {
fprintf(stderr,
"usage: %s -m <model.bin> -f <audio.wav> [options]\n"
" -m, --model PATH model path (default: models/ggml-base.en.bin)\n"
" -f, --file PATH input WAV (16 kHz mono); required\n"
" -l, --language LANG spoken language or 'auto' (default: en)\n"
" -t, --threads N inference threads (default: %d)\n"
" --chunk-ms N producer chunk size in ms (default: 1000)\n"
" --realtime pace the producer to real time (simulate live)\n"
" --window-norm per-window mel normalization (live AGC) instead of global\n"
" --half-life S release half-life in seconds for --window-norm (default: 3.0)\n"
" --translate translate to English\n",
argv0, std::min(4, (int) std::thread::hardware_concurrency()));
}

static bool parse_args(int argc, char ** argv, cli_params & p) {
for (int i = 1; i < argc; i++) {
std::string a = argv[i];
auto next = [&](const char * name) -> std::string {
if (i + 1 >= argc) { fprintf(stderr, "missing value for %s\n", name); exit(1); }
return argv[++i];
};
if (a == "-m" || a == "--model") p.model = next("model");
else if (a == "-f" || a == "--file") p.fname = next("file");
else if (a == "-l" || a == "--language") p.language = next("language");
else if (a == "-t" || a == "--threads") p.n_threads = std::stoi(next("threads"));
else if (a == "--chunk-ms") p.chunk_ms = std::stoi(next("chunk-ms"));
else if (a == "--realtime") p.realtime = true;
else if (a == "--window-norm") p.window_norm = true;
else if (a == "--half-life") p.half_life = std::stof(next("half-life"));
else if (a == "--translate") p.translate = true;
else if (a == "-h" || a == "--help") { print_usage(argv[0]); exit(0); }
else { fprintf(stderr, "unknown argument: %s\n", a.c_str()); return false; }
}
if (p.fname.empty()) { print_usage(argv[0]); return false; }
return true;
}

// Print every segment that was produced since `already_printed`, return the new total.
static int print_new_segments(whisper_state * state, int already_printed) {
const int n = whisper_full_n_segments_from_state(state);
for (int i = already_printed; i < n; i++) {
const int64_t t0 = whisper_full_get_segment_t0_from_state(state, i);
const int64_t t1 = whisper_full_get_segment_t1_from_state(state, i);
const char * text = whisper_full_get_segment_text_from_state(state, i);
printf("[%s --> %s]%s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
}
fflush(stdout);
return n;
}

int main(int argc, char ** argv) {
cli_params p;
if (!parse_args(argc, argv, p)) return 1;

// load the audio up front (the producer thread streams it out below)
std::vector<float> pcm;
std::vector<std::vector<float>> pcms;
if (!read_audio_data(p.fname, pcm, pcms, /*stereo=*/false)) {
fprintf(stderr, "error: failed to read audio '%s'\n", p.fname.c_str());
return 1;
}

// shared, read-only context (model weights)
whisper_context_params cparams = whisper_context_default_params();
whisper_context * ctx = whisper_init_from_file_with_params_no_state(p.model.c_str(), cparams);
if (!ctx) {
fprintf(stderr, "error: failed to load model '%s'\n", p.model.c_str());
return 1;
}

// dedicated inference state for the worker (one per concurrent stream)
whisper_state * state = whisper_init_state(ctx);
if (!state) {
fprintf(stderr, "error: failed to init state\n");
whisper_free(ctx);
return 1;
}

whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.print_progress = false;
wparams.print_realtime = false;
wparams.print_timestamps = false;
wparams.translate = p.translate;
wparams.language = p.language.c_str();
wparams.n_threads = p.n_threads;
wparams.no_context = false; // carry rolling context across windows
if (p.window_norm) {
wparams.mel_norm_mode = WHISPER_MEL_NORM_WINDOW;
wparams.mel_norm_half_life = p.half_life;
} else {
wparams.mel_norm_mode = WHISPER_MEL_NORM_GLOBAL;
}

whisper_resumable_reset_with_state(ctx, state);

pcm_queue queue;
std::atomic<bool> failed{false};

// ---- worker thread: append audio + decode complete windows as they arrive ----
std::thread worker([&] {
int printed = 0;
std::vector<float> chunk;
while (queue.pop_all(chunk)) {
if (chunk.empty()) continue; // woke up but nothing buffered yet
if (whisper_append_audio_with_state(ctx, state, chunk.data(), (int) chunk.size()) != 0) {
failed = true; return;
}
const int ret = whisper_full_resumable_with_state(ctx, state, wparams, /*finalize=*/false);
if (ret < 0) { failed = true; return; }
printed = print_new_segments(state, printed);
}
// stream ended: flush the trailing (< 30s) window
const int ret = whisper_full_resumable_with_state(ctx, state, wparams, /*finalize=*/true);
if (ret < 0) { failed = true; return; }
print_new_segments(state, printed);
});

// ---- producer: stream the file out in chunks (this is where a mic would feed in) ----
const int chunk_n = (p.chunk_ms * WHISPER_SAMPLE_RATE) / 1000;
for (size_t off = 0; off < pcm.size(); off += chunk_n) {
const size_t n = std::min((size_t) chunk_n, pcm.size() - off);
queue.push(pcm.data() + off, n);
if (p.realtime) {
std::this_thread::sleep_for(std::chrono::milliseconds((1000 * (int) n) / WHISPER_SAMPLE_RATE));
}
}
queue.close();

worker.join();

whisper_free_state(state);
whisper_free(ctx);

return failed ? 2 : 0;
}
Loading