Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in
# build
option(WHISPER_FATAL_WARNINGS "whisper: enable -Werror flag" OFF)
option(WHISPER_USE_SYSTEM_GGML "whisper: use system-installed GGML library" OFF)
option(WHISPER_DIARIZE "whisper: enable speaker diarization" OFF)

# sanitizers
option(WHISPER_SANITIZE_THREAD "whisper: enable thread sanitizer" OFF)
Expand Down
21 changes: 17 additions & 4 deletions examples/cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,11 @@ options:
-tpi, --temperature-inc N [0.20 ] The increment of temperature, between 0 and 1
-debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
-tr, --translate [false ] translate from source language to english
-di, --diarize [false ] stereo audio diarization
-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
-di, --diarize [false ] enable speaker diarization
--diarize-model FNAME [ ] speaker embedding model path (GGML .bin)
--diarize-threshold N [0.50 ] clustering distance threshold
--diarize-speakers N [0 ] target speaker count (0 = auto)
-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
-nf, --no-fallback [false ] do not use temperature fallback while decoding
-otxt, --output-txt [false ] output result in a text file
-ovtt, --output-vtt [false ] output result in a vtt file
Expand Down Expand Up @@ -62,5 +65,15 @@ options:
--suppress-regex REGEX [ ] regular expression matching tokens to suppress
--grammar GRAMMAR [ ] GBNF grammar to guide decoding
--grammar-rule RULE [ ] top-level GBNF grammar rule name
--grammar-penalty N [100.0 ] scales down logits of nongrammar tokens
```
--grammar-penalty N [100.0 ] scales down logits of nongrammar tokens
```

Model-based diarization uses the ECAPA-TDNN speaker embedding model produced by
`models/convert-speaker-to-ggml.py`:

```
python models/convert-speaker-to-ggml.py --output models/ggml-speaker-ecapa-tdnn.bin
./build/bin/whisper-cli -m models/ggml-base.en.bin \
--diarize --diarize-model models/ggml-speaker-ecapa-tdnn.bin \
-f input.wav
```
157 changes: 98 additions & 59 deletions examples/cli/cli.cpp

Large diffs are not rendered by default.

11 changes: 10 additions & 1 deletion examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ options:
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
-debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
-tr, --translate [false ] translate from source language to english
-di, --diarize [false ] stereo audio diarization
-di, --diarize [false ] enable speaker diarization
--diarize-model FNAME [ ] speaker embedding model path (GGML .bin)
--diarize-threshold N [0.50 ] clustering distance threshold
--diarize-speakers N [0 ] target speaker count (0 = auto)
-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
-nf, --no-fallback [false ] do not use temperature fallback while decoding
-ps, --print-special [false ] print special tokens
Expand Down Expand Up @@ -69,6 +72,10 @@ Voice Activity Detection (VAD) options:
> [!WARNING]
> **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.**

When using diarization over HTTP, `diarize_model` is a form field whose value is a
path on the server host, not an uploaded model file. The speaker embedding model
must already exist on the machine running `whisper-server`.

## request examples

**/inference**
Expand All @@ -78,6 +85,8 @@ curl 127.0.0.1:8080/inference \
-F file="@<file-path>" \
-F temperature="0.0" \
-F temperature_inc="0.2" \
-F diarize="true" \
-F diarize_model="/absolute/path/on/server/models/ggml-speaker-ecapa-tdnn.bin" \
-F response_format="json"
```

Expand Down
203 changes: 138 additions & 65 deletions examples/server/server.cpp

Large diffs are not rendered by default.

38 changes: 38 additions & 0 deletions include/whisper-speaker.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#ifndef WHISPER_SPEAKER_H
#define WHISPER_SPEAKER_H

#include "ggml.h"

#ifdef __cplusplus
extern "C" {
#endif

// Opaque speaker model context
struct whisper_speaker_model;

// Load speaker model from GGML binary file
struct whisper_speaker_model * whisper_speaker_load_from_file(const char * path_model);

// Free model resources
void whisper_speaker_free(struct whisper_speaker_model * model);

// Print model structure info
void whisper_speaker_validate(struct whisper_speaker_model * model);

// Get embedding dimension (192 for ECAPA-TDNN)
int whisper_speaker_get_embedding_dim(struct whisper_speaker_model * model);

// Get tensor count
int whisper_speaker_get_tensor_count(struct whisper_speaker_model * model);

// Get tensor by index
struct ggml_tensor * whisper_speaker_get_tensor(struct whisper_speaker_model * model, int idx);

// Find tensor by name (e.g. "mods.embedding_model.blocks.0.conv.conv.weight")
struct ggml_tensor * whisper_speaker_find_tensor(struct whisper_speaker_model * model, const char * name);

#ifdef __cplusplus
}
#endif

#endif // WHISPER_SPEAKER_H
19 changes: 19 additions & 0 deletions include/whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,12 @@ extern "C" {
const char * vad_model_path; // Path to VAD model

whisper_vad_params vad_params;

// Speaker diarization params
bool diarize; // Enable speaker diarization (default: false)
const char * diarize_model_path; // Path to speaker embedding model file (GGML .bin format)
float diarize_threshold; // Distance threshold for clustering (default: 0.5f)
int diarize_speakers; // Target speaker count; 0 = auto-detect (default: 0)
};

// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
Expand Down Expand Up @@ -647,6 +653,19 @@ extern "C" {
WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
WHISPER_API bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment);

// Speaker diarization accessor

// Get the speaker ID assigned to the given segment (result of diarization clustering)
// Returns: 0-based speaker ID if diarization enabled, -1 if disabled or invalid segment
WHISPER_API int whisper_full_get_segment_speaker_id(
struct whisper_context * ctx,
int i_segment);

// Variant that works with whisper_state directly (for advanced use cases)
WHISPER_API int whisper_full_get_segment_speaker_id_from_state(
struct whisper_state * state,
int i_segment);

// Get the text of the specified segment
WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment);
WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
Expand Down
Loading