ggml-org · MoonMao42 · Mar 30, 2026 · Apr 2, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -69,6 +69,7 @@ option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in
 # build
 option(WHISPER_FATAL_WARNINGS  "whisper: enable -Werror flag"               OFF)
 option(WHISPER_USE_SYSTEM_GGML "whisper: use system-installed GGML library" OFF)
+option(WHISPER_DIARIZE        "whisper: enable speaker diarization"         OFF)
 
 # sanitizers
 option(WHISPER_SANITIZE_THREAD    "whisper: enable thread sanitizer"    OFF)

diff --git a/examples/cli/README.md b/examples/cli/README.md
@@ -30,8 +30,11 @@ options:
   -tpi,      --temperature-inc N [0.20   ] The increment of temperature, between 0 and 1
   -debug,    --debug-mode        [false  ] enable debug mode (eg. dump log_mel)
   -tr,       --translate         [false  ] translate from source language to english
-  -di,       --diarize           [false  ] stereo audio diarization
-  -tdrz,     --tinydiarize       [false  ] enable tinydiarize (requires a tdrz model)
+  -di,       --diarize           [false  ] enable speaker diarization
+             --diarize-model FNAME [       ] speaker embedding model path (GGML .bin)
+             --diarize-threshold N [0.50   ] clustering distance threshold
+             --diarize-speakers N  [0      ] target speaker count (0 = auto)
+  -tdrz,     --tinydiarize       [false  ] enable tinydiarize (requires a tdrz model)
   -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
   -otxt,     --output-txt        [false  ] output result in a text file
   -ovtt,     --output-vtt        [false  ] output result in a vtt file
@@ -62,5 +65,15 @@ options:
   --suppress-regex REGEX         [       ] regular expression matching tokens to suppress
   --grammar GRAMMAR              [       ] GBNF grammar to guide decoding
   --grammar-rule RULE            [       ] top-level GBNF grammar rule name
-  --grammar-penalty N            [100.0  ] scales down logits of nongrammar tokens
-```
+  --grammar-penalty N            [100.0  ] scales down logits of nongrammar tokens
+```
+
+Model-based diarization uses the ECAPA-TDNN speaker embedding model produced by
+`models/convert-speaker-to-ggml.py`:
+
+```
+python models/convert-speaker-to-ggml.py --output models/ggml-speaker-ecapa-tdnn.bin
+./build/bin/whisper-cli -m models/ggml-base.en.bin \
+  --diarize --diarize-model models/ggml-speaker-ecapa-tdnn.bin \
+  -f input.wav
+```
diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -29,7 +29,10 @@ options:
   -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
   -debug,    --debug-mode        [false  ] enable debug mode (eg. dump log_mel)
   -tr,       --translate         [false  ] translate from source language to english
-  -di,       --diarize           [false  ] stereo audio diarization
+  -di,       --diarize           [false  ] enable speaker diarization
+             --diarize-model FNAME [       ] speaker embedding model path (GGML .bin)
+             --diarize-threshold N [0.50   ] clustering distance threshold
+             --diarize-speakers N  [0      ] target speaker count (0 = auto)
   -tdrz,     --tinydiarize       [false  ] enable tinydiarize (requires a tdrz model)
   -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
   -ps,       --print-special     [false  ] print special tokens
@@ -69,6 +72,10 @@ Voice Activity Detection (VAD) options:
 > [!WARNING]
 > **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.**
 
+When using diarization over HTTP, `diarize_model` is a form field whose value is a
+path on the server host, not an uploaded model file. The speaker embedding model
+must already exist on the machine running `whisper-server`.
+
 ## request examples
 
 **/inference**
@@ -78,6 +85,8 @@ curl 127.0.0.1:8080/inference \
 -F file="@<file-path>" \
 -F temperature="0.0" \
 -F temperature_inc="0.2" \
+-F diarize="true" \
+-F diarize_model="/absolute/path/on/server/models/ggml-speaker-ecapa-tdnn.bin" \
 -F response_format="json"
 ```
 

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
diff --git a/include/whisper-speaker.h b/include/whisper-speaker.h
@@ -0,0 +1,38 @@
+#ifndef WHISPER_SPEAKER_H
+#define WHISPER_SPEAKER_H
+
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque speaker model context
+struct whisper_speaker_model;
+
+// Load speaker model from GGML binary file
+struct whisper_speaker_model * whisper_speaker_load_from_file(const char * path_model);
+
+// Free model resources
+void whisper_speaker_free(struct whisper_speaker_model * model);
+
+// Print model structure info
+void whisper_speaker_validate(struct whisper_speaker_model * model);
+
+// Get embedding dimension (192 for ECAPA-TDNN)
+int whisper_speaker_get_embedding_dim(struct whisper_speaker_model * model);
+
+// Get tensor count
+int whisper_speaker_get_tensor_count(struct whisper_speaker_model * model);
+
+// Get tensor by index
+struct ggml_tensor * whisper_speaker_get_tensor(struct whisper_speaker_model * model, int idx);
+
+// Find tensor by name (e.g. "mods.embedding_model.blocks.0.conv.conv.weight")
+struct ggml_tensor * whisper_speaker_find_tensor(struct whisper_speaker_model * model, const char * name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // WHISPER_SPEAKER_H
diff --git a/include/whisper.h b/include/whisper.h
@@ -588,6 +588,12 @@ extern "C" {
         const char * vad_model_path;              // Path to VAD model
 
         whisper_vad_params vad_params;
+
+        // Speaker diarization params
+        bool         diarize;                      // Enable speaker diarization (default: false)
+        const char * diarize_model_path;           // Path to speaker embedding model file (GGML .bin format)
+        float        diarize_threshold;            // Distance threshold for clustering (default: 0.5f)
+        int          diarize_speakers;             // Target speaker count; 0 = auto-detect (default: 0)
     };
 
     // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
@@ -647,6 +653,19 @@ extern "C" {
     WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment);
     WHISPER_API bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment);
 
+    // Speaker diarization accessor
+
+    // Get the speaker ID assigned to the given segment (result of diarization clustering)
+    // Returns: 0-based speaker ID if diarization enabled, -1 if disabled or invalid segment
+    WHISPER_API int whisper_full_get_segment_speaker_id(
+        struct whisper_context * ctx,
+        int i_segment);
+
+    // Variant that works with whisper_state directly (for advanced use cases)
+    WHISPER_API int whisper_full_get_segment_speaker_id_from_state(
+        struct whisper_state * state,
+        int i_segment);
+
     // Get the text of the specified segment
     WHISPER_API const char * whisper_full_get_segment_text           (struct whisper_context * ctx, int i_segment);
     WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);