From 80d59536b28a40b259b691bef8810e0cf2b8cab6 Mon Sep 17 00:00:00 2001 From: "rosetta-livekit-bot[bot]" <282703043+rosetta-livekit-bot[bot]@users.noreply.github.com> Date: Thu, 18 Jun 2026 23:24:44 +0000 Subject: [PATCH] feat(soniox): support stt-rt-v5 endpoint sensitivity --- .changeset/fresh-otters-share.md | 5 +++++ plugins/soniox/etc/agents-plugin-soniox.api.md | 1 + plugins/soniox/src/stt.ts | 12 ++++++++++-- 3 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 .changeset/fresh-otters-share.md diff --git a/.changeset/fresh-otters-share.md b/.changeset/fresh-otters-share.md new file mode 100644 index 000000000..e12371dd3 --- /dev/null +++ b/.changeset/fresh-otters-share.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents-plugin-soniox": minor +--- + +Support Soniox stt-rt-v5 and endpointSensitivity. diff --git a/plugins/soniox/etc/agents-plugin-soniox.api.md b/plugins/soniox/etc/agents-plugin-soniox.api.md index 92bc97f76..202e6308a 100644 --- a/plugins/soniox/etc/agents-plugin-soniox.api.md +++ b/plugins/soniox/etc/agents-plugin-soniox.api.md @@ -72,6 +72,7 @@ export interface STTOptions { enableLanguageIdentification: boolean; // (undocumented) enableSpeakerDiarization: boolean; + endpointSensitivity?: number; // (undocumented) languageHints?: string[]; // (undocumented) diff --git a/plugins/soniox/src/stt.ts b/plugins/soniox/src/stt.ts index b26c9cf76..3627c82c0 100644 --- a/plugins/soniox/src/stt.ts +++ b/plugins/soniox/src/stt.ts @@ -74,6 +74,8 @@ export interface STTOptions { enableLanguageIdentification: boolean; /** Maximum delay in milliseconds between speech cessation and endpoint detection. */ maxEndpointDelayMs: number; + /** How readily the model emits speech endpoints. Range: -1.0 to 1.0. */ + endpointSensitivity?: number; clientReferenceId?: string; translation?: TranslationConfig; } @@ -81,13 +83,13 @@ export interface STTOptions { const defaultSTTOptions: STTOptions = { apiKey: process.env.SONIOX_API_KEY, baseUrl: BASE_URL, - model: 'stt-rt-v4', + model: 'stt-rt-v5', languageHintsStrict: false, numChannels: 1, sampleRate: 16000, enableSpeakerDiarization: false, enableLanguageIdentification: true, - maxEndpointDelayMs: 500, + maxEndpointDelayMs: 2000, }; /** @public */ @@ -103,6 +105,11 @@ export class STT extends stt.STT { if (merged.maxEndpointDelayMs < 500 || merged.maxEndpointDelayMs > 3000) { throw new Error('maxEndpointDelayMs must be between 500 and 3000'); } + if (merged.endpointSensitivity !== undefined) { + if (merged.endpointSensitivity < -1.0 || merged.endpointSensitivity > 1.0) { + throw new Error('endpointSensitivity must be between -1.0 and 1.0'); + } + } super({ streaming: true, @@ -207,6 +214,7 @@ export class SpeechStream extends stt.SpeechStream { enable_language_identification: this.#opts.enableLanguageIdentification, client_reference_id: this.#opts.clientReferenceId, max_endpoint_delay_ms: this.#opts.maxEndpointDelayMs, + endpoint_sensitivity: this.#opts.endpointSensitivity, }; if (this.#opts.translation) {