pragmatrix · pragmatrix · Jun 19, 2026 · Jun 19, 2026
diff --git a/CONTEXT.md b/CONTEXT.md
@@ -0,0 +1,25 @@
+# Context
+
+Glossary of canonical terms for context-switch. Keep this free of implementation
+detail — it is a glossary, not a spec.
+
+## Speech provider terms
+
+These three Microsoft offerings are distinct and must not all be called "Azure".
+
+- **Azure Speech** — Microsoft's Azure AI Speech SDK service. In this repo it backs
+  the `azure` crate (`AzureTranscribe`, `AzureSynthesize`, ...). Classic
+  speech-to-text / text-to-speech.
+
+- **Azure OpenAI Realtime** — the Azure-hosted variant of the OpenAI Realtime API.
+  Reached through `openai-dialog` with `Protocol::Azure`. A realtime
+  speech-to-speech dialog protocol over a single websocket.
+
+- **Voice Live** — Microsoft Foundry's managed speech-to-speech API
+  (`/voice-live/realtime`). Wire-compatible with Azure OpenAI Realtime but adds
+  Azure-only capabilities (deep noise suppression, Azure semantic VAD, Azure
+  speech / MAI transcription models). Backed by the `microsoft-voice-live` crate.
+
+  Note: Voice Live and Azure OpenAI Realtime are expected to converge over time;
+  the `microsoft-voice-live` boundary is kept deliberately thin so the two can be
+  merged later without a rewrite.
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,6 +15,7 @@ members = [
     "services/elevenlabs",
     "services/google-dialog",
     "services/google-transcribe", 
+    "services/microsoft-voice-live",
     "services/openai-dialog", 
     "services/playback",
 ]
@@ -38,6 +39,7 @@ azure-speech = { workspace = true }
 aristech = { workspace = true }
 elevenlabs = { workspace = true }
 google-transcribe = { workspace = true }
+microsoft-voice-live = { workspace = true }
 
 # basic
 
@@ -104,6 +106,7 @@ aristech = { path = "services/aristech" }
 elevenlabs = { path = "services/elevenlabs" }
 google-transcribe = { path = "services/google-transcribe" }
 google-dialog = { path = "services/google-dialog" }
+microsoft-voice-live = { path = "services/microsoft-voice-live" }
 gemini-live = { path = "external/gemini-live-rs/crates/gemini-live" }
 
 # Dependencies required by `external/gemini-live-rs/crates/gemini-live`.

diff --git a/README.md b/README.md
@@ -69,6 +69,9 @@ cargo run --example transcribe -- elevenlabs
 # Run generic transcribe example with Aristech provider
 cargo run --example transcribe -- aristech
 
+# Run generic transcribe example with Microsoft Voice Live provider
+cargo run --example transcribe -- voice-live
+
 # Run Azure synthesize example
 cargo run --example azure-synthesize
 
@@ -105,6 +108,13 @@ AZURE_REGION=your_azure_region
 # ElevenLabs Configuration
 ELEVENLABS_API_KEY=your_elevenlabs_key
 
+# Microsoft Voice Live Configuration
+MICROSOFT_VOICE_LIVE_API_KEY=your_voice_live_key
+MICROSOFT_VOICE_LIVE_ENDPOINT=wss://your-resource.services.ai.azure.com/voice-live/realtime
+MICROSOFT_VOICE_LIVE_MODEL=gpt-4o-mini-realtime-preview
+MICROSOFT_VOICE_LIVE_API_VERSION=2026-06-01-preview
+MICROSOFT_VOICE_LIVE_TRANSCRIPTION_MODEL=azure-speech
+
 # Audio Knife Configuration
 AUDIO_KNIFE_ADDRESS=127.0.0.1:8123
 

diff --git a/examples/transcribe.rs b/examples/transcribe.rs
@@ -5,12 +5,17 @@ use std::time::Duration;
 use anyhow::{Context, Result, bail};
 use clap::{Parser, ValueEnum};
 use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
+use openai_api_rs::realtime::types::{
+    AzureSemanticVadConfig, EndOfUtteranceDetectionConfig, EndOfUtteranceDetectionModel,
+    EndOfUtteranceThresholdLevel, TurnDetection,
+};
 use rodio::DeviceSinkBuilder;
 use tokio::select;
 use tokio::sync::mpsc::{channel, unbounded_channel};
 
 use context_switch::services::{
     AristechTranscribe, AzureTranscribe, ElevenLabsTranscribe, GoogleTranscribe,
+    MicrosoftVoiceLiveTranscribe,
 };
 use context_switch::{AudioConsumer, InputModality, OutputModality};
 use context_switch_core::language::Languages;
@@ -44,6 +49,8 @@ enum Provider {
     Google,
     #[value(name = "aristech")]
     Aristech,
+    #[value(name = "voice-live")]
+    VoiceLive,
 }
 
 #[tokio::main]
@@ -343,5 +350,50 @@ async fn start_conversation(
             };
             AristechTranscribe.conversation(params, conversation).await
         }
+        Provider::VoiceLive => {
+            if diarization {
+                bail!("--diarization is only supported for the azure provider");
+            }
+            if region.is_some() {
+                bail!("--region is only supported for the google provider");
+            }
+
+            let language = Some(
+                languages
+                    .single()
+                    .context("Voice Live provider supports exactly one --language value")?
+                    .clone(),
+            );
+
+            let params = microsoft_voice_live::Params {
+                api_key: env::var("MICROSOFT_VOICE_LIVE_API_KEY")
+                    .expect("MICROSOFT_VOICE_LIVE_API_KEY undefined"),
+                endpoint: env::var("MICROSOFT_VOICE_LIVE_ENDPOINT")
+                    .expect("MICROSOFT_VOICE_LIVE_ENDPOINT undefined (must be wss://...)"),
+                model: model.map(str::to_owned).unwrap_or_else(|| {
+                    env::var("MICROSOFT_VOICE_LIVE_MODEL").unwrap_or_else(|_| "gpt-4.1".to_owned())
+                }),
+                api_version: env::var("MICROSOFT_VOICE_LIVE_API_VERSION").ok(),
+                transcription_model: env::var("MICROSOFT_VOICE_LIVE_TRANSCRIPTION_MODEL")
+                    .unwrap_or_else(|_| "azure-speech".to_owned()),
+                language,
+                noise_reduction: None,
+                turn_detection: Some(TurnDetection::AzureSemanticVadMultilingual(
+                    AzureSemanticVadConfig {
+                        end_of_utterance_detection: Some(EndOfUtteranceDetectionConfig {
+                            model: EndOfUtteranceDetectionModel::SmartEndOfTurnDetection,
+                            threshold_level: Some(EndOfUtteranceThresholdLevel::Low),
+                            timeout_ms: Some(5000),
+                        }),
+                        // remove_filler_words: Some(true),
+                        languages: Some(vec!["de-DE".to_owned()]),
+                        ..Default::default()
+                    },
+                )),
+            };
+            MicrosoftVoiceLiveTranscribe
+                .conversation(params, conversation)
+                .await
+        }
     }
 }
diff --git a/external/openai-api-rs b/external/openai-api-rs
diff --git a/justfile b/justfile
@@ -25,6 +25,9 @@ transcribe-google-latest-short:
 transcribe-google-latest-long:
     cargo run --example transcribe -- google --language de-DE --model latest_long --region eu
 
+transcribe-voice-live-de:
+    cargo run --example transcribe -- voice-live --language de-DE
+
 transcribe-google-diarization:
     cargo run --example transcribe -- google --diarization --language de-DE --model chirp_3 --region eu
 

diff --git a/services/microsoft-voice-live/Cargo.toml b/services/microsoft-voice-live/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "microsoft-voice-live"
+version = "0.1.0"
+edition.workspace = true
+
+[dependencies]
+context-switch-core = { workspace = true }
+
+openai-api-rs = { workspace = true }
+
+tokio-tungstenite = { version = "0.29.0", features = ["connect", "native-tls"] }
+
+tracing = { workspace = true }
+
+anyhow = { workspace = true }
+futures = { workspace = true }
+tokio = { workspace = true, features = ["net"] }
+serde_json = { workspace = true }
+base64 = { workspace = true }
+serde = { workspace = true, features = ["derive"] }
+async-trait = { workspace = true }
+url = { workspace = true }
+1 −1		Cargo.toml
+2 −1		src/realtime/client_event.rs
+34 −0		src/realtime/server_event.rs
+150 −0		src/realtime/types.rs