diff --git a/CONTEXT.md b/CONTEXT.md
new file mode 100644
index 0000000..fa94750
--- /dev/null
+++ b/CONTEXT.md
@@ -0,0 +1,25 @@
+# Context
+
+Glossary of canonical terms for context-switch. Keep this free of implementation
+detail — it is a glossary, not a spec.
+
+## Speech provider terms
+
+These three Microsoft offerings are distinct and must not all be called "Azure".
+
+- **Azure Speech** — Microsoft's Azure AI Speech SDK service. In this repo it backs
+  the `azure` crate (`AzureTranscribe`, `AzureSynthesize`, ...). Classic
+  speech-to-text / text-to-speech.
+
+- **Azure OpenAI Realtime** — the Azure-hosted variant of the OpenAI Realtime API.
+  Reached through `openai-dialog` with `Protocol::Azure`. A realtime
+  speech-to-speech dialog protocol over a single websocket.
+
+- **Voice Live** — Microsoft Foundry's managed speech-to-speech API
+  (`/voice-live/realtime`). Wire-compatible with Azure OpenAI Realtime but adds
+  Azure-only capabilities (deep noise suppression, Azure semantic VAD, Azure
+  speech / MAI transcription models). Backed by the `microsoft-voice-live` crate.
+
+  Note: Voice Live and Azure OpenAI Realtime are expected to converge over time;
+  the `microsoft-voice-live` boundary is kept deliberately thin so the two can be
+  merged later without a rewrite.
diff --git a/Cargo.toml b/Cargo.toml
index 1e102d9..25ecdf5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,6 +15,7 @@ members = [
     "services/elevenlabs",
     "services/google-dialog",
     "services/google-transcribe", 
+    "services/microsoft-voice-live",
     "services/openai-dialog", 
     "services/playback",
 ]
@@ -38,6 +39,7 @@ azure-speech = { workspace = true }
 aristech = { workspace = true }
 elevenlabs = { workspace = true }
 google-transcribe = { workspace = true }
+microsoft-voice-live = { workspace = true }
 
 # basic
 
@@ -104,6 +106,7 @@ aristech = { path = "services/aristech" }
 elevenlabs = { path = "services/elevenlabs" }
 google-transcribe = { path = "services/google-transcribe" }
 google-dialog = { path = "services/google-dialog" }
+microsoft-voice-live = { path = "services/microsoft-voice-live" }
 gemini-live = { path = "external/gemini-live-rs/crates/gemini-live" }
 
 # Dependencies required by `external/gemini-live-rs/crates/gemini-live`.
diff --git a/README.md b/README.md
index d207158..d2c9871 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,9 @@ cargo run --example transcribe -- elevenlabs
 # Run generic transcribe example with Aristech provider
 cargo run --example transcribe -- aristech
 
+# Run generic transcribe example with Microsoft Voice Live provider
+cargo run --example transcribe -- voice-live
+
 # Run Azure synthesize example
 cargo run --example azure-synthesize
 
@@ -105,6 +108,13 @@ AZURE_REGION=your_azure_region
 # ElevenLabs Configuration
 ELEVENLABS_API_KEY=your_elevenlabs_key
 
+# Microsoft Voice Live Configuration
+MICROSOFT_VOICE_LIVE_API_KEY=your_voice_live_key
+MICROSOFT_VOICE_LIVE_ENDPOINT=wss://your-resource.services.ai.azure.com/voice-live/realtime
+MICROSOFT_VOICE_LIVE_MODEL=gpt-4o-mini-realtime-preview
+MICROSOFT_VOICE_LIVE_API_VERSION=2026-06-01-preview
+MICROSOFT_VOICE_LIVE_TRANSCRIPTION_MODEL=azure-speech
+
 # Audio Knife Configuration
 AUDIO_KNIFE_ADDRESS=127.0.0.1:8123
 
diff --git a/examples/transcribe.rs b/examples/transcribe.rs
index de0ab89..d57d892 100644
--- a/examples/transcribe.rs
+++ b/examples/transcribe.rs
@@ -5,12 +5,17 @@ use std::time::Duration;
 use anyhow::{Context, Result, bail};
 use clap::{Parser, ValueEnum};
 use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
+use openai_api_rs::realtime::types::{
+    AzureSemanticVadConfig, EndOfUtteranceDetectionConfig, EndOfUtteranceDetectionModel,
+    EndOfUtteranceThresholdLevel, TurnDetection,
+};
 use rodio::DeviceSinkBuilder;
 use tokio::select;
 use tokio::sync::mpsc::{channel, unbounded_channel};
 
 use context_switch::services::{
     AristechTranscribe, AzureTranscribe, ElevenLabsTranscribe, GoogleTranscribe,
+    MicrosoftVoiceLiveTranscribe,
 };
 use context_switch::{AudioConsumer, InputModality, OutputModality};
 use context_switch_core::language::Languages;
@@ -44,6 +49,8 @@ enum Provider {
     Google,
     #[value(name = "aristech")]
     Aristech,
+    #[value(name = "voice-live")]
+    VoiceLive,
 }
 
 #[tokio::main]
@@ -343,5 +350,50 @@ async fn start_conversation(
             };
             AristechTranscribe.conversation(params, conversation).await
         }
+        Provider::VoiceLive => {
+            if diarization {
+                bail!("--diarization is only supported for the azure provider");
+            }
+            if region.is_some() {
+                bail!("--region is only supported for the google provider");
+            }
+
+            let language = Some(
+                languages
+                    .single()
+                    .context("Voice Live provider supports exactly one --language value")?
+                    .clone(),
+            );
+
+            let params = microsoft_voice_live::Params {
+                api_key: env::var("MICROSOFT_VOICE_LIVE_API_KEY")
+                    .expect("MICROSOFT_VOICE_LIVE_API_KEY undefined"),
+                endpoint: env::var("MICROSOFT_VOICE_LIVE_ENDPOINT")
+                    .expect("MICROSOFT_VOICE_LIVE_ENDPOINT undefined (must be wss://...)"),
+                model: model.map(str::to_owned).unwrap_or_else(|| {
+                    env::var("MICROSOFT_VOICE_LIVE_MODEL").unwrap_or_else(|_| "gpt-4.1".to_owned())
+                }),
+                api_version: env::var("MICROSOFT_VOICE_LIVE_API_VERSION").ok(),
+                transcription_model: env::var("MICROSOFT_VOICE_LIVE_TRANSCRIPTION_MODEL")
+                    .unwrap_or_else(|_| "azure-speech".to_owned()),
+                language,
+                noise_reduction: None,
+                turn_detection: Some(TurnDetection::AzureSemanticVadMultilingual(
+                    AzureSemanticVadConfig {
+                        end_of_utterance_detection: Some(EndOfUtteranceDetectionConfig {
+                            model: EndOfUtteranceDetectionModel::SmartEndOfTurnDetection,
+                            threshold_level: Some(EndOfUtteranceThresholdLevel::Low),
+                            timeout_ms: Some(5000),
+                        }),
+                        // remove_filler_words: Some(true),
+                        languages: Some(vec!["de-DE".to_owned()]),
+                        ..Default::default()
+                    },
+                )),
+            };
+            MicrosoftVoiceLiveTranscribe
+                .conversation(params, conversation)
+                .await
+        }
     }
 }
diff --git a/external/openai-api-rs b/external/openai-api-rs
index b07e84f..65afe2b 160000
--- a/external/openai-api-rs
+++ b/external/openai-api-rs
@@ -1 +1 @@
-Subproject commit b07e84f8598568b032134f4139804636bab8529e
+Subproject commit 65afe2b1f02636a51896fe57678046a26ed6a9a8
diff --git a/justfile b/justfile
index e43db93..6a026aa 100644
--- a/justfile
+++ b/justfile
@@ -25,6 +25,9 @@ transcribe-google-latest-short:
 transcribe-google-latest-long:
     cargo run --example transcribe -- google --language de-DE --model latest_long --region eu
 
+transcribe-voice-live-de:
+    cargo run --example transcribe -- voice-live --language de-DE
+
 transcribe-google-diarization:
     cargo run --example transcribe -- google --diarization --language de-DE --model chirp_3 --region eu
 
diff --git a/services/microsoft-voice-live/Cargo.toml b/services/microsoft-voice-live/Cargo.toml
new file mode 100644
index 0000000..b2f2acf
--- /dev/null
+++ b/services/microsoft-voice-live/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "microsoft-voice-live"
+version = "0.1.0"
+edition.workspace = true
+
+[dependencies]
+context-switch-core = { workspace = true }
+
+openai-api-rs = { workspace = true }
+
+tokio-tungstenite = { version = "0.29.0", features = ["connect", "native-tls"] }
+
+tracing = { workspace = true }
+
+anyhow = { workspace = true }
+futures = { workspace = true }
+tokio = { workspace = true, features = ["net"] }
+serde_json = { workspace = true }
+base64 = { workspace = true }
+serde = { workspace = true, features = ["derive"] }
+async-trait = { workspace = true }
+url = { workspace = true }
diff --git a/services/microsoft-voice-live/src/client.rs b/services/microsoft-voice-live/src/client.rs
new file mode 100644
index 0000000..c874111
--- /dev/null
+++ b/services/microsoft-voice-live/src/client.rs
@@ -0,0 +1,322 @@
+use std::collections::HashMap;
+
+use anyhow::{Context, Result, bail};
+use base64::prelude::*;
+use futures::stream::{SplitSink, SplitStream};
+use futures::{SinkExt, StreamExt};
+use openai_api_rs::realtime::client_event::{self, ClientEvent};
+use openai_api_rs::realtime::server_event::ServerEvent;
+use openai_api_rs::realtime::types::{self, AzureSemanticVadConfig, TurnDetection};
+use tokio::{net::TcpStream, select};
+use tokio_tungstenite::tungstenite::{Bytes, protocol::Message};
+use tokio_tungstenite::{MaybeTlsStream, WebSocketStream};
+use tracing::{debug, info, trace, warn};
+
+use context_switch_core::{
+    AudioFormat, AudioFrame, BillingRecord, BillingSchedule, ConversationInput, ConversationOutput,
+    Input, OutputPath, audio,
+};
+
+use crate::transcribe::{Params, ServiceOutputEvent};
+use crate::transcription::TranscriptionState;
+
+pub struct Client {
+    read: SplitStream<WebSocketStream<MaybeTlsStream<TcpStream>>>,
+    write: SplitSink<WebSocketStream<MaybeTlsStream<TcpStream>>, Message>,
+    transcription_state: TranscriptionState,
+    /// Most recent speaker reported per item via transcription segments. Applied to the final
+    /// transcript on completion, which does not carry speaker attribution itself.
+    segment_speakers: HashMap<String, String>,
+}
+
+impl Client {
+    pub(crate) fn new(
+        read: SplitStream<WebSocketStream<MaybeTlsStream<TcpStream>>>,
+        write: SplitSink<WebSocketStream<MaybeTlsStream<TcpStream>>, Message>,
+    ) -> Self {
+        Self {
+            read,
+            write,
+            transcription_state: TranscriptionState::default(),
+            segment_speakers: HashMap::new(),
+        }
+    }
+
+    pub async fn transcribe(
+        &mut self,
+        input_format: AudioFormat,
+        params: Params,
+        mut input: ConversationInput,
+        output: ConversationOutput,
+    ) -> Result<()> {
+        let expected_format = AudioFormat::new(1, 24000);
+        if input_format != expected_format {
+            bail!(
+                "Audio input has the wrong format {input_format:?}, expected: {expected_format:?}"
+            );
+        }
+
+        // Wait for the created event before configuring the session.
+        let message = self.read.next().await;
+        Self::verify_session_created_event(message)?;
+        debug!("Session created");
+
+        self.send_session_update(&params).await?;
+        debug!("Session updated");
+
+        let language = params.language.clone();
+
+        loop {
+            select! {
+                input = input.recv() => {
+                    match input {
+                        Some(Input::Audio { frame }) => {
+                            let duration = frame.duration();
+                            self.send_frame(frame).await?;
+                            output.billing_records(
+                                None,
+                                None,
+                                [BillingRecord::duration("input:audio", duration)],
+                                BillingSchedule::Now,
+                            )?;
+                        }
+                        Some(_) => warn!("Unexpected non-audio input"),
+                        // Input channel closed: end the session.
+                        None => break,
+                    }
+                }
+
+                message = self.read.next() => {
+                    match message {
+                        Some(Ok(message)) => {
+                            match self.process_message(message, &output, language.as_deref()).await? {
+                                FlowControl::End => break,
+                                FlowControl::PongAndContinue(payload) => {
+                                    self.write.send(Message::Pong(payload)).await?;
+                                }
+                                FlowControl::Continue => {}
+                            }
+                        }
+                        Some(Err(e)) => bail!(e),
+                        // End of stream.
+                        None => break,
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn send_session_update(&mut self, params: &Params) -> Result<()> {
+        let session = types::VoiceLiveSession {
+            input_audio_sampling_rate: None,
+            input_audio_noise_reduction: params.noise_reduction.clone(),
+            input_audio_echo_cancellation: None,
+            input_audio_transcription: Some(types::TranscriptionConfig {
+                language: params.language.clone(),
+                model: params.transcription_model.clone(),
+                prompt: None,
+            }),
+            turn_detection: Some(transcription_turn_detection(params.turn_detection.clone())),
+        };
+
+        log_requested_session_update(&session);
+
+        self.send_client_event(ClientEvent::SessionUpdate(client_event::SessionUpdate {
+            event_id: None,
+            session: client_event::SessionUpdatePayload::VoiceLive(session),
+        }))
+        .await
+    }
+
+    async fn process_message(
+        &mut self,
+        message: Message,
+        output: &ConversationOutput,
+        language: Option<&str>,
+    ) -> Result<FlowControl> {
+        match message {
+            Message::Text(str) => {
+                let event = serde_json::from_str(&str)
+                    .with_context(|| format!("Deserialization failed: `{str}`"))?;
+                self.handle_server_event(event, output, language, Some(&str))
+                    .await?;
+                Ok(FlowControl::Continue)
+            }
+            Message::Ping(data) => Ok(FlowControl::PongAndContinue(data)),
+            Message::Close(_) => Ok(FlowControl::End),
+            msg => bail!("Unhandled websocket message: {msg:?}"),
+        }
+    }
+
+    async fn handle_server_event(
+        &mut self,
+        event: ServerEvent,
+        output: &ConversationOutput,
+        language: Option<&str>,
+        raw_message: Option<&str>,
+    ) -> Result<()> {
+        match event {
+            ServerEvent::SessionUpdated(e) => {
+                if let Some(message) = raw_message {
+                    info!(session_updated_raw = %message, "Raw session.updated from server");
+                }
+                log_confirmed_session_from_server(&e.session);
+                debug!("Session update acknowledged");
+            }
+
+            ServerEvent::InputAudioBufferSpeechStarted(e) => {
+                output.service_event(
+                    OutputPath::Control,
+                    ServiceOutputEvent::SpeechStarted {
+                        audio_start_ms: e.audio_start_ms,
+                    },
+                )?;
+            }
+            ServerEvent::InputAudioBufferSpeechStopped(e) => {
+                output.service_event(
+                    OutputPath::Control,
+                    ServiceOutputEvent::SpeechStopped {
+                        audio_end_ms: e.audio_end_ms,
+                    },
+                )?;
+            }
+            ServerEvent::InputAudioBufferCommited(e) => {
+                output.service_event(
+                    OutputPath::Control,
+                    ServiceOutputEvent::SpeechCommitted { item_id: e.item_id },
+                )?;
+            }
+            ServerEvent::InputAudioBufferTimeoutTriggered(e) => {
+                output.service_event(
+                    OutputPath::Control,
+                    ServiceOutputEvent::SpeechTimeout {
+                        audio_start_ms: e.audio_start_ms,
+                        audio_end_ms: e.audio_end_ms,
+                    },
+                )?;
+            }
+
+            ServerEvent::ConversationItemInputAudioTranscriptionDelta(e) => {
+                let text =
+                    self.transcription_state
+                        .apply_input_delta(e.item_id, e.content_index, e.delta);
+                output.text(false, text, language.map(str::to_string), None)?;
+            }
+            ServerEvent::ConversationItemInputAudioTranscriptionCompleted(e) => {
+                let speaker = self.segment_speakers.remove(&e.item_id);
+                if let Some(text) = self.transcription_state.complete_input_transcription(
+                    e.item_id,
+                    e.content_index,
+                    e.transcript,
+                ) {
+                    output.text(true, text, language.map(str::to_string), speaker)?;
+                }
+            }
+            ServerEvent::ConversationItemInputAudioTranscriptionSegment(e) => {
+                if let Some(speaker) = &e.speaker {
+                    self.segment_speakers.insert(e.item_id, speaker.clone());
+                }
+                output.service_event(
+                    OutputPath::Control,
+                    ServiceOutputEvent::Segment {
+                        start: e.start,
+                        end: e.end,
+                        text: e.text,
+                        speaker: e.speaker,
+                    },
+                )?;
+            }
+            ServerEvent::ConversationItemInputAudioTranscriptionFailed(e) => {
+                bail!("Input audio transcription failed: {}", e.error.message);
+            }
+
+            ServerEvent::Error(e) => bail!("Voice Live error: {}", e.error.message),
+
+            other => trace!("Ignoring server event: {other:?}"),
+        }
+
+        Ok(())
+    }
+
+    async fn send_frame(&mut self, frame: AudioFrame) -> Result<()> {
+        let mono = frame.into_mono();
+        let samples_le = audio::to_le_bytes(mono.samples);
+
+        let event = client_event::InputAudioBufferAppend {
+            event_id: None,
+            audio: BASE64_STANDARD.encode(samples_le),
+        };
+        self.send_client_event(ClientEvent::InputAudioBufferAppend(event))
+            .await
+    }
+
+    async fn send_client_event(&mut self, client_event: ClientEvent) -> Result<()> {
+        let json = serde_json::to_string(&client_event)?;
+        self.write.send(Message::Text(json.into())).await?;
+        Ok(())
+    }
+
+    fn verify_session_created_event(
+        message: Option<Result<Message, tokio_tungstenite::tungstenite::Error>>,
+    ) -> Result<()> {
+        let Some(message) = message else {
+            bail!("Failed to receive the initial message");
+        };
+        let Message::Text(message) = message? else {
+            bail!("Expected a text message for session creation");
+        };
+
+        match serde_json::from_str(&message)? {
+            ServerEvent::SessionCreated(_) => Ok(()),
+            ServerEvent::Error(e) => bail!("Failed to create the session: {}", e.error.message),
+            other => bail!("Unexpected event in response to session creation: {other:?}"),
+        }
+    }
+}
+
+/// Produces the turn-detection configuration for transcription. Responses are always suppressed
+/// (`create_response = false`) because this service only transcribes; a missing configuration
+/// defaults to Azure semantic VAD.
+fn transcription_turn_detection(configured: Option<TurnDetection>) -> TurnDetection {
+    let mut detection = configured
+        .unwrap_or_else(|| TurnDetection::AzureSemanticVad(AzureSemanticVadConfig::default()));
+    match &mut detection {
+        TurnDetection::ServerVAD(config) => {
+            config.create_response = false;
+        }
+        TurnDetection::SemanticVAD(config) => config.create_response = false,
+        TurnDetection::AzureSemanticVad(config) => {
+            config.create_response = Some(false);
+        }
+        TurnDetection::AzureSemanticVadMultilingual(config) => {
+            config.create_response = Some(false);
+        }
+    }
+    detection
+}
+
+enum FlowControl {
+    Continue,
+    End,
+    PongAndContinue(Bytes),
+}
+
+fn log_confirmed_session_from_server(session: &types::UntaggedSession) {
+    match serde_json::to_string_pretty(session) {
+        Ok(session_json) => {
+            info!(session_confirmed = %session_json, "Confirmed session from server")
+        }
+        Err(error) => warn!(?error, "Failed to serialize confirmed session from server"),
+    }
+}
+
+fn log_requested_session_update(session: &types::VoiceLiveSession) {
+    match serde_json::to_string(session) {
+        Ok(session_json) => {
+            info!(session_requested = %session_json, "Requested session update sent to server")
+        }
+        Err(error) => warn!(?error, "Failed to serialize requested session update"),
+    }
+}
diff --git a/services/microsoft-voice-live/src/host.rs b/services/microsoft-voice-live/src/host.rs
new file mode 100644
index 0000000..aa5d056
--- /dev/null
+++ b/services/microsoft-voice-live/src/host.rs
@@ -0,0 +1,50 @@
+use anyhow::{Context, Result, anyhow};
+use openai_api_rs::realtime::api::{RealtimeClient, RealtimeProtocol};
+use url::Url;
+
+use crate::client::Client;
+
+pub struct Host {
+    client: RealtimeClient,
+}
+
+impl Host {
+    pub fn new(endpoint: &str, api_key: &str, model: &str, api_version: &str) -> Result<Self> {
+        let wss_url = build_voice_live_url(endpoint, model, api_version)?;
+        // Reuse the Azure realtime auth behavior (api-key query, no bearer header). The full URL,
+        // including the Voice Live path and `api-version`, is precomputed here, so an empty model
+        // is passed to keep the client from appending a second `model` query parameter.
+        let client = RealtimeClient::new_with_endpoint_and_protocol(
+            wss_url,
+            api_key.into(),
+            String::new(),
+            RealtimeProtocol::Azure,
+        );
+        Ok(Self { client })
+    }
+
+    pub async fn connect(&self) -> Result<Client> {
+        let (write, read) = self
+            .client
+            .connect()
+            .await
+            .map_err(|e| anyhow!(e.to_string()))?;
+        Ok(Client::new(read, write))
+    }
+}
+
+fn build_voice_live_url(endpoint: &str, model: &str, api_version: &str) -> Result<String> {
+    let mut url = Url::parse(endpoint.trim())
+        .with_context(|| format!("Invalid Voice Live endpoint URL: {endpoint}"))?;
+
+    match url.scheme() {
+        "wss" => {}
+        scheme => anyhow::bail!("Unsupported Voice Live endpoint URL scheme: {scheme}. Use wss://"),
+    }
+
+    url.set_query(None);
+    url.query_pairs_mut()
+        .append_pair("api-version", api_version)
+        .append_pair("model", model);
+    Ok(url.to_string())
+}
diff --git a/services/microsoft-voice-live/src/lib.rs b/services/microsoft-voice-live/src/lib.rs
new file mode 100644
index 0000000..cb4b533
--- /dev/null
+++ b/services/microsoft-voice-live/src/lib.rs
@@ -0,0 +1,6 @@
+mod client;
+mod host;
+mod transcribe;
+mod transcription;
+
+pub use transcribe::{MicrosoftVoiceLiveTranscribe, Params, ServiceOutputEvent};
diff --git a/services/microsoft-voice-live/src/transcribe.rs b/services/microsoft-voice-live/src/transcribe.rs
new file mode 100644
index 0000000..cd225dd
--- /dev/null
+++ b/services/microsoft-voice-live/src/transcribe.rs
@@ -0,0 +1,90 @@
+use anyhow::Result;
+use async_trait::async_trait;
+use openai_api_rs::realtime::types::{NoiseReduction, TurnDetection};
+use serde::{Deserialize, Serialize};
+
+use context_switch_core::{Conversation, Service};
+
+use crate::host::Host;
+
+/// Default Voice Live API version. Newer resources require an explicit `api-version`.
+const DEFAULT_API_VERSION: &str = "2026-06-01-preview";
+/// Default transcription model. `azure-speech` is Azure's native speech-to-text engine.
+const DEFAULT_TRANSCRIPTION_MODEL: &str = "azure-speech";
+
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct Params {
+    pub api_key: String,
+    /// Resource endpoint URL. Must be a full `wss://` URL.
+    /// The path is used exactly as provided; only the `api-version` and `model` query
+    /// parameters are set.
+    pub endpoint: String,
+    /// Realtime model used for the Voice Live session (URL `model` query parameter).
+    pub model: String,
+    pub api_version: Option<String>,
+    /// Transcription model set in `audio.input.transcription.model`.
+    #[serde(default = "default_transcription_model")]
+    pub transcription_model: String,
+    /// Input audio language hint in ISO-639-1 form (e.g. `en`).
+    pub language: Option<String>,
+    /// Input-audio noise reduction (Azure deep noise suppression, near/far field).
+    pub noise_reduction: Option<NoiseReduction>,
+    /// Turn-detection configuration (Azure semantic VAD, server VAD, ...). Defaults to Azure
+    /// semantic VAD with responses suppressed when omitted.
+    pub turn_detection: Option<TurnDetection>,
+}
+
+#[derive(Debug)]
+pub struct MicrosoftVoiceLiveTranscribe;
+
+#[async_trait]
+impl Service for MicrosoftVoiceLiveTranscribe {
+    type Params = Params;
+
+    async fn conversation(&self, params: Params, conversation: Conversation) -> Result<()> {
+        let input_format = conversation.require_audio_input()?;
+        conversation.require_text_output(true)?;
+
+        let host = Host::new(
+            &params.endpoint,
+            &params.api_key,
+            &params.model,
+            params.api_version.as_deref().unwrap_or(DEFAULT_API_VERSION),
+        )?;
+        let mut client = host.connect().await?;
+
+        let (input, output) = conversation.start()?;
+        client.transcribe(input_format, params, input, output).await
+    }
+}
+
+/// Turn-detection and segmentation signals surfaced on the control output path. These give the
+/// caller full visibility into what the turn detector reports, beyond the final transcript text.
+#[derive(Debug, Serialize)]
+#[serde(tag = "type", rename_all = "camelCase")]
+pub enum ServiceOutputEvent {
+    SpeechStarted {
+        audio_start_ms: u32,
+    },
+    SpeechStopped {
+        audio_end_ms: u32,
+    },
+    SpeechCommitted {
+        item_id: String,
+    },
+    SpeechTimeout {
+        audio_start_ms: u32,
+        audio_end_ms: u32,
+    },
+    Segment {
+        start: f64,
+        end: f64,
+        text: String,
+        speaker: Option<String>,
+    },
+}
+
+fn default_transcription_model() -> String {
+    DEFAULT_TRANSCRIPTION_MODEL.to_string()
+}
diff --git a/services/microsoft-voice-live/src/transcription.rs b/services/microsoft-voice-live/src/transcription.rs
new file mode 100644
index 0000000..51b579f
--- /dev/null
+++ b/services/microsoft-voice-live/src/transcription.rs
@@ -0,0 +1,57 @@
+// TODO(merge): extract to openai-realtime-core, shared with `openai-dialog`'s transcription state.
+// Voice Live is transcription-only, so only the input-side buffers are duplicated here.
+
+use std::collections::HashMap;
+
+#[derive(Debug, Default)]
+pub struct TranscriptionState {
+    input_transcription_buffers: HashMap<InputTranscriptionKey, String>,
+}
+
+impl TranscriptionState {
+    pub fn apply_input_delta(
+        &mut self,
+        item_id: String,
+        content_index: u32,
+        delta: String,
+    ) -> String {
+        let key = InputTranscriptionKey::new(item_id, content_index);
+        let entry = self.input_transcription_buffers.entry(key).or_default();
+        entry.push_str(&delta);
+        entry.clone()
+    }
+
+    pub fn complete_input_transcription(
+        &mut self,
+        item_id: String,
+        content_index: u32,
+        transcript: String,
+    ) -> Option<String> {
+        let key = InputTranscriptionKey::new(item_id, content_index);
+        let text = if transcript.is_empty() {
+            self.input_transcription_buffers
+                .remove(&key)
+                .unwrap_or_default()
+        } else {
+            self.input_transcription_buffers.remove(&key);
+            transcript
+        };
+
+        (!text.is_empty()).then_some(text)
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+struct InputTranscriptionKey {
+    item_id: String,
+    content_index: u32,
+}
+
+impl InputTranscriptionKey {
+    fn new(item_id: String, content_index: u32) -> Self {
+        Self {
+            item_id,
+            content_index,
+        }
+    }
+}
diff --git a/services/openai-dialog/Cargo.toml b/services/openai-dialog/Cargo.toml
index dbff691..9bcca22 100644
--- a/services/openai-dialog/Cargo.toml
+++ b/services/openai-dialog/Cargo.toml
@@ -12,7 +12,7 @@ context-switch-core = { workspace = true }
 
 openai-api-rs = { workspace = true }
 
-tokio-tungstenite = { version = "0.28.0", features = ["connect", "native-tls"] }
+tokio-tungstenite = { version = "0.29.0", features = ["connect", "native-tls"] }
 
 tracing = { workspace = true }
 
diff --git a/src/context_switch.rs b/src/context_switch.rs
index a2abd40..776d192 100644
--- a/src/context_switch.rs
+++ b/src/context_switch.rs
@@ -42,6 +42,10 @@ pub fn registry() -> Registry {
         .add_service("azure-translate", azure::AzureTranslate)
         .add_service("elevenlabs-transcribe", elevenlabs::ElevenLabsTranscribe)
         .add_service("google-transcribe", google_transcribe::GoogleTranscribe)
+        .add_service(
+            "microsoft-voice-live-transcribe",
+            microsoft_voice_live::MicrosoftVoiceLiveTranscribe,
+        )
         .add_service("openai-dialog", openai_dialog::OpenAIDialog)
         .add_service("google-dialog", google_dialog::GoogleDialog)
         .add_service("aristech-transcribe", aristech::AristechTranscribe)
diff --git a/src/lib.rs b/src/lib.rs
index 51e3862..a749929 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,4 +17,5 @@ pub mod services {
     pub use elevenlabs::ElevenLabsTranscribe;
     pub use google_dialog::GoogleDialog;
     pub use google_transcribe::GoogleTranscribe;
+    pub use microsoft_voice_live::MicrosoftVoiceLiveTranscribe;
 }