Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions CONTEXT.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Context

Glossary of canonical terms for context-switch. Keep this free of implementation
detail — it is a glossary, not a spec.

## Speech provider terms

These three Microsoft offerings are distinct and must not all be called "Azure".

- **Azure Speech** — Microsoft's Azure AI Speech SDK service. In this repo it backs
the `azure` crate (`AzureTranscribe`, `AzureSynthesize`, ...). Classic
speech-to-text / text-to-speech.

- **Azure OpenAI Realtime** — the Azure-hosted variant of the OpenAI Realtime API.
Reached through `openai-dialog` with `Protocol::Azure`. A realtime
speech-to-speech dialog protocol over a single websocket.

- **Voice Live** — Microsoft Foundry's managed speech-to-speech API
(`/voice-live/realtime`). Wire-compatible with Azure OpenAI Realtime but adds
Azure-only capabilities (deep noise suppression, Azure semantic VAD, Azure
speech / MAI transcription models). Backed by the `microsoft-voice-live` crate.

Note: Voice Live and Azure OpenAI Realtime are expected to converge over time;
the `microsoft-voice-live` boundary is kept deliberately thin so the two can be
merged later without a rewrite.
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ members = [
"services/elevenlabs",
"services/google-dialog",
"services/google-transcribe",
"services/microsoft-voice-live",
"services/openai-dialog",
"services/playback",
]
Expand All @@ -38,6 +39,7 @@ azure-speech = { workspace = true }
aristech = { workspace = true }
elevenlabs = { workspace = true }
google-transcribe = { workspace = true }
microsoft-voice-live = { workspace = true }

# basic

Expand Down Expand Up @@ -104,6 +106,7 @@ aristech = { path = "services/aristech" }
elevenlabs = { path = "services/elevenlabs" }
google-transcribe = { path = "services/google-transcribe" }
google-dialog = { path = "services/google-dialog" }
microsoft-voice-live = { path = "services/microsoft-voice-live" }
gemini-live = { path = "external/gemini-live-rs/crates/gemini-live" }

# Dependencies required by `external/gemini-live-rs/crates/gemini-live`.
Expand Down
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ cargo run --example transcribe -- elevenlabs
# Run generic transcribe example with Aristech provider
cargo run --example transcribe -- aristech

# Run generic transcribe example with Microsoft Voice Live provider
cargo run --example transcribe -- voice-live

# Run Azure synthesize example
cargo run --example azure-synthesize

Expand Down Expand Up @@ -105,6 +108,13 @@ AZURE_REGION=your_azure_region
# ElevenLabs Configuration
ELEVENLABS_API_KEY=your_elevenlabs_key

# Microsoft Voice Live Configuration
MICROSOFT_VOICE_LIVE_API_KEY=your_voice_live_key
MICROSOFT_VOICE_LIVE_ENDPOINT=wss://your-resource.services.ai.azure.com/voice-live/realtime
MICROSOFT_VOICE_LIVE_MODEL=gpt-4o-mini-realtime-preview
MICROSOFT_VOICE_LIVE_API_VERSION=2026-06-01-preview
MICROSOFT_VOICE_LIVE_TRANSCRIPTION_MODEL=azure-speech

# Audio Knife Configuration
AUDIO_KNIFE_ADDRESS=127.0.0.1:8123

Expand Down
52 changes: 52 additions & 0 deletions examples/transcribe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,17 @@ use std::time::Duration;
use anyhow::{Context, Result, bail};
use clap::{Parser, ValueEnum};
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use openai_api_rs::realtime::types::{
AzureSemanticVadConfig, EndOfUtteranceDetectionConfig, EndOfUtteranceDetectionModel,
EndOfUtteranceThresholdLevel, TurnDetection,
};
use rodio::DeviceSinkBuilder;
use tokio::select;
use tokio::sync::mpsc::{channel, unbounded_channel};

use context_switch::services::{
AristechTranscribe, AzureTranscribe, ElevenLabsTranscribe, GoogleTranscribe,
MicrosoftVoiceLiveTranscribe,
};
use context_switch::{AudioConsumer, InputModality, OutputModality};
use context_switch_core::language::Languages;
Expand Down Expand Up @@ -44,6 +49,8 @@ enum Provider {
Google,
#[value(name = "aristech")]
Aristech,
#[value(name = "voice-live")]
VoiceLive,
}

#[tokio::main]
Expand Down Expand Up @@ -343,5 +350,50 @@ async fn start_conversation(
};
AristechTranscribe.conversation(params, conversation).await
}
Provider::VoiceLive => {
if diarization {
bail!("--diarization is only supported for the azure provider");
}
if region.is_some() {
bail!("--region is only supported for the google provider");
}

let language = Some(
languages
.single()
.context("Voice Live provider supports exactly one --language value")?
.clone(),
);

let params = microsoft_voice_live::Params {
api_key: env::var("MICROSOFT_VOICE_LIVE_API_KEY")
.expect("MICROSOFT_VOICE_LIVE_API_KEY undefined"),
endpoint: env::var("MICROSOFT_VOICE_LIVE_ENDPOINT")
.expect("MICROSOFT_VOICE_LIVE_ENDPOINT undefined (must be wss://...)"),
model: model.map(str::to_owned).unwrap_or_else(|| {
env::var("MICROSOFT_VOICE_LIVE_MODEL").unwrap_or_else(|_| "gpt-4.1".to_owned())
}),
api_version: env::var("MICROSOFT_VOICE_LIVE_API_VERSION").ok(),
transcription_model: env::var("MICROSOFT_VOICE_LIVE_TRANSCRIPTION_MODEL")
.unwrap_or_else(|_| "azure-speech".to_owned()),
language,
noise_reduction: None,
turn_detection: Some(TurnDetection::AzureSemanticVadMultilingual(
AzureSemanticVadConfig {
end_of_utterance_detection: Some(EndOfUtteranceDetectionConfig {
model: EndOfUtteranceDetectionModel::SmartEndOfTurnDetection,
threshold_level: Some(EndOfUtteranceThresholdLevel::Low),
timeout_ms: Some(5000),
}),
// remove_filler_words: Some(true),
languages: Some(vec!["de-DE".to_owned()]),
..Default::default()
},
)),
};
MicrosoftVoiceLiveTranscribe
.conversation(params, conversation)
.await
}
}
}
3 changes: 3 additions & 0 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ transcribe-google-latest-short:
transcribe-google-latest-long:
cargo run --example transcribe -- google --language de-DE --model latest_long --region eu

transcribe-voice-live-de:
cargo run --example transcribe -- voice-live --language de-DE

transcribe-google-diarization:
cargo run --example transcribe -- google --diarization --language de-DE --model chirp_3 --region eu

Expand Down
22 changes: 22 additions & 0 deletions services/microsoft-voice-live/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[package]
name = "microsoft-voice-live"
version = "0.1.0"
edition.workspace = true

[dependencies]
context-switch-core = { workspace = true }

openai-api-rs = { workspace = true }

tokio-tungstenite = { version = "0.29.0", features = ["connect", "native-tls"] }

tracing = { workspace = true }

anyhow = { workspace = true }
futures = { workspace = true }
tokio = { workspace = true, features = ["net"] }
serde_json = { workspace = true }
base64 = { workspace = true }
serde = { workspace = true, features = ["derive"] }
async-trait = { workspace = true }
url = { workspace = true }
Loading
Loading