diff --git a/CONTEXT.md b/CONTEXT.md new file mode 100644 index 0000000..fa94750 --- /dev/null +++ b/CONTEXT.md @@ -0,0 +1,25 @@ +# Context + +Glossary of canonical terms for context-switch. Keep this free of implementation +detail — it is a glossary, not a spec. + +## Speech provider terms + +These three Microsoft offerings are distinct and must not all be called "Azure". + +- **Azure Speech** — Microsoft's Azure AI Speech SDK service. In this repo it backs + the `azure` crate (`AzureTranscribe`, `AzureSynthesize`, ...). Classic + speech-to-text / text-to-speech. + +- **Azure OpenAI Realtime** — the Azure-hosted variant of the OpenAI Realtime API. + Reached through `openai-dialog` with `Protocol::Azure`. A realtime + speech-to-speech dialog protocol over a single websocket. + +- **Voice Live** — Microsoft Foundry's managed speech-to-speech API + (`/voice-live/realtime`). Wire-compatible with Azure OpenAI Realtime but adds + Azure-only capabilities (deep noise suppression, Azure semantic VAD, Azure + speech / MAI transcription models). Backed by the `microsoft-voice-live` crate. + + Note: Voice Live and Azure OpenAI Realtime are expected to converge over time; + the `microsoft-voice-live` boundary is kept deliberately thin so the two can be + merged later without a rewrite. diff --git a/Cargo.toml b/Cargo.toml index 1e102d9..25ecdf5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ members = [ "services/elevenlabs", "services/google-dialog", "services/google-transcribe", + "services/microsoft-voice-live", "services/openai-dialog", "services/playback", ] @@ -38,6 +39,7 @@ azure-speech = { workspace = true } aristech = { workspace = true } elevenlabs = { workspace = true } google-transcribe = { workspace = true } +microsoft-voice-live = { workspace = true } # basic @@ -104,6 +106,7 @@ aristech = { path = "services/aristech" } elevenlabs = { path = "services/elevenlabs" } google-transcribe = { path = "services/google-transcribe" } google-dialog = { path = "services/google-dialog" } +microsoft-voice-live = { path = "services/microsoft-voice-live" } gemini-live = { path = "external/gemini-live-rs/crates/gemini-live" } # Dependencies required by `external/gemini-live-rs/crates/gemini-live`. diff --git a/README.md b/README.md index d207158..d2c9871 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,9 @@ cargo run --example transcribe -- elevenlabs # Run generic transcribe example with Aristech provider cargo run --example transcribe -- aristech +# Run generic transcribe example with Microsoft Voice Live provider +cargo run --example transcribe -- voice-live + # Run Azure synthesize example cargo run --example azure-synthesize @@ -105,6 +108,13 @@ AZURE_REGION=your_azure_region # ElevenLabs Configuration ELEVENLABS_API_KEY=your_elevenlabs_key +# Microsoft Voice Live Configuration +MICROSOFT_VOICE_LIVE_API_KEY=your_voice_live_key +MICROSOFT_VOICE_LIVE_ENDPOINT=wss://your-resource.services.ai.azure.com/voice-live/realtime +MICROSOFT_VOICE_LIVE_MODEL=gpt-4o-mini-realtime-preview +MICROSOFT_VOICE_LIVE_API_VERSION=2026-06-01-preview +MICROSOFT_VOICE_LIVE_TRANSCRIPTION_MODEL=azure-speech + # Audio Knife Configuration AUDIO_KNIFE_ADDRESS=127.0.0.1:8123 diff --git a/examples/transcribe.rs b/examples/transcribe.rs index de0ab89..d57d892 100644 --- a/examples/transcribe.rs +++ b/examples/transcribe.rs @@ -5,12 +5,17 @@ use std::time::Duration; use anyhow::{Context, Result, bail}; use clap::{Parser, ValueEnum}; use cpal::traits::{DeviceTrait, HostTrait, StreamTrait}; +use openai_api_rs::realtime::types::{ + AzureSemanticVadConfig, EndOfUtteranceDetectionConfig, EndOfUtteranceDetectionModel, + EndOfUtteranceThresholdLevel, TurnDetection, +}; use rodio::DeviceSinkBuilder; use tokio::select; use tokio::sync::mpsc::{channel, unbounded_channel}; use context_switch::services::{ AristechTranscribe, AzureTranscribe, ElevenLabsTranscribe, GoogleTranscribe, + MicrosoftVoiceLiveTranscribe, }; use context_switch::{AudioConsumer, InputModality, OutputModality}; use context_switch_core::language::Languages; @@ -44,6 +49,8 @@ enum Provider { Google, #[value(name = "aristech")] Aristech, + #[value(name = "voice-live")] + VoiceLive, } #[tokio::main] @@ -343,5 +350,50 @@ async fn start_conversation( }; AristechTranscribe.conversation(params, conversation).await } + Provider::VoiceLive => { + if diarization { + bail!("--diarization is only supported for the azure provider"); + } + if region.is_some() { + bail!("--region is only supported for the google provider"); + } + + let language = Some( + languages + .single() + .context("Voice Live provider supports exactly one --language value")? + .clone(), + ); + + let params = microsoft_voice_live::Params { + api_key: env::var("MICROSOFT_VOICE_LIVE_API_KEY") + .expect("MICROSOFT_VOICE_LIVE_API_KEY undefined"), + endpoint: env::var("MICROSOFT_VOICE_LIVE_ENDPOINT") + .expect("MICROSOFT_VOICE_LIVE_ENDPOINT undefined (must be wss://...)"), + model: model.map(str::to_owned).unwrap_or_else(|| { + env::var("MICROSOFT_VOICE_LIVE_MODEL").unwrap_or_else(|_| "gpt-4.1".to_owned()) + }), + api_version: env::var("MICROSOFT_VOICE_LIVE_API_VERSION").ok(), + transcription_model: env::var("MICROSOFT_VOICE_LIVE_TRANSCRIPTION_MODEL") + .unwrap_or_else(|_| "azure-speech".to_owned()), + language, + noise_reduction: None, + turn_detection: Some(TurnDetection::AzureSemanticVadMultilingual( + AzureSemanticVadConfig { + end_of_utterance_detection: Some(EndOfUtteranceDetectionConfig { + model: EndOfUtteranceDetectionModel::SmartEndOfTurnDetection, + threshold_level: Some(EndOfUtteranceThresholdLevel::Low), + timeout_ms: Some(5000), + }), + // remove_filler_words: Some(true), + languages: Some(vec!["de-DE".to_owned()]), + ..Default::default() + }, + )), + }; + MicrosoftVoiceLiveTranscribe + .conversation(params, conversation) + .await + } } } diff --git a/external/openai-api-rs b/external/openai-api-rs index b07e84f..65afe2b 160000 --- a/external/openai-api-rs +++ b/external/openai-api-rs @@ -1 +1 @@ -Subproject commit b07e84f8598568b032134f4139804636bab8529e +Subproject commit 65afe2b1f02636a51896fe57678046a26ed6a9a8 diff --git a/justfile b/justfile index e43db93..6a026aa 100644 --- a/justfile +++ b/justfile @@ -25,6 +25,9 @@ transcribe-google-latest-short: transcribe-google-latest-long: cargo run --example transcribe -- google --language de-DE --model latest_long --region eu +transcribe-voice-live-de: + cargo run --example transcribe -- voice-live --language de-DE + transcribe-google-diarization: cargo run --example transcribe -- google --diarization --language de-DE --model chirp_3 --region eu diff --git a/services/microsoft-voice-live/Cargo.toml b/services/microsoft-voice-live/Cargo.toml new file mode 100644 index 0000000..b2f2acf --- /dev/null +++ b/services/microsoft-voice-live/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "microsoft-voice-live" +version = "0.1.0" +edition.workspace = true + +[dependencies] +context-switch-core = { workspace = true } + +openai-api-rs = { workspace = true } + +tokio-tungstenite = { version = "0.29.0", features = ["connect", "native-tls"] } + +tracing = { workspace = true } + +anyhow = { workspace = true } +futures = { workspace = true } +tokio = { workspace = true, features = ["net"] } +serde_json = { workspace = true } +base64 = { workspace = true } +serde = { workspace = true, features = ["derive"] } +async-trait = { workspace = true } +url = { workspace = true } diff --git a/services/microsoft-voice-live/src/client.rs b/services/microsoft-voice-live/src/client.rs new file mode 100644 index 0000000..c874111 --- /dev/null +++ b/services/microsoft-voice-live/src/client.rs @@ -0,0 +1,322 @@ +use std::collections::HashMap; + +use anyhow::{Context, Result, bail}; +use base64::prelude::*; +use futures::stream::{SplitSink, SplitStream}; +use futures::{SinkExt, StreamExt}; +use openai_api_rs::realtime::client_event::{self, ClientEvent}; +use openai_api_rs::realtime::server_event::ServerEvent; +use openai_api_rs::realtime::types::{self, AzureSemanticVadConfig, TurnDetection}; +use tokio::{net::TcpStream, select}; +use tokio_tungstenite::tungstenite::{Bytes, protocol::Message}; +use tokio_tungstenite::{MaybeTlsStream, WebSocketStream}; +use tracing::{debug, info, trace, warn}; + +use context_switch_core::{ + AudioFormat, AudioFrame, BillingRecord, BillingSchedule, ConversationInput, ConversationOutput, + Input, OutputPath, audio, +}; + +use crate::transcribe::{Params, ServiceOutputEvent}; +use crate::transcription::TranscriptionState; + +pub struct Client { + read: SplitStream>>, + write: SplitSink>, Message>, + transcription_state: TranscriptionState, + /// Most recent speaker reported per item via transcription segments. Applied to the final + /// transcript on completion, which does not carry speaker attribution itself. + segment_speakers: HashMap, +} + +impl Client { + pub(crate) fn new( + read: SplitStream>>, + write: SplitSink>, Message>, + ) -> Self { + Self { + read, + write, + transcription_state: TranscriptionState::default(), + segment_speakers: HashMap::new(), + } + } + + pub async fn transcribe( + &mut self, + input_format: AudioFormat, + params: Params, + mut input: ConversationInput, + output: ConversationOutput, + ) -> Result<()> { + let expected_format = AudioFormat::new(1, 24000); + if input_format != expected_format { + bail!( + "Audio input has the wrong format {input_format:?}, expected: {expected_format:?}" + ); + } + + // Wait for the created event before configuring the session. + let message = self.read.next().await; + Self::verify_session_created_event(message)?; + debug!("Session created"); + + self.send_session_update(¶ms).await?; + debug!("Session updated"); + + let language = params.language.clone(); + + loop { + select! { + input = input.recv() => { + match input { + Some(Input::Audio { frame }) => { + let duration = frame.duration(); + self.send_frame(frame).await?; + output.billing_records( + None, + None, + [BillingRecord::duration("input:audio", duration)], + BillingSchedule::Now, + )?; + } + Some(_) => warn!("Unexpected non-audio input"), + // Input channel closed: end the session. + None => break, + } + } + + message = self.read.next() => { + match message { + Some(Ok(message)) => { + match self.process_message(message, &output, language.as_deref()).await? { + FlowControl::End => break, + FlowControl::PongAndContinue(payload) => { + self.write.send(Message::Pong(payload)).await?; + } + FlowControl::Continue => {} + } + } + Some(Err(e)) => bail!(e), + // End of stream. + None => break, + } + } + } + } + + Ok(()) + } + + async fn send_session_update(&mut self, params: &Params) -> Result<()> { + let session = types::VoiceLiveSession { + input_audio_sampling_rate: None, + input_audio_noise_reduction: params.noise_reduction.clone(), + input_audio_echo_cancellation: None, + input_audio_transcription: Some(types::TranscriptionConfig { + language: params.language.clone(), + model: params.transcription_model.clone(), + prompt: None, + }), + turn_detection: Some(transcription_turn_detection(params.turn_detection.clone())), + }; + + log_requested_session_update(&session); + + self.send_client_event(ClientEvent::SessionUpdate(client_event::SessionUpdate { + event_id: None, + session: client_event::SessionUpdatePayload::VoiceLive(session), + })) + .await + } + + async fn process_message( + &mut self, + message: Message, + output: &ConversationOutput, + language: Option<&str>, + ) -> Result { + match message { + Message::Text(str) => { + let event = serde_json::from_str(&str) + .with_context(|| format!("Deserialization failed: `{str}`"))?; + self.handle_server_event(event, output, language, Some(&str)) + .await?; + Ok(FlowControl::Continue) + } + Message::Ping(data) => Ok(FlowControl::PongAndContinue(data)), + Message::Close(_) => Ok(FlowControl::End), + msg => bail!("Unhandled websocket message: {msg:?}"), + } + } + + async fn handle_server_event( + &mut self, + event: ServerEvent, + output: &ConversationOutput, + language: Option<&str>, + raw_message: Option<&str>, + ) -> Result<()> { + match event { + ServerEvent::SessionUpdated(e) => { + if let Some(message) = raw_message { + info!(session_updated_raw = %message, "Raw session.updated from server"); + } + log_confirmed_session_from_server(&e.session); + debug!("Session update acknowledged"); + } + + ServerEvent::InputAudioBufferSpeechStarted(e) => { + output.service_event( + OutputPath::Control, + ServiceOutputEvent::SpeechStarted { + audio_start_ms: e.audio_start_ms, + }, + )?; + } + ServerEvent::InputAudioBufferSpeechStopped(e) => { + output.service_event( + OutputPath::Control, + ServiceOutputEvent::SpeechStopped { + audio_end_ms: e.audio_end_ms, + }, + )?; + } + ServerEvent::InputAudioBufferCommited(e) => { + output.service_event( + OutputPath::Control, + ServiceOutputEvent::SpeechCommitted { item_id: e.item_id }, + )?; + } + ServerEvent::InputAudioBufferTimeoutTriggered(e) => { + output.service_event( + OutputPath::Control, + ServiceOutputEvent::SpeechTimeout { + audio_start_ms: e.audio_start_ms, + audio_end_ms: e.audio_end_ms, + }, + )?; + } + + ServerEvent::ConversationItemInputAudioTranscriptionDelta(e) => { + let text = + self.transcription_state + .apply_input_delta(e.item_id, e.content_index, e.delta); + output.text(false, text, language.map(str::to_string), None)?; + } + ServerEvent::ConversationItemInputAudioTranscriptionCompleted(e) => { + let speaker = self.segment_speakers.remove(&e.item_id); + if let Some(text) = self.transcription_state.complete_input_transcription( + e.item_id, + e.content_index, + e.transcript, + ) { + output.text(true, text, language.map(str::to_string), speaker)?; + } + } + ServerEvent::ConversationItemInputAudioTranscriptionSegment(e) => { + if let Some(speaker) = &e.speaker { + self.segment_speakers.insert(e.item_id, speaker.clone()); + } + output.service_event( + OutputPath::Control, + ServiceOutputEvent::Segment { + start: e.start, + end: e.end, + text: e.text, + speaker: e.speaker, + }, + )?; + } + ServerEvent::ConversationItemInputAudioTranscriptionFailed(e) => { + bail!("Input audio transcription failed: {}", e.error.message); + } + + ServerEvent::Error(e) => bail!("Voice Live error: {}", e.error.message), + + other => trace!("Ignoring server event: {other:?}"), + } + + Ok(()) + } + + async fn send_frame(&mut self, frame: AudioFrame) -> Result<()> { + let mono = frame.into_mono(); + let samples_le = audio::to_le_bytes(mono.samples); + + let event = client_event::InputAudioBufferAppend { + event_id: None, + audio: BASE64_STANDARD.encode(samples_le), + }; + self.send_client_event(ClientEvent::InputAudioBufferAppend(event)) + .await + } + + async fn send_client_event(&mut self, client_event: ClientEvent) -> Result<()> { + let json = serde_json::to_string(&client_event)?; + self.write.send(Message::Text(json.into())).await?; + Ok(()) + } + + fn verify_session_created_event( + message: Option>, + ) -> Result<()> { + let Some(message) = message else { + bail!("Failed to receive the initial message"); + }; + let Message::Text(message) = message? else { + bail!("Expected a text message for session creation"); + }; + + match serde_json::from_str(&message)? { + ServerEvent::SessionCreated(_) => Ok(()), + ServerEvent::Error(e) => bail!("Failed to create the session: {}", e.error.message), + other => bail!("Unexpected event in response to session creation: {other:?}"), + } + } +} + +/// Produces the turn-detection configuration for transcription. Responses are always suppressed +/// (`create_response = false`) because this service only transcribes; a missing configuration +/// defaults to Azure semantic VAD. +fn transcription_turn_detection(configured: Option) -> TurnDetection { + let mut detection = configured + .unwrap_or_else(|| TurnDetection::AzureSemanticVad(AzureSemanticVadConfig::default())); + match &mut detection { + TurnDetection::ServerVAD(config) => { + config.create_response = false; + } + TurnDetection::SemanticVAD(config) => config.create_response = false, + TurnDetection::AzureSemanticVad(config) => { + config.create_response = Some(false); + } + TurnDetection::AzureSemanticVadMultilingual(config) => { + config.create_response = Some(false); + } + } + detection +} + +enum FlowControl { + Continue, + End, + PongAndContinue(Bytes), +} + +fn log_confirmed_session_from_server(session: &types::UntaggedSession) { + match serde_json::to_string_pretty(session) { + Ok(session_json) => { + info!(session_confirmed = %session_json, "Confirmed session from server") + } + Err(error) => warn!(?error, "Failed to serialize confirmed session from server"), + } +} + +fn log_requested_session_update(session: &types::VoiceLiveSession) { + match serde_json::to_string(session) { + Ok(session_json) => { + info!(session_requested = %session_json, "Requested session update sent to server") + } + Err(error) => warn!(?error, "Failed to serialize requested session update"), + } +} diff --git a/services/microsoft-voice-live/src/host.rs b/services/microsoft-voice-live/src/host.rs new file mode 100644 index 0000000..aa5d056 --- /dev/null +++ b/services/microsoft-voice-live/src/host.rs @@ -0,0 +1,50 @@ +use anyhow::{Context, Result, anyhow}; +use openai_api_rs::realtime::api::{RealtimeClient, RealtimeProtocol}; +use url::Url; + +use crate::client::Client; + +pub struct Host { + client: RealtimeClient, +} + +impl Host { + pub fn new(endpoint: &str, api_key: &str, model: &str, api_version: &str) -> Result { + let wss_url = build_voice_live_url(endpoint, model, api_version)?; + // Reuse the Azure realtime auth behavior (api-key query, no bearer header). The full URL, + // including the Voice Live path and `api-version`, is precomputed here, so an empty model + // is passed to keep the client from appending a second `model` query parameter. + let client = RealtimeClient::new_with_endpoint_and_protocol( + wss_url, + api_key.into(), + String::new(), + RealtimeProtocol::Azure, + ); + Ok(Self { client }) + } + + pub async fn connect(&self) -> Result { + let (write, read) = self + .client + .connect() + .await + .map_err(|e| anyhow!(e.to_string()))?; + Ok(Client::new(read, write)) + } +} + +fn build_voice_live_url(endpoint: &str, model: &str, api_version: &str) -> Result { + let mut url = Url::parse(endpoint.trim()) + .with_context(|| format!("Invalid Voice Live endpoint URL: {endpoint}"))?; + + match url.scheme() { + "wss" => {} + scheme => anyhow::bail!("Unsupported Voice Live endpoint URL scheme: {scheme}. Use wss://"), + } + + url.set_query(None); + url.query_pairs_mut() + .append_pair("api-version", api_version) + .append_pair("model", model); + Ok(url.to_string()) +} diff --git a/services/microsoft-voice-live/src/lib.rs b/services/microsoft-voice-live/src/lib.rs new file mode 100644 index 0000000..cb4b533 --- /dev/null +++ b/services/microsoft-voice-live/src/lib.rs @@ -0,0 +1,6 @@ +mod client; +mod host; +mod transcribe; +mod transcription; + +pub use transcribe::{MicrosoftVoiceLiveTranscribe, Params, ServiceOutputEvent}; diff --git a/services/microsoft-voice-live/src/transcribe.rs b/services/microsoft-voice-live/src/transcribe.rs new file mode 100644 index 0000000..cd225dd --- /dev/null +++ b/services/microsoft-voice-live/src/transcribe.rs @@ -0,0 +1,90 @@ +use anyhow::Result; +use async_trait::async_trait; +use openai_api_rs::realtime::types::{NoiseReduction, TurnDetection}; +use serde::{Deserialize, Serialize}; + +use context_switch_core::{Conversation, Service}; + +use crate::host::Host; + +/// Default Voice Live API version. Newer resources require an explicit `api-version`. +const DEFAULT_API_VERSION: &str = "2026-06-01-preview"; +/// Default transcription model. `azure-speech` is Azure's native speech-to-text engine. +const DEFAULT_TRANSCRIPTION_MODEL: &str = "azure-speech"; + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct Params { + pub api_key: String, + /// Resource endpoint URL. Must be a full `wss://` URL. + /// The path is used exactly as provided; only the `api-version` and `model` query + /// parameters are set. + pub endpoint: String, + /// Realtime model used for the Voice Live session (URL `model` query parameter). + pub model: String, + pub api_version: Option, + /// Transcription model set in `audio.input.transcription.model`. + #[serde(default = "default_transcription_model")] + pub transcription_model: String, + /// Input audio language hint in ISO-639-1 form (e.g. `en`). + pub language: Option, + /// Input-audio noise reduction (Azure deep noise suppression, near/far field). + pub noise_reduction: Option, + /// Turn-detection configuration (Azure semantic VAD, server VAD, ...). Defaults to Azure + /// semantic VAD with responses suppressed when omitted. + pub turn_detection: Option, +} + +#[derive(Debug)] +pub struct MicrosoftVoiceLiveTranscribe; + +#[async_trait] +impl Service for MicrosoftVoiceLiveTranscribe { + type Params = Params; + + async fn conversation(&self, params: Params, conversation: Conversation) -> Result<()> { + let input_format = conversation.require_audio_input()?; + conversation.require_text_output(true)?; + + let host = Host::new( + ¶ms.endpoint, + ¶ms.api_key, + ¶ms.model, + params.api_version.as_deref().unwrap_or(DEFAULT_API_VERSION), + )?; + let mut client = host.connect().await?; + + let (input, output) = conversation.start()?; + client.transcribe(input_format, params, input, output).await + } +} + +/// Turn-detection and segmentation signals surfaced on the control output path. These give the +/// caller full visibility into what the turn detector reports, beyond the final transcript text. +#[derive(Debug, Serialize)] +#[serde(tag = "type", rename_all = "camelCase")] +pub enum ServiceOutputEvent { + SpeechStarted { + audio_start_ms: u32, + }, + SpeechStopped { + audio_end_ms: u32, + }, + SpeechCommitted { + item_id: String, + }, + SpeechTimeout { + audio_start_ms: u32, + audio_end_ms: u32, + }, + Segment { + start: f64, + end: f64, + text: String, + speaker: Option, + }, +} + +fn default_transcription_model() -> String { + DEFAULT_TRANSCRIPTION_MODEL.to_string() +} diff --git a/services/microsoft-voice-live/src/transcription.rs b/services/microsoft-voice-live/src/transcription.rs new file mode 100644 index 0000000..51b579f --- /dev/null +++ b/services/microsoft-voice-live/src/transcription.rs @@ -0,0 +1,57 @@ +// TODO(merge): extract to openai-realtime-core, shared with `openai-dialog`'s transcription state. +// Voice Live is transcription-only, so only the input-side buffers are duplicated here. + +use std::collections::HashMap; + +#[derive(Debug, Default)] +pub struct TranscriptionState { + input_transcription_buffers: HashMap, +} + +impl TranscriptionState { + pub fn apply_input_delta( + &mut self, + item_id: String, + content_index: u32, + delta: String, + ) -> String { + let key = InputTranscriptionKey::new(item_id, content_index); + let entry = self.input_transcription_buffers.entry(key).or_default(); + entry.push_str(&delta); + entry.clone() + } + + pub fn complete_input_transcription( + &mut self, + item_id: String, + content_index: u32, + transcript: String, + ) -> Option { + let key = InputTranscriptionKey::new(item_id, content_index); + let text = if transcript.is_empty() { + self.input_transcription_buffers + .remove(&key) + .unwrap_or_default() + } else { + self.input_transcription_buffers.remove(&key); + transcript + }; + + (!text.is_empty()).then_some(text) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct InputTranscriptionKey { + item_id: String, + content_index: u32, +} + +impl InputTranscriptionKey { + fn new(item_id: String, content_index: u32) -> Self { + Self { + item_id, + content_index, + } + } +} diff --git a/services/openai-dialog/Cargo.toml b/services/openai-dialog/Cargo.toml index dbff691..9bcca22 100644 --- a/services/openai-dialog/Cargo.toml +++ b/services/openai-dialog/Cargo.toml @@ -12,7 +12,7 @@ context-switch-core = { workspace = true } openai-api-rs = { workspace = true } -tokio-tungstenite = { version = "0.28.0", features = ["connect", "native-tls"] } +tokio-tungstenite = { version = "0.29.0", features = ["connect", "native-tls"] } tracing = { workspace = true } diff --git a/src/context_switch.rs b/src/context_switch.rs index a2abd40..776d192 100644 --- a/src/context_switch.rs +++ b/src/context_switch.rs @@ -42,6 +42,10 @@ pub fn registry() -> Registry { .add_service("azure-translate", azure::AzureTranslate) .add_service("elevenlabs-transcribe", elevenlabs::ElevenLabsTranscribe) .add_service("google-transcribe", google_transcribe::GoogleTranscribe) + .add_service( + "microsoft-voice-live-transcribe", + microsoft_voice_live::MicrosoftVoiceLiveTranscribe, + ) .add_service("openai-dialog", openai_dialog::OpenAIDialog) .add_service("google-dialog", google_dialog::GoogleDialog) .add_service("aristech-transcribe", aristech::AristechTranscribe) diff --git a/src/lib.rs b/src/lib.rs index 51e3862..a749929 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,4 +17,5 @@ pub mod services { pub use elevenlabs::ElevenLabsTranscribe; pub use google_dialog::GoogleDialog; pub use google_transcribe::GoogleTranscribe; + pub use microsoft_voice_live::MicrosoftVoiceLiveTranscribe; }