Skip to content

Commit ddef790

Browse files
authored
Merge pull request #83 from pragmatrix/deepgram-transcribe
Deepgram transcribe
2 parents 65a4a60 + 7fc6319 commit ddef790

12 files changed

Lines changed: 427 additions & 28 deletions

File tree

.github/copilot-instructions.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
## Control Flow Style
2525
- Prefer exhaustive `match` statements for enum-based control flow instead of `if matches!(...)` shortcuts.
2626

27+
## Streaming Services
28+
- Streaming transcribe/translate services drive the provider response stream and audio forwarding (plus billing) in a single `select!` loop — never a detached forwarder task — so termination and billing stay deterministic.
29+
2730
## Memory Promotion
2831
- When a durable repository-specific preference is learned during a session, write it into this file as a concise bullet if it can help future sessions.
2932
- Keep additions short, actionable, and scoped to coding behavior in this repository.

CONTEXT.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,21 @@ These three Microsoft offerings are distinct and must not all be called "Azure".
2323
Note: Voice Live and Azure OpenAI Realtime are expected to converge over time;
2424
the `microsoft-voice-live` boundary is kept deliberately thin so the two can be
2525
merged later without a rewrite.
26+
27+
- **Deepgram Flux** — Deepgram's turn-based conversational speech-to-text API
28+
(`listen` v2). Unlike classic streaming recognition, it emits per-turn events
29+
rather than a continuous interim/final stream. Backs the `deepgram-service`
30+
crate (`DeepgramTranscribe`, registered as `deepgram-transcribe`).
31+
32+
- **Turn** — a single contiguous span of one speaker talking, as detected by
33+
Flux. A turn accumulates transcript across `Update` events and is closed by an
34+
`EndOfTurn`.
35+
36+
- **End of Turn (EOT)** — Flux's decision that the speaker has finished the
37+
current turn. Governed by `eotThreshold` (confidence to close a turn) and
38+
`eotTimeoutMs` (silence after which a turn is force-closed).
39+
40+
- **Eager End of Turn** — an early, lower-confidence EOT signal (enabled by
41+
`eagerEotThreshold`) that lets a downstream agent begin preparing a reply
42+
before the turn is confirmed. May be retracted by a `TurnResumed` event if the
43+
speaker continues.

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ members = [
1212
"filter-test",
1313
"services/aristech",
1414
"services/azure",
15+
"services/deepgram",
1516
"services/elevenlabs",
1617
"services/google-dialog",
1718
"services/google-transcribe",
@@ -37,6 +38,7 @@ google-dialog = { workspace = true }
3738
azure = { workspace = true }
3839
azure-speech = { workspace = true }
3940
aristech = { workspace = true }
41+
deepgram-service = { workspace = true }
4042
elevenlabs = { workspace = true }
4143
google-transcribe = { workspace = true }
4244
microsoft-voice-live = { workspace = true }
@@ -103,6 +105,7 @@ context-switch-core = { path = "core" }
103105
azure = { path = "services/azure" }
104106
playback = { path = "services/playback" }
105107
aristech = { path = "services/aristech" }
108+
deepgram-service = { path = "services/deepgram" }
106109
elevenlabs = { path = "services/elevenlabs" }
107110
google-transcribe = { path = "services/google-transcribe" }
108111
google-dialog = { path = "services/google-dialog" }

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ cargo run --example openai-dialog
6363
# Run generic transcribe example with Azure provider
6464
cargo run --example transcribe -- azure
6565

66+
# Run generic transcribe example with Deepgram provider
67+
cargo run --example transcribe -- deepgram
68+
6669
# Run generic transcribe example with ElevenLabs provider
6770
cargo run --example transcribe -- elevenlabs
6871

@@ -108,6 +111,10 @@ AZURE_REGION=your_azure_region
108111
# ElevenLabs Configuration
109112
ELEVENLABS_API_KEY=your_elevenlabs_key
110113
114+
# Deepgram Configuration
115+
DEEPGRAM_API_KEY=your_deepgram_key
116+
DEEPGRAM_ENDPOINT=wss://api.deepgram.com/v2/listen
117+
111118
# Microsoft Voice Live Configuration
112119
MICROSOFT_VOICE_LIVE_API_KEY=your_voice_live_key
113120
MICROSOFT_VOICE_LIVE_ENDPOINT=wss://your-resource.services.ai.azure.com/voice-live/realtime

examples/transcribe.rs

Lines changed: 96 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,19 @@ use std::time::Duration;
44

55
use anyhow::{Context, Result, bail};
66
use clap::{Parser, ValueEnum};
7+
use tokio::select;
8+
use tokio::sync::mpsc::{channel, unbounded_channel};
9+
710
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
811
use openai_api_rs::realtime::types::{
912
AzureSemanticVadConfig, EndOfUtteranceDetectionConfig, EndOfUtteranceDetectionModel,
1013
EndOfUtteranceThresholdLevel, TurnDetection,
1114
};
1215
use rodio::DeviceSinkBuilder;
13-
use tokio::select;
14-
use tokio::sync::mpsc::{channel, unbounded_channel};
1516

1617
use context_switch::services::{
17-
AristechTranscribe, AzureTranscribe, ElevenLabsTranscribe, GoogleTranscribe,
18-
MicrosoftVoiceLiveTranscribe,
18+
AristechTranscribe, AzureTranscribe, DeepgramTranscribe, ElevenLabsTranscribe,
19+
GoogleTranscribe, MicrosoftVoiceLiveTranscribe,
1920
};
2021
use context_switch::{AudioConsumer, InputModality, OutputModality};
2122
use context_switch_core::language::Languages;
@@ -51,6 +52,8 @@ enum Provider {
5152
Aristech,
5253
#[value(name = "voice-live")]
5354
VoiceLive,
55+
#[value(name = "deepgram")]
56+
Deepgram,
5457
}
5558

5659
#[tokio::main]
@@ -240,11 +243,10 @@ async fn start_conversation(
240243
diarization: bool,
241244
conversation: Conversation,
242245
) -> Result<()> {
246+
validate_provider_args(provider, model, region, diarization)?;
247+
243248
match provider {
244249
Provider::Azure => {
245-
if region.is_some() {
246-
bail!("--region is only supported for the google provider");
247-
}
248250
let params = azure::transcribe::Params {
249251
endpoint: env::var("AZURE_ENDPOINT")
250252
.ok()
@@ -259,12 +261,6 @@ async fn start_conversation(
259261
AzureTranscribe.conversation(params, conversation).await
260262
}
261263
Provider::Elevenlabs => {
262-
if diarization {
263-
bail!("--diarization is only supported for the azure provider");
264-
}
265-
if region.is_some() {
266-
bail!("--region is only supported for the google provider");
267-
}
268264
let language = Some(
269265
languages
270266
.single()
@@ -317,12 +313,6 @@ async fn start_conversation(
317313
GoogleTranscribe.conversation(params, conversation).await
318314
}
319315
Provider::Aristech => {
320-
if diarization {
321-
bail!("--diarization is only supported for the azure provider");
322-
}
323-
if region.is_some() {
324-
bail!("--region is only supported for the google provider");
325-
}
326316
let language = languages
327317
.single()
328318
.context("Aristech provider supports exactly one --language value")?
@@ -351,13 +341,6 @@ async fn start_conversation(
351341
AristechTranscribe.conversation(params, conversation).await
352342
}
353343
Provider::VoiceLive => {
354-
if diarization {
355-
bail!("--diarization is only supported for the azure provider");
356-
}
357-
if region.is_some() {
358-
bail!("--region is only supported for the google provider");
359-
}
360-
361344
let language = Some(
362345
languages
363346
.single()
@@ -383,7 +366,7 @@ async fn start_conversation(
383366
AzureSemanticVadConfig {
384367
end_of_utterance_detection: Some(EndOfUtteranceDetectionConfig {
385368
model: EndOfUtteranceDetectionModel::SmartEndOfTurnDetection,
386-
threshold_level: Some(EndOfUtteranceThresholdLevel::Low),
369+
threshold_level: Some(EndOfUtteranceThresholdLevel::Default),
387370
timeout_ms: Some(5000),
388371
}),
389372
// remove_filler_words: Some(true),
@@ -396,5 +379,91 @@ async fn start_conversation(
396379
.conversation(params, conversation)
397380
.await
398381
}
382+
Provider::Deepgram => {
383+
let params = deepgram_service::transcribe::Params {
384+
api_key: env::var("DEEPGRAM_API_KEY").expect("DEEPGRAM_API_KEY undefined"),
385+
endpoint: env::var("DEEPGRAM_ENDPOINT").expect("DEEPGRAM_ENDPOINT undefined"),
386+
language: languages.join_csv(),
387+
profanity_filter: false,
388+
keyterm: vec![],
389+
turn_detection: deepgram_service::transcribe::TurnDetection::default(),
390+
};
391+
392+
DeepgramTranscribe.conversation(params, conversation).await
393+
}
394+
}
395+
}
396+
397+
#[derive(Debug, Clone, Copy, Default)]
398+
struct ProviderCapabilities {
399+
region: bool,
400+
diarization: bool,
401+
model: bool,
402+
}
403+
404+
impl Provider {
405+
fn capabilities(self) -> ProviderCapabilities {
406+
let mut capabilities = ProviderCapabilities::default();
407+
408+
match self {
409+
Provider::Azure => {
410+
capabilities.diarization = true;
411+
capabilities.model = true;
412+
}
413+
Provider::Deepgram => {}
414+
Provider::Elevenlabs => {
415+
capabilities.model = true;
416+
}
417+
Provider::Google => {
418+
capabilities.region = true;
419+
capabilities.diarization = true;
420+
capabilities.model = true;
421+
}
422+
Provider::Aristech => {
423+
capabilities.model = true;
424+
}
425+
Provider::VoiceLive => {
426+
capabilities.model = true;
427+
}
428+
}
429+
430+
capabilities
399431
}
400432
}
433+
434+
fn validate_capability(
435+
option_name: &str,
436+
is_used: bool,
437+
capability: bool,
438+
provider: Provider,
439+
) -> Result<()> {
440+
if !is_used || capability {
441+
return Ok(());
442+
}
443+
444+
bail!(
445+
"{option_name} is unsupported for provider '{}'",
446+
provider
447+
.to_possible_value()
448+
.expect("Provider has a possible value")
449+
.get_name()
450+
)
451+
}
452+
453+
fn validate_provider_args(
454+
provider: Provider,
455+
model: Option<&str>,
456+
region: Option<&str>,
457+
diarization: bool,
458+
) -> Result<()> {
459+
let capabilities = provider.capabilities();
460+
461+
validate_capability("--model", model.is_some(), capabilities.model, provider)?;
462+
validate_capability("--region", region.is_some(), capabilities.region, provider)?;
463+
validate_capability(
464+
"--diarization",
465+
diarization,
466+
capabilities.diarization,
467+
provider,
468+
)
469+
}

justfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ transcribe-google-latest-long:
2828
transcribe-voice-live-de:
2929
cargo run --example transcribe -- voice-live --language de-DE
3030

31+
transcribe-deepgram-de:
32+
cargo run --example transcribe -- deepgram --language de-DE
33+
3134
transcribe-google-diarization:
3235
cargo run --example transcribe -- google --diarization --language de-DE --model chirp_3 --region eu
3336

services/deepgram/Cargo.toml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
[package]
2+
name = "deepgram-service"
3+
version = "0.1.0"
4+
edition.workspace = true
5+
6+
[dependencies]
7+
context-switch-core = { workspace = true }
8+
9+
deepgram = "0.10.0"
10+
11+
anyhow = { workspace = true }
12+
async-trait = { workspace = true }
13+
bytes = { workspace = true }
14+
futures = { workspace = true }
15+
serde = { workspace = true, features = ["derive"] }
16+
tokio = { workspace = true }
17+
tracing = { workspace = true }

services/deepgram/src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
//! A Deepgram Flux speech-to-text service.
2+
3+
pub mod transcribe;
4+
pub use transcribe::DeepgramTranscribe;

0 commit comments

Comments
 (0)