Skip to content

Commit f5411ff

Browse files
committed
feat(agent): finish Phase 1 with examples and integration tests
Wraps up Phase 1 of the spec-coverage rollout: - Agent::start_at_url(&str) — public URL override on the agent sub-client. start() now delegates to it. Used by self-hosted deployments and the new integration tests. - ThinkSettings and ThinkFunction gain builder methods so external consumers can construct them around the #[non_exhaustive] marker: ThinkSettings::{with_endpoint, with_functions, with_function, with_prompt, with_context_length} and ThinkFunction::{new, with_endpoint}. - Three new examples in examples/agent/websocket/: * function_calling.rs — client-side function definition, FunctionCallRequest dispatch, FunctionCallResponse reply. * dynamic_provider_swap.rs — UpdateSpeak mid-session to swap the Aura voice while the connection stays open. * microphone_agent.rs — full two-way audio session via cpal (capture) and rodio (playback). Audio I/O patterns borrowed from microphone_flux.rs and text_to_speech_to_stream.rs. - tests/agent_integration.rs uses a tokio-tungstenite mock server to exercise the full session lifecycle end-to-end: Welcome → client Settings → SettingsApplied → ConversationText → binary audio → AgentAudioDone → client close. The mock injects dg-request-id on the upgrade response and asserts on the serialized Settings the client sends. A second test verifies that unknown server JSON events round-trip through the AgentResponse::Unknown(Value) forward-compat variant. 136 tests pass (lib + doctest + integration), all 4 examples build, both with default features and with --no-default-features --features agent.
1 parent 32bfafa commit f5411ff

7 files changed

Lines changed: 1004 additions & 2 deletions

File tree

Cargo.toml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,3 +134,18 @@ required-features = ["speak"]
134134
name = "agent_simple"
135135
path = "examples/agent/websocket/simple_agent.rs"
136136
required-features = ["agent"]
137+
138+
[[example]]
139+
name = "agent_function_calling"
140+
path = "examples/agent/websocket/function_calling.rs"
141+
required-features = ["agent"]
142+
143+
[[example]]
144+
name = "agent_dynamic_provider_swap"
145+
path = "examples/agent/websocket/dynamic_provider_swap.rs"
146+
required-features = ["agent"]
147+
148+
[[example]]
149+
name = "agent_microphone"
150+
path = "examples/agent/websocket/microphone_agent.rs"
151+
required-features = ["agent"]
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
/* Expected result from running this example program.
2+
Connected. dg-request-id: Some(<uuid>)
3+
Welcome request_id: <uuid>
4+
Settings applied
5+
Conversation (Assistant): I'm using my first voice.
6+
[5s elapsed] Sending UpdateSpeak to switch to a different Aura-2 voice...
7+
SpeakUpdated
8+
Conversation (Assistant): Now my voice has changed.
9+
*/
10+
11+
//! Dynamic provider swap example.
12+
//!
13+
//! Connects with one Speak provider (`aura-2-thalia-en`), then 5 seconds
14+
//! after the agent's initial greeting, sends an `UpdateSpeak` message to
15+
//! swap the voice to `aura-2-zeus-en`. Demonstrates that providers can
16+
//! be changed mid-session without dropping the WebSocket.
17+
//!
18+
//! No real audio I/O — the example surfaces the JSON message round-trip
19+
//! and the `SpeakUpdated` confirmation event.
20+
//!
21+
//! Run with:
22+
//!
23+
//! ```bash
24+
//! DEEPGRAM_API_KEY=<your-key> \
25+
//! cargo run --features agent --example agent_dynamic_provider_swap
26+
//! ```
27+
28+
use std::env;
29+
use std::time::Duration;
30+
31+
use futures::stream::StreamExt;
32+
33+
use deepgram::agent::{
34+
audio::{AudioConfig, AudioInput, AudioInputEncoding},
35+
listen::{AgentListenProvider, AgentListenSettings, DeepgramListenV2Provider},
36+
messages::UpdateSpeakMessage,
37+
settings::{AgentConfig, InlineAgentConfig, SettingsMessage},
38+
speak::{DeepgramSpeakModel, DeepgramSpeakProvider, SpeakProvider, SpeakSettings},
39+
think::{OpenAiModel, OpenAiThinkProvider, ThinkProvider, ThinkSettings},
40+
AgentEvent, AgentResponse,
41+
};
42+
use deepgram::{Deepgram, DeepgramError};
43+
44+
static SESSION_DURATION: Duration = Duration::from_secs(60);
45+
static SWAP_AFTER: Duration = Duration::from_secs(5);
46+
47+
#[tokio::main]
48+
async fn main() -> Result<(), DeepgramError> {
49+
let api_key = env::var("DEEPGRAM_API_KEY").expect("DEEPGRAM_API_KEY environment variable");
50+
51+
let dg = Deepgram::new(&api_key)?;
52+
let (mut handle, mut events) = dg.agent().start().await?;
53+
54+
println!("Connected. dg-request-id: {:?}", handle.request_id());
55+
56+
let settings = SettingsMessage::new(
57+
AudioConfig::new(
58+
Some(AudioInput::new(AudioInputEncoding::Linear16, 16_000)),
59+
None,
60+
),
61+
AgentConfig::inline(
62+
InlineAgentConfig::from_parts(
63+
AgentListenSettings::new(AgentListenProvider::DeepgramV2(
64+
DeepgramListenV2Provider::new("flux-general-en"),
65+
)),
66+
ThinkSettings::new(ThinkProvider::OpenAi(OpenAiThinkProvider::new(
67+
OpenAiModel::Gpt4oMini,
68+
))),
69+
SpeakSettings::new(SpeakProvider::Deepgram(DeepgramSpeakProvider::new(
70+
DeepgramSpeakModel::Aura2ThaliaEn,
71+
))),
72+
)
73+
.with_greeting("I'm using my first voice."),
74+
),
75+
);
76+
handle.send_settings(settings).await?;
77+
78+
let timeout = tokio::time::sleep(SESSION_DURATION);
79+
tokio::pin!(timeout);
80+
let swap_timer = tokio::time::sleep(SWAP_AFTER);
81+
tokio::pin!(swap_timer);
82+
let mut swapped = false;
83+
84+
loop {
85+
tokio::select! {
86+
_ = &mut timeout => {
87+
println!("\nSession duration reached, closing.");
88+
break;
89+
}
90+
_ = &mut swap_timer, if !swapped => {
91+
println!(
92+
"\n[{}s elapsed] Sending UpdateSpeak to switch to a different Aura-2 voice...",
93+
SWAP_AFTER.as_secs()
94+
);
95+
let new_speak = SpeakSettings::new(SpeakProvider::Deepgram(
96+
DeepgramSpeakProvider::new(DeepgramSpeakModel::Aura2ZeusEn),
97+
));
98+
handle
99+
.send_update_speak(UpdateSpeakMessage::one(new_speak))
100+
.await?;
101+
// Optional follow-up: inject an utterance so the agent
102+
// speaks again with the new voice. Without this the user
103+
// would have to talk for the swap to be audible.
104+
handle
105+
.send_inject_agent_message(
106+
deepgram::agent::messages::InjectAgentMessageMessage::new(
107+
"Now my voice has changed.",
108+
),
109+
)
110+
.await?;
111+
swapped = true;
112+
}
113+
event = events.next() => {
114+
match event {
115+
Some(Ok(AgentEvent::Json(response))) => match response {
116+
AgentResponse::Welcome(w) => {
117+
println!("Welcome request_id: {}", w.request_id);
118+
}
119+
AgentResponse::SettingsApplied(_) => {
120+
println!("Settings applied");
121+
}
122+
AgentResponse::SpeakUpdated(_) => {
123+
println!("SpeakUpdated");
124+
}
125+
AgentResponse::ConversationText(c) => {
126+
println!("Conversation ({:?}): {}", c.role, c.content);
127+
}
128+
AgentResponse::Error(e) => {
129+
eprintln!("Error [{}]: {}", e.code, e.description);
130+
break;
131+
}
132+
AgentResponse::Warning(w) => {
133+
println!("Warning [{}]: {}", w.code, w.description);
134+
}
135+
_ => {}
136+
},
137+
Some(Ok(AgentEvent::Audio(_))) => {
138+
// Discard audio for brevity; the playback path is
139+
// demonstrated in the microphone example.
140+
}
141+
Some(Ok(_)) => {} // AgentEvent #[non_exhaustive]
142+
Some(Err(err)) => {
143+
eprintln!("Stream error: {err}");
144+
break;
145+
}
146+
None => {
147+
println!("Server closed connection.");
148+
break;
149+
}
150+
}
151+
}
152+
}
153+
}
154+
155+
handle.close().await?;
156+
Ok(())
157+
}
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
/* Expected result from running this example program.
2+
Connected. dg-request-id: Some(<uuid>)
3+
Welcome request_id: <uuid>
4+
Settings applied
5+
Conversation (Assistant): Hi! Ask me about the weather.
6+
... (after a user injection asking about NYC)
7+
FunctionCallRequest: get_weather (id=fc_1, client_side=true)
8+
arguments: {"city":"New York"}
9+
→ responding with: {"temperature":72,"condition":"sunny"}
10+
Conversation (Assistant): It's 72 and sunny in New York.
11+
*/
12+
13+
//! Function-calling Voice Agent example.
14+
//!
15+
//! Configures an agent with a single client-side function (`get_weather`),
16+
//! injects a synthetic user utterance asking about the weather, and
17+
//! responds to the resulting [`FunctionCallRequest`] with a canned
18+
//! `FunctionCallResponse`.
19+
//!
20+
//! No real audio I/O — the example exercises the JSON message round-trip
21+
//! and the function-call protocol on its own.
22+
//!
23+
//! Run with:
24+
//!
25+
//! ```bash
26+
//! DEEPGRAM_API_KEY=<your-key> \
27+
//! cargo run --features agent --example agent_function_calling
28+
//! ```
29+
30+
use std::env;
31+
use std::time::Duration;
32+
33+
use futures::stream::StreamExt;
34+
use serde_json::json;
35+
36+
use deepgram::agent::messages::FunctionCallResponseMessage;
37+
use deepgram::agent::{
38+
audio::{AudioConfig, AudioInput, AudioInputEncoding},
39+
listen::{AgentListenProvider, AgentListenSettings, DeepgramListenV2Provider},
40+
settings::{AgentConfig, InlineAgentConfig, SettingsMessage},
41+
speak::{DeepgramSpeakModel, DeepgramSpeakProvider, SpeakProvider, SpeakSettings},
42+
think::{OpenAiModel, OpenAiThinkProvider, ThinkFunction, ThinkProvider, ThinkSettings},
43+
AgentEvent, AgentResponse,
44+
};
45+
use deepgram::{Deepgram, DeepgramError};
46+
47+
static SESSION_DURATION: Duration = Duration::from_secs(60);
48+
49+
#[tokio::main]
50+
async fn main() -> Result<(), DeepgramError> {
51+
let api_key = env::var("DEEPGRAM_API_KEY").expect("DEEPGRAM_API_KEY environment variable");
52+
53+
let dg = Deepgram::new(&api_key)?;
54+
let (mut handle, mut events) = dg.agent().start().await?;
55+
56+
println!("Connected. dg-request-id: {:?}", handle.request_id());
57+
58+
// No `endpoint` → executed client-side. The server emits a
59+
// FunctionCallRequest and waits for our FunctionCallResponse.
60+
let weather_function = ThinkFunction::new(
61+
"get_weather",
62+
"Look up the current weather for a city.",
63+
json!({
64+
"type": "object",
65+
"properties": {
66+
"city": {
67+
"type": "string",
68+
"description": "City name, e.g. \"New York\""
69+
}
70+
},
71+
"required": ["city"]
72+
}),
73+
);
74+
75+
let think = ThinkSettings::new(ThinkProvider::OpenAi(OpenAiThinkProvider::new(
76+
OpenAiModel::Gpt4oMini,
77+
)))
78+
.with_function(weather_function)
79+
.with_prompt(
80+
"You are a helpful weather assistant. Use the get_weather function \
81+
when the user asks about weather.",
82+
);
83+
84+
let settings = SettingsMessage::new(
85+
AudioConfig::new(
86+
Some(AudioInput::new(AudioInputEncoding::Linear16, 16_000)),
87+
None,
88+
),
89+
AgentConfig::inline(
90+
InlineAgentConfig::from_parts(
91+
AgentListenSettings::new(AgentListenProvider::DeepgramV2(
92+
DeepgramListenV2Provider::new("flux-general-en"),
93+
)),
94+
think,
95+
SpeakSettings::new(SpeakProvider::Deepgram(DeepgramSpeakProvider::new(
96+
DeepgramSpeakModel::Aura2ThaliaEn,
97+
))),
98+
)
99+
.with_greeting("Hi! Ask me about the weather."),
100+
),
101+
);
102+
handle.send_settings(settings).await?;
103+
104+
// Wait for SettingsApplied, then inject a synthetic user message
105+
// so the agent has something to respond to without us needing a mic.
106+
let mut injected = false;
107+
108+
let timeout = tokio::time::sleep(SESSION_DURATION);
109+
tokio::pin!(timeout);
110+
111+
loop {
112+
tokio::select! {
113+
_ = &mut timeout => {
114+
println!("\nSession duration reached, closing.");
115+
break;
116+
}
117+
event = events.next() => {
118+
match event {
119+
Some(Ok(AgentEvent::Json(response))) => match response {
120+
AgentResponse::Welcome(w) => {
121+
println!("Welcome request_id: {}", w.request_id);
122+
}
123+
AgentResponse::SettingsApplied(_) => {
124+
println!("Settings applied");
125+
if !injected {
126+
injected = true;
127+
handle
128+
.send_inject_user_message(
129+
deepgram::agent::messages::InjectUserMessageMessage::new(
130+
"What's the weather in New York?",
131+
),
132+
)
133+
.await?;
134+
}
135+
}
136+
AgentResponse::ConversationText(c) => {
137+
println!("Conversation ({:?}): {}", c.role, c.content);
138+
}
139+
AgentResponse::FunctionCallRequest(req) => {
140+
for call in &req.functions {
141+
println!(
142+
"FunctionCallRequest: {} (id={}, client_side={})",
143+
call.name, call.id, call.client_side
144+
);
145+
println!(" arguments: {}", call.arguments);
146+
147+
if call.client_side && call.name == "get_weather" {
148+
// Canned response — in a real app, parse
149+
// call.arguments and dispatch to your
150+
// function implementation.
151+
let result =
152+
json!({"temperature": 72, "condition": "sunny"});
153+
println!(" → responding with: {result}");
154+
handle
155+
.send_function_call_response(
156+
FunctionCallResponseMessage::with_id(
157+
call.id.clone(),
158+
call.name.clone(),
159+
result.to_string(),
160+
),
161+
)
162+
.await?;
163+
}
164+
}
165+
}
166+
AgentResponse::AgentAudioDone(_) => {
167+
// After the agent finishes its audio response we
168+
// could end the demo. For brevity we just log.
169+
println!("Agent audio done");
170+
}
171+
AgentResponse::Error(e) => {
172+
eprintln!("Error [{}]: {}", e.code, e.description);
173+
break;
174+
}
175+
AgentResponse::Warning(w) => {
176+
println!("Warning [{}]: {}", w.code, w.description);
177+
}
178+
_ => {}
179+
},
180+
Some(Ok(AgentEvent::Audio(_))) => {
181+
// Audio chunks arrive between AgentStartedSpeaking and
182+
// AgentAudioDone. Discarded here.
183+
}
184+
Some(Ok(_)) => {} // AgentEvent #[non_exhaustive]
185+
Some(Err(err)) => {
186+
eprintln!("Stream error: {err}");
187+
break;
188+
}
189+
None => {
190+
println!("Server closed connection.");
191+
break;
192+
}
193+
}
194+
}
195+
}
196+
}
197+
198+
handle.close().await?;
199+
Ok(())
200+
}

0 commit comments

Comments
 (0)