Skip to content

Commit 26374e2

Browse files
committed
feat(agent): add audio, listen, and Settings message types
Adds the audio I/O block, agent.listen provider config, and the top-level SettingsMessage that ties together the previously-landed think/speak/history types. Audio (mirrors AgentV1SettingsMessage.audio): - AudioConfig with optional input/output sub-configs. - AudioInput defaults to linear16 at 24kHz per spec. - AudioInputEncoding (11 values: linear16, linear32, flac, alaw, mulaw, amr-nb, amr-wb, opus, ogg-opus, speex, g729) + AudioOutputEncoding (linear16, mulaw, alaw, mp3, opus, flac, aac). - AudioContainer (none, wav, ogg). Listen (mirrors agent.listen on the spec): - AgentListenProvider with V1 and V2 (Flux) Deepgram variants. - Discriminator is the `version` field (v1 default, v2 explicit) via a custom Deserialize. V2 requires `model`; `language_hint` accepts a single string or array on the wire and is modeled as Vec<String>. SettingsMessage: - Top-level fields: type ("Settings"), tags, experimental, flags.history, mip_opt_out, audio, agent. - AgentConfig is a oneOf: Inline(InlineAgentConfig) or Saved(Uuid), with a UUID-string ↔ inline-object discriminator implemented by hand. InlineAgentConfig.language carries #[deprecated] mirroring the spec. - think and speak fields use one-or-many serde helpers so a single provider serializes as a scalar object and an array deserializes cleanly. 88 round-trip serde tests pass (cargo test --features agent). The remaining client-to-server messages (UpdateSpeak, UpdateThink, UpdatePrompt, InjectUserMessage, InjectAgentMessage, FunctionCallResponse, KeepAlive) and the unified ClientMessage enum follow in the next commit.
1 parent 6b306e2 commit 26374e2

4 files changed

Lines changed: 1383 additions & 2 deletions

File tree

src/agent/audio.rs

Lines changed: 378 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,378 @@
1+
//! Audio I/O configuration for the Voice Agent `Settings` message.
2+
//!
3+
//! Mirrors the `audio` block on `AgentV1SettingsMessage` in
4+
//! `asyncapi/schemas/schemas.agent.v1.yml`.
5+
//!
6+
//! Note the encoding lists here are agent-specific and intentionally
7+
//! distinct from [`crate::common::options::Encoding`] (STT) and
8+
//! `crate::speak::options::Encoding` (TTS REST). The agent input
9+
//! encoding adds `linear32`, `alaw`, and `ogg-opus` over what STT
10+
//! exposes today; the agent output encoding includes `aac` like the
11+
//! Speak REST API.
12+
13+
use serde::{Deserialize, Deserializer, Serialize, Serializer};
14+
15+
/// Audio configuration block on `AgentV1SettingsMessage.audio`.
16+
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
17+
#[non_exhaustive]
18+
pub struct AudioConfig {
19+
/// Inbound audio (client → agent) configuration.
20+
#[serde(default, skip_serializing_if = "Option::is_none")]
21+
pub input: Option<AudioInput>,
22+
23+
/// Outbound audio (agent → client) configuration.
24+
#[serde(default, skip_serializing_if = "Option::is_none")]
25+
pub output: Option<AudioOutput>,
26+
}
27+
28+
impl AudioConfig {
29+
/// Construct with the given input and output sub-configs.
30+
pub fn new(input: Option<AudioInput>, output: Option<AudioOutput>) -> Self {
31+
Self { input, output }
32+
}
33+
}
34+
35+
/// Inbound audio configuration. Spec defaults to `linear16` at `24000` Hz.
36+
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
37+
#[non_exhaustive]
38+
pub struct AudioInput {
39+
/// Audio encoding format.
40+
pub encoding: AudioInputEncoding,
41+
42+
/// Sample rate in Hz. Common values: 16000, 24000, 44100, 48000.
43+
pub sample_rate: u32,
44+
}
45+
46+
impl AudioInput {
47+
/// Construct with explicit encoding and sample rate.
48+
pub fn new(encoding: AudioInputEncoding, sample_rate: u32) -> Self {
49+
Self {
50+
encoding,
51+
sample_rate,
52+
}
53+
}
54+
}
55+
56+
impl Default for AudioInput {
57+
/// Spec defaults: `linear16` at 24000 Hz.
58+
fn default() -> Self {
59+
Self {
60+
encoding: AudioInputEncoding::Linear16,
61+
sample_rate: 24_000,
62+
}
63+
}
64+
}
65+
66+
/// Outbound audio configuration. All fields optional per spec.
67+
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
68+
#[non_exhaustive]
69+
pub struct AudioOutput {
70+
/// Audio encoding format. Spec default is `linear16`.
71+
#[serde(default, skip_serializing_if = "Option::is_none")]
72+
pub encoding: Option<AudioOutputEncoding>,
73+
74+
/// Sample rate in Hz.
75+
#[serde(default, skip_serializing_if = "Option::is_none")]
76+
pub sample_rate: Option<u32>,
77+
78+
/// Bitrate in bits per second.
79+
#[serde(default, skip_serializing_if = "Option::is_none")]
80+
pub bitrate: Option<u32>,
81+
82+
/// Container format. Spec default is `none`.
83+
#[serde(default, skip_serializing_if = "Option::is_none")]
84+
pub container: Option<AudioContainer>,
85+
}
86+
87+
impl AudioOutput {
88+
/// Construct an empty output config; all fields default to `None`.
89+
pub fn new() -> Self {
90+
Self::default()
91+
}
92+
93+
#[allow(missing_docs)]
94+
pub fn with_encoding(mut self, encoding: AudioOutputEncoding) -> Self {
95+
self.encoding = Some(encoding);
96+
self
97+
}
98+
99+
#[allow(missing_docs)]
100+
pub fn with_sample_rate(mut self, sample_rate: u32) -> Self {
101+
self.sample_rate = Some(sample_rate);
102+
self
103+
}
104+
105+
#[allow(missing_docs)]
106+
pub fn with_bitrate(mut self, bitrate: u32) -> Self {
107+
self.bitrate = Some(bitrate);
108+
self
109+
}
110+
111+
#[allow(missing_docs)]
112+
pub fn with_container(mut self, container: AudioContainer) -> Self {
113+
self.container = Some(container);
114+
self
115+
}
116+
}
117+
118+
/// Inbound audio encoding for the Voice Agent. Use [`AudioInputEncoding::Other`] for
119+
/// values not yet enumerated by this SDK.
120+
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
121+
#[non_exhaustive]
122+
#[allow(missing_docs)]
123+
pub enum AudioInputEncoding {
124+
Linear16,
125+
Linear32,
126+
Flac,
127+
Alaw,
128+
Mulaw,
129+
AmrNb,
130+
AmrWb,
131+
Opus,
132+
OggOpus,
133+
Speex,
134+
G729,
135+
/// Forward-compatibility escape.
136+
Other(String),
137+
}
138+
139+
impl AudioInputEncoding {
140+
/// Wire string representation.
141+
pub fn as_str(&self) -> &str {
142+
match self {
143+
Self::Linear16 => "linear16",
144+
Self::Linear32 => "linear32",
145+
Self::Flac => "flac",
146+
Self::Alaw => "alaw",
147+
Self::Mulaw => "mulaw",
148+
Self::AmrNb => "amr-nb",
149+
Self::AmrWb => "amr-wb",
150+
Self::Opus => "opus",
151+
Self::OggOpus => "ogg-opus",
152+
Self::Speex => "speex",
153+
Self::G729 => "g729",
154+
Self::Other(s) => s,
155+
}
156+
}
157+
}
158+
159+
impl From<String> for AudioInputEncoding {
160+
fn from(value: String) -> Self {
161+
match value.as_str() {
162+
"linear16" => Self::Linear16,
163+
"linear32" => Self::Linear32,
164+
"flac" => Self::Flac,
165+
"alaw" => Self::Alaw,
166+
"mulaw" => Self::Mulaw,
167+
"amr-nb" => Self::AmrNb,
168+
"amr-wb" => Self::AmrWb,
169+
"opus" => Self::Opus,
170+
"ogg-opus" => Self::OggOpus,
171+
"speex" => Self::Speex,
172+
"g729" => Self::G729,
173+
_ => Self::Other(value),
174+
}
175+
}
176+
}
177+
178+
impl Serialize for AudioInputEncoding {
179+
fn serialize<S: Serializer>(&self, ser: S) -> Result<S::Ok, S::Error> {
180+
ser.serialize_str(self.as_str())
181+
}
182+
}
183+
184+
impl<'de> Deserialize<'de> for AudioInputEncoding {
185+
fn deserialize<D: Deserializer<'de>>(de: D) -> Result<Self, D::Error> {
186+
Ok(Self::from(String::deserialize(de)?))
187+
}
188+
}
189+
190+
/// Outbound audio encoding for the Voice Agent.
191+
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
192+
#[non_exhaustive]
193+
#[allow(missing_docs)]
194+
pub enum AudioOutputEncoding {
195+
Linear16,
196+
Mulaw,
197+
Alaw,
198+
Mp3,
199+
Opus,
200+
Flac,
201+
Aac,
202+
/// Forward-compatibility escape.
203+
Other(String),
204+
}
205+
206+
impl AudioOutputEncoding {
207+
/// Wire string representation.
208+
pub fn as_str(&self) -> &str {
209+
match self {
210+
Self::Linear16 => "linear16",
211+
Self::Mulaw => "mulaw",
212+
Self::Alaw => "alaw",
213+
Self::Mp3 => "mp3",
214+
Self::Opus => "opus",
215+
Self::Flac => "flac",
216+
Self::Aac => "aac",
217+
Self::Other(s) => s,
218+
}
219+
}
220+
}
221+
222+
impl From<String> for AudioOutputEncoding {
223+
fn from(value: String) -> Self {
224+
match value.as_str() {
225+
"linear16" => Self::Linear16,
226+
"mulaw" => Self::Mulaw,
227+
"alaw" => Self::Alaw,
228+
"mp3" => Self::Mp3,
229+
"opus" => Self::Opus,
230+
"flac" => Self::Flac,
231+
"aac" => Self::Aac,
232+
_ => Self::Other(value),
233+
}
234+
}
235+
}
236+
237+
impl Serialize for AudioOutputEncoding {
238+
fn serialize<S: Serializer>(&self, ser: S) -> Result<S::Ok, S::Error> {
239+
ser.serialize_str(self.as_str())
240+
}
241+
}
242+
243+
impl<'de> Deserialize<'de> for AudioOutputEncoding {
244+
fn deserialize<D: Deserializer<'de>>(de: D) -> Result<Self, D::Error> {
245+
Ok(Self::from(String::deserialize(de)?))
246+
}
247+
}
248+
249+
/// Audio container format for outbound audio.
250+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
251+
#[serde(rename_all = "lowercase")]
252+
#[non_exhaustive]
253+
pub enum AudioContainer {
254+
/// No container (`none`).
255+
None,
256+
/// WAV.
257+
Wav,
258+
/// Ogg.
259+
Ogg,
260+
}
261+
262+
#[cfg(test)]
263+
mod tests {
264+
use super::*;
265+
use serde_json::json;
266+
267+
#[test]
268+
fn input_round_trip_default_encoding() {
269+
let raw = json!({ "encoding": "linear16", "sample_rate": 24000 });
270+
let input: AudioInput = serde_json::from_value(raw.clone()).unwrap();
271+
assert_eq!(input.encoding, AudioInputEncoding::Linear16);
272+
assert_eq!(input.sample_rate, 24_000);
273+
assert_eq!(serde_json::to_value(&input).unwrap(), raw);
274+
}
275+
276+
#[test]
277+
fn input_round_trip_ogg_opus() {
278+
let raw = json!({ "encoding": "ogg-opus", "sample_rate": 48000 });
279+
let input: AudioInput = serde_json::from_value(raw.clone()).unwrap();
280+
assert_eq!(input.encoding, AudioInputEncoding::OggOpus);
281+
assert_eq!(serde_json::to_value(&input).unwrap(), raw);
282+
}
283+
284+
#[test]
285+
fn input_unknown_encoding_falls_back_to_other() {
286+
let raw = json!({ "encoding": "future-codec", "sample_rate": 16000 });
287+
let input: AudioInput = serde_json::from_value(raw).unwrap();
288+
assert_eq!(
289+
input.encoding,
290+
AudioInputEncoding::Other("future-codec".into())
291+
);
292+
}
293+
294+
#[test]
295+
fn output_round_trip_full() {
296+
let raw = json!({
297+
"encoding": "mp3",
298+
"sample_rate": 22050,
299+
"bitrate": 48000,
300+
"container": "none"
301+
});
302+
let output: AudioOutput = serde_json::from_value(raw.clone()).unwrap();
303+
assert_eq!(output.encoding, Some(AudioOutputEncoding::Mp3));
304+
assert_eq!(output.container, Some(AudioContainer::None));
305+
assert_eq!(serde_json::to_value(&output).unwrap(), raw);
306+
}
307+
308+
#[test]
309+
fn output_default_serializes_empty() {
310+
let output = AudioOutput::default();
311+
let value = serde_json::to_value(&output).unwrap();
312+
assert_eq!(value, json!({}));
313+
}
314+
315+
#[test]
316+
fn output_builder_chain() {
317+
let output = AudioOutput::new()
318+
.with_encoding(AudioOutputEncoding::Aac)
319+
.with_sample_rate(48000)
320+
.with_bitrate(192_000);
321+
assert_eq!(output.encoding, Some(AudioOutputEncoding::Aac));
322+
assert_eq!(output.sample_rate, Some(48000));
323+
assert_eq!(output.bitrate, Some(192_000));
324+
}
325+
326+
#[test]
327+
fn audio_config_round_trip() {
328+
let raw = json!({
329+
"input": { "encoding": "alaw", "sample_rate": 8000 },
330+
"output": {
331+
"encoding": "opus",
332+
"sample_rate": 48000,
333+
"container": "ogg"
334+
}
335+
});
336+
let config: AudioConfig = serde_json::from_value(raw.clone()).unwrap();
337+
assert!(matches!(
338+
config.input.as_ref().unwrap().encoding,
339+
AudioInputEncoding::Alaw
340+
));
341+
assert_eq!(
342+
config.output.as_ref().unwrap().container,
343+
Some(AudioContainer::Ogg)
344+
);
345+
assert_eq!(serde_json::to_value(&config).unwrap(), raw);
346+
}
347+
348+
#[test]
349+
fn audio_config_omits_none_fields() {
350+
let config = AudioConfig::new(Some(AudioInput::default()), None);
351+
let value = serde_json::to_value(&config).unwrap();
352+
assert_eq!(
353+
value,
354+
json!({ "input": { "encoding": "linear16", "sample_rate": 24000 } })
355+
);
356+
}
357+
358+
#[test]
359+
fn audio_input_default_helper() {
360+
let input = AudioInput::default();
361+
assert_eq!(input.encoding, AudioInputEncoding::Linear16);
362+
assert_eq!(input.sample_rate, 24_000);
363+
}
364+
365+
#[test]
366+
fn output_container_serialization() {
367+
for (container, wire) in [
368+
(AudioContainer::None, "none"),
369+
(AudioContainer::Wav, "wav"),
370+
(AudioContainer::Ogg, "ogg"),
371+
] {
372+
let serialized = serde_json::to_value(container).unwrap();
373+
assert_eq!(serialized, json!(wire));
374+
let back: AudioContainer = serde_json::from_value(json!(wire)).unwrap();
375+
assert_eq!(back, container);
376+
}
377+
}
378+
}

0 commit comments

Comments
 (0)