Skip to content

Commit e3292b1

Browse files
committed
feat(common): backfill missing leaf fields on Listen responses
Phase 7 of the spec-coverage rollout — adds the 12 fields the SDK was silently dropping on Listen REST and streaming responses. batch_response.rs (ListenMetadata): - models, model_info, tags - summary_info, sentiment_info, topics_info, intents_info (all using a new shared TokenInfo struct) - language field documented as pending removal in Phase 8e batch_response.rs (other): - ResultAlternative.summaries (Vec<ChannelSummary>) - ResultAlternative.topics (Vec<ChannelTopic>) - Word.speaker_confidence - Entity.raw_value - Paragraph.speaker stream_response.rs: - TranscriptResponse.entities (Vec<EntityHit>) — only present on is_final messages when detect_entities is enabled - TerminalResponse.transaction_key - TerminalResponse.sha256 read::response::TokenInfo is now a re-export of common::batch_response::TokenInfo so the two response surfaces share the same shape without duplication. All new fields are #[serde(default, skip_serializing_if = ...)] optional — payloads without them deserialize cleanly (back-compat sanity test included). 9 new lib tests pass (232 total).
1 parent 57a2741 commit e3292b1

3 files changed

Lines changed: 382 additions & 18 deletions

File tree

src/common/batch_response.rs

Lines changed: 243 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
//!
55
//! [api]: https://developers.deepgram.com/api-reference/#transcription-prerecorded-responses
66
7+
use std::collections::HashMap;
8+
79
use serde::{Deserialize, Serialize};
810
use uuid::Uuid;
911

@@ -60,8 +62,70 @@ pub struct ListenMetadata {
6062
#[allow(missing_docs)]
6163
pub channels: usize,
6264

63-
#[allow(missing_docs)]
65+
/// Top-level language. Not in the current
66+
/// `ListenV1ResponseMetadata` schema (the language is on each
67+
/// channel via `ChannelResult.detected_language`); kept for
68+
/// backward compatibility, will be removed in 0.10.0 (Phase 8e).
6469
pub language: Option<String>,
70+
71+
/// Model UUIDs that served the request.
72+
#[serde(default, skip_serializing_if = "Option::is_none")]
73+
pub models: Option<Vec<String>>,
74+
75+
/// Per-model metadata, keyed by model UUID.
76+
#[serde(default, skip_serializing_if = "Option::is_none")]
77+
pub model_info: Option<HashMap<String, ModelInfoEntry>>,
78+
79+
/// Token usage for the summarization step (when `summarize` was set).
80+
#[serde(default, skip_serializing_if = "Option::is_none")]
81+
pub summary_info: Option<TokenInfo>,
82+
83+
/// Token usage for the sentiment-analysis step.
84+
#[serde(default, skip_serializing_if = "Option::is_none")]
85+
pub sentiment_info: Option<TokenInfo>,
86+
87+
/// Token usage for the topic-detection step.
88+
#[serde(default, skip_serializing_if = "Option::is_none")]
89+
pub topics_info: Option<TokenInfo>,
90+
91+
/// Token usage for the intent-detection step.
92+
#[serde(default, skip_serializing_if = "Option::is_none")]
93+
pub intents_info: Option<TokenInfo>,
94+
95+
/// Tags echoed back from the request's `tag` query param(s).
96+
#[serde(default, skip_serializing_if = "Option::is_none")]
97+
pub tags: Option<Vec<String>>,
98+
}
99+
100+
/// Per-model metadata entry inside [`ListenMetadata::model_info`].
101+
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
102+
#[non_exhaustive]
103+
pub struct ModelInfoEntry {
104+
/// Display name of the model.
105+
pub name: String,
106+
/// Version string.
107+
pub version: String,
108+
/// Model architecture (e.g. `nova-2`).
109+
pub arch: String,
110+
}
111+
112+
/// Token usage and model identifier for one analytics feature
113+
/// (summarize / sentiment / topics / intents). Shared with
114+
/// [`crate::read::response::TokenInfo`].
115+
#[derive(Debug, PartialEq, Eq, Clone, Default, Serialize, Deserialize)]
116+
#[non_exhaustive]
117+
pub struct TokenInfo {
118+
/// UUID of the model that produced this output.
119+
#[serde(default, skip_serializing_if = "Option::is_none")]
120+
pub model_uuid: Option<String>,
121+
122+
/// Number of input tokens consumed.
123+
#[serde(default, skip_serializing_if = "Option::is_none")]
124+
pub input_tokens: Option<u64>,
125+
126+
/// Number of output tokens produced.
127+
#[serde(default, skip_serializing_if = "Option::is_none")]
128+
pub output_tokens: Option<u64>,
65129
}
66130

67131
/// Transcription results.
@@ -187,6 +251,9 @@ pub struct Paragraph {
187251
num_words: usize,
188252
start: f64,
189253
end: f64,
254+
/// Speaker label when diarization is enabled. None otherwise.
255+
#[serde(default, skip_serializing_if = "Option::is_none")]
256+
pub speaker: Option<usize>,
190257
}
191258

192259
/// Paragraph results.
@@ -216,6 +283,9 @@ pub struct Entity {
216283
confidence: f64,
217284
start_word: usize,
218285
end_word: usize,
286+
/// Original spoken text of the entity, present when smart formatting is enabled.
287+
#[serde(default, skip_serializing_if = "Option::is_none")]
288+
pub raw_value: Option<String>,
219289
}
220290

221291
/// Intent
@@ -343,6 +413,43 @@ pub struct ResultAlternative {
343413
#[allow(missing_docs)]
344414
#[serde(default)]
345415
pub languages: Vec<String>,
416+
417+
/// Channel-level summaries (when `summarize` was set). Distinct
418+
/// from [`ListenResults::summary`] (document-level).
419+
#[serde(default, skip_serializing_if = "Option::is_none")]
420+
pub summaries: Option<Vec<ChannelSummary>>,
421+
422+
/// Channel-level topic detections (when `topics` was set).
423+
/// Distinct from [`ListenResults::topics`] (document-level).
424+
#[serde(default, skip_serializing_if = "Option::is_none")]
425+
pub topics: Option<Vec<ChannelTopic>>,
426+
}
427+
428+
/// One channel-level summary entry on a [`ResultAlternative`].
429+
#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
430+
#[non_exhaustive]
431+
pub struct ChannelSummary {
432+
/// Summary text.
433+
pub summary: String,
434+
/// Index of the first word covered by this summary.
435+
pub start_word: f64,
436+
/// Index of the last word covered by this summary.
437+
pub end_word: f64,
438+
}
439+
440+
/// One channel-level topic entry on a [`ResultAlternative`].
441+
#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
442+
#[non_exhaustive]
443+
pub struct ChannelTopic {
444+
/// Snippet of text that was classified.
445+
pub text: String,
446+
/// Index of the first word in the snippet.
447+
pub start_word: f64,
448+
/// Index of the last word in the snippet.
449+
pub end_word: f64,
450+
/// Topic labels detected on this snippet.
451+
#[serde(default)]
452+
pub topics: Vec<String>,
346453
}
347454

348455
/// A single transcribed word.
@@ -370,6 +477,11 @@ pub struct Word {
370477
/// [docs]: https://developers.deepgram.com/documentation/features/diarize/
371478
pub speaker: Option<usize>,
372479

480+
/// Confidence of the [`speaker`](Word::speaker) assignment, when
481+
/// diarization is enabled.
482+
#[serde(default, skip_serializing_if = "Option::is_none")]
483+
pub speaker_confidence: Option<f64>,
484+
373485
/// [`None`] unless the [Punctuation feature][docs] is set.
374486
///
375487
/// [docs]: https://developers.deepgram.com/documentation/features/punctuate/
@@ -398,3 +510,133 @@ pub struct Hit {
398510
#[allow(missing_docs)]
399511
pub snippet: String,
400512
}
513+
514+
#[cfg(test)]
515+
mod tests {
516+
use super::*;
517+
use serde_json::json;
518+
519+
// Tests below assert deserialization shape only (not strict JSON
520+
// round-trip equality). Several pre-existing optional fields on
521+
// batch_response types serialize `None` as `null` rather than
522+
// omitting them — normalizing that wire behavior is a Phase 8
523+
// cleanup, not Phase 7.
524+
525+
#[test]
526+
fn metadata_with_model_info_and_token_info() {
527+
let raw = json!({
528+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
529+
"transaction_key": "deprecated",
530+
"sha256": "abc",
531+
"created": "2026-05-08T12:00:00Z",
532+
"duration": 12.5,
533+
"channels": 1,
534+
"models": ["30089e05-99d1-4376-b32e-c263170674af"],
535+
"model_info": {
536+
"30089e05-99d1-4376-b32e-c263170674af": {
537+
"name": "2-general-nova",
538+
"version": "2024-01-09.29447",
539+
"arch": "nova-2"
540+
}
541+
},
542+
"summary_info": {
543+
"model_uuid": "67875a7f-c9c4-48a0-aa55-5bdb8a91c34a",
544+
"input_tokens": 95,
545+
"output_tokens": 63
546+
},
547+
"tags": ["staging"]
548+
});
549+
let m: ListenMetadata = serde_json::from_value(raw).unwrap();
550+
assert_eq!(m.models.as_ref().unwrap().len(), 1);
551+
let info = m.model_info.as_ref().unwrap();
552+
assert_eq!(info["30089e05-99d1-4376-b32e-c263170674af"].arch, "nova-2");
553+
assert_eq!(m.summary_info.as_ref().unwrap().input_tokens, Some(95));
554+
assert_eq!(m.tags.as_deref().unwrap(), &["staging".to_string()]);
555+
}
556+
557+
#[test]
558+
fn metadata_minimal_deserializes_without_new_fields() {
559+
let raw = json!({
560+
"request_id": "550e8400-e29b-41d4-a716-446655440000",
561+
"transaction_key": "deprecated",
562+
"sha256": "abc",
563+
"created": "2026-05-08T12:00:00Z",
564+
"duration": 12.5,
565+
"channels": 1
566+
});
567+
let m: ListenMetadata = serde_json::from_value(raw).unwrap();
568+
assert!(m.models.is_none());
569+
assert!(m.summary_info.is_none());
570+
assert!(m.tags.is_none());
571+
}
572+
573+
#[test]
574+
fn word_speaker_confidence_round_trip() {
575+
let raw = json!({
576+
"word": "hello",
577+
"start": 0.0,
578+
"end": 0.5,
579+
"confidence": 0.95,
580+
"speaker": 0,
581+
"speaker_confidence": 0.88,
582+
"punctuated_word": "Hello,"
583+
});
584+
let w: Word = serde_json::from_value(raw.clone()).unwrap();
585+
assert_eq!(w.speaker_confidence, Some(0.88));
586+
assert_eq!(serde_json::to_value(&w).unwrap(), raw);
587+
}
588+
589+
#[test]
590+
fn entity_raw_value_round_trip() {
591+
let raw = json!({
592+
"label": "PHONE_NUMBER",
593+
"value": "555-1234",
594+
"raw_value": "five five five one two three four",
595+
"confidence": 0.91,
596+
"start_word": 3,
597+
"end_word": 6
598+
});
599+
let e: Entity = serde_json::from_value(raw.clone()).unwrap();
600+
assert_eq!(
601+
e.raw_value.as_deref(),
602+
Some("five five five one two three four")
603+
);
604+
assert_eq!(serde_json::to_value(&e).unwrap(), raw);
605+
}
606+
607+
#[test]
608+
fn paragraph_speaker_round_trip() {
609+
let raw = json!({
610+
"sentences": [{"text": "Hi.", "start": 0.0, "end": 0.5}],
611+
"num_words": 1,
612+
"start": 0.0,
613+
"end": 0.5,
614+
"speaker": 2
615+
});
616+
let p: Paragraph = serde_json::from_value(raw.clone()).unwrap();
617+
assert_eq!(p.speaker, Some(2));
618+
assert_eq!(serde_json::to_value(&p).unwrap(), raw);
619+
}
620+
621+
#[test]
622+
fn channel_summaries_and_topics_deserialize() {
623+
let raw = json!({
624+
"transcript": "Hello world",
625+
"confidence": 0.97,
626+
"words": [],
627+
"summaries": [
628+
{"summary": "A greeting.", "start_word": 0.0, "end_word": 1.0}
629+
],
630+
"topics": [
631+
{"text": "Hello world", "start_word": 0.0, "end_word": 1.0,
632+
"topics": ["greeting"]}
633+
]
634+
});
635+
let alt: ResultAlternative = serde_json::from_value(raw).unwrap();
636+
assert_eq!(alt.summaries.as_ref().unwrap().len(), 1);
637+
assert_eq!(
638+
alt.topics.as_ref().unwrap()[0].topics,
639+
vec!["greeting".to_string()]
640+
);
641+
}
642+
}

0 commit comments

Comments
 (0)