|
4 | 4 | //! |
5 | 5 | //! [api]: https://developers.deepgram.com/api-reference/#transcription-prerecorded-responses |
6 | 6 |
|
| 7 | +use std::collections::HashMap; |
| 8 | + |
7 | 9 | use serde::{Deserialize, Serialize}; |
8 | 10 | use uuid::Uuid; |
9 | 11 |
|
@@ -60,8 +62,70 @@ pub struct ListenMetadata { |
60 | 62 | #[allow(missing_docs)] |
61 | 63 | pub channels: usize, |
62 | 64 |
|
63 | | - #[allow(missing_docs)] |
| 65 | + /// Top-level language. Not in the current |
| 66 | + /// `ListenV1ResponseMetadata` schema (the language is on each |
| 67 | + /// channel via `ChannelResult.detected_language`); kept for |
| 68 | + /// backward compatibility, will be removed in 0.10.0 (Phase 8e). |
64 | 69 | pub language: Option<String>, |
| 70 | + |
| 71 | + /// Model UUIDs that served the request. |
| 72 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 73 | + pub models: Option<Vec<String>>, |
| 74 | + |
| 75 | + /// Per-model metadata, keyed by model UUID. |
| 76 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 77 | + pub model_info: Option<HashMap<String, ModelInfoEntry>>, |
| 78 | + |
| 79 | + /// Token usage for the summarization step (when `summarize` was set). |
| 80 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 81 | + pub summary_info: Option<TokenInfo>, |
| 82 | + |
| 83 | + /// Token usage for the sentiment-analysis step. |
| 84 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 85 | + pub sentiment_info: Option<TokenInfo>, |
| 86 | + |
| 87 | + /// Token usage for the topic-detection step. |
| 88 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 89 | + pub topics_info: Option<TokenInfo>, |
| 90 | + |
| 91 | + /// Token usage for the intent-detection step. |
| 92 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 93 | + pub intents_info: Option<TokenInfo>, |
| 94 | + |
| 95 | + /// Tags echoed back from the request's `tag` query param(s). |
| 96 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 97 | + pub tags: Option<Vec<String>>, |
| 98 | +} |
| 99 | + |
| 100 | +/// Per-model metadata entry inside [`ListenMetadata::model_info`]. |
| 101 | +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] |
| 102 | +#[non_exhaustive] |
| 103 | +pub struct ModelInfoEntry { |
| 104 | + /// Display name of the model. |
| 105 | + pub name: String, |
| 106 | + /// Version string. |
| 107 | + pub version: String, |
| 108 | + /// Model architecture (e.g. `nova-2`). |
| 109 | + pub arch: String, |
| 110 | +} |
| 111 | + |
| 112 | +/// Token usage and model identifier for one analytics feature |
| 113 | +/// (summarize / sentiment / topics / intents). Shared with |
| 114 | +/// [`crate::read::response::TokenInfo`]. |
| 115 | +#[derive(Debug, PartialEq, Eq, Clone, Default, Serialize, Deserialize)] |
| 116 | +#[non_exhaustive] |
| 117 | +pub struct TokenInfo { |
| 118 | + /// UUID of the model that produced this output. |
| 119 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 120 | + pub model_uuid: Option<String>, |
| 121 | + |
| 122 | + /// Number of input tokens consumed. |
| 123 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 124 | + pub input_tokens: Option<u64>, |
| 125 | + |
| 126 | + /// Number of output tokens produced. |
| 127 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 128 | + pub output_tokens: Option<u64>, |
65 | 129 | } |
66 | 130 |
|
67 | 131 | /// Transcription results. |
@@ -187,6 +251,9 @@ pub struct Paragraph { |
187 | 251 | num_words: usize, |
188 | 252 | start: f64, |
189 | 253 | end: f64, |
| 254 | + /// Speaker label when diarization is enabled. None otherwise. |
| 255 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 256 | + pub speaker: Option<usize>, |
190 | 257 | } |
191 | 258 |
|
192 | 259 | /// Paragraph results. |
@@ -216,6 +283,9 @@ pub struct Entity { |
216 | 283 | confidence: f64, |
217 | 284 | start_word: usize, |
218 | 285 | end_word: usize, |
| 286 | + /// Original spoken text of the entity, present when smart formatting is enabled. |
| 287 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 288 | + pub raw_value: Option<String>, |
219 | 289 | } |
220 | 290 |
|
221 | 291 | /// Intent |
@@ -343,6 +413,43 @@ pub struct ResultAlternative { |
343 | 413 | #[allow(missing_docs)] |
344 | 414 | #[serde(default)] |
345 | 415 | pub languages: Vec<String>, |
| 416 | + |
| 417 | + /// Channel-level summaries (when `summarize` was set). Distinct |
| 418 | + /// from [`ListenResults::summary`] (document-level). |
| 419 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 420 | + pub summaries: Option<Vec<ChannelSummary>>, |
| 421 | + |
| 422 | + /// Channel-level topic detections (when `topics` was set). |
| 423 | + /// Distinct from [`ListenResults::topics`] (document-level). |
| 424 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 425 | + pub topics: Option<Vec<ChannelTopic>>, |
| 426 | +} |
| 427 | + |
| 428 | +/// One channel-level summary entry on a [`ResultAlternative`]. |
| 429 | +#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] |
| 430 | +#[non_exhaustive] |
| 431 | +pub struct ChannelSummary { |
| 432 | + /// Summary text. |
| 433 | + pub summary: String, |
| 434 | + /// Index of the first word covered by this summary. |
| 435 | + pub start_word: f64, |
| 436 | + /// Index of the last word covered by this summary. |
| 437 | + pub end_word: f64, |
| 438 | +} |
| 439 | + |
| 440 | +/// One channel-level topic entry on a [`ResultAlternative`]. |
| 441 | +#[derive(Debug, PartialEq, Clone, Serialize, Deserialize)] |
| 442 | +#[non_exhaustive] |
| 443 | +pub struct ChannelTopic { |
| 444 | + /// Snippet of text that was classified. |
| 445 | + pub text: String, |
| 446 | + /// Index of the first word in the snippet. |
| 447 | + pub start_word: f64, |
| 448 | + /// Index of the last word in the snippet. |
| 449 | + pub end_word: f64, |
| 450 | + /// Topic labels detected on this snippet. |
| 451 | + #[serde(default)] |
| 452 | + pub topics: Vec<String>, |
346 | 453 | } |
347 | 454 |
|
348 | 455 | /// A single transcribed word. |
@@ -370,6 +477,11 @@ pub struct Word { |
370 | 477 | /// [docs]: https://developers.deepgram.com/documentation/features/diarize/ |
371 | 478 | pub speaker: Option<usize>, |
372 | 479 |
|
| 480 | + /// Confidence of the [`speaker`](Word::speaker) assignment, when |
| 481 | + /// diarization is enabled. |
| 482 | + #[serde(default, skip_serializing_if = "Option::is_none")] |
| 483 | + pub speaker_confidence: Option<f64>, |
| 484 | + |
373 | 485 | /// [`None`] unless the [Punctuation feature][docs] is set. |
374 | 486 | /// |
375 | 487 | /// [docs]: https://developers.deepgram.com/documentation/features/punctuate/ |
@@ -398,3 +510,133 @@ pub struct Hit { |
398 | 510 | #[allow(missing_docs)] |
399 | 511 | pub snippet: String, |
400 | 512 | } |
| 513 | + |
| 514 | +#[cfg(test)] |
| 515 | +mod tests { |
| 516 | + use super::*; |
| 517 | + use serde_json::json; |
| 518 | + |
| 519 | + // Tests below assert deserialization shape only (not strict JSON |
| 520 | + // round-trip equality). Several pre-existing optional fields on |
| 521 | + // batch_response types serialize `None` as `null` rather than |
| 522 | + // omitting them — normalizing that wire behavior is a Phase 8 |
| 523 | + // cleanup, not Phase 7. |
| 524 | + |
| 525 | + #[test] |
| 526 | + fn metadata_with_model_info_and_token_info() { |
| 527 | + let raw = json!({ |
| 528 | + "request_id": "550e8400-e29b-41d4-a716-446655440000", |
| 529 | + "transaction_key": "deprecated", |
| 530 | + "sha256": "abc", |
| 531 | + "created": "2026-05-08T12:00:00Z", |
| 532 | + "duration": 12.5, |
| 533 | + "channels": 1, |
| 534 | + "models": ["30089e05-99d1-4376-b32e-c263170674af"], |
| 535 | + "model_info": { |
| 536 | + "30089e05-99d1-4376-b32e-c263170674af": { |
| 537 | + "name": "2-general-nova", |
| 538 | + "version": "2024-01-09.29447", |
| 539 | + "arch": "nova-2" |
| 540 | + } |
| 541 | + }, |
| 542 | + "summary_info": { |
| 543 | + "model_uuid": "67875a7f-c9c4-48a0-aa55-5bdb8a91c34a", |
| 544 | + "input_tokens": 95, |
| 545 | + "output_tokens": 63 |
| 546 | + }, |
| 547 | + "tags": ["staging"] |
| 548 | + }); |
| 549 | + let m: ListenMetadata = serde_json::from_value(raw).unwrap(); |
| 550 | + assert_eq!(m.models.as_ref().unwrap().len(), 1); |
| 551 | + let info = m.model_info.as_ref().unwrap(); |
| 552 | + assert_eq!(info["30089e05-99d1-4376-b32e-c263170674af"].arch, "nova-2"); |
| 553 | + assert_eq!(m.summary_info.as_ref().unwrap().input_tokens, Some(95)); |
| 554 | + assert_eq!(m.tags.as_deref().unwrap(), &["staging".to_string()]); |
| 555 | + } |
| 556 | + |
| 557 | + #[test] |
| 558 | + fn metadata_minimal_deserializes_without_new_fields() { |
| 559 | + let raw = json!({ |
| 560 | + "request_id": "550e8400-e29b-41d4-a716-446655440000", |
| 561 | + "transaction_key": "deprecated", |
| 562 | + "sha256": "abc", |
| 563 | + "created": "2026-05-08T12:00:00Z", |
| 564 | + "duration": 12.5, |
| 565 | + "channels": 1 |
| 566 | + }); |
| 567 | + let m: ListenMetadata = serde_json::from_value(raw).unwrap(); |
| 568 | + assert!(m.models.is_none()); |
| 569 | + assert!(m.summary_info.is_none()); |
| 570 | + assert!(m.tags.is_none()); |
| 571 | + } |
| 572 | + |
| 573 | + #[test] |
| 574 | + fn word_speaker_confidence_round_trip() { |
| 575 | + let raw = json!({ |
| 576 | + "word": "hello", |
| 577 | + "start": 0.0, |
| 578 | + "end": 0.5, |
| 579 | + "confidence": 0.95, |
| 580 | + "speaker": 0, |
| 581 | + "speaker_confidence": 0.88, |
| 582 | + "punctuated_word": "Hello," |
| 583 | + }); |
| 584 | + let w: Word = serde_json::from_value(raw.clone()).unwrap(); |
| 585 | + assert_eq!(w.speaker_confidence, Some(0.88)); |
| 586 | + assert_eq!(serde_json::to_value(&w).unwrap(), raw); |
| 587 | + } |
| 588 | + |
| 589 | + #[test] |
| 590 | + fn entity_raw_value_round_trip() { |
| 591 | + let raw = json!({ |
| 592 | + "label": "PHONE_NUMBER", |
| 593 | + "value": "555-1234", |
| 594 | + "raw_value": "five five five one two three four", |
| 595 | + "confidence": 0.91, |
| 596 | + "start_word": 3, |
| 597 | + "end_word": 6 |
| 598 | + }); |
| 599 | + let e: Entity = serde_json::from_value(raw.clone()).unwrap(); |
| 600 | + assert_eq!( |
| 601 | + e.raw_value.as_deref(), |
| 602 | + Some("five five five one two three four") |
| 603 | + ); |
| 604 | + assert_eq!(serde_json::to_value(&e).unwrap(), raw); |
| 605 | + } |
| 606 | + |
| 607 | + #[test] |
| 608 | + fn paragraph_speaker_round_trip() { |
| 609 | + let raw = json!({ |
| 610 | + "sentences": [{"text": "Hi.", "start": 0.0, "end": 0.5}], |
| 611 | + "num_words": 1, |
| 612 | + "start": 0.0, |
| 613 | + "end": 0.5, |
| 614 | + "speaker": 2 |
| 615 | + }); |
| 616 | + let p: Paragraph = serde_json::from_value(raw.clone()).unwrap(); |
| 617 | + assert_eq!(p.speaker, Some(2)); |
| 618 | + assert_eq!(serde_json::to_value(&p).unwrap(), raw); |
| 619 | + } |
| 620 | + |
| 621 | + #[test] |
| 622 | + fn channel_summaries_and_topics_deserialize() { |
| 623 | + let raw = json!({ |
| 624 | + "transcript": "Hello world", |
| 625 | + "confidence": 0.97, |
| 626 | + "words": [], |
| 627 | + "summaries": [ |
| 628 | + {"summary": "A greeting.", "start_word": 0.0, "end_word": 1.0} |
| 629 | + ], |
| 630 | + "topics": [ |
| 631 | + {"text": "Hello world", "start_word": 0.0, "end_word": 1.0, |
| 632 | + "topics": ["greeting"]} |
| 633 | + ] |
| 634 | + }); |
| 635 | + let alt: ResultAlternative = serde_json::from_value(raw).unwrap(); |
| 636 | + assert_eq!(alt.summaries.as_ref().unwrap().len(), 1); |
| 637 | + assert_eq!( |
| 638 | + alt.topics.as_ref().unwrap()[0].topics, |
| 639 | + vec!["greeting".to_string()] |
| 640 | + ); |
| 641 | + } |
| 642 | +} |
0 commit comments