@@ -16,6 +16,12 @@ import {
1616} from '../stt/index.js' ;
1717import { type APIConnectOptions , DEFAULT_API_CONNECT_OPTIONS } from '../types.js' ;
1818import { type AudioBuffer , Event , Task , cancelAndWait , shortuuid , waitForAbort } from '../utils.js' ;
19+ import type { TimedString } from '../voice/io.js' ;
20+ import {
21+ type SttServerEvent ,
22+ type SttTranscriptEvent ,
23+ sttServerEventSchema ,
24+ } from './api_protos.js' ;
1925import { type AnyString , connectWs , createAccessToken } from './utils.js' ;
2026
2127export type DeepgramModels =
@@ -122,7 +128,7 @@ export class STT<TModel extends STTModels> extends BaseSTT {
122128 apiSecret ?: string ;
123129 modelOptions ?: STTOptions < TModel > ;
124130 } ) {
125- super ( { streaming : true , interimResults : true } ) ;
131+ super ( { streaming : true , interimResults : true , alignedTranscript : 'word' } ) ;
126132
127133 const {
128134 model,
@@ -271,7 +277,6 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
271277 let closing = false ;
272278 let finalReceived = false ;
273279
274- type SttServerEvent = Record < string , any > ;
275280 const eventChannel = createStreamChannel < SttServerEvent > ( ) ;
276281
277282 const resourceCleanup = ( ) => {
@@ -380,10 +385,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
380385 if ( signal . aborted ) return ;
381386 if ( result . done ) return ;
382387
383- const json = result . value ;
384- const type = json . type as string | undefined ;
388+ // Parse and validate with Zod schema
389+ const parseResult = await sttServerEventSchema . safeParseAsync ( result . value ) ;
390+ if ( ! parseResult . success ) {
391+ this . #logger. warn (
392+ { error : parseResult . error , rawData : result . value } ,
393+ 'Failed to parse STT server event' ,
394+ ) ;
395+ continue ;
396+ }
397+
398+ const event : SttServerEvent = parseResult . data ;
385399
386- switch ( type ) {
400+ switch ( event . type ) {
387401 case 'session.created' :
388402 case 'session.finalized' :
389403 break ;
@@ -392,21 +406,15 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
392406 resourceCleanup ( ) ;
393407 break ;
394408 case 'interim_transcript' :
395- this . processTranscript ( json , false ) ;
409+ this . processTranscript ( event , false ) ;
396410 break ;
397411 case 'final_transcript' :
398- this . processTranscript ( json , true ) ;
412+ this . processTranscript ( event , true ) ;
399413 break ;
400414 case 'error' :
401- this . #logger. error ( { error : json } , 'Received error from LiveKit STT' ) ;
415+ this . #logger. error ( { error : event } , 'Received error from LiveKit STT' ) ;
402416 resourceCleanup ( ) ;
403- throw new APIError ( `LiveKit STT returned error: ${ JSON . stringify ( json ) } ` ) ;
404- default :
405- this . #logger. warn (
406- { message : json } ,
407- 'Received unexpected message from LiveKit STT' ,
408- ) ;
409- break ;
417+ throw new APIError ( `LiveKit STT returned error: ${ JSON . stringify ( event ) } ` ) ;
410418 }
411419 }
412420 } finally {
@@ -457,13 +465,13 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
457465 }
458466 }
459467
460- private processTranscript ( data : Record < string , any > , isFinal : boolean ) {
468+ private processTranscript ( data : SttTranscriptEvent , isFinal : boolean ) {
461469 // Check if queue is closed to avoid race condition during disconnect
462470 if ( this . queue . closed ) return ;
463471
464- const requestId = data . request_id ?? this . requestId ;
465- const text = data . transcript ?? '' ;
466- const language = data . language ?? this . opts . language ?? 'en' ;
472+ const requestId = data . session_id || this . requestId ;
473+ const text = data . transcript ;
474+ const language = data . language || this . opts . language || 'en' ;
467475
468476 if ( ! text && ! isFinal ) return ;
469477
@@ -476,10 +484,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
476484
477485 const speechData : SpeechData = {
478486 language,
479- startTime : data . start ?? 0 ,
480- endTime : data . duration ?? 0 ,
481- confidence : data . confidence ?? 1.0 ,
487+ startTime : this . startTimeOffset + data . start ,
488+ endTime : this . startTimeOffset + data . start + data . duration ,
489+ confidence : data . confidence ,
482490 text,
491+ words : data . words . map (
492+ ( word ) : TimedString => ( {
493+ text : word . word ,
494+ startTime : word . start + this . startTimeOffset ,
495+ endTime : word . end + this . startTimeOffset ,
496+ startTimeOffset : this . startTimeOffset ,
497+ confidence : word . confidence ,
498+ } ) ,
499+ ) ,
483500 } ;
484501
485502 if ( isFinal ) {
0 commit comments