Skip to content

Commit 989f16a

Browse files
authored
Add aligned transcript support with word-level timing (#984)
1 parent 30c45e7 commit 989f16a

11 files changed

Lines changed: 251 additions & 59 deletions

File tree

.changeset/tender-glasses-burn.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
'@livekit/agents-plugin-deepgram': patch
3+
'@livekit/agents-plugin-baseten': patch
4+
'@livekit/agents-plugin-openai': patch
5+
'@livekit/agents': patch
6+
---
7+
8+
Add aligned transcript support with word-level timing for STT plugins

agents/src/inference/api_protos.ts

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,86 @@ export type TtsSessionClosedEvent = z.infer<typeof ttsSessionClosedEventSchema>;
8080
export type TtsErrorEvent = z.infer<typeof ttsErrorEventSchema>;
8181
export type TtsClientEvent = z.infer<typeof ttsClientEventSchema>;
8282
export type TtsServerEvent = z.infer<typeof ttsServerEventSchema>;
83+
84+
// ============================================================================
85+
// STT Schemas
86+
// ============================================================================
87+
88+
// Word-level timing data
89+
export const sttWordSchema = z.object({
90+
word: z.string().optional().default(''),
91+
start: z.number().optional().default(0),
92+
end: z.number().optional().default(0),
93+
confidence: z.number().optional().default(0.0),
94+
extra: z.unknown().nullable().optional(),
95+
});
96+
97+
// Interim transcript event
98+
export const sttInterimTranscriptEventSchema = z.object({
99+
type: z.literal('interim_transcript'),
100+
session_id: z.string().optional(),
101+
transcript: z.string().optional().default(''),
102+
language: z.string().optional().default(''),
103+
start: z.number().optional().default(0),
104+
duration: z.number().optional().default(0),
105+
confidence: z.number().optional().default(1.0),
106+
words: z.array(sttWordSchema).optional().default([]),
107+
extra: z.unknown().nullable().optional(),
108+
});
109+
110+
// Final transcript event
111+
export const sttFinalTranscriptEventSchema = z.object({
112+
type: z.literal('final_transcript'),
113+
session_id: z.string().optional(),
114+
transcript: z.string().optional().default(''),
115+
language: z.string().optional().default(''),
116+
start: z.number().optional().default(0),
117+
duration: z.number().optional().default(0),
118+
confidence: z.number().optional().default(1.0),
119+
words: z.array(sttWordSchema).optional().default([]),
120+
extra: z.unknown().nullable().optional(),
121+
});
122+
123+
// Session created event
124+
export const sttSessionCreatedEventSchema = z.object({
125+
type: z.literal('session.created'),
126+
session_id: z.string().optional(),
127+
});
128+
129+
// Session finalized event
130+
export const sttSessionFinalizedEventSchema = z.object({
131+
type: z.literal('session.finalized'),
132+
});
133+
134+
// Session closed event
135+
export const sttSessionClosedEventSchema = z.object({
136+
type: z.literal('session.closed'),
137+
});
138+
139+
// Error event
140+
export const sttErrorEventSchema = z.object({
141+
type: z.literal('error'),
142+
message: z.string().optional(),
143+
code: z.string().optional(),
144+
});
145+
146+
// Discriminated union for all STT server events
147+
export const sttServerEventSchema = z.discriminatedUnion('type', [
148+
sttSessionCreatedEventSchema,
149+
sttSessionFinalizedEventSchema,
150+
sttSessionClosedEventSchema,
151+
sttInterimTranscriptEventSchema,
152+
sttFinalTranscriptEventSchema,
153+
sttErrorEventSchema,
154+
]);
155+
156+
// Type exports for STT
157+
export type SttWord = z.infer<typeof sttWordSchema>;
158+
export type SttInterimTranscriptEvent = z.infer<typeof sttInterimTranscriptEventSchema>;
159+
export type SttFinalTranscriptEvent = z.infer<typeof sttFinalTranscriptEventSchema>;
160+
export type SttTranscriptEvent = SttInterimTranscriptEvent | SttFinalTranscriptEvent;
161+
export type SttSessionCreatedEvent = z.infer<typeof sttSessionCreatedEventSchema>;
162+
export type SttSessionFinalizedEvent = z.infer<typeof sttSessionFinalizedEventSchema>;
163+
export type SttSessionClosedEvent = z.infer<typeof sttSessionClosedEventSchema>;
164+
export type SttErrorEvent = z.infer<typeof sttErrorEventSchema>;
165+
export type SttServerEvent = z.infer<typeof sttServerEventSchema>;

agents/src/inference/stt.ts

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ import {
1616
} from '../stt/index.js';
1717
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
1818
import { type AudioBuffer, Event, Task, cancelAndWait, shortuuid, waitForAbort } from '../utils.js';
19+
import type { TimedString } from '../voice/io.js';
20+
import {
21+
type SttServerEvent,
22+
type SttTranscriptEvent,
23+
sttServerEventSchema,
24+
} from './api_protos.js';
1925
import { type AnyString, connectWs, createAccessToken } from './utils.js';
2026

2127
export type DeepgramModels =
@@ -122,7 +128,7 @@ export class STT<TModel extends STTModels> extends BaseSTT {
122128
apiSecret?: string;
123129
modelOptions?: STTOptions<TModel>;
124130
}) {
125-
super({ streaming: true, interimResults: true });
131+
super({ streaming: true, interimResults: true, alignedTranscript: 'word' });
126132

127133
const {
128134
model,
@@ -271,7 +277,6 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
271277
let closing = false;
272278
let finalReceived = false;
273279

274-
type SttServerEvent = Record<string, any>;
275280
const eventChannel = createStreamChannel<SttServerEvent>();
276281

277282
const resourceCleanup = () => {
@@ -380,10 +385,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
380385
if (signal.aborted) return;
381386
if (result.done) return;
382387

383-
const json = result.value;
384-
const type = json.type as string | undefined;
388+
// Parse and validate with Zod schema
389+
const parseResult = await sttServerEventSchema.safeParseAsync(result.value);
390+
if (!parseResult.success) {
391+
this.#logger.warn(
392+
{ error: parseResult.error, rawData: result.value },
393+
'Failed to parse STT server event',
394+
);
395+
continue;
396+
}
397+
398+
const event: SttServerEvent = parseResult.data;
385399

386-
switch (type) {
400+
switch (event.type) {
387401
case 'session.created':
388402
case 'session.finalized':
389403
break;
@@ -392,21 +406,15 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
392406
resourceCleanup();
393407
break;
394408
case 'interim_transcript':
395-
this.processTranscript(json, false);
409+
this.processTranscript(event, false);
396410
break;
397411
case 'final_transcript':
398-
this.processTranscript(json, true);
412+
this.processTranscript(event, true);
399413
break;
400414
case 'error':
401-
this.#logger.error({ error: json }, 'Received error from LiveKit STT');
415+
this.#logger.error({ error: event }, 'Received error from LiveKit STT');
402416
resourceCleanup();
403-
throw new APIError(`LiveKit STT returned error: ${JSON.stringify(json)}`);
404-
default:
405-
this.#logger.warn(
406-
{ message: json },
407-
'Received unexpected message from LiveKit STT',
408-
);
409-
break;
417+
throw new APIError(`LiveKit STT returned error: ${JSON.stringify(event)}`);
410418
}
411419
}
412420
} finally {
@@ -457,13 +465,13 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
457465
}
458466
}
459467

460-
private processTranscript(data: Record<string, any>, isFinal: boolean) {
468+
private processTranscript(data: SttTranscriptEvent, isFinal: boolean) {
461469
// Check if queue is closed to avoid race condition during disconnect
462470
if (this.queue.closed) return;
463471

464-
const requestId = data.request_id ?? this.requestId;
465-
const text = data.transcript ?? '';
466-
const language = data.language ?? this.opts.language ?? 'en';
472+
const requestId = data.session_id || this.requestId;
473+
const text = data.transcript;
474+
const language = data.language || this.opts.language || 'en';
467475

468476
if (!text && !isFinal) return;
469477

@@ -476,10 +484,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
476484

477485
const speechData: SpeechData = {
478486
language,
479-
startTime: data.start ?? 0,
480-
endTime: data.duration ?? 0,
481-
confidence: data.confidence ?? 1.0,
487+
startTime: this.startTimeOffset + data.start,
488+
endTime: this.startTimeOffset + data.start + data.duration,
489+
confidence: data.confidence,
482490
text,
491+
words: data.words.map(
492+
(word): TimedString => ({
493+
text: word.word,
494+
startTime: word.start + this.startTimeOffset,
495+
endTime: word.end + this.startTimeOffset,
496+
startTimeOffset: this.startTimeOffset,
497+
confidence: word.confidence,
498+
}),
499+
),
483500
};
484501

485502
if (isFinal) {

agents/src/stt/stt.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import { DeferredReadableStream } from '../stream/deferred_stream.js';
1313
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, intervalForRetry } from '../types.js';
1414
import type { AudioBuffer } from '../utils.js';
1515
import { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';
16+
import type { TimedString } from '../voice/index.js';
1617

1718
/** Indicates start/middle/end of speech */
1819
export enum SpeechEventType {
@@ -53,6 +54,7 @@ export interface SpeechData {
5354
startTime: number;
5455
endTime: number;
5556
confidence: number;
57+
words?: TimedString[];
5658
}
5759

5860
export interface RecognitionUsage {
@@ -76,6 +78,13 @@ export interface SpeechEvent {
7678
export interface STTCapabilities {
7779
streaming: boolean;
7880
interimResults: boolean;
81+
/**
82+
* Whether this STT supports aligned transcripts with word/chunk timestamps.
83+
* - 'word': Provider returns word-level timestamps
84+
* - 'chunk': Provider returns chunk-level timestamps (e.g., sentence/phrase boundaries)
85+
* - false: Provider does not support aligned transcripts
86+
*/
87+
alignedTranscript?: 'word' | 'chunk' | false;
7988
}
8089

8190
export interface STTError {
@@ -176,6 +185,7 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
176185
private deferredInputStream: DeferredReadableStream<AudioFrame>;
177186
private logger = log();
178187
private _connOptions: APIConnectOptions;
188+
private _startTimeOffset: number = 0;
179189

180190
protected abortController = new AbortController();
181191

@@ -300,6 +310,17 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
300310
return this.abortController.signal;
301311
}
302312

313+
get startTimeOffset(): number {
314+
return this._startTimeOffset;
315+
}
316+
317+
set startTimeOffset(value: number) {
318+
if (value < 0) {
319+
throw new Error('startTimeOffset must be non-negative');
320+
}
321+
this._startTimeOffset = value;
322+
}
323+
303324
updateInputStream(audioStream: ReadableStream<AudioFrame>) {
304325
this.deferredInputStream.setSource(audioStream);
305326
}

agents/src/voice/agent.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,15 @@ export class Agent<UserData = any> {
271271

272272
const connOptions = activity.agentSession.connOptions.sttConnOptions;
273273
const stream = wrapped_stt.stream({ connOptions });
274+
275+
// Set startTimeOffset to provide linear timestamps across reconnections
276+
const audioInputStartedAt =
277+
activity.agentSession._recorderIO?.recordingStartedAt ?? // Use recording start time if available
278+
activity.agentSession._startedAt ?? // Fallback to session start time
279+
Date.now(); // Fallback to current time
280+
281+
stream.startTimeOffset = (Date.now() - audioInputStartedAt) / 1000;
282+
274283
stream.updateInputStream(audio);
275284

276285
let cleaned = false;

agents/src/voice/io.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,14 @@ export type TTSNode = (
3030
) => Promise<ReadableStream<AudioFrame> | null>;
3131

3232
/**
33-
* A string with timing information for word-level alignment.
33+
*A string with optional start and end timestamps for word-level alignment.
3434
*/
3535
export interface TimedString {
3636
text: string;
3737
startTime?: number; // seconds
3838
endTime?: number; // seconds
39+
confidence?: number;
40+
startTimeOffset?: number;
3941
}
4042

4143
export interface AudioOutputCapabilities {

0 commit comments

Comments
 (0)