livekit
diff --git a/‎.changeset/tender-glasses-burn.md‎
Lines changed: 8 additions & 0 deletions b/‎.changeset/tender-glasses-burn.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎agents/src/inference/api_protos.ts‎
Lines changed: 83 additions & 0 deletions b/‎agents/src/inference/api_protos.ts‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎agents/src/inference/stt.ts‎
Lines changed: 39 additions & 22 deletions b/‎agents/src/inference/stt.ts‎
Lines changed: 39 additions & 22 deletions
diff --git a/‎agents/src/stt/stt.ts‎
Lines changed: 21 additions & 0 deletions b/‎agents/src/stt/stt.ts‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎agents/src/voice/agent.ts‎
Lines changed: 9 additions & 0 deletions b/‎agents/src/voice/agent.ts‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎agents/src/voice/io.ts‎
Lines changed: 3 additions & 1 deletion b/‎agents/src/voice/io.ts‎
Lines changed: 3 additions & 1 deletion
@@ -0,0 +1,8 @@
+---
+'@livekit/agents-plugin-deepgram': patch
+'@livekit/agents-plugin-baseten': patch
+'@livekit/agents-plugin-openai': patch
+'@livekit/agents': patch
+---
+
+Add aligned transcript support with word-level timing for STT plugins
@@ -80,3 +80,86 @@ export type TtsSessionClosedEvent = z.infer<typeof ttsSessionClosedEventSchema>;
 export type TtsErrorEvent = z.infer<typeof ttsErrorEventSchema>;
 export type TtsClientEvent = z.infer<typeof ttsClientEventSchema>;
 export type TtsServerEvent = z.infer<typeof ttsServerEventSchema>;
+
+// ============================================================================
+// STT Schemas
+// ============================================================================
+
+// Word-level timing data
+export const sttWordSchema = z.object({
+  word: z.string().optional().default(''),
+  start: z.number().optional().default(0),
+  end: z.number().optional().default(0),
+  confidence: z.number().optional().default(0.0),
+  extra: z.unknown().nullable().optional(),
+});
+
+// Interim transcript event
+export const sttInterimTranscriptEventSchema = z.object({
+  type: z.literal('interim_transcript'),
+  session_id: z.string().optional(),
+  transcript: z.string().optional().default(''),
+  language: z.string().optional().default(''),
+  start: z.number().optional().default(0),
+  duration: z.number().optional().default(0),
+  confidence: z.number().optional().default(1.0),
+  words: z.array(sttWordSchema).optional().default([]),
+  extra: z.unknown().nullable().optional(),
+});
+
+// Final transcript event
+export const sttFinalTranscriptEventSchema = z.object({
+  type: z.literal('final_transcript'),
+  session_id: z.string().optional(),
+  transcript: z.string().optional().default(''),
+  language: z.string().optional().default(''),
+  start: z.number().optional().default(0),
+  duration: z.number().optional().default(0),
+  confidence: z.number().optional().default(1.0),
+  words: z.array(sttWordSchema).optional().default([]),
+  extra: z.unknown().nullable().optional(),
+});
+
+// Session created event
+export const sttSessionCreatedEventSchema = z.object({
+  type: z.literal('session.created'),
+  session_id: z.string().optional(),
+});
+
+// Session finalized event
+export const sttSessionFinalizedEventSchema = z.object({
+  type: z.literal('session.finalized'),
+});
+
+// Session closed event
+export const sttSessionClosedEventSchema = z.object({
+  type: z.literal('session.closed'),
+});
+
+// Error event
+export const sttErrorEventSchema = z.object({
+  type: z.literal('error'),
+  message: z.string().optional(),
+  code: z.string().optional(),
+});
+
+// Discriminated union for all STT server events
+export const sttServerEventSchema = z.discriminatedUnion('type', [
+  sttSessionCreatedEventSchema,
+  sttSessionFinalizedEventSchema,
+  sttSessionClosedEventSchema,
+  sttInterimTranscriptEventSchema,
+  sttFinalTranscriptEventSchema,
+  sttErrorEventSchema,
+]);
+
+// Type exports for STT
+export type SttWord = z.infer<typeof sttWordSchema>;
+export type SttInterimTranscriptEvent = z.infer<typeof sttInterimTranscriptEventSchema>;
+export type SttFinalTranscriptEvent = z.infer<typeof sttFinalTranscriptEventSchema>;
+export type SttTranscriptEvent = SttInterimTranscriptEvent | SttFinalTranscriptEvent;
+export type SttSessionCreatedEvent = z.infer<typeof sttSessionCreatedEventSchema>;
+export type SttSessionFinalizedEvent = z.infer<typeof sttSessionFinalizedEventSchema>;
+export type SttSessionClosedEvent = z.infer<typeof sttSessionClosedEventSchema>;
+export type SttErrorEvent = z.infer<typeof sttErrorEventSchema>;
+export type SttServerEvent = z.infer<typeof sttServerEventSchema>;
@@ -16,6 +16,12 @@ import {
 } from '../stt/index.js';
 import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
 import { type AudioBuffer, Event, Task, cancelAndWait, shortuuid, waitForAbort } from '../utils.js';
+import type { TimedString } from '../voice/io.js';
+import {
+  type SttServerEvent,
+  type SttTranscriptEvent,
+  sttServerEventSchema,
+} from './api_protos.js';
 import { type AnyString, connectWs, createAccessToken } from './utils.js';
 
 export type DeepgramModels =
@@ -122,7 +128,7 @@ export class STT<TModel extends STTModels> extends BaseSTT {
     apiSecret?: string;
     modelOptions?: STTOptions<TModel>;
   }) {
-    super({ streaming: true, interimResults: true });
+    super({ streaming: true, interimResults: true, alignedTranscript: 'word' });
 
     const {
       model,
@@ -271,7 +277,6 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
       let closing = false;
       let finalReceived = false;
 
-      type SttServerEvent = Record<string, any>;
       const eventChannel = createStreamChannel<SttServerEvent>();
 
       const resourceCleanup = () => {
@@ -380,10 +385,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
             if (signal.aborted) return;
             if (result.done) return;
 
-            const json = result.value;
-            const type = json.type as string | undefined;
+            // Parse and validate with Zod schema
+            const parseResult = await sttServerEventSchema.safeParseAsync(result.value);
+            if (!parseResult.success) {
+              this.#logger.warn(
+                { error: parseResult.error, rawData: result.value },
+                'Failed to parse STT server event',
+              );
+              continue;
+            }
+
+            const event: SttServerEvent = parseResult.data;
 
-            switch (type) {
+            switch (event.type) {
               case 'session.created':
               case 'session.finalized':
                 break;
@@ -392,21 +406,15 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
                 resourceCleanup();
                 break;
               case 'interim_transcript':
-                this.processTranscript(json, false);
+                this.processTranscript(event, false);
                 break;
               case 'final_transcript':
-                this.processTranscript(json, true);
+                this.processTranscript(event, true);
                 break;
               case 'error':
-                this.#logger.error({ error: json }, 'Received error from LiveKit STT');
+                this.#logger.error({ error: event }, 'Received error from LiveKit STT');
                 resourceCleanup();
-                throw new APIError(`LiveKit STT returned error: ${JSON.stringify(json)}`);
-              default:
-                this.#logger.warn(
-                  { message: json },
-                  'Received unexpected message from LiveKit STT',
-                );
-                break;
+                throw new APIError(`LiveKit STT returned error: ${JSON.stringify(event)}`);
             }
           }
         } finally {
@@ -457,13 +465,13 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
     }
   }
 
-  private processTranscript(data: Record<string, any>, isFinal: boolean) {
+  private processTranscript(data: SttTranscriptEvent, isFinal: boolean) {
     // Check if queue is closed to avoid race condition during disconnect
     if (this.queue.closed) return;
 
-    const requestId = data.request_id ?? this.requestId;
-    const text = data.transcript ?? '';
-    const language = data.language ?? this.opts.language ?? 'en';
+    const requestId = data.session_id || this.requestId;
+    const text = data.transcript;
+    const language = data.language || this.opts.language || 'en';
 
     if (!text && !isFinal) return;
 
@@ -476,10 +484,19 @@ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
 
       const speechData: SpeechData = {
         language,
-        startTime: data.start ?? 0,
-        endTime: data.duration ?? 0,
-        confidence: data.confidence ?? 1.0,
+        startTime: this.startTimeOffset + data.start,
+        endTime: this.startTimeOffset + data.start + data.duration,
+        confidence: data.confidence,
         text,
+        words: data.words.map(
+          (word): TimedString => ({
+            text: word.word,
+            startTime: word.start + this.startTimeOffset,
+            endTime: word.end + this.startTimeOffset,
+            startTimeOffset: this.startTimeOffset,
+            confidence: word.confidence,
+          }),
+        ),
       };
 
       if (isFinal) {
 
@@ -13,6 +13,7 @@ import { DeferredReadableStream } from '../stream/deferred_stream.js';
 import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS, intervalForRetry } from '../types.js';
 import type { AudioBuffer } from '../utils.js';
 import { AsyncIterableQueue, delay, startSoon, toError } from '../utils.js';
+import type { TimedString } from '../voice/index.js';
 
 /** Indicates start/middle/end of speech */
 export enum SpeechEventType {
@@ -53,6 +54,7 @@ export interface SpeechData {
   startTime: number;
   endTime: number;
   confidence: number;
+  words?: TimedString[];
 }
 
 export interface RecognitionUsage {
@@ -76,6 +78,13 @@ export interface SpeechEvent {
 export interface STTCapabilities {
   streaming: boolean;
   interimResults: boolean;
+  /**
+   * Whether this STT supports aligned transcripts with word/chunk timestamps.
+   * - 'word': Provider returns word-level timestamps
+   * - 'chunk': Provider returns chunk-level timestamps (e.g., sentence/phrase boundaries)
+   * - false: Provider does not support aligned transcripts
+   */
+  alignedTranscript?: 'word' | 'chunk' | false;
 }
 
 export interface STTError {
@@ -176,6 +185,7 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
   private deferredInputStream: DeferredReadableStream<AudioFrame>;
   private logger = log();
   private _connOptions: APIConnectOptions;
+  private _startTimeOffset: number = 0;
 
   protected abortController = new AbortController();
 
@@ -300,6 +310,17 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
     return this.abortController.signal;
   }
 
+  get startTimeOffset(): number {
+    return this._startTimeOffset;
+  }
+
+  set startTimeOffset(value: number) {
+    if (value < 0) {
+      throw new Error('startTimeOffset must be non-negative');
+    }
+    this._startTimeOffset = value;
+  }
+
   updateInputStream(audioStream: ReadableStream<AudioFrame>) {
     this.deferredInputStream.setSource(audioStream);
   }
 
@@ -271,6 +271,15 @@ export class Agent<UserData = any> {
 
       const connOptions = activity.agentSession.connOptions.sttConnOptions;
       const stream = wrapped_stt.stream({ connOptions });
+
+      // Set startTimeOffset to provide linear timestamps across reconnections
+      const audioInputStartedAt =
+        activity.agentSession._recorderIO?.recordingStartedAt ?? // Use recording start time if available
+        activity.agentSession._startedAt ?? // Fallback to session start time
+        Date.now(); // Fallback to current time
+
+      stream.startTimeOffset = (Date.now() - audioInputStartedAt) / 1000;
+
       stream.updateInputStream(audio);
 
       let cleaned = false;
 
@@ -30,12 +30,14 @@ export type TTSNode = (
 ) => Promise<ReadableStream<AudioFrame> | null>;
 
 /**
- * A string with timing information for word-level alignment.
+ *A string with optional start and end timestamps for word-level alignment.
  */
 export interface TimedString {
   text: string;
   startTime?: number; // seconds
   endTime?: number; // seconds
+  confidence?: number;
+  startTimeOffset?: number;
 }
 
 export interface AudioOutputCapabilities {