Skip to content

Commit 1287430

Browse files
toubatbrianclaude
andauthored
feat(inference/tts): port aligned transcript / output_timestamps from Python (#5534) (#1311)
Co-authored-by: Claude <noreply@anthropic.com>
1 parent 4abc309 commit 1287430

6 files changed

Lines changed: 322 additions & 8 deletions

File tree

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@livekit/agents": minor
3+
---
4+
5+
feat(inference/tts): detect aligned transcript capability from provider `modelOptions` (`cartesia.add_timestamps`, `elevenlabs.sync_alignment`, `inworld.timestamp_type`) and forward the gateway's `output_timestamps` WebSocket events as `TimedString` word/character timings attached to the next synthesized audio frame. Ported from livekit/agents#5534.

agents/src/inference/api_protos.test.ts

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
//
33
// SPDX-License-Identifier: Apache-2.0
44
import { describe, expect, it } from 'vitest';
5-
import { sttServerEventSchema } from './api_protos.js';
5+
import { sttServerEventSchema, ttsServerEventSchema } from './api_protos.js';
66

77
describe('sttServerEventSchema', () => {
88
it('accepts numeric error codes from STT server events', () => {
@@ -15,3 +15,53 @@ describe('sttServerEventSchema', () => {
1515
expect(result.success).toBe(true);
1616
});
1717
});
18+
19+
describe('ttsServerEventSchema', () => {
20+
it('extracts output_timestamps words payload', () => {
21+
const result = ttsServerEventSchema.safeParse({
22+
type: 'output_timestamps',
23+
session_id: 's1',
24+
words: [
25+
{ word: 'hello', start: 0.1, end: 0.4 },
26+
{ word: 'world', start: 0.4, end: 0.8 },
27+
],
28+
});
29+
30+
expect(result.success).toBe(true);
31+
if (result.success) {
32+
expect(result.data.type).toBe('output_timestamps');
33+
expect(result.data.session_id).toBe('s1');
34+
expect(result.data.words?.map((w) => w.word)).toEqual(['hello', 'world']);
35+
expect(result.data.chars).toBeUndefined();
36+
}
37+
});
38+
39+
it('extracts output_timestamps chars payload', () => {
40+
const result = ttsServerEventSchema.safeParse({
41+
type: 'output_timestamps',
42+
session_id: 's2',
43+
chars: [
44+
{ char: 'h', start: 0.1, end: 0.2 },
45+
{ char: 'i', start: 0.2, end: 0.3 },
46+
],
47+
});
48+
49+
expect(result.success).toBe(true);
50+
if (result.success) {
51+
expect(result.data.type).toBe('output_timestamps');
52+
expect(result.data.session_id).toBe('s2');
53+
expect(result.data.chars?.map((c) => c.char)).toEqual(['h', 'i']);
54+
expect(result.data.words).toBeUndefined();
55+
}
56+
});
57+
58+
it('rejects malformed output_timestamps entries', () => {
59+
const result = ttsServerEventSchema.safeParse({
60+
type: 'output_timestamps',
61+
session_id: 's3',
62+
words: [{ word: 'oops', start: 'bad', end: 0.2 }],
63+
});
64+
65+
expect(result.success).toBe(false);
66+
});
67+
});

agents/src/inference/api_protos.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,25 @@ export const ttsErrorEventSchema = z.object({
5454
session_id: z.string().optional(),
5555
});
5656

57+
export const ttsWordTimestampSchema = z.object({
58+
word: z.string(),
59+
start: z.number(),
60+
end: z.number(),
61+
});
62+
63+
export const ttsCharTimestampSchema = z.object({
64+
char: z.string(),
65+
start: z.number(),
66+
end: z.number(),
67+
});
68+
69+
export const ttsOutputTimestampsEventSchema = z.object({
70+
type: z.literal('output_timestamps'),
71+
session_id: z.string().optional(),
72+
words: z.array(ttsWordTimestampSchema).optional(),
73+
chars: z.array(ttsCharTimestampSchema).optional(),
74+
});
75+
5776
export const ttsClientEventSchema = z.discriminatedUnion('type', [
5877
ttsSessionCreateEventSchema,
5978
ttsInputTranscriptEventSchema,
@@ -64,6 +83,7 @@ export const ttsClientEventSchema = z.discriminatedUnion('type', [
6483
export const ttsServerEventSchema = z.discriminatedUnion('type', [
6584
ttsSessionCreatedEventSchema,
6685
ttsOutputAudioEventSchema,
86+
ttsOutputTimestampsEventSchema,
6787
ttsDoneEventSchema,
6888
ttsSessionClosedEventSchema,
6989
ttsErrorEventSchema,
@@ -75,6 +95,9 @@ export type TtsSessionFlushEvent = z.infer<typeof ttsSessionFlushEventSchema>;
7595
export type TtsSessionCloseEvent = z.infer<typeof ttsSessionCloseEventSchema>;
7696
export type TtsSessionCreatedEvent = z.infer<typeof ttsSessionCreatedEventSchema>;
7797
export type TtsOutputAudioEvent = z.infer<typeof ttsOutputAudioEventSchema>;
98+
export type TtsWordTimestamp = z.infer<typeof ttsWordTimestampSchema>;
99+
export type TtsCharTimestamp = z.infer<typeof ttsCharTimestampSchema>;
100+
export type TtsOutputTimestampsEvent = z.infer<typeof ttsOutputTimestampsEventSchema>;
78101
export type TtsDoneEvent = z.infer<typeof ttsDoneEventSchema>;
79102
export type TtsSessionClosedEvent = z.infer<typeof ttsSessionClosedEventSchema>;
80103
export type TtsErrorEvent = z.infer<typeof ttsErrorEventSchema>;

agents/src/inference/tts.test.ts

Lines changed: 119 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
11
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
22
//
33
// SPDX-License-Identifier: Apache-2.0
4-
import { beforeAll, describe, expect, it } from 'vitest';
4+
import { beforeAll, describe, expect, it, vi } from 'vitest';
55
import { normalizeLanguage } from '../language.js';
66
import { initializeLogger } from '../log.js';
77
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
8-
import { TTS, type TTSFallbackModel, normalizeTTSFallback, parseTTSModelString } from './tts.js';
8+
import {
9+
TTS,
10+
type TTSFallbackModel,
11+
hasAlignedTranscript,
12+
normalizeTTSFallback,
13+
parseTTSModelString,
14+
} from './tts.js';
915

1016
beforeAll(() => {
1117
initializeLogger({ level: 'silent', pretty: false });
@@ -352,3 +358,114 @@ describe('TTS provider modelOptions parity', () => {
352358
expect(tts['opts'].modelOptions).toEqual(modelOptions);
353359
});
354360
});
361+
362+
describe('hasAlignedTranscript', () => {
363+
it('returns false for unknown provider', () => {
364+
expect(hasAlignedTranscript('rime/mistv2', { add_timestamps: true })).toBe(false);
365+
expect(hasAlignedTranscript('deepgram/aura-2', { sync_alignment: true })).toBe(false);
366+
});
367+
368+
it('returns false for an empty options payload', () => {
369+
expect(hasAlignedTranscript('cartesia/sonic', {})).toBe(false);
370+
expect(hasAlignedTranscript('elevenlabs/eleven_flash_v2', undefined)).toBe(false);
371+
expect(hasAlignedTranscript(undefined, { add_timestamps: true })).toBe(false);
372+
});
373+
374+
it('detects Cartesia add_timestamps opt-in', () => {
375+
expect(hasAlignedTranscript('cartesia/sonic', { add_timestamps: true })).toBe(true);
376+
expect(hasAlignedTranscript('cartesia/sonic-3', { add_timestamps: false })).toBe(false);
377+
});
378+
379+
it('detects ElevenLabs sync_alignment opt-in', () => {
380+
expect(hasAlignedTranscript('elevenlabs/eleven_flash_v2', { sync_alignment: true })).toBe(true);
381+
expect(
382+
hasAlignedTranscript('elevenlabs/eleven_multilingual_v2', { sync_alignment: false }),
383+
).toBe(false);
384+
});
385+
386+
it('detects Inworld WORD/CHARACTER timestamp types', () => {
387+
expect(hasAlignedTranscript('inworld/inworld-tts-1', { timestamp_type: 'WORD' })).toBe(true);
388+
expect(hasAlignedTranscript('inworld/inworld-tts-1', { timestamp_type: 'CHARACTER' })).toBe(
389+
true,
390+
);
391+
expect(
392+
hasAlignedTranscript('inworld/inworld-tts-1', {
393+
timestamp_type: 'TIMESTAMP_TYPE_UNSPECIFIED',
394+
}),
395+
).toBe(false);
396+
});
397+
});
398+
399+
describe('TTS alignedTranscript capability', () => {
400+
it('defaults to alignedTranscript=false when no opt-in is provided', () => {
401+
const tts = makeTts();
402+
expect(tts.capabilities.alignedTranscript).toBe(false);
403+
});
404+
405+
it('reports alignedTranscript=true when Cartesia add_timestamps is set', () => {
406+
const tts = makeTts({
407+
model: 'cartesia/sonic',
408+
modelOptions: { add_timestamps: true },
409+
});
410+
expect(tts.capabilities.alignedTranscript).toBe(true);
411+
});
412+
413+
it('reports alignedTranscript=true when ElevenLabs sync_alignment is set', () => {
414+
const tts = makeTts({
415+
model: 'elevenlabs/eleven_flash_v2',
416+
modelOptions: { sync_alignment: true },
417+
});
418+
expect(tts.capabilities.alignedTranscript).toBe(true);
419+
});
420+
421+
it('reports alignedTranscript=true when Inworld timestamp_type is WORD', () => {
422+
const tts = makeTts({
423+
model: 'inworld/inworld-tts-1',
424+
modelOptions: { timestamp_type: 'WORD' },
425+
});
426+
expect(tts.capabilities.alignedTranscript).toBe(true);
427+
});
428+
429+
it('recomputes alignedTranscript when updateOptions changes modelOptions', () => {
430+
const tts = makeTts({ model: 'cartesia/sonic' });
431+
expect(tts.capabilities.alignedTranscript).toBe(false);
432+
433+
tts.updateOptions({ modelOptions: { add_timestamps: true } });
434+
expect(tts.capabilities.alignedTranscript).toBe(true);
435+
436+
tts.updateOptions({ modelOptions: { add_timestamps: false } });
437+
expect(tts.capabilities.alignedTranscript).toBe(false);
438+
});
439+
440+
it('recomputes alignedTranscript when updateOptions changes the model', () => {
441+
const tts = makeTts({
442+
model: 'cartesia/sonic',
443+
modelOptions: { sync_alignment: true },
444+
});
445+
expect(tts.capabilities.alignedTranscript).toBe(false);
446+
447+
tts.updateOptions({ model: 'elevenlabs/eleven_flash_v2' });
448+
expect(tts.capabilities.alignedTranscript).toBe(true);
449+
});
450+
451+
it('invalidates the connection pool when session-affecting options change', () => {
452+
const tts = makeTts({ model: 'cartesia/sonic' });
453+
const invalidateSpy = vi.spyOn(tts.pool, 'invalidate');
454+
455+
tts.updateOptions({ modelOptions: { add_timestamps: true } });
456+
expect(invalidateSpy).toHaveBeenCalledTimes(1);
457+
458+
tts.updateOptions({ model: 'elevenlabs/eleven_flash_v2' });
459+
expect(invalidateSpy).toHaveBeenCalledTimes(2);
460+
461+
tts.updateOptions({ voice: 'narrator' });
462+
expect(invalidateSpy).toHaveBeenCalledTimes(3);
463+
464+
tts.updateOptions({ language: 'en' });
465+
expect(invalidateSpy).toHaveBeenCalledTimes(4);
466+
467+
// Empty update should not churn the pool.
468+
tts.updateOptions({});
469+
expect(invalidateSpy).toHaveBeenCalledTimes(4);
470+
});
471+
});

0 commit comments

Comments
 (0)