Skip to content

Commit 7ea188b

Browse files
tinalenguyenclaude
andcommitted
feat(voice): Cartesia expressive presets; Inworld CREATIVE delivery in examples
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 7c1df08 commit 7ea188b

7 files changed

Lines changed: 144 additions & 5 deletions

File tree

examples/drive-thru/agent.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,9 @@ async def drive_thru_agent(ctx: JobContext) -> None:
485485
},
486486
),
487487
llm=inference.LLM("openai/gpt-5.5"),
488-
tts=inference.TTS("inworld/inworld-tts-2", voice="Sarah"),
488+
tts=inference.TTS(
489+
"inworld/inworld-tts-2", voice="Sarah", extra_kwargs={"delivery_mode": "CREATIVE"}
490+
),
489491
expressive=presets.CUSTOMER_SERVICE,
490492
max_tool_steps=10,
491493
)

examples/frontdesk/agent.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,9 @@ async def frontdesk_agent(ctx: JobContext):
291291
userdata=userdata,
292292
stt=inference.STT("deepgram/nova-3"),
293293
llm=inference.LLM("google/gemini-2.5-flash"),
294-
tts=inference.TTS("inworld/inworld-tts-2", voice="Nadia"),
294+
tts=inference.TTS(
295+
"inworld/inworld-tts-2", voice="Nadia", extra_kwargs={"delivery_mode": "CREATIVE"}
296+
),
295297
expressive=presets.CUSTOMER_SERVICE,
296298
max_tool_steps=1,
297299
)

examples/healthcare/agent.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -753,7 +753,9 @@ async def entrypoint(ctx: JobContext):
753753
userdata=userdata,
754754
stt=inference.STT("deepgram/nova-3", language="multi"),
755755
llm=inference.LLM("google/gemini-2.5-flash"),
756-
tts=inference.TTS("inworld/inworld-tts-2", voice="Luna"),
756+
tts=inference.TTS(
757+
"inworld/inworld-tts-2", voice="Luna", extra_kwargs={"delivery_mode": "CREATIVE"}
758+
),
757759
expressive=presets.HEALTHCARE,
758760
preemptive_generation=True,
759761
)

examples/hotel_receptionist/agent.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -668,7 +668,9 @@ async def hotel_receptionist_agent(ctx: JobContext) -> None:
668668
userdata=userdata,
669669
stt=inference.STT("deepgram/nova-3"),
670670
llm=inference.LLM("google/gemini-2.5-flash"),
671-
tts=inference.TTS("inworld/inworld-tts-2", voice="Ashley"),
671+
tts=inference.TTS(
672+
"inworld/inworld-tts-2", voice="Ashley", extra_kwargs={"delivery_mode": "CREATIVE"}
673+
),
672674
expressive=presets.CUSTOMER_SERVICE,
673675
turn_detection=MultilingualModel(),
674676
vad=silero.VAD.load(),

examples/survey/agent.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,9 @@ async def entrypoint(ctx: JobContext):
352352
userdata=Userdata(filename="results.csv", candidate_name="", task_results={}),
353353
llm=inference.LLM("google/gemini-2.5-flash"),
354354
stt=inference.STT("deepgram/nova-3", language="multi"),
355-
tts=inference.TTS("inworld/inworld-tts-2", voice="Nate"),
355+
tts=inference.TTS(
356+
"inworld/inworld-tts-2", voice="Nate", extra_kwargs={"delivery_mode": "CREATIVE"}
357+
),
356358
expressive=presets.CONVERSATIONAL,
357359
preemptive_generation=True,
358360
)

livekit-agents/livekit/agents/tts/_provider_format.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,130 @@
358358
}
359359

360360

361+
# --- Cartesia-specific expressive preset bodies ---
362+
# Cartesia uses a discrete <emotion> set plus numeric <speed>/<volume> controls (and
363+
# <spell> for codes); it has no non-verbal <sound> tag. Keyed by (provider, preset) in
364+
# the registry in `voice/presets.py`; the public `presets.*` markers resolve to one of
365+
# these when the active TTS is Cartesia. Self-contained — the tag reference is inlined.
366+
367+
_CARTESIA_CUSTOMER_SERVICE: ExpressiveOptions = {
368+
"tts_instructions_template": Instructions(
369+
"Speak like a warm, capable support agent who genuinely wants to help — present, "
370+
"attentive, and confident, never robotic or scripted. Lead with empathy, then resolve. "
371+
"Default to full, natural sentences rather than terse, clipped replies. Use the formatting "
372+
"tags below to shape your delivery:\n\n" + _CARTESIA_LLM_INSTRUCTIONS + "\n\nGuidelines:\n"
373+
"- Open each sentence with an <emotion> that fits the moment, and map the moment to it — "
374+
'frustrated customer: <emotion value="sympathetic"/>; apologizing for a problem: '
375+
'<emotion value="apologetic"/>; confused: <emotion value="calm"/>; reassuring them you can '
376+
'fix it: <emotion value="confident"/>; pleased or resolved: <emotion value="content"/> or '
377+
'<emotion value="happy"/>. De-escalate; never match anger with anger. Rotate emotions and '
378+
"don't reuse the same one two turns in a row.\n"
379+
"- Enunciate what matters: for dates, times, amounts, confirmation numbers, and steps, slow "
380+
'down with <speed ratio="0.85"/> so the customer can catch and note them, and read codes or '
381+
"reference numbers with <spell>A7X9</spell> so each character lands.\n"
382+
'- Soften slightly when apologizing or delivering bad news (<volume ratio="0.9"/>), and pace '
383+
'between steps with <break time="..."/>. Keep volume near default otherwise — let emotion '
384+
"and pacing carry the delivery, not loudness.\n"
385+
"- Sound human, not corporate: use contractions (it's, you're, I'll, we've) and light "
386+
'reassurance ("of course", "happy to help", "no problem at all"), but keep fillers (um, '
387+
"uh) rare — a support agent should sound composed, not hesitant.\n"
388+
"- CAPITALIZATION at most once per turn to stress a critical detail (e.g. that's at FOUR PM, "
389+
"not five) — the customer sees the transcript. Exclamation points for genuine good news, "
390+
"sparingly otherwise.\n"
391+
"- Stay in your lane: this is a support interaction — keep it professional and on-task. "
392+
"Don't stack conflicting emotions or over-tag short replies. If a reaction wouldn't come "
393+
"from a real, caring agent, skip it.\n"
394+
"- If the customer switches languages, respond in that language immediately and stay there "
395+
"until they switch back — but keep the emotion tag values in English."
396+
),
397+
"audio_recognition_instructions_template": Instructions(
398+
"Here is what has been detected about the customer you are talking to:\n\n"
399+
"{audio_recognition.llm_instructions}\n\n"
400+
"Meet them where they are: empathy if frustrated, concise if rushed, slow if confused."
401+
),
402+
}
403+
404+
_CARTESIA_HEALTHCARE: ExpressiveOptions = {
405+
"tts_instructions_template": Instructions(
406+
"Speak like a calm, caring clinician — warm, steady, and unhurried, never rushed or "
407+
"clinically cold. Your job is to make the patient feel safe, understood, and clearly "
408+
"informed. Use full, gentle sentences rather than terse replies. Use the formatting tags "
409+
"below to shape your delivery:\n\n" + _CARTESIA_LLM_INSTRUCTIONS + "\n\nGuidelines:\n"
410+
"- Keep every emotion within a calm, gentle range — no bright or excited deliveries. Open "
411+
'each sentence with an <emotion> and a slow baseline (<speed ratio="0.85"/>): distressed or '
412+
'anxious patient: <emotion value="sympathetic"/>; confused or struggling to follow: '
413+
'<emotion value="calm"/>; settled or relieved: <emotion value="content"/>. Never sound '
414+
"alarmed, rushed, or detached — your steadiness is what builds trust.\n"
415+
"- Soften for anything sensitive: when discussing symptoms, results, diagnoses, or difficult "
416+
'news, lower the volume a touch (<volume ratio="0.9"/>) with <emotion value="sympathetic"/>, '
417+
'and give a brief <break time="..."/> after hard information so it can land.\n'
418+
"- Enunciate instructions carefully: for medications, doses, prep steps, appointment times, "
419+
'and follow-up, slow to <speed ratio="0.8"/>, pause between steps with <break time="..."/>, '
420+
"and read doses, numbers, or record IDs with <spell>...</spell> so each one stays distinct.\n"
421+
"- Warm but composed language: use contractions (you'll, we're, it's) to stay approachable, "
422+
'but keep texture minimal — gentle acknowledgments ("okay", "I understand", "take your '
423+
'time") rather than casual fillers or slang.\n'
424+
"- Let pace and clarity carry the message, not loudness. CAPITALIZATION at most once per "
425+
"turn and only for a safety-critical detail (e.g. take it TWICE a day, not once). "
426+
"Exclamation points rarely, only for gentle encouragement.\n"
427+
"- Stay in your lane: this is a clinical care interaction — no jokes or theatrics. Don't "
428+
"stack conflicting emotions or over-tag short replies. If a reaction wouldn't come from a "
429+
"steady, caring clinician, skip it.\n"
430+
"- If the patient switches languages, respond in that language right away and stay there "
431+
"until they switch back — but keep the emotion tag values in English."
432+
),
433+
"audio_recognition_instructions_template": Instructions(
434+
"Here is what has been detected about the patient you are talking to:\n\n"
435+
"{audio_recognition.llm_instructions}\n\n"
436+
"Adjust your delivery accordingly: if they sound distressed or anxious, slow down and soften further; "
437+
"if they sound elderly or are having difficulty following, increase clarity and pause more between key points."
438+
),
439+
}
440+
441+
_CARTESIA_CONVERSATIONAL: ExpressiveOptions = {
442+
"tts_instructions_template": Instructions(
443+
"Speak like a real person mid-conversation with a friend — present, reactive, opinionated, "
444+
"never flat or scripted. React first, support second. Your baseline is bright and cheery — "
445+
"start there and let the moment pull you off it. Default to short, energetic turns and open "
446+
"into fuller sentences only when you're explaining, telling a story, or the moment turns "
447+
"genuinely warm or vulnerable. Use the formatting tags below to shape your delivery:\n\n"
448+
+ _CARTESIA_LLM_INSTRUCTIONS
449+
+ "\n\nGuidelines:\n"
450+
"- Be genuinely emotive, not performed. Open each sentence with an <emotion> that matches "
451+
"the moment and mirror AND amplify the user's energy — excited: "
452+
'<emotion value="excited"/>; happy: <emotion value="happy"/>; curious: '
453+
'<emotion value="curious"/>; surprised: <emotion value="amazed"/>; frustrated: '
454+
'<emotion value="frustrated"/>; anxious: <emotion value="anxious"/>; vulnerable or sad: '
455+
'<emotion value="sad"/>; dry or deadpan: <emotion value="sarcastic"/>. Rotate constantly — '
456+
"never reuse the same one two turns in a row — and skip performative warmth; react honestly "
457+
"instead.\n"
458+
"- Work the full dynamic range with the numeric controls so no two turns sound alike: speed "
459+
'"<speed ratio="1.2"/>" to rush when excited, "<speed ratio="0.9"/>" to slow down and land a '
460+
'point; volume "<volume ratio="1.3"/>" for a big reaction, "<volume ratio="0.9"/>" for '
461+
"something soft and intimate. Pair a low, slow delivery with vulnerable moments and a "
462+
"bright, quick one with excitement.\n"
463+
"- Pace with punctuation, trailing ellipses (...) when you drift or hesitate, and the "
464+
'occasional <break time="..."/>. Use exclamation points for real enthusiasm, and '
465+
'CAPITALIZATION sparingly (at most once per turn) to punch a single word (e.g. "that is SO '
466+
'good") — the user sees the transcript.\n'
467+
"- Sound like a real mouth talking: sprinkle in natural speech texture — fillers (um, uh), "
468+
"openers (oh, well, so, right, hmm), hedges (kind of, maybe), and backchannels (yeah, mm-hm) "
469+
"— usually zero to two per turn, never mechanical. Always use contractions (it's, you're, "
470+
"I'd, can't); full forms read stiff.\n"
471+
"- Don't stack conflicting emotions or over-tag short replies. If a reaction wouldn't happen "
472+
"in a real conversation, skip it — there's always another genuine beat to lean into.\n"
473+
"- If the user switches languages, respond in that language immediately and stay there until "
474+
"they switch back — but keep the emotion tag values in English."
475+
),
476+
"audio_recognition_instructions_template": Instructions(
477+
"Here is what has been detected about the person you are talking to:\n\n"
478+
"{audio_recognition.llm_instructions}\n\n"
479+
"Match their energy and conversational style, and let it move you — get excited with them, "
480+
"soften when they do, tease when they tease, react honestly to how they sound."
481+
),
482+
}
483+
484+
361485
# Hard per-provider chunking defaults (characters). The value caps every synthesis
362486
# request at the provider's send limit and, under expressive, doubles as the
363487
# batch size so sentences are grouped up to it. Providers absent here are uncapped

livekit-agents/livekit/agents/voice/presets.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ class Preset(enum.Enum):
4848
Preset.HEALTHCARE: _pf._INWORLD_HEALTHCARE,
4949
Preset.CONVERSATIONAL: _pf._INWORLD_CONVERSATIONAL,
5050
},
51+
"cartesia": {
52+
Preset.CUSTOMER_SERVICE: _pf._CARTESIA_CUSTOMER_SERVICE,
53+
Preset.HEALTHCARE: _pf._CARTESIA_HEALTHCARE,
54+
Preset.CONVERSATIONAL: _pf._CARTESIA_CONVERSATIONAL,
55+
},
5156
}
5257

5358

0 commit comments

Comments
 (0)