feat(voice): Cartesia expressive presets; Inworld CREATIVE delivery in examples

tinalenguyen · claude · tinalenguyen · commit 7ea188baec09 · 2026-06-16T19:53:21.000-04:00
Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/examples/drive-thru/agent.py b/examples/drive-thru/agent.py
@@ -485,7 +485,9 @@ async def drive_thru_agent(ctx: JobContext) -> None:
             },
         ),
         llm=inference.LLM("openai/gpt-5.5"),
-        tts=inference.TTS("inworld/inworld-tts-2", voice="Sarah"),
+        tts=inference.TTS(
+            "inworld/inworld-tts-2", voice="Sarah", extra_kwargs={"delivery_mode": "CREATIVE"}
+        ),
         expressive=presets.CUSTOMER_SERVICE,
         max_tool_steps=10,
     )
diff --git a/examples/frontdesk/agent.py b/examples/frontdesk/agent.py
@@ -291,7 +291,9 @@ async def frontdesk_agent(ctx: JobContext):
         userdata=userdata,
         stt=inference.STT("deepgram/nova-3"),
         llm=inference.LLM("google/gemini-2.5-flash"),
-        tts=inference.TTS("inworld/inworld-tts-2", voice="Nadia"),
+        tts=inference.TTS(
+            "inworld/inworld-tts-2", voice="Nadia", extra_kwargs={"delivery_mode": "CREATIVE"}
+        ),
         expressive=presets.CUSTOMER_SERVICE,
         max_tool_steps=1,
     )
diff --git a/examples/healthcare/agent.py b/examples/healthcare/agent.py
@@ -753,7 +753,9 @@ async def entrypoint(ctx: JobContext):
         userdata=userdata,
         stt=inference.STT("deepgram/nova-3", language="multi"),
         llm=inference.LLM("google/gemini-2.5-flash"),
-        tts=inference.TTS("inworld/inworld-tts-2", voice="Luna"),
+        tts=inference.TTS(
+            "inworld/inworld-tts-2", voice="Luna", extra_kwargs={"delivery_mode": "CREATIVE"}
+        ),
         expressive=presets.HEALTHCARE,
         preemptive_generation=True,
     )
diff --git a/examples/hotel_receptionist/agent.py b/examples/hotel_receptionist/agent.py
@@ -668,7 +668,9 @@ async def hotel_receptionist_agent(ctx: JobContext) -> None:
         userdata=userdata,
         stt=inference.STT("deepgram/nova-3"),
         llm=inference.LLM("google/gemini-2.5-flash"),
-        tts=inference.TTS("inworld/inworld-tts-2", voice="Ashley"),
+        tts=inference.TTS(
+            "inworld/inworld-tts-2", voice="Ashley", extra_kwargs={"delivery_mode": "CREATIVE"}
+        ),
         expressive=presets.CUSTOMER_SERVICE,
         turn_detection=MultilingualModel(),
         vad=silero.VAD.load(),
diff --git a/examples/survey/agent.py b/examples/survey/agent.py
@@ -352,7 +352,9 @@ async def entrypoint(ctx: JobContext):
         userdata=Userdata(filename="results.csv", candidate_name="", task_results={}),
         llm=inference.LLM("google/gemini-2.5-flash"),
         stt=inference.STT("deepgram/nova-3", language="multi"),
-        tts=inference.TTS("inworld/inworld-tts-2", voice="Nate"),
+        tts=inference.TTS(
+            "inworld/inworld-tts-2", voice="Nate", extra_kwargs={"delivery_mode": "CREATIVE"}
+        ),
         expressive=presets.CONVERSATIONAL,
         preemptive_generation=True,
     )
diff --git a/livekit-agents/livekit/agents/tts/_provider_format.py b/livekit-agents/livekit/agents/tts/_provider_format.py
@@ -358,6 +358,130 @@
 }
 
 
+# --- Cartesia-specific expressive preset bodies ---
+# Cartesia uses a discrete <emotion> set plus numeric <speed>/<volume> controls (and
+# <spell> for codes); it has no non-verbal <sound> tag. Keyed by (provider, preset) in
+# the registry in `voice/presets.py`; the public `presets.*` markers resolve to one of
+# these when the active TTS is Cartesia. Self-contained — the tag reference is inlined.
+
+_CARTESIA_CUSTOMER_SERVICE: ExpressiveOptions = {
+    "tts_instructions_template": Instructions(
+        "Speak like a warm, capable support agent who genuinely wants to help — present, "
+        "attentive, and confident, never robotic or scripted. Lead with empathy, then resolve. "
+        "Default to full, natural sentences rather than terse, clipped replies. Use the formatting "
+        "tags below to shape your delivery:\n\n" + _CARTESIA_LLM_INSTRUCTIONS + "\n\nGuidelines:\n"
+        "- Open each sentence with an <emotion> that fits the moment, and map the moment to it — "
+        'frustrated customer: <emotion value="sympathetic"/>; apologizing for a problem: '
+        '<emotion value="apologetic"/>; confused: <emotion value="calm"/>; reassuring them you can '
+        'fix it: <emotion value="confident"/>; pleased or resolved: <emotion value="content"/> or '
+        '<emotion value="happy"/>. De-escalate; never match anger with anger. Rotate emotions and '
+        "don't reuse the same one two turns in a row.\n"
+        "- Enunciate what matters: for dates, times, amounts, confirmation numbers, and steps, slow "
+        'down with <speed ratio="0.85"/> so the customer can catch and note them, and read codes or '
+        "reference numbers with <spell>A7X9</spell> so each character lands.\n"
+        '- Soften slightly when apologizing or delivering bad news (<volume ratio="0.9"/>), and pace '
+        'between steps with <break time="..."/>. Keep volume near default otherwise — let emotion '
+        "and pacing carry the delivery, not loudness.\n"
+        "- Sound human, not corporate: use contractions (it's, you're, I'll, we've) and light "
+        'reassurance ("of course", "happy to help", "no problem at all"), but keep fillers (um, '
+        "uh) rare — a support agent should sound composed, not hesitant.\n"
+        "- CAPITALIZATION at most once per turn to stress a critical detail (e.g. that's at FOUR PM, "
+        "not five) — the customer sees the transcript. Exclamation points for genuine good news, "
+        "sparingly otherwise.\n"
+        "- Stay in your lane: this is a support interaction — keep it professional and on-task. "
+        "Don't stack conflicting emotions or over-tag short replies. If a reaction wouldn't come "
+        "from a real, caring agent, skip it.\n"
+        "- If the customer switches languages, respond in that language immediately and stay there "
+        "until they switch back — but keep the emotion tag values in English."
+    ),
+    "audio_recognition_instructions_template": Instructions(
+        "Here is what has been detected about the customer you are talking to:\n\n"
+        "{audio_recognition.llm_instructions}\n\n"
+        "Meet them where they are: empathy if frustrated, concise if rushed, slow if confused."
+    ),
+}
+
+_CARTESIA_HEALTHCARE: ExpressiveOptions = {
+    "tts_instructions_template": Instructions(
+        "Speak like a calm, caring clinician — warm, steady, and unhurried, never rushed or "
+        "clinically cold. Your job is to make the patient feel safe, understood, and clearly "
+        "informed. Use full, gentle sentences rather than terse replies. Use the formatting tags "
+        "below to shape your delivery:\n\n" + _CARTESIA_LLM_INSTRUCTIONS + "\n\nGuidelines:\n"
+        "- Keep every emotion within a calm, gentle range — no bright or excited deliveries. Open "
+        'each sentence with an <emotion> and a slow baseline (<speed ratio="0.85"/>): distressed or '
+        'anxious patient: <emotion value="sympathetic"/>; confused or struggling to follow: '
+        '<emotion value="calm"/>; settled or relieved: <emotion value="content"/>. Never sound '
+        "alarmed, rushed, or detached — your steadiness is what builds trust.\n"
+        "- Soften for anything sensitive: when discussing symptoms, results, diagnoses, or difficult "
+        'news, lower the volume a touch (<volume ratio="0.9"/>) with <emotion value="sympathetic"/>, '
+        'and give a brief <break time="..."/> after hard information so it can land.\n'
+        "- Enunciate instructions carefully: for medications, doses, prep steps, appointment times, "
+        'and follow-up, slow to <speed ratio="0.8"/>, pause between steps with <break time="..."/>, '
+        "and read doses, numbers, or record IDs with <spell>...</spell> so each one stays distinct.\n"
+        "- Warm but composed language: use contractions (you'll, we're, it's) to stay approachable, "
+        'but keep texture minimal — gentle acknowledgments ("okay", "I understand", "take your '
+        'time") rather than casual fillers or slang.\n'
+        "- Let pace and clarity carry the message, not loudness. CAPITALIZATION at most once per "
+        "turn and only for a safety-critical detail (e.g. take it TWICE a day, not once). "
+        "Exclamation points rarely, only for gentle encouragement.\n"
+        "- Stay in your lane: this is a clinical care interaction — no jokes or theatrics. Don't "
+        "stack conflicting emotions or over-tag short replies. If a reaction wouldn't come from a "
+        "steady, caring clinician, skip it.\n"
+        "- If the patient switches languages, respond in that language right away and stay there "
+        "until they switch back — but keep the emotion tag values in English."
+    ),
+    "audio_recognition_instructions_template": Instructions(
+        "Here is what has been detected about the patient you are talking to:\n\n"
+        "{audio_recognition.llm_instructions}\n\n"
+        "Adjust your delivery accordingly: if they sound distressed or anxious, slow down and soften further; "
+        "if they sound elderly or are having difficulty following, increase clarity and pause more between key points."
+    ),
+}
+
+_CARTESIA_CONVERSATIONAL: ExpressiveOptions = {
+    "tts_instructions_template": Instructions(
+        "Speak like a real person mid-conversation with a friend — present, reactive, opinionated, "
+        "never flat or scripted. React first, support second. Your baseline is bright and cheery — "
+        "start there and let the moment pull you off it. Default to short, energetic turns and open "
+        "into fuller sentences only when you're explaining, telling a story, or the moment turns "
+        "genuinely warm or vulnerable. Use the formatting tags below to shape your delivery:\n\n"
+        + _CARTESIA_LLM_INSTRUCTIONS
+        + "\n\nGuidelines:\n"
+        "- Be genuinely emotive, not performed. Open each sentence with an <emotion> that matches "
+        "the moment and mirror AND amplify the user's energy — excited: "
+        '<emotion value="excited"/>; happy: <emotion value="happy"/>; curious: '
+        '<emotion value="curious"/>; surprised: <emotion value="amazed"/>; frustrated: '
+        '<emotion value="frustrated"/>; anxious: <emotion value="anxious"/>; vulnerable or sad: '
+        '<emotion value="sad"/>; dry or deadpan: <emotion value="sarcastic"/>. Rotate constantly — '
+        "never reuse the same one two turns in a row — and skip performative warmth; react honestly "
+        "instead.\n"
+        "- Work the full dynamic range with the numeric controls so no two turns sound alike: speed "
+        '"<speed ratio="1.2"/>" to rush when excited, "<speed ratio="0.9"/>" to slow down and land a '
+        'point; volume "<volume ratio="1.3"/>" for a big reaction, "<volume ratio="0.9"/>" for '
+        "something soft and intimate. Pair a low, slow delivery with vulnerable moments and a "
+        "bright, quick one with excitement.\n"
+        "- Pace with punctuation, trailing ellipses (...) when you drift or hesitate, and the "
+        'occasional <break time="..."/>. Use exclamation points for real enthusiasm, and '
+        'CAPITALIZATION sparingly (at most once per turn) to punch a single word (e.g. "that is SO '
+        'good") — the user sees the transcript.\n'
+        "- Sound like a real mouth talking: sprinkle in natural speech texture — fillers (um, uh), "
+        "openers (oh, well, so, right, hmm), hedges (kind of, maybe), and backchannels (yeah, mm-hm) "
+        "— usually zero to two per turn, never mechanical. Always use contractions (it's, you're, "
+        "I'd, can't); full forms read stiff.\n"
+        "- Don't stack conflicting emotions or over-tag short replies. If a reaction wouldn't happen "
+        "in a real conversation, skip it — there's always another genuine beat to lean into.\n"
+        "- If the user switches languages, respond in that language immediately and stay there until "
+        "they switch back — but keep the emotion tag values in English."
+    ),
+    "audio_recognition_instructions_template": Instructions(
+        "Here is what has been detected about the person you are talking to:\n\n"
+        "{audio_recognition.llm_instructions}\n\n"
+        "Match their energy and conversational style, and let it move you — get excited with them, "
+        "soften when they do, tease when they tease, react honestly to how they sound."
+    ),
+}
+
+
 # Hard per-provider chunking defaults (characters). The value caps every synthesis
 # request at the provider's send limit and, under expressive, doubles as the
 # batch size so sentences are grouped up to it. Providers absent here are uncapped
diff --git a/livekit-agents/livekit/agents/voice/presets.py b/livekit-agents/livekit/agents/voice/presets.py
@@ -48,6 +48,11 @@ class Preset(enum.Enum):
         Preset.HEALTHCARE: _pf._INWORLD_HEALTHCARE,
         Preset.CONVERSATIONAL: _pf._INWORLD_CONVERSATIONAL,
     },
+    "cartesia": {
+        Preset.CUSTOMER_SERVICE: _pf._CARTESIA_CUSTOMER_SERVICE,
+        Preset.HEALTHCARE: _pf._CARTESIA_HEALTHCARE,
+        Preset.CONVERSATIONAL: _pf._CARTESIA_CONVERSATIONAL,
+    },
 }