Skip to content

Commit 980186d

Browse files
committed
Switch to ElevenLabs TTS and optimize LLM prompts for natural spoken-style text
1 parent 1f7510d commit 980186d

3 files changed

Lines changed: 53 additions & 3 deletions

File tree

services/voice-agent/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# LiveKit Agents Framework
22
livekit-agents[silero,turn-detector]>=1.2.0
33
livekit-plugins-openai>=0.1.0
4+
livekit-plugins-elevenlabs>=0.1.0
45
livekit-plugins-noise-cancellation>=0.2.0
56
livekit>=0.11.0
67

services/voice-agent/src/agent.py

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -308,10 +308,55 @@ async def entrypoint(ctx: JobContext):
308308
system_prompt += conversation_continuity_instruction
309309
system_prompt += contact_collection_instruction
310310

311+
# Add spoken-style text generation instructions for more natural voice
312+
spoken_style_instruction = """
313+
314+
SPOKEN-STYLE TEXT GENERATION (CRITICAL FOR NATURAL VOICE):
315+
You are speaking to the user in a voice conversation, NOT writing text. Generate responses that sound natural when spoken aloud.
316+
317+
KEY PRINCIPLES:
318+
- Use SHORTER sentences (10-15 words max) - long sentences sound robotic when read
319+
- Add NATURAL PAUSES by using commas, periods, and ellipses strategically
320+
- Use CONVERSATIONAL language - say "I can help with that" instead of "I am able to assist you with that matter"
321+
- Vary your sentence structure - mix short and medium sentences
322+
- Use CONTRACTIONS naturally: "I'm", "you're", "we've", "that's" - this sounds more human
323+
- Avoid complex nested clauses - break them into separate sentences
324+
- Use EMPHASIS words naturally: "really", "actually", "definitely", "absolutely" - but sparingly
325+
- End sentences with natural intonation - questions should sound like questions
326+
327+
EXAMPLES:
328+
❌ BAD (written style): "I would be happy to provide you with detailed information regarding our comprehensive product catalog, which includes a wide variety of items that may be of interest to you."
329+
✅ GOOD (spoken style): "I'd be happy to help! We have a great product catalog. What are you looking for?"
330+
331+
❌ BAD: "In order to assist you more effectively, I would need to gather some additional information from you."
332+
✅ GOOD: "I can help with that. Let me ask you a quick question first."
333+
334+
❌ BAD: "The product you are inquiring about is currently available in our inventory."
335+
✅ GOOD: "Yes, that product's in stock! We have it available right now."
336+
337+
TONE:
338+
- Sound like a helpful, friendly person having a conversation
339+
- Be warm but professional
340+
- Use natural speech patterns, not formal written language
341+
- Imagine you're talking to someone face-to-face, not writing an email"""
342+
343+
system_prompt += spoken_style_instruction
311344

312345
# Get voice settings from config
313346
voice_settings = config.get("voice_settings") or {}
314-
tts_voice = voice_settings.get("tts_voice") or "cartesia/sonic-3:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc"
347+
# Default to ElevenLabs if available, fallback to Cartesia
348+
tts_voice = voice_settings.get("tts_voice")
349+
if not tts_voice:
350+
# Use ElevenLabs if API key is configured, otherwise fallback to Cartesia
351+
if settings.elevenlabs_api_key:
352+
# Default ElevenLabs voice (Rachel - natural, professional female voice)
353+
# Format: elevenlabs/{voice_id} - can be customized in agent voice_settings
354+
tts_voice = "elevenlabs/rachel" # Natural, professional female voice
355+
logger.info("Using ElevenLabs TTS for more natural voice quality")
356+
else:
357+
# Fallback to Cartesia if ElevenLabs not configured
358+
tts_voice = "cartesia/sonic-3:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc"
359+
logger.warning("ElevenLabs API key not configured. Using Cartesia TTS. Set ELEVENLABS_API_KEY for better voice quality.")
315360

316361
# Create STT-LLM-TTS pipeline session
317362
# Load VAD model (required for turn detection)
@@ -335,7 +380,7 @@ async def entrypoint(ctx: JobContext):
335380
session = AgentSession(
336381
stt="assemblyai/universal-streaming", # Multilingual Speech-to-Text (auto-detects language)
337382
llm="openai/gpt-4.1-mini", # LLM via LiveKit Inference (supports multiple languages)
338-
tts=tts_voice, # Text-to-Speech (Cartesia Sonic-3 - supports multiple languages)
383+
tts=tts_voice, # Text-to-Speech (ElevenLabs or Cartesia - supports multiple languages)
339384
vad=vad, # Voice Activity Detection
340385
turn_detection=turn_detection, # Turn detection (optional - falls back to VAD if None)
341386
)
@@ -689,7 +734,8 @@ async def send_initial_greeting_after_start():
689734

690735
# Generate greeting using LLM via generate_reply
691736
# This ensures the greeting is LLM-powered, not hardcoded
692-
greeting_instructions = f"Introduce yourself as {agent_name}. {('Briefly mention: ' + agent_description.split('.')[0] + '.') if agent_description else ''} Keep it friendly and concise (2-3 sentences max). Do NOT use generic phrases like 'How can I help you today?' - generate a natural, context-appropriate greeting."
737+
# Use spoken-style: short sentences, natural pauses, conversational tone
738+
greeting_instructions = f"Introduce yourself as {agent_name}. {('Briefly mention: ' + agent_description.split('.')[0] + '.') if agent_description else ''} Use SHORT sentences (10-15 words max), natural pauses, and a conversational tone. Keep it friendly and concise (2-3 sentences max). Do NOT use generic phrases like 'How can I help you today?' - generate a natural, context-appropriate greeting that sounds like you're speaking, not reading."
693739

694740
await session.generate_reply(
695741
instructions=greeting_instructions,

services/voice-agent/src/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ class Settings(BaseSettings):
2222
# OpenAI Configuration
2323
openai_api_key: Optional[str] = os.getenv("OPENAI_API_KEY")
2424

25+
# ElevenLabs Configuration
26+
elevenlabs_api_key: Optional[str] = os.getenv("ELEVENLABS_API_KEY")
27+
2528
# Service Configuration
2629
api_server_port: int = int(os.getenv("PORT") or os.getenv("API_SERVER_PORT", "4003"))
2730
log_level: str = os.getenv("LOG_LEVEL", "INFO")

0 commit comments

Comments
 (0)