diff --git a/Releases/v4.0.1/.claude/VoiceServer/server.ts b/Releases/v4.0.1/.claude/VoiceServer/server.ts index 9f5dec95c..d64e54389 100644 --- a/Releases/v4.0.1/.claude/VoiceServer/server.ts +++ b/Releases/v4.0.1/.claude/VoiceServer/server.ts @@ -1,6 +1,6 @@ #!/usr/bin/env bun /** - * Voice Server - Personal AI Voice notification server using ElevenLabs TTS + * Voice Server - Personal AI Voice notification server with pluggable TTS providers (ElevenLabs, kokoro-fastapi) * * Architecture: Pure pass-through. All voice config comes from settings.json. * The server has zero hardcoded voice parameters. @@ -102,6 +102,13 @@ function applyPronunciations(text: string): string { return result; } +// Apply pronunciations and log any changes — shared by all TTS providers +function preprocessForTTS(text: string): string { + const pronouncedText = applyPronunciations(text); + if (pronouncedText !== text) console.log(`📖 Pronunciation: "${text}" → "${pronouncedText}"`); + return pronouncedText; +} + // Load pronunciations at startup loadPronunciations(); @@ -136,6 +143,9 @@ interface LoadedVoiceConfig { voices: Record; // keyed by name ("main", "algorithm") voicesByVoiceId: Record; // keyed by voiceId for lookup desktopNotifications: boolean; // whether to show macOS notification banners + ttsProvider: 'elevenlabs' | 'kokoro'; // voiceServer.tts_provider in settings.json + kokoroUrl: string; // voiceServer.kokoro_url in settings.json + kokoroVoice: string; // voiceServer.kokoro_voice in settings.json } // Last-resort defaults if settings.json is entirely missing or unparseable @@ -155,7 +165,7 @@ function loadVoiceConfig(): LoadedVoiceConfig { try { if (!existsSync(settingsPath)) { console.warn('⚠️ settings.json not found — using fallback voice defaults'); - return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true }; + return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true, ttsProvider: 'elevenlabs', kokoroUrl: 'http://localhost:8880', kokoroVoice: 'af_sky' }; } const content = readFileSync(settingsPath, 'utf-8'); @@ -163,6 +173,10 @@ function loadVoiceConfig(): LoadedVoiceConfig { const daidentity = settings.daidentity || {}; const voicesSection = daidentity.voices || {}; const desktopNotifications = settings.notifications?.desktop?.enabled !== false; + const voiceServer = settings.voiceServer || {}; + const ttsProvider: 'elevenlabs' | 'kokoro' = voiceServer.tts_provider === 'kokoro' ? 'kokoro' : 'elevenlabs'; + const kokoroUrl: string = voiceServer.kokoro_url || 'http://localhost:8880'; + const kokoroVoice: string = voiceServer.kokoro_voice || 'af_sky'; // Build lookup maps const voices: Record = {}; @@ -195,10 +209,10 @@ function loadVoiceConfig(): LoadedVoiceConfig { console.log(` ${name}: ${entry.voiceName || entry.voiceId} (speed: ${entry.speed}, stability: ${entry.stability})`); } - return { defaultVoiceId, voices, voicesByVoiceId, desktopNotifications }; + return { defaultVoiceId, voices, voicesByVoiceId, desktopNotifications, ttsProvider, kokoroUrl, kokoroVoice }; } catch (error) { console.error('⚠️ Failed to load settings.json voice config:', error); - return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true }; + return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true, ttsProvider: 'elevenlabs', kokoroUrl: 'http://localhost:8880', kokoroVoice: 'af_sky' }; } } @@ -339,12 +353,7 @@ async function generateSpeech( throw new Error('ElevenLabs API key not configured'); } - // Apply pronunciation replacements before sending to TTS - const pronouncedText = applyPronunciations(text); - if (pronouncedText !== text) { - console.log(`📖 Pronunciation: "${text}" → "${pronouncedText}"`); - } - + const pronouncedText = preprocessForTTS(text); const url = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`; const response = await fetch(url, { @@ -369,6 +378,30 @@ async function generateSpeech( return await response.arrayBuffer(); } +// Generate speech using kokoro-fastapi (OpenAI-compatible local TTS) +async function generateKokoroSpeech(text: string): Promise { + const pronouncedText = preprocessForTTS(text); + const url = `${voiceConfig.kokoroUrl}/v1/audio/speech`; + + const response = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model: 'kokoro', + input: pronouncedText, + voice: voiceConfig.kokoroVoice, + response_format: 'mp3', + }), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Kokoro API error: ${response.status} - ${errorText}`); + } + + return await response.arrayBuffer(); +} + // Play audio using afplay (macOS) async function playAudio(audioBuffer: ArrayBuffer, volume: number = FALLBACK_VOLUME): Promise { const tempFile = `/tmp/voice-${Date.now()}.mp3`; @@ -379,6 +412,7 @@ async function playAudio(audioBuffer: ArrayBuffer, volume: number = FALLBACK_VOL const proc = spawn('/usr/bin/afplay', ['-v', volume.toString(), tempFile]); proc.on('error', (error) => { + spawn('/bin/rm', [tempFile]); console.error('Error playing audio:', error); reject(error); }); @@ -454,61 +488,78 @@ async function sendNotification( const { cleaned, emotion } = extractEmotionalMarker(safeMessage); safeMessage = cleaned; - // Generate and play voice using ElevenLabs + // Generate and play voice let voicePlayed = false; let voiceError: string | undefined; - if (voiceEnabled && ELEVENLABS_API_KEY) { - try { - const voice = voiceId || DEFAULT_VOICE_ID; - - // 3-tier voice settings resolution - let resolvedSettings: ElevenLabsVoiceSettings; - let resolvedVolume: number; - - if (callerVoiceSettings && Object.keys(callerVoiceSettings).length > 0) { - // Tier 1: Caller provided explicit voice_settings → pass through - resolvedSettings = { - stability: callerVoiceSettings.stability ?? FALLBACK_VOICE_SETTINGS.stability, - similarity_boost: callerVoiceSettings.similarity_boost ?? FALLBACK_VOICE_SETTINGS.similarity_boost, - style: callerVoiceSettings.style ?? FALLBACK_VOICE_SETTINGS.style, - speed: callerVoiceSettings.speed ?? FALLBACK_VOICE_SETTINGS.speed, - use_speaker_boost: callerVoiceSettings.use_speaker_boost ?? FALLBACK_VOICE_SETTINGS.use_speaker_boost, - }; - resolvedVolume = callerVolume ?? FALLBACK_VOLUME; - console.log(`🔗 Voice settings: pass-through from caller`); - } else { - // Tier 2/3: Look up by voiceId, fall back to main - const voiceEntry = lookupVoiceByVoiceId(voice) || voiceConfig.voices.main; - if (voiceEntry) { - resolvedSettings = voiceEntryToSettings(voiceEntry); - resolvedVolume = callerVolume ?? voiceEntry.volume ?? FALLBACK_VOLUME; - console.log(`📋 Voice settings: from settings.json (${voiceEntry.voiceName || voice})`); - } else { - resolvedSettings = { ...FALLBACK_VOICE_SETTINGS }; + if (voiceEnabled) { + const provider = voiceConfig.ttsProvider; + + if (provider === 'kokoro') { + // Kokoro-fastapi: self-hosted OpenAI-compatible neural TTS + try { + const resolvedVolume = callerVolume ?? voiceConfig.voices.main?.volume ?? FALLBACK_VOLUME; + console.log(`🍃 Kokoro TTS: voice=${voiceConfig.kokoroVoice}, url=${voiceConfig.kokoroUrl}`); + const audioBuffer = await generateKokoroSpeech(safeMessage); + await playAudio(audioBuffer, resolvedVolume); + voicePlayed = true; + } catch (error: any) { + console.error("Kokoro TTS failed:", error); + voiceError = error.message || "Kokoro TTS failed"; + } + } else if (ELEVENLABS_API_KEY) { + // ElevenLabs cloud TTS + try { + const voice = voiceId || DEFAULT_VOICE_ID; + + // 3-tier voice settings resolution + let resolvedSettings: ElevenLabsVoiceSettings; + let resolvedVolume: number; + + if (callerVoiceSettings && Object.keys(callerVoiceSettings).length > 0) { + // Tier 1: Caller provided explicit voice_settings → pass through + resolvedSettings = { + stability: callerVoiceSettings.stability ?? FALLBACK_VOICE_SETTINGS.stability, + similarity_boost: callerVoiceSettings.similarity_boost ?? FALLBACK_VOICE_SETTINGS.similarity_boost, + style: callerVoiceSettings.style ?? FALLBACK_VOICE_SETTINGS.style, + speed: callerVoiceSettings.speed ?? FALLBACK_VOICE_SETTINGS.speed, + use_speaker_boost: callerVoiceSettings.use_speaker_boost ?? FALLBACK_VOICE_SETTINGS.use_speaker_boost, + }; resolvedVolume = callerVolume ?? FALLBACK_VOLUME; - console.log(`⚠️ Voice settings: fallback defaults (no config found for ${voice})`); + console.log(`🔗 Voice settings: pass-through from caller`); + } else { + // Tier 2/3: Look up by voiceId, fall back to main + const voiceEntry = lookupVoiceByVoiceId(voice) || voiceConfig.voices.main; + if (voiceEntry) { + resolvedSettings = voiceEntryToSettings(voiceEntry); + resolvedVolume = callerVolume ?? voiceEntry.volume ?? FALLBACK_VOLUME; + console.log(`📋 Voice settings: from settings.json (${voiceEntry.voiceName || voice})`); + } else { + resolvedSettings = { ...FALLBACK_VOICE_SETTINGS }; + resolvedVolume = callerVolume ?? FALLBACK_VOLUME; + console.log(`⚠️ Voice settings: fallback defaults (no config found for ${voice})`); + } } - } - // Emotional preset overlay — modifies stability + similarity_boost only - if (emotion && EMOTIONAL_PRESETS[emotion]) { - resolvedSettings = { - ...resolvedSettings, - stability: EMOTIONAL_PRESETS[emotion].stability, - similarity_boost: EMOTIONAL_PRESETS[emotion].similarity_boost, - }; - console.log(`🎭 Emotion overlay: ${emotion}`); - } + // Emotional preset overlay — modifies stability + similarity_boost only + if (emotion && EMOTIONAL_PRESETS[emotion]) { + resolvedSettings = { + ...resolvedSettings, + stability: EMOTIONAL_PRESETS[emotion].stability, + similarity_boost: EMOTIONAL_PRESETS[emotion].similarity_boost, + }; + console.log(`🎭 Emotion overlay: ${emotion}`); + } - console.log(`🎙️ Generating speech (voice: ${voice}, speed: ${resolvedSettings.speed}, stability: ${resolvedSettings.stability}, boost: ${resolvedSettings.similarity_boost}, style: ${resolvedSettings.style}, volume: ${resolvedVolume})`); + console.log(`🎙️ Generating speech (voice: ${voice}, speed: ${resolvedSettings.speed}, stability: ${resolvedSettings.stability}, boost: ${resolvedSettings.similarity_boost}, style: ${resolvedSettings.style}, volume: ${resolvedVolume})`); - const audioBuffer = await generateSpeech(safeMessage, voice, resolvedSettings); - await playAudio(audioBuffer, resolvedVolume); - voicePlayed = true; - } catch (error: any) { - console.error("Failed to generate/play speech:", error); - voiceError = error.message || "TTS generation failed"; + const audioBuffer = await generateSpeech(safeMessage, voice, resolvedSettings); + await playAudio(audioBuffer, resolvedVolume); + voicePlayed = true; + } catch (error: any) { + console.error("Failed to generate/play speech:", error); + voiceError = error.message || "TTS generation failed"; + } } } @@ -688,11 +739,13 @@ const server = serve({ JSON.stringify({ status: "healthy", port: PORT, - voice_system: "ElevenLabs", + tts_provider: voiceConfig.ttsProvider, default_voice_id: DEFAULT_VOICE_ID, - api_key_configured: !!ELEVENLABS_API_KEY, + elevenlabs_api_key_configured: !!ELEVENLABS_API_KEY, pronunciation_rules: pronunciationRules.length, configured_voices: Object.keys(voiceConfig.voices), + kokoro_url: voiceConfig.kokoroUrl, + kokoro_voice: voiceConfig.kokoroVoice, }), { headers: { ...corsHeaders, "Content-Type": "application/json" }, @@ -709,8 +762,8 @@ const server = serve({ }); console.log(`🚀 Voice Server running on port ${PORT}`); -console.log(`🎙️ Using ElevenLabs TTS (default voice: ${DEFAULT_VOICE_ID})`); +console.log(`🔊 TTS: ${voiceConfig.ttsProvider === 'kokoro' ? `kokoro-fastapi (${voiceConfig.kokoroVoice} @ ${voiceConfig.kokoroUrl})` : ELEVENLABS_API_KEY ? `ElevenLabs (default voice: ${DEFAULT_VOICE_ID})` : `⚠️ no provider — ElevenLabs key missing`}`); console.log(`📡 POST to http://localhost:${PORT}/notify`); console.log(`🔒 Security: CORS restricted to localhost, rate limiting enabled`); -console.log(`🔑 API Key: ${ELEVENLABS_API_KEY ? '✅ Configured' : '❌ Missing'}`); +console.log(`🔑 ElevenLabs API Key: ${ELEVENLABS_API_KEY ? '✅ Configured' : '❌ Not set'}`); console.log(`📖 Pronunciations: ${pronunciationRules.length} rules loaded`); diff --git a/Releases/v4.0.3/.claude/VoiceServer/server.ts b/Releases/v4.0.3/.claude/VoiceServer/server.ts index 9f5dec95c..6326ec658 100644 --- a/Releases/v4.0.3/.claude/VoiceServer/server.ts +++ b/Releases/v4.0.3/.claude/VoiceServer/server.ts @@ -20,8 +20,9 @@ import { homedir } from "os"; import { join } from "path"; import { existsSync, readFileSync } from "fs"; -// Load .env from user home directory -const envPath = join(homedir(), '.env'); +// Load .env — try PAI config dir first (~/.config/PAI/.env), fall back to ~/.env +const paiEnvPath = join(homedir(), '.config', 'PAI', '.env'); +const envPath = existsSync(paiEnvPath) ? paiEnvPath : join(homedir(), '.env'); if (existsSync(envPath)) { const envContent = await Bun.file(envPath).text(); envContent.split('\n').forEach(line => { @@ -36,8 +37,7 @@ const PORT = parseInt(process.env.PORT || "8888"); const ELEVENLABS_API_KEY = process.env.ELEVENLABS_API_KEY; if (!ELEVENLABS_API_KEY) { - console.error('⚠️ ELEVENLABS_API_KEY not found in ~/.env'); - console.error('Add: ELEVENLABS_API_KEY=your_key_here'); + console.warn('⚠️ ELEVENLABS_API_KEY not set — ElevenLabs TTS unavailable, local TTS will be used as fallback'); } // ========================================================================== @@ -136,6 +136,8 @@ interface LoadedVoiceConfig { voices: Record; // keyed by name ("main", "algorithm") voicesByVoiceId: Record; // keyed by voiceId for lookup desktopNotifications: boolean; // whether to show macOS notification banners + ttsProvider: 'elevenlabs' | 'local'; // voiceServer.tts_provider in settings.json + localVoice: string; // voiceServer.local_voice in settings.json (macOS say voice name) } // Last-resort defaults if settings.json is entirely missing or unparseable @@ -155,7 +157,7 @@ function loadVoiceConfig(): LoadedVoiceConfig { try { if (!existsSync(settingsPath)) { console.warn('⚠️ settings.json not found — using fallback voice defaults'); - return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true }; + return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true, ttsProvider: 'elevenlabs', localVoice: 'Samantha' }; } const content = readFileSync(settingsPath, 'utf-8'); @@ -163,6 +165,9 @@ function loadVoiceConfig(): LoadedVoiceConfig { const daidentity = settings.daidentity || {}; const voicesSection = daidentity.voices || {}; const desktopNotifications = settings.notifications?.desktop?.enabled !== false; + const voiceServer = settings.voiceServer || {}; + const ttsProvider: 'elevenlabs' | 'local' = voiceServer.tts_provider === 'local' ? 'local' : 'elevenlabs'; + const localVoice: string = voiceServer.local_voice || 'Samantha'; // Build lookup maps const voices: Record = {}; @@ -195,10 +200,10 @@ function loadVoiceConfig(): LoadedVoiceConfig { console.log(` ${name}: ${entry.voiceName || entry.voiceId} (speed: ${entry.speed}, stability: ${entry.stability})`); } - return { defaultVoiceId, voices, voicesByVoiceId, desktopNotifications }; + return { defaultVoiceId, voices, voicesByVoiceId, desktopNotifications, ttsProvider, localVoice }; } catch (error) { console.error('⚠️ Failed to load settings.json voice config:', error); - return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true }; + return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true, ttsProvider: 'elevenlabs', localVoice: 'Samantha' }; } } @@ -394,6 +399,93 @@ async function playAudio(audioBuffer: ArrayBuffer, volume: number = FALLBACK_VOL }); } +// ========================================================================== +// Local TTS Voice Catalogue +// ========================================================================== + +interface LocalVoiceInfo { + name: string; + locale: string; + accent: string; + gender: string; + category: 'natural' | 'classic' | 'novelty'; + sample: string; +} + +// Curated catalogue of realistic English voices available via macOS say command. +// Novelty voices (Albert, Bahh, Bells, etc.) are excluded from this list. +const LOCAL_VOICE_CATALOGUE: LocalVoiceInfo[] = [ + // Natural — modern high-quality voices + { name: 'Samantha', locale: 'en_US', accent: 'American', gender: 'female', category: 'natural', sample: 'Clear, neutral American female — good all-purpose default' }, + { name: 'Eddy (English (US))', locale: 'en_US', accent: 'American', gender: 'male', category: 'natural', sample: 'Modern American male with natural cadence' }, + { name: 'Flo (English (US))', locale: 'en_US', accent: 'American', gender: 'female', category: 'natural', sample: 'Modern American female, warm and conversational' }, + { name: 'Reed (English (US))', locale: 'en_US', accent: 'American', gender: 'male', category: 'natural', sample: 'Modern American male, clear and professional' }, + { name: 'Rocko (English (US))', locale: 'en_US', accent: 'American', gender: 'male', category: 'natural', sample: 'Modern American male, energetic tone' }, + { name: 'Sandy (English (US))', locale: 'en_US', accent: 'American', gender: 'female', category: 'natural', sample: 'Modern American female, upbeat and friendly' }, + { name: 'Shelley (English (US))',locale: 'en_US', accent: 'American', gender: 'female', category: 'natural', sample: 'Modern American female, calm and measured' }, + { name: 'Daniel', locale: 'en_GB', accent: 'British', gender: 'male', category: 'natural', sample: 'British male — professional, clear RP accent' }, + { name: 'Eddy (English (UK))', locale: 'en_GB', accent: 'British', gender: 'male', category: 'natural', sample: 'Modern British male, natural and conversational' }, + { name: 'Flo (English (UK))', locale: 'en_GB', accent: 'British', gender: 'female', category: 'natural', sample: 'Modern British female, warm and clear' }, + { name: 'Reed (English (UK))', locale: 'en_GB', accent: 'British', gender: 'male', category: 'natural', sample: 'Modern British male, measured and reliable' }, + { name: 'Rocko (English (UK))', locale: 'en_GB', accent: 'British', gender: 'male', category: 'natural', sample: 'Modern British male, confident tone' }, + { name: 'Sandy (English (UK))', locale: 'en_GB', accent: 'British', gender: 'female', category: 'natural', sample: 'Modern British female, friendly and clear' }, + { name: 'Shelley (English (UK))',locale: 'en_GB', accent: 'British', gender: 'female', category: 'natural', sample: 'Modern British female, calm and professional' }, + { name: 'Karen', locale: 'en_AU', accent: 'Australian', gender: 'female', category: 'natural', sample: 'Australian female — distinctive, friendly accent' }, + { name: 'Moira', locale: 'en_IE', accent: 'Irish', gender: 'female', category: 'natural', sample: 'Irish female — warm, melodic accent' }, + { name: 'Tessa', locale: 'en_ZA', accent: 'South African', gender: 'female', category: 'natural', sample: 'South African female — distinctive and clear' }, + { name: 'Rishi', locale: 'en_IN', accent: 'Indian English', gender: 'male', category: 'natural', sample: 'Indian English male — clear and distinctive' }, + // Classic — older synthesised voices, still intelligible + { name: 'Fred', locale: 'en_US', accent: 'American', gender: 'male', category: 'classic', sample: 'Classic American male, robotic but reliable' }, + { name: 'Kathy', locale: 'en_US', accent: 'American', gender: 'female', category: 'classic', sample: 'Classic American female synthesiser' }, + { name: 'Junior', locale: 'en_US', accent: 'American', gender: 'male', category: 'classic', sample: 'Classic high-pitched American male' }, + { name: 'Ralph', locale: 'en_US', accent: 'American', gender: 'male', category: 'classic', sample: 'Classic gruff American male' }, +]; + +// Get voices installed on this system that are also in our catalogue +async function getInstalledLocalVoices(): Promise { + try { + const result = await new Promise((resolve, reject) => { + const proc = spawn('/usr/bin/say', ['-v', '?']); + let output = ''; + proc.stdout?.on('data', (d: Buffer) => { output += d.toString(); }); + proc.on('error', reject); + proc.on('exit', () => resolve(output)); + }); + + const installedNames = new Set( + result.split('\n') + .filter(line => /en_/.test(line)) + .map(line => line.split(/\s{2,}/)[0].trim()) + ); + + return LOCAL_VOICE_CATALOGUE.filter(v => installedNames.has(v.name)); + } catch { + return LOCAL_VOICE_CATALOGUE; // fallback: return full catalogue + } +} + +// Play audio using macOS say command (local TTS — no API key required) +async function playLocalSpeech(text: string, voice: string, volume: number = FALLBACK_VOLUME): Promise { + const tempFile = `/tmp/voice-local-${Date.now()}.aiff`; + + await new Promise((resolve, reject) => { + const proc = spawn('/usr/bin/say', ['-v', voice, '-o', tempFile, text]); + proc.on('error', reject); + proc.on('exit', (code) => { + code === 0 ? resolve() : reject(new Error(`say exited with code ${code}`)); + }); + }); + + return new Promise((resolve, reject) => { + const proc = spawn('/usr/bin/afplay', ['-v', volume.toString(), tempFile]); + proc.on('error', reject); + proc.on('exit', (code) => { + spawn('/bin/rm', [tempFile]); + code === 0 ? resolve() : reject(new Error(`afplay exited with code ${code}`)); + }); + }); +} + // Spawn a process safely function spawnSafe(command: string, args: string[]): Promise { return new Promise((resolve, reject) => { @@ -454,61 +546,84 @@ async function sendNotification( const { cleaned, emotion } = extractEmotionalMarker(safeMessage); safeMessage = cleaned; - // Generate and play voice using ElevenLabs + // Generate and play voice let voicePlayed = false; let voiceError: string | undefined; - if (voiceEnabled && ELEVENLABS_API_KEY) { - try { - const voice = voiceId || DEFAULT_VOICE_ID; - - // 3-tier voice settings resolution - let resolvedSettings: ElevenLabsVoiceSettings; - let resolvedVolume: number; - - if (callerVoiceSettings && Object.keys(callerVoiceSettings).length > 0) { - // Tier 1: Caller provided explicit voice_settings → pass through - resolvedSettings = { - stability: callerVoiceSettings.stability ?? FALLBACK_VOICE_SETTINGS.stability, - similarity_boost: callerVoiceSettings.similarity_boost ?? FALLBACK_VOICE_SETTINGS.similarity_boost, - style: callerVoiceSettings.style ?? FALLBACK_VOICE_SETTINGS.style, - speed: callerVoiceSettings.speed ?? FALLBACK_VOICE_SETTINGS.speed, - use_speaker_boost: callerVoiceSettings.use_speaker_boost ?? FALLBACK_VOICE_SETTINGS.use_speaker_boost, - }; - resolvedVolume = callerVolume ?? FALLBACK_VOLUME; - console.log(`🔗 Voice settings: pass-through from caller`); - } else { - // Tier 2/3: Look up by voiceId, fall back to main - const voiceEntry = lookupVoiceByVoiceId(voice) || voiceConfig.voices.main; - if (voiceEntry) { - resolvedSettings = voiceEntryToSettings(voiceEntry); - resolvedVolume = callerVolume ?? voiceEntry.volume ?? FALLBACK_VOLUME; - console.log(`📋 Voice settings: from settings.json (${voiceEntry.voiceName || voice})`); - } else { - resolvedSettings = { ...FALLBACK_VOICE_SETTINGS }; + if (voiceEnabled) { + const provider = voiceConfig.ttsProvider; + + if (provider === 'local' || !ELEVENLABS_API_KEY) { + // Local TTS: explicit config or no API key available + try { + const resolvedVolume = callerVolume ?? voiceConfig.voices.main?.volume ?? FALLBACK_VOLUME; + console.log(`🔊 Local TTS (${provider === 'local' ? 'configured' : 'no API key'}): voice=${voiceConfig.localVoice}`); + await playLocalSpeech(safeMessage, voiceConfig.localVoice, resolvedVolume); + voicePlayed = true; + } catch (error: any) { + console.error("Local TTS failed:", error); + voiceError = error.message || "Local TTS failed"; + } + } else { + // ElevenLabs with automatic local TTS fallback + try { + const voice = voiceId || DEFAULT_VOICE_ID; + + // 3-tier voice settings resolution + let resolvedSettings: ElevenLabsVoiceSettings; + let resolvedVolume: number; + + if (callerVoiceSettings && Object.keys(callerVoiceSettings).length > 0) { + // Tier 1: Caller provided explicit voice_settings → pass through + resolvedSettings = { + stability: callerVoiceSettings.stability ?? FALLBACK_VOICE_SETTINGS.stability, + similarity_boost: callerVoiceSettings.similarity_boost ?? FALLBACK_VOICE_SETTINGS.similarity_boost, + style: callerVoiceSettings.style ?? FALLBACK_VOICE_SETTINGS.style, + speed: callerVoiceSettings.speed ?? FALLBACK_VOICE_SETTINGS.speed, + use_speaker_boost: callerVoiceSettings.use_speaker_boost ?? FALLBACK_VOICE_SETTINGS.use_speaker_boost, + }; resolvedVolume = callerVolume ?? FALLBACK_VOLUME; - console.log(`⚠️ Voice settings: fallback defaults (no config found for ${voice})`); + console.log(`🔗 Voice settings: pass-through from caller`); + } else { + // Tier 2/3: Look up by voiceId, fall back to main + const voiceEntry = lookupVoiceByVoiceId(voice) || voiceConfig.voices.main; + if (voiceEntry) { + resolvedSettings = voiceEntryToSettings(voiceEntry); + resolvedVolume = callerVolume ?? voiceEntry.volume ?? FALLBACK_VOLUME; + console.log(`📋 Voice settings: from settings.json (${voiceEntry.voiceName || voice})`); + } else { + resolvedSettings = { ...FALLBACK_VOICE_SETTINGS }; + resolvedVolume = callerVolume ?? FALLBACK_VOLUME; + console.log(`⚠️ Voice settings: fallback defaults (no config found for ${voice})`); + } } - } - // Emotional preset overlay — modifies stability + similarity_boost only - if (emotion && EMOTIONAL_PRESETS[emotion]) { - resolvedSettings = { - ...resolvedSettings, - stability: EMOTIONAL_PRESETS[emotion].stability, - similarity_boost: EMOTIONAL_PRESETS[emotion].similarity_boost, - }; - console.log(`🎭 Emotion overlay: ${emotion}`); - } + // Emotional preset overlay — modifies stability + similarity_boost only + if (emotion && EMOTIONAL_PRESETS[emotion]) { + resolvedSettings = { + ...resolvedSettings, + stability: EMOTIONAL_PRESETS[emotion].stability, + similarity_boost: EMOTIONAL_PRESETS[emotion].similarity_boost, + }; + console.log(`🎭 Emotion overlay: ${emotion}`); + } - console.log(`🎙️ Generating speech (voice: ${voice}, speed: ${resolvedSettings.speed}, stability: ${resolvedSettings.stability}, boost: ${resolvedSettings.similarity_boost}, style: ${resolvedSettings.style}, volume: ${resolvedVolume})`); + console.log(`🎙️ Generating speech (voice: ${voice}, speed: ${resolvedSettings.speed}, stability: ${resolvedSettings.stability}, boost: ${resolvedSettings.similarity_boost}, style: ${resolvedSettings.style}, volume: ${resolvedVolume})`); - const audioBuffer = await generateSpeech(safeMessage, voice, resolvedSettings); - await playAudio(audioBuffer, resolvedVolume); - voicePlayed = true; - } catch (error: any) { - console.error("Failed to generate/play speech:", error); - voiceError = error.message || "TTS generation failed"; + const audioBuffer = await generateSpeech(safeMessage, voice, resolvedSettings); + await playAudio(audioBuffer, resolvedVolume); + voicePlayed = true; + } catch (error: any) { + console.warn(`⚠️ ElevenLabs TTS failed: ${error.message} — falling back to local TTS`); + try { + const resolvedVolume = callerVolume ?? voiceConfig.voices.main?.volume ?? FALLBACK_VOLUME; + await playLocalSpeech(safeMessage, voiceConfig.localVoice, resolvedVolume); + voicePlayed = true; + } catch (localError: any) { + console.error("Local TTS fallback also failed:", localError); + voiceError = error.message; + } + } } } @@ -683,14 +798,35 @@ const server = serve({ } } + if (url.pathname === "/voices/local" && req.method === "GET") { + const voices = await getInstalledLocalVoices(); + const current = voiceConfig.localVoice; + return new Response( + JSON.stringify({ + current_voice: current, + how_to_change: 'Set voiceServer.local_voice in ~/.claude/settings.json', + voices: voices.map(v => ({ + ...v, + active: v.name === current, + })), + }), + { + headers: { ...corsHeaders, "Content-Type": "application/json" }, + status: 200 + } + ); + } + if (url.pathname === "/health") { return new Response( JSON.stringify({ status: "healthy", port: PORT, - voice_system: "ElevenLabs", + tts_provider: voiceConfig.ttsProvider, + local_voice: voiceConfig.localVoice, + local_tts_available: existsSync('/usr/bin/say'), + elevenlabs_api_key_configured: !!ELEVENLABS_API_KEY, default_voice_id: DEFAULT_VOICE_ID, - api_key_configured: !!ELEVENLABS_API_KEY, pronunciation_rules: pronunciationRules.length, configured_voices: Object.keys(voiceConfig.voices), }), @@ -701,7 +837,7 @@ const server = serve({ ); } - return new Response("Voice Server - POST to /notify, /notify/personality, or /pai", { + return new Response("Voice Server — POST /notify | GET /health | GET /voices/local", { headers: corsHeaders, status: 200 }); @@ -709,8 +845,8 @@ const server = serve({ }); console.log(`🚀 Voice Server running on port ${PORT}`); -console.log(`🎙️ Using ElevenLabs TTS (default voice: ${DEFAULT_VOICE_ID})`); +console.log(`🔊 TTS: ${voiceConfig.ttsProvider === 'local' ? `local only (${voiceConfig.localVoice})` : ELEVENLABS_API_KEY ? `ElevenLabs + local fallback (${voiceConfig.localVoice})` : `local only — no ElevenLabs API key (${voiceConfig.localVoice})`}`); +console.log(`🔑 ElevenLabs API Key: ${ELEVENLABS_API_KEY ? '✅ Configured' : '❌ Not set'}`); console.log(`📡 POST to http://localhost:${PORT}/notify`); console.log(`🔒 Security: CORS restricted to localhost, rate limiting enabled`); -console.log(`🔑 API Key: ${ELEVENLABS_API_KEY ? '✅ Configured' : '❌ Missing'}`); console.log(`📖 Pronunciations: ${pronunciationRules.length} rules loaded`);