Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 114 additions & 61 deletions Releases/v4.0.1/.claude/VoiceServer/server.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env bun
/**
* Voice Server - Personal AI Voice notification server using ElevenLabs TTS
* Voice Server - Personal AI Voice notification server with pluggable TTS providers (ElevenLabs, kokoro-fastapi)
*
* Architecture: Pure pass-through. All voice config comes from settings.json.
* The server has zero hardcoded voice parameters.
Expand Down Expand Up @@ -102,6 +102,13 @@ function applyPronunciations(text: string): string {
return result;
}

// Apply pronunciations and log any changes — shared by all TTS providers
function preprocessForTTS(text: string): string {
const pronouncedText = applyPronunciations(text);
if (pronouncedText !== text) console.log(`📖 Pronunciation: "${text}" → "${pronouncedText}"`);
return pronouncedText;
}

// Load pronunciations at startup
loadPronunciations();

Expand Down Expand Up @@ -136,6 +143,9 @@ interface LoadedVoiceConfig {
voices: Record<string, VoiceEntry>; // keyed by name ("main", "algorithm")
voicesByVoiceId: Record<string, VoiceEntry>; // keyed by voiceId for lookup
desktopNotifications: boolean; // whether to show macOS notification banners
ttsProvider: 'elevenlabs' | 'kokoro'; // voiceServer.tts_provider in settings.json
kokoroUrl: string; // voiceServer.kokoro_url in settings.json
kokoroVoice: string; // voiceServer.kokoro_voice in settings.json
}

// Last-resort defaults if settings.json is entirely missing or unparseable
Expand All @@ -155,14 +165,18 @@ function loadVoiceConfig(): LoadedVoiceConfig {
try {
if (!existsSync(settingsPath)) {
console.warn('⚠️ settings.json not found — using fallback voice defaults');
return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true };
return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true, ttsProvider: 'elevenlabs', kokoroUrl: 'http://localhost:8880', kokoroVoice: 'af_sky' };
}

const content = readFileSync(settingsPath, 'utf-8');
const settings = JSON.parse(content);
const daidentity = settings.daidentity || {};
const voicesSection = daidentity.voices || {};
const desktopNotifications = settings.notifications?.desktop?.enabled !== false;
const voiceServer = settings.voiceServer || {};
const ttsProvider: 'elevenlabs' | 'kokoro' = voiceServer.tts_provider === 'kokoro' ? 'kokoro' : 'elevenlabs';
const kokoroUrl: string = voiceServer.kokoro_url || 'http://localhost:8880';
const kokoroVoice: string = voiceServer.kokoro_voice || 'af_sky';

// Build lookup maps
const voices: Record<string, VoiceEntry> = {};
Expand Down Expand Up @@ -195,10 +209,10 @@ function loadVoiceConfig(): LoadedVoiceConfig {
console.log(` ${name}: ${entry.voiceName || entry.voiceId} (speed: ${entry.speed}, stability: ${entry.stability})`);
}

return { defaultVoiceId, voices, voicesByVoiceId, desktopNotifications };
return { defaultVoiceId, voices, voicesByVoiceId, desktopNotifications, ttsProvider, kokoroUrl, kokoroVoice };
} catch (error) {
console.error('⚠️ Failed to load settings.json voice config:', error);
return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true };
return { defaultVoiceId: '', voices: {}, voicesByVoiceId: {}, desktopNotifications: true, ttsProvider: 'elevenlabs', kokoroUrl: 'http://localhost:8880', kokoroVoice: 'af_sky' };
}
}

Expand Down Expand Up @@ -339,12 +353,7 @@ async function generateSpeech(
throw new Error('ElevenLabs API key not configured');
}

// Apply pronunciation replacements before sending to TTS
const pronouncedText = applyPronunciations(text);
if (pronouncedText !== text) {
console.log(`📖 Pronunciation: "${text}" → "${pronouncedText}"`);
}

const pronouncedText = preprocessForTTS(text);
const url = `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`;

const response = await fetch(url, {
Expand All @@ -369,6 +378,30 @@ async function generateSpeech(
return await response.arrayBuffer();
}

// Generate speech using kokoro-fastapi (OpenAI-compatible local TTS)
async function generateKokoroSpeech(text: string): Promise<ArrayBuffer> {
const pronouncedText = preprocessForTTS(text);
const url = `${voiceConfig.kokoroUrl}/v1/audio/speech`;

const response = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model: 'kokoro',
input: pronouncedText,
voice: voiceConfig.kokoroVoice,
response_format: 'mp3',
}),
});

if (!response.ok) {
const errorText = await response.text();
throw new Error(`Kokoro API error: ${response.status} - ${errorText}`);
}

return await response.arrayBuffer();
}

// Play audio using afplay (macOS)
async function playAudio(audioBuffer: ArrayBuffer, volume: number = FALLBACK_VOLUME): Promise<void> {
const tempFile = `/tmp/voice-${Date.now()}.mp3`;
Expand All @@ -379,6 +412,7 @@ async function playAudio(audioBuffer: ArrayBuffer, volume: number = FALLBACK_VOL
const proc = spawn('/usr/bin/afplay', ['-v', volume.toString(), tempFile]);

proc.on('error', (error) => {
spawn('/bin/rm', [tempFile]);
console.error('Error playing audio:', error);
reject(error);
});
Expand Down Expand Up @@ -454,61 +488,78 @@ async function sendNotification(
const { cleaned, emotion } = extractEmotionalMarker(safeMessage);
safeMessage = cleaned;

// Generate and play voice using ElevenLabs
// Generate and play voice
let voicePlayed = false;
let voiceError: string | undefined;

if (voiceEnabled && ELEVENLABS_API_KEY) {
try {
const voice = voiceId || DEFAULT_VOICE_ID;

// 3-tier voice settings resolution
let resolvedSettings: ElevenLabsVoiceSettings;
let resolvedVolume: number;

if (callerVoiceSettings && Object.keys(callerVoiceSettings).length > 0) {
// Tier 1: Caller provided explicit voice_settings → pass through
resolvedSettings = {
stability: callerVoiceSettings.stability ?? FALLBACK_VOICE_SETTINGS.stability,
similarity_boost: callerVoiceSettings.similarity_boost ?? FALLBACK_VOICE_SETTINGS.similarity_boost,
style: callerVoiceSettings.style ?? FALLBACK_VOICE_SETTINGS.style,
speed: callerVoiceSettings.speed ?? FALLBACK_VOICE_SETTINGS.speed,
use_speaker_boost: callerVoiceSettings.use_speaker_boost ?? FALLBACK_VOICE_SETTINGS.use_speaker_boost,
};
resolvedVolume = callerVolume ?? FALLBACK_VOLUME;
console.log(`🔗 Voice settings: pass-through from caller`);
} else {
// Tier 2/3: Look up by voiceId, fall back to main
const voiceEntry = lookupVoiceByVoiceId(voice) || voiceConfig.voices.main;
if (voiceEntry) {
resolvedSettings = voiceEntryToSettings(voiceEntry);
resolvedVolume = callerVolume ?? voiceEntry.volume ?? FALLBACK_VOLUME;
console.log(`📋 Voice settings: from settings.json (${voiceEntry.voiceName || voice})`);
} else {
resolvedSettings = { ...FALLBACK_VOICE_SETTINGS };
if (voiceEnabled) {
const provider = voiceConfig.ttsProvider;

if (provider === 'kokoro') {
// Kokoro-fastapi: self-hosted OpenAI-compatible neural TTS
try {
const resolvedVolume = callerVolume ?? voiceConfig.voices.main?.volume ?? FALLBACK_VOLUME;
console.log(`🍃 Kokoro TTS: voice=${voiceConfig.kokoroVoice}, url=${voiceConfig.kokoroUrl}`);
const audioBuffer = await generateKokoroSpeech(safeMessage);
await playAudio(audioBuffer, resolvedVolume);
voicePlayed = true;
} catch (error: any) {
console.error("Kokoro TTS failed:", error);
voiceError = error.message || "Kokoro TTS failed";
}
} else if (ELEVENLABS_API_KEY) {
// ElevenLabs cloud TTS
try {
const voice = voiceId || DEFAULT_VOICE_ID;

// 3-tier voice settings resolution
let resolvedSettings: ElevenLabsVoiceSettings;
let resolvedVolume: number;

if (callerVoiceSettings && Object.keys(callerVoiceSettings).length > 0) {
// Tier 1: Caller provided explicit voice_settings → pass through
resolvedSettings = {
stability: callerVoiceSettings.stability ?? FALLBACK_VOICE_SETTINGS.stability,
similarity_boost: callerVoiceSettings.similarity_boost ?? FALLBACK_VOICE_SETTINGS.similarity_boost,
style: callerVoiceSettings.style ?? FALLBACK_VOICE_SETTINGS.style,
speed: callerVoiceSettings.speed ?? FALLBACK_VOICE_SETTINGS.speed,
use_speaker_boost: callerVoiceSettings.use_speaker_boost ?? FALLBACK_VOICE_SETTINGS.use_speaker_boost,
};
resolvedVolume = callerVolume ?? FALLBACK_VOLUME;
console.log(`⚠️ Voice settings: fallback defaults (no config found for ${voice})`);
console.log(`🔗 Voice settings: pass-through from caller`);
} else {
// Tier 2/3: Look up by voiceId, fall back to main
const voiceEntry = lookupVoiceByVoiceId(voice) || voiceConfig.voices.main;
if (voiceEntry) {
resolvedSettings = voiceEntryToSettings(voiceEntry);
resolvedVolume = callerVolume ?? voiceEntry.volume ?? FALLBACK_VOLUME;
console.log(`📋 Voice settings: from settings.json (${voiceEntry.voiceName || voice})`);
} else {
resolvedSettings = { ...FALLBACK_VOICE_SETTINGS };
resolvedVolume = callerVolume ?? FALLBACK_VOLUME;
console.log(`⚠️ Voice settings: fallback defaults (no config found for ${voice})`);
}
}
}

// Emotional preset overlay — modifies stability + similarity_boost only
if (emotion && EMOTIONAL_PRESETS[emotion]) {
resolvedSettings = {
...resolvedSettings,
stability: EMOTIONAL_PRESETS[emotion].stability,
similarity_boost: EMOTIONAL_PRESETS[emotion].similarity_boost,
};
console.log(`🎭 Emotion overlay: ${emotion}`);
}
// Emotional preset overlay — modifies stability + similarity_boost only
if (emotion && EMOTIONAL_PRESETS[emotion]) {
resolvedSettings = {
...resolvedSettings,
stability: EMOTIONAL_PRESETS[emotion].stability,
similarity_boost: EMOTIONAL_PRESETS[emotion].similarity_boost,
};
console.log(`🎭 Emotion overlay: ${emotion}`);
}

console.log(`🎙️ Generating speech (voice: ${voice}, speed: ${resolvedSettings.speed}, stability: ${resolvedSettings.stability}, boost: ${resolvedSettings.similarity_boost}, style: ${resolvedSettings.style}, volume: ${resolvedVolume})`);
console.log(`🎙️ Generating speech (voice: ${voice}, speed: ${resolvedSettings.speed}, stability: ${resolvedSettings.stability}, boost: ${resolvedSettings.similarity_boost}, style: ${resolvedSettings.style}, volume: ${resolvedVolume})`);

const audioBuffer = await generateSpeech(safeMessage, voice, resolvedSettings);
await playAudio(audioBuffer, resolvedVolume);
voicePlayed = true;
} catch (error: any) {
console.error("Failed to generate/play speech:", error);
voiceError = error.message || "TTS generation failed";
const audioBuffer = await generateSpeech(safeMessage, voice, resolvedSettings);
await playAudio(audioBuffer, resolvedVolume);
voicePlayed = true;
} catch (error: any) {
console.error("Failed to generate/play speech:", error);
voiceError = error.message || "TTS generation failed";
}
}
}

Expand Down Expand Up @@ -688,11 +739,13 @@ const server = serve({
JSON.stringify({
status: "healthy",
port: PORT,
voice_system: "ElevenLabs",
tts_provider: voiceConfig.ttsProvider,
default_voice_id: DEFAULT_VOICE_ID,
api_key_configured: !!ELEVENLABS_API_KEY,
elevenlabs_api_key_configured: !!ELEVENLABS_API_KEY,
pronunciation_rules: pronunciationRules.length,
configured_voices: Object.keys(voiceConfig.voices),
kokoro_url: voiceConfig.kokoroUrl,
kokoro_voice: voiceConfig.kokoroVoice,
}),
{
headers: { ...corsHeaders, "Content-Type": "application/json" },
Expand All @@ -709,8 +762,8 @@ const server = serve({
});

console.log(`🚀 Voice Server running on port ${PORT}`);
console.log(`🎙️ Using ElevenLabs TTS (default voice: ${DEFAULT_VOICE_ID})`);
console.log(`🔊 TTS: ${voiceConfig.ttsProvider === 'kokoro' ? `kokoro-fastapi (${voiceConfig.kokoroVoice} @ ${voiceConfig.kokoroUrl})` : ELEVENLABS_API_KEY ? `ElevenLabs (default voice: ${DEFAULT_VOICE_ID})` : `⚠️ no provider — ElevenLabs key missing`}`);
console.log(`📡 POST to http://localhost:${PORT}/notify`);
console.log(`🔒 Security: CORS restricted to localhost, rate limiting enabled`);
console.log(`🔑 API Key: ${ELEVENLABS_API_KEY ? '✅ Configured' : '❌ Missing'}`);
console.log(`🔑 ElevenLabs API Key: ${ELEVENLABS_API_KEY ? '✅ Configured' : '❌ Not set'}`);
console.log(`📖 Pronunciations: ${pronunciationRules.length} rules loaded`);
Loading