|
| 1 | +/** |
| 2 | + * ElevenLabs Text-to-Speech Service |
| 3 | + * |
| 4 | + * Converts script text into MP3 audio using the ElevenLabs TTS API v1. |
| 5 | + * Part of the CodingCat.dev automated video pipeline: |
| 6 | + * script text → ElevenLabs TTS → MP3 audio → upload to GCS → Remotion render |
| 7 | + */ |
| 8 | + |
| 9 | +const ELEVENLABS_API_BASE = "https://api.elevenlabs.io/v1"; |
| 10 | + |
| 11 | +/** Configuration for the ElevenLabs TTS service. */ |
| 12 | +export type ElevenLabsConfig = { |
| 13 | + /** ElevenLabs API key for authentication. */ |
| 14 | + apiKey: string; |
| 15 | + /** ElevenLabs voice ID to use for speech synthesis. */ |
| 16 | + voiceId: string; |
| 17 | +}; |
| 18 | + |
| 19 | +/** Voice settings passed to the ElevenLabs API. */ |
| 20 | +interface VoiceSettings { |
| 21 | + stability: number; |
| 22 | + similarity_boost: number; |
| 23 | + style: number; |
| 24 | +} |
| 25 | + |
| 26 | +/** Request body for the ElevenLabs TTS endpoint. */ |
| 27 | +interface TTSRequestBody { |
| 28 | + text: string; |
| 29 | + model_id: string; |
| 30 | + voice_settings: VoiceSettings; |
| 31 | +} |
| 32 | + |
| 33 | +/** Shape of the script object used in the video pipeline. */ |
| 34 | +export interface VideoScript { |
| 35 | + hook: string; |
| 36 | + scenes: Array<{ |
| 37 | + sceneNumber?: number; |
| 38 | + narration: string; |
| 39 | + visualDescription?: string; |
| 40 | + bRollKeywords?: string[]; |
| 41 | + durationEstimate?: number; |
| 42 | + }>; |
| 43 | + cta: string; |
| 44 | +} |
| 45 | + |
| 46 | +/** |
| 47 | + * Reads the ElevenLabs configuration from environment variables. |
| 48 | + * |
| 49 | + * @returns The resolved {@link ElevenLabsConfig}. |
| 50 | + * @throws {Error} If required environment variables are missing. |
| 51 | + */ |
| 52 | +function getConfig(): ElevenLabsConfig { |
| 53 | + const apiKey = process.env.ELEVENLABS_API_KEY; |
| 54 | + const voiceId = process.env.ELEVENLABS_VOICE_ID; |
| 55 | + |
| 56 | + if (!apiKey) { |
| 57 | + throw new Error( |
| 58 | + "Missing ELEVENLABS_API_KEY environment variable. " + |
| 59 | + "Set it in your .env.local or deployment environment." |
| 60 | + ); |
| 61 | + } |
| 62 | + |
| 63 | + if (!voiceId) { |
| 64 | + throw new Error( |
| 65 | + "Missing ELEVENLABS_VOICE_ID environment variable. " + |
| 66 | + "Set it in your .env.local or deployment environment." |
| 67 | + ); |
| 68 | + } |
| 69 | + |
| 70 | + return { apiKey, voiceId }; |
| 71 | +} |
| 72 | + |
| 73 | +/** |
| 74 | + * Generate speech audio from plain text using the ElevenLabs TTS API. |
| 75 | + * |
| 76 | + * Calls the ElevenLabs v1 text-to-speech endpoint with the |
| 77 | + * `eleven_multilingual_v2` model and returns the resulting MP3 audio |
| 78 | + * as a Node.js `Buffer`. |
| 79 | + * |
| 80 | + * @param text - The text to convert to speech. |
| 81 | + * @returns A `Buffer` containing the MP3 audio data. |
| 82 | + * @throws {Error} If the text is empty, env vars are missing, or the API request fails. |
| 83 | + * |
| 84 | + * @example |
| 85 | + * ```ts |
| 86 | + * import { generateSpeech } from "@/lib/services/elevenlabs"; |
| 87 | + * |
| 88 | + * const mp3Buffer = await generateSpeech("Hello from CodingCat.dev!"); |
| 89 | + * ``` |
| 90 | + */ |
| 91 | +export async function generateSpeech(text: string): Promise<Buffer> { |
| 92 | + if (!text || text.trim().length === 0) { |
| 93 | + throw new Error("Cannot generate speech from empty text."); |
| 94 | + } |
| 95 | + |
| 96 | + const { apiKey, voiceId } = getConfig(); |
| 97 | + |
| 98 | + const url = `${ELEVENLABS_API_BASE}/text-to-speech/${voiceId}`; |
| 99 | + |
| 100 | + const body: TTSRequestBody = { |
| 101 | + text, |
| 102 | + model_id: "eleven_multilingual_v2", |
| 103 | + voice_settings: { |
| 104 | + stability: 0.5, |
| 105 | + similarity_boost: 0.75, |
| 106 | + style: 0.5, |
| 107 | + }, |
| 108 | + }; |
| 109 | + |
| 110 | + let response: Response; |
| 111 | + |
| 112 | + try { |
| 113 | + response = await fetch(url, { |
| 114 | + method: "POST", |
| 115 | + headers: { |
| 116 | + Accept: "audio/mpeg", |
| 117 | + "Content-Type": "application/json", |
| 118 | + "xi-api-key": apiKey, |
| 119 | + }, |
| 120 | + body: JSON.stringify(body), |
| 121 | + }); |
| 122 | + } catch (error) { |
| 123 | + throw new Error( |
| 124 | + `ElevenLabs API request failed: ${error instanceof Error ? error.message : String(error)}` |
| 125 | + ); |
| 126 | + } |
| 127 | + |
| 128 | + if (!response.ok) { |
| 129 | + let errorDetail: string; |
| 130 | + |
| 131 | + try { |
| 132 | + const errorBody = await response.json(); |
| 133 | + errorDetail = |
| 134 | + errorBody?.detail?.message || |
| 135 | + errorBody?.detail || |
| 136 | + JSON.stringify(errorBody); |
| 137 | + } catch { |
| 138 | + errorDetail = response.statusText || "Unknown error"; |
| 139 | + } |
| 140 | + |
| 141 | + throw new Error( |
| 142 | + `ElevenLabs TTS API error (${response.status}): ${errorDetail}` |
| 143 | + ); |
| 144 | + } |
| 145 | + |
| 146 | + const arrayBuffer = await response.arrayBuffer(); |
| 147 | + |
| 148 | + if (arrayBuffer.byteLength === 0) { |
| 149 | + throw new Error("ElevenLabs API returned an empty audio response."); |
| 150 | + } |
| 151 | + |
| 152 | + return Buffer.from(arrayBuffer); |
| 153 | +} |
| 154 | + |
| 155 | +/** |
| 156 | + * Generate speech audio from a structured video script. |
| 157 | + * |
| 158 | + * Concatenates the script's hook, scene narrations, and call-to-action |
| 159 | + * into a single text block (separated by pauses) and converts it to |
| 160 | + * MP3 audio via {@link generateSpeech}. |
| 161 | + * |
| 162 | + * @param script - The video script containing a hook, scenes with narrations, and a CTA. |
| 163 | + * @returns A `Buffer` containing the MP3 audio data. |
| 164 | + * @throws {Error} If the script produces empty text or the TTS call fails. |
| 165 | + * |
| 166 | + * @example |
| 167 | + * ```ts |
| 168 | + * import { generateSpeechFromScript } from "@/lib/services/elevenlabs"; |
| 169 | + * |
| 170 | + * const mp3Buffer = await generateSpeechFromScript({ |
| 171 | + * hook: "Did you know you can automate video creation?", |
| 172 | + * scenes: [ |
| 173 | + * { narration: "First, we generate a script using AI." }, |
| 174 | + * { narration: "Then, we convert it to speech with ElevenLabs." }, |
| 175 | + * ], |
| 176 | + * cta: "Subscribe to CodingCat.dev for more!", |
| 177 | + * }); |
| 178 | + * ``` |
| 179 | + */ |
| 180 | +export async function generateSpeechFromScript( |
| 181 | + script: VideoScript |
| 182 | +): Promise<Buffer> { |
| 183 | + const sections: string[] = []; |
| 184 | + |
| 185 | + if (script.hook?.trim()) { |
| 186 | + sections.push(script.hook.trim()); |
| 187 | + } |
| 188 | + |
| 189 | + if (script.scenes && Array.isArray(script.scenes)) { |
| 190 | + for (const scene of script.scenes) { |
| 191 | + if (scene.narration?.trim()) { |
| 192 | + sections.push(scene.narration.trim()); |
| 193 | + } |
| 194 | + } |
| 195 | + } |
| 196 | + |
| 197 | + if (script.cta?.trim()) { |
| 198 | + sections.push(script.cta.trim()); |
| 199 | + } |
| 200 | + |
| 201 | + if (sections.length === 0) { |
| 202 | + throw new Error( |
| 203 | + "Cannot generate speech from an empty script. " + |
| 204 | + "Provide at least a hook, one scene narration, or a CTA." |
| 205 | + ); |
| 206 | + } |
| 207 | + |
| 208 | + // Join sections with ". " to create natural pauses between parts. |
| 209 | + // Ensure each section ends cleanly before adding the pause separator. |
| 210 | + const combinedText = sections |
| 211 | + .map((s) => (s.endsWith(".") ? s : `${s}.`)) |
| 212 | + .join(" "); |
| 213 | + |
| 214 | + return generateSpeech(combinedText); |
| 215 | +} |
0 commit comments