|
6 | 6 | * script text → ElevenLabs TTS → MP3 audio → upload to GCS → Remotion render |
7 | 7 | */ |
8 | 8 |
|
| 9 | +import { |
| 10 | + aggregateToWordTimestamps, |
| 11 | + type CharacterAlignment, |
| 12 | + type WordTimestamp, |
| 13 | + type SceneAudioResult, |
| 14 | +} from "@/lib/utils/audio-timestamps"; |
| 15 | + |
9 | 16 | const ELEVENLABS_API_BASE = "https://api.elevenlabs.io/v1"; |
10 | 17 |
|
11 | 18 | /** Configuration for the ElevenLabs TTS service. */ |
@@ -43,6 +50,12 @@ export interface VideoScript { |
43 | 50 | cta: string; |
44 | 51 | } |
45 | 52 |
|
| 53 | +/** Response from ElevenLabs /with-timestamps endpoint */ |
| 54 | +interface TTSWithTimestampsResponse { |
| 55 | + audio_base64: string; |
| 56 | + alignment: CharacterAlignment; |
| 57 | +} |
| 58 | + |
46 | 59 | /** |
47 | 60 | * Reads the ElevenLabs configuration from environment variables. |
48 | 61 | * |
@@ -213,3 +226,181 @@ export async function generateSpeechFromScript( |
213 | 226 |
|
214 | 227 | return generateSpeech(combinedText); |
215 | 228 | } |
| 229 | + |
| 230 | +/** |
| 231 | + * Generate speech with word-level timestamps using the ElevenLabs |
| 232 | + * `/text-to-speech/{voiceId}/with-timestamps` endpoint. |
| 233 | + * |
| 234 | + * Returns both the audio buffer and word-level timing data that can be |
| 235 | + * used to sync Remotion visuals to the narration. |
| 236 | + * |
| 237 | + * @param text - The text to convert to speech. |
| 238 | + * @returns Audio buffer + word-level timestamps. |
| 239 | + */ |
| 240 | +export async function generateSpeechWithTimestamps( |
| 241 | + text: string |
| 242 | +): Promise<SceneAudioResult> { |
| 243 | + if (!text || text.trim().length === 0) { |
| 244 | + throw new Error("Cannot generate speech from empty text."); |
| 245 | + } |
| 246 | + |
| 247 | + const { apiKey, voiceId } = getConfig(); |
| 248 | + |
| 249 | + const url = `${ELEVENLABS_API_BASE}/text-to-speech/${voiceId}/with-timestamps`; |
| 250 | + |
| 251 | + const body: TTSRequestBody = { |
| 252 | + text, |
| 253 | + model_id: "eleven_multilingual_v2", |
| 254 | + voice_settings: { |
| 255 | + stability: 0.5, |
| 256 | + similarity_boost: 0.75, |
| 257 | + style: 0.5, |
| 258 | + }, |
| 259 | + }; |
| 260 | + |
| 261 | + let response: Response; |
| 262 | + |
| 263 | + try { |
| 264 | + response = await fetch(url, { |
| 265 | + method: "POST", |
| 266 | + headers: { |
| 267 | + "Content-Type": "application/json", |
| 268 | + "xi-api-key": apiKey, |
| 269 | + }, |
| 270 | + body: JSON.stringify(body), |
| 271 | + }); |
| 272 | + } catch (error) { |
| 273 | + throw new Error( |
| 274 | + `ElevenLabs timestamps API request failed: ${error instanceof Error ? error.message : String(error)}` |
| 275 | + ); |
| 276 | + } |
| 277 | + |
| 278 | + if (!response.ok) { |
| 279 | + let errorDetail: string; |
| 280 | + try { |
| 281 | + const errorBody = await response.json(); |
| 282 | + errorDetail = |
| 283 | + errorBody?.detail?.message || |
| 284 | + errorBody?.detail || |
| 285 | + JSON.stringify(errorBody); |
| 286 | + } catch { |
| 287 | + errorDetail = response.statusText || "Unknown error"; |
| 288 | + } |
| 289 | + throw new Error( |
| 290 | + `ElevenLabs timestamps API error (${response.status}): ${errorDetail}` |
| 291 | + ); |
| 292 | + } |
| 293 | + |
| 294 | + const data = (await response.json()) as TTSWithTimestampsResponse; |
| 295 | + |
| 296 | + if (!data.audio_base64) { |
| 297 | + throw new Error("ElevenLabs timestamps API returned no audio data."); |
| 298 | + } |
| 299 | + |
| 300 | + const audioBuffer = Buffer.from(data.audio_base64, "base64"); |
| 301 | + const wordTimestamps = aggregateToWordTimestamps(data.alignment); |
| 302 | + |
| 303 | + // Calculate duration from the last word's end time, or estimate from buffer |
| 304 | + const durationMs = |
| 305 | + wordTimestamps.length > 0 |
| 306 | + ? wordTimestamps[wordTimestamps.length - 1].endMs |
| 307 | + : Math.round((audioBuffer.length / 32000) * 1000); // rough estimate for MP3 |
| 308 | + |
| 309 | + return { |
| 310 | + audioBase64: data.audio_base64, |
| 311 | + audioBuffer, |
| 312 | + wordTimestamps, |
| 313 | + durationMs, |
| 314 | + }; |
| 315 | +} |
| 316 | + |
| 317 | +/** |
| 318 | + * Generate per-scene audio with timestamps from a structured video script. |
| 319 | + * |
| 320 | + * Instead of concatenating everything into one blob, this generates |
| 321 | + * separate audio for each section (hook, scenes, CTA) with word-level |
| 322 | + * timestamps. This enables: |
| 323 | + * - Precise scene boundary timing |
| 324 | + * - Per-scene word timestamps for visual sync |
| 325 | + * - Fault isolation (retry one scene instead of all) |
| 326 | + * |
| 327 | + * @param script - The video script |
| 328 | + * @returns Array of SceneAudioResult, one per section (hook + scenes + CTA) |
| 329 | + */ |
| 330 | +export async function generatePerSceneAudio( |
| 331 | + script: VideoScript |
| 332 | +): Promise<{ |
| 333 | + hook: SceneAudioResult; |
| 334 | + scenes: SceneAudioResult[]; |
| 335 | + cta: SceneAudioResult; |
| 336 | + totalDurationMs: number; |
| 337 | +}> { |
| 338 | + const sections: { label: string; text: string }[] = []; |
| 339 | + |
| 340 | + if (script.hook?.trim()) { |
| 341 | + sections.push({ label: "hook", text: script.hook.trim() }); |
| 342 | + } else { |
| 343 | + throw new Error("Script must have a hook."); |
| 344 | + } |
| 345 | + |
| 346 | + if (!script.scenes?.length) { |
| 347 | + throw new Error("Script must have at least one scene."); |
| 348 | + } |
| 349 | + |
| 350 | + for (const scene of script.scenes) { |
| 351 | + if (scene.narration?.trim()) { |
| 352 | + sections.push({ |
| 353 | + label: `scene-${scene.sceneNumber ?? sections.length}`, |
| 354 | + text: scene.narration.trim(), |
| 355 | + }); |
| 356 | + } |
| 357 | + } |
| 358 | + |
| 359 | + if (script.cta?.trim()) { |
| 360 | + sections.push({ label: "cta", text: script.cta.trim() }); |
| 361 | + } else { |
| 362 | + throw new Error("Script must have a CTA."); |
| 363 | + } |
| 364 | + |
| 365 | + console.log( |
| 366 | + `[elevenlabs] Generating per-scene audio for ${sections.length} sections...` |
| 367 | + ); |
| 368 | + |
| 369 | + // Generate audio for all sections concurrently (with a concurrency limit) |
| 370 | + const CONCURRENCY = 3; |
| 371 | + const results: SceneAudioResult[] = []; |
| 372 | + |
| 373 | + for (let i = 0; i < sections.length; i += CONCURRENCY) { |
| 374 | + const batch = sections.slice(i, i + CONCURRENCY); |
| 375 | + const batchResults = await Promise.all( |
| 376 | + batch.map(async (section) => { |
| 377 | + console.log( |
| 378 | + `[elevenlabs] Generating audio for ${section.label} (${section.text.length} chars)...` |
| 379 | + ); |
| 380 | + return generateSpeechWithTimestamps(section.text); |
| 381 | + }) |
| 382 | + ); |
| 383 | + results.push(...batchResults); |
| 384 | + } |
| 385 | + |
| 386 | + const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0); |
| 387 | + |
| 388 | + console.log( |
| 389 | + `[elevenlabs] Per-scene audio complete: ${results.length} sections, ${Math.round(totalDurationMs / 1000)}s total` |
| 390 | + ); |
| 391 | + |
| 392 | + // Split results back into hook, scenes, CTA |
| 393 | + const hookResult = results[0]; |
| 394 | + const sceneResults = results.slice(1, results.length - 1); |
| 395 | + const ctaResult = results[results.length - 1]; |
| 396 | + |
| 397 | + return { |
| 398 | + hook: hookResult, |
| 399 | + scenes: sceneResults, |
| 400 | + cta: ctaResult, |
| 401 | + totalDurationMs, |
| 402 | + }; |
| 403 | +} |
| 404 | + |
| 405 | +// Re-export timestamp types for consumers |
| 406 | +export type { WordTimestamp, SceneAudioResult, CharacterAlignment } from "@/lib/utils/audio-timestamps"; |
0 commit comments