Skip to content

Commit 363805a

Browse files
author
Miriad
committed
Merge feat/scene-components-phase-a into dev — Scene type system, CodeMorphScene, DynamicListScene, ElevenLabs timestamps, Gemini prompt update
2 parents a671590 + 674e6a7 commit 363805a

File tree

10 files changed

+1340
-14
lines changed

10 files changed

+1340
-14
lines changed

app/api/cron/ingest/route.ts

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,30 @@ interface RSSItem {
1616

1717
interface ScriptScene {
1818
sceneNumber: number;
19+
sceneType: "narration" | "code" | "list" | "comparison" | "mockup";
1920
narration: string;
2021
visualDescription: string;
2122
bRollKeywords: string[];
2223
durationEstimate: number;
24+
// Scene-type-specific data
25+
code?: {
26+
snippet: string;
27+
language: string;
28+
highlightLines?: number[];
29+
};
30+
list?: {
31+
items: string[];
32+
icon?: string;
33+
};
34+
comparison?: {
35+
leftLabel: string;
36+
rightLabel: string;
37+
rows: { left: string; right: string }[];
38+
};
39+
mockup?: {
40+
deviceType: "browser" | "phone" | "terminal";
41+
screenContent: string;
42+
};
2343
}
2444

2545
interface GeneratedScript {
@@ -159,7 +179,29 @@ function buildPrompt(topics: RSSItem[]): string {
159179
160180
${topicList}
161181
162-
Pick the MOST interesting and timely topic for a short explainer video (60-90 seconds). Then generate a complete video script as JSON matching this exact schema:
182+
Pick the MOST interesting and timely topic for a short explainer video (60-90 seconds). Then generate a complete video script as JSON.
183+
184+
## Scene Types
185+
186+
Each scene MUST have a "sceneType" that determines its visual treatment. Choose the best type for the content:
187+
188+
- **"code"** — Use when explaining code snippets, API usage, config files, or CLI commands. Provide the actual code in the "code" field.
189+
- **"list"** — Use for enumerated content: "Top 5 features", "3 reasons why", key takeaways. Provide items in the "list" field.
190+
- **"comparison"** — Use for A-vs-B content: "React vs Vue", "SQL vs NoSQL", pros/cons. Provide structured data in the "comparison" field.
191+
- **"mockup"** — Use when showing a UI, website, app screen, or terminal output. Provide device type and content description in the "mockup" field.
192+
- **"narration"** — Use for conceptual explanations, introductions, or transitions where B-roll footage is appropriate. This is the default/fallback.
193+
194+
**Guidelines:**
195+
- A good video uses 2-3 different scene types for visual variety
196+
- Code-heavy topics should have at least one "code" scene
197+
- Always include "bRollKeywords" and "visualDescription" as fallbacks even for non-narration scenes
198+
- For "code" scenes, provide REAL, working code snippets (not pseudocode)
199+
- For "list" scenes, provide 3-6 concise items
200+
- For "comparison" scenes, provide 2-4 rows
201+
202+
## JSON Schema
203+
204+
Return ONLY a JSON object matching this exact schema:
163205
164206
{
165207
"title": "string - catchy video title",
@@ -171,10 +213,31 @@ Pick the MOST interesting and timely topic for a short explainer video (60-90 se
171213
"scenes": [
172214
{
173215
"sceneNumber": 1,
216+
"sceneType": "code | list | comparison | mockup | narration",
174217
"narration": "string - what the narrator says",
175-
"visualDescription": "string - what to show on screen",
218+
"visualDescription": "string - what to show on screen (fallback for all types)",
176219
"bRollKeywords": ["keyword1", "keyword2"],
177-
"durationEstimate": 15
220+
"durationEstimate": 15,
221+
"code": {
222+
"snippet": "string - actual code to display (only for sceneType: code)",
223+
"language": "typescript | javascript | jsx | tsx | css | html | json | bash",
224+
"highlightLines": [1, 3]
225+
},
226+
"list": {
227+
"items": ["Item 1", "Item 2", "Item 3"],
228+
"icon": "🚀"
229+
},
230+
"comparison": {
231+
"leftLabel": "Option A",
232+
"rightLabel": "Option B",
233+
"rows": [
234+
{ "left": "Feature of A", "right": "Feature of B" }
235+
]
236+
},
237+
"mockup": {
238+
"deviceType": "browser | phone | terminal",
239+
"screenContent": "Description of what appears on the device screen"
240+
}
178241
}
179242
],
180243
"cta": "string - call to action (subscribe, check link, etc.)"
@@ -185,7 +248,9 @@ Pick the MOST interesting and timely topic for a short explainer video (60-90 se
185248
Requirements:
186249
- The script should have 3-5 scenes totaling 60-90 seconds
187250
- The hook should be punchy and curiosity-driven
188-
- Each scene should have clear visual direction
251+
- Use at least 2 different scene types for visual variety
252+
- Only include the type-specific field that matches the sceneType (e.g., only include "code" when sceneType is "code")
253+
- For "code" scenes, provide real, syntactically correct code
189254
- The qualityScore should be your honest self-assessment (0-100)
190255
- Return ONLY the JSON object, no markdown or extra text`;
191256
}

lib/services/elevenlabs.ts

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@
66
* script text → ElevenLabs TTS → MP3 audio → upload to GCS → Remotion render
77
*/
88

9+
import {
10+
aggregateToWordTimestamps,
11+
type CharacterAlignment,
12+
type WordTimestamp,
13+
type SceneAudioResult,
14+
} from "@/lib/utils/audio-timestamps";
15+
916
const ELEVENLABS_API_BASE = "https://api.elevenlabs.io/v1";
1017

1118
/** Configuration for the ElevenLabs TTS service. */
@@ -43,6 +50,12 @@ export interface VideoScript {
4350
cta: string;
4451
}
4552

53+
/** Response from ElevenLabs /with-timestamps endpoint */
54+
interface TTSWithTimestampsResponse {
55+
audio_base64: string;
56+
alignment: CharacterAlignment;
57+
}
58+
4659
/**
4760
* Reads the ElevenLabs configuration from environment variables.
4861
*
@@ -213,3 +226,181 @@ export async function generateSpeechFromScript(
213226

214227
return generateSpeech(combinedText);
215228
}
229+
230+
/**
231+
* Generate speech with word-level timestamps using the ElevenLabs
232+
* `/text-to-speech/{voiceId}/with-timestamps` endpoint.
233+
*
234+
* Returns both the audio buffer and word-level timing data that can be
235+
* used to sync Remotion visuals to the narration.
236+
*
237+
* @param text - The text to convert to speech.
238+
* @returns Audio buffer + word-level timestamps.
239+
*/
240+
export async function generateSpeechWithTimestamps(
241+
text: string
242+
): Promise<SceneAudioResult> {
243+
if (!text || text.trim().length === 0) {
244+
throw new Error("Cannot generate speech from empty text.");
245+
}
246+
247+
const { apiKey, voiceId } = getConfig();
248+
249+
const url = `${ELEVENLABS_API_BASE}/text-to-speech/${voiceId}/with-timestamps`;
250+
251+
const body: TTSRequestBody = {
252+
text,
253+
model_id: "eleven_multilingual_v2",
254+
voice_settings: {
255+
stability: 0.5,
256+
similarity_boost: 0.75,
257+
style: 0.5,
258+
},
259+
};
260+
261+
let response: Response;
262+
263+
try {
264+
response = await fetch(url, {
265+
method: "POST",
266+
headers: {
267+
"Content-Type": "application/json",
268+
"xi-api-key": apiKey,
269+
},
270+
body: JSON.stringify(body),
271+
});
272+
} catch (error) {
273+
throw new Error(
274+
`ElevenLabs timestamps API request failed: ${error instanceof Error ? error.message : String(error)}`
275+
);
276+
}
277+
278+
if (!response.ok) {
279+
let errorDetail: string;
280+
try {
281+
const errorBody = await response.json();
282+
errorDetail =
283+
errorBody?.detail?.message ||
284+
errorBody?.detail ||
285+
JSON.stringify(errorBody);
286+
} catch {
287+
errorDetail = response.statusText || "Unknown error";
288+
}
289+
throw new Error(
290+
`ElevenLabs timestamps API error (${response.status}): ${errorDetail}`
291+
);
292+
}
293+
294+
const data = (await response.json()) as TTSWithTimestampsResponse;
295+
296+
if (!data.audio_base64) {
297+
throw new Error("ElevenLabs timestamps API returned no audio data.");
298+
}
299+
300+
const audioBuffer = Buffer.from(data.audio_base64, "base64");
301+
const wordTimestamps = aggregateToWordTimestamps(data.alignment);
302+
303+
// Calculate duration from the last word's end time, or estimate from buffer
304+
const durationMs =
305+
wordTimestamps.length > 0
306+
? wordTimestamps[wordTimestamps.length - 1].endMs
307+
: Math.round((audioBuffer.length / 32000) * 1000); // rough estimate for MP3
308+
309+
return {
310+
audioBase64: data.audio_base64,
311+
audioBuffer,
312+
wordTimestamps,
313+
durationMs,
314+
};
315+
}
316+
317+
/**
318+
* Generate per-scene audio with timestamps from a structured video script.
319+
*
320+
* Instead of concatenating everything into one blob, this generates
321+
* separate audio for each section (hook, scenes, CTA) with word-level
322+
* timestamps. This enables:
323+
* - Precise scene boundary timing
324+
* - Per-scene word timestamps for visual sync
325+
* - Fault isolation (retry one scene instead of all)
326+
*
327+
* @param script - The video script
328+
* @returns Array of SceneAudioResult, one per section (hook + scenes + CTA)
329+
*/
330+
export async function generatePerSceneAudio(
331+
script: VideoScript
332+
): Promise<{
333+
hook: SceneAudioResult;
334+
scenes: SceneAudioResult[];
335+
cta: SceneAudioResult;
336+
totalDurationMs: number;
337+
}> {
338+
const sections: { label: string; text: string }[] = [];
339+
340+
if (script.hook?.trim()) {
341+
sections.push({ label: "hook", text: script.hook.trim() });
342+
} else {
343+
throw new Error("Script must have a hook.");
344+
}
345+
346+
if (!script.scenes?.length) {
347+
throw new Error("Script must have at least one scene.");
348+
}
349+
350+
for (const scene of script.scenes) {
351+
if (scene.narration?.trim()) {
352+
sections.push({
353+
label: `scene-${scene.sceneNumber ?? sections.length}`,
354+
text: scene.narration.trim(),
355+
});
356+
}
357+
}
358+
359+
if (script.cta?.trim()) {
360+
sections.push({ label: "cta", text: script.cta.trim() });
361+
} else {
362+
throw new Error("Script must have a CTA.");
363+
}
364+
365+
console.log(
366+
`[elevenlabs] Generating per-scene audio for ${sections.length} sections...`
367+
);
368+
369+
// Generate audio for all sections concurrently (with a concurrency limit)
370+
const CONCURRENCY = 3;
371+
const results: SceneAudioResult[] = [];
372+
373+
for (let i = 0; i < sections.length; i += CONCURRENCY) {
374+
const batch = sections.slice(i, i + CONCURRENCY);
375+
const batchResults = await Promise.all(
376+
batch.map(async (section) => {
377+
console.log(
378+
`[elevenlabs] Generating audio for ${section.label} (${section.text.length} chars)...`
379+
);
380+
return generateSpeechWithTimestamps(section.text);
381+
})
382+
);
383+
results.push(...batchResults);
384+
}
385+
386+
const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0);
387+
388+
console.log(
389+
`[elevenlabs] Per-scene audio complete: ${results.length} sections, ${Math.round(totalDurationMs / 1000)}s total`
390+
);
391+
392+
// Split results back into hook, scenes, CTA
393+
const hookResult = results[0];
394+
const sceneResults = results.slice(1, results.length - 1);
395+
const ctaResult = results[results.length - 1];
396+
397+
return {
398+
hook: hookResult,
399+
scenes: sceneResults,
400+
cta: ctaResult,
401+
totalDurationMs,
402+
};
403+
}
404+
405+
// Re-export timestamp types for consumers
406+
export type { WordTimestamp, SceneAudioResult, CharacterAlignment } from "@/lib/utils/audio-timestamps";

0 commit comments

Comments
 (0)