Skip to content

Commit ad16538

Browse files
author
Miriad
committed
Merge branch 'phase1b/video-pipeline' into dev
2 parents d10f9e5 + 5492a5c commit ad16538

File tree

20 files changed

+40085
-5057
lines changed

20 files changed

+40085
-5057
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,4 @@ next-env.d.ts
5050
.genkit/
5151
# Firebase debug files
5252
firebase-debug.log
53-
firebase-debug.*.log
53+
firebase-debug.*.logpackage-lock.json
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import { NextResponse } from 'next/server';
2+
import { isValidSignature, SIGNATURE_HEADER_NAME } from '@sanity/webhook';
3+
import { processVideoProduction } from '@/lib/services/video-pipeline';
4+
5+
const WEBHOOK_SECRET = process.env.SANITY_WEBHOOK_SECRET;
6+
7+
interface SanityWebhookBody {
8+
_id: string;
9+
_type: string;
10+
status?: string;
11+
}
12+
13+
/**
14+
* Sanity webhook handler for the video production pipeline.
15+
*
16+
* Listens for automatedVideo documents transitioning to "script_ready" status
17+
* and triggers the video production pipeline in the background.
18+
*
19+
* Configure in Sanity: Webhook → POST → filter: `_type == "automatedVideo"`
20+
* with projection: `{ _id, _type, status }`
21+
*/
22+
export async function POST(request: Request) {
23+
try {
24+
if (!WEBHOOK_SECRET) {
25+
console.log('[WEBHOOK] Missing SANITY_WEBHOOK_SECRET environment variable');
26+
return NextResponse.json(
27+
{ error: 'Server misconfigured: missing webhook secret' },
28+
{ status: 500 }
29+
);
30+
}
31+
32+
// Read the raw body as text for signature verification
33+
const rawBody = await request.text();
34+
const signature = request.headers.get(SIGNATURE_HEADER_NAME);
35+
36+
if (!signature) {
37+
console.log('[WEBHOOK] Missing signature header');
38+
return NextResponse.json(
39+
{ error: 'Missing signature' },
40+
{ status: 401 }
41+
);
42+
}
43+
44+
// Verify the webhook signature
45+
const isValid = await isValidSignature(rawBody, signature, WEBHOOK_SECRET);
46+
47+
if (!isValid) {
48+
console.log('[WEBHOOK] Invalid signature received');
49+
return NextResponse.json(
50+
{ error: 'Invalid signature' },
51+
{ status: 401 }
52+
);
53+
}
54+
55+
// Parse the verified body
56+
let body: SanityWebhookBody;
57+
try {
58+
body = JSON.parse(rawBody);
59+
} catch {
60+
console.log('[WEBHOOK] Failed to parse webhook body');
61+
return NextResponse.json(
62+
{ skipped: true, reason: 'Invalid JSON body' },
63+
{ status: 200 }
64+
);
65+
}
66+
67+
console.log(`[WEBHOOK] Received document: type=${body._type}, id=${body._id}, status=${body.status}`);
68+
69+
if (body._type !== 'automatedVideo') {
70+
console.log(`[WEBHOOK] Skipping: document type is "${body._type}", not "automatedVideo"`);
71+
return NextResponse.json(
72+
{ skipped: true, reason: `Document type "${body._type}" is not "automatedVideo"` },
73+
{ status: 200 }
74+
);
75+
}
76+
77+
if (body.status !== 'script_ready') {
78+
console.log(`[WEBHOOK] Skipping: status is "${body.status}", not "script_ready"`);
79+
return NextResponse.json(
80+
{ skipped: true, reason: `Status "${body.status}" is not "script_ready"` },
81+
{ status: 200 }
82+
);
83+
}
84+
85+
// Fire and forget — trigger pipeline in background, return 200 immediately
86+
console.log(`[WEBHOOK] Triggering video production for document: ${body._id}`);
87+
processVideoProduction(body._id).catch((error) => {
88+
console.log(`[WEBHOOK] Background processing error for ${body._id}:`, error);
89+
});
90+
91+
return NextResponse.json({ triggered: true }, { status: 200 });
92+
} catch (error) {
93+
console.log('[WEBHOOK] Unexpected error processing webhook:', error);
94+
return NextResponse.json(
95+
{ error: 'Internal server error' },
96+
{ status: 500 }
97+
);
98+
}
99+
}

lib/services/elevenlabs.ts

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
/**
2+
* ElevenLabs Text-to-Speech Service
3+
*
4+
* Converts script text into MP3 audio using the ElevenLabs TTS API v1.
5+
* Part of the CodingCat.dev automated video pipeline:
6+
* script text → ElevenLabs TTS → MP3 audio → upload to GCS → Remotion render
7+
*/
8+
9+
const ELEVENLABS_API_BASE = "https://api.elevenlabs.io/v1";
10+
11+
/** Configuration for the ElevenLabs TTS service. */
12+
export type ElevenLabsConfig = {
13+
/** ElevenLabs API key for authentication. */
14+
apiKey: string;
15+
/** ElevenLabs voice ID to use for speech synthesis. */
16+
voiceId: string;
17+
};
18+
19+
/** Voice settings passed to the ElevenLabs API. */
20+
interface VoiceSettings {
21+
stability: number;
22+
similarity_boost: number;
23+
style: number;
24+
}
25+
26+
/** Request body for the ElevenLabs TTS endpoint. */
27+
interface TTSRequestBody {
28+
text: string;
29+
model_id: string;
30+
voice_settings: VoiceSettings;
31+
}
32+
33+
/** Shape of the script object used in the video pipeline. */
34+
export interface VideoScript {
35+
hook: string;
36+
scenes: Array<{
37+
sceneNumber?: number;
38+
narration: string;
39+
visualDescription?: string;
40+
bRollKeywords?: string[];
41+
durationEstimate?: number;
42+
}>;
43+
cta: string;
44+
}
45+
46+
/**
47+
* Reads the ElevenLabs configuration from environment variables.
48+
*
49+
* @returns The resolved {@link ElevenLabsConfig}.
50+
* @throws {Error} If required environment variables are missing.
51+
*/
52+
function getConfig(): ElevenLabsConfig {
53+
const apiKey = process.env.ELEVENLABS_API_KEY;
54+
const voiceId = process.env.ELEVENLABS_VOICE_ID;
55+
56+
if (!apiKey) {
57+
throw new Error(
58+
"Missing ELEVENLABS_API_KEY environment variable. " +
59+
"Set it in your .env.local or deployment environment."
60+
);
61+
}
62+
63+
if (!voiceId) {
64+
throw new Error(
65+
"Missing ELEVENLABS_VOICE_ID environment variable. " +
66+
"Set it in your .env.local or deployment environment."
67+
);
68+
}
69+
70+
return { apiKey, voiceId };
71+
}
72+
73+
/**
74+
* Generate speech audio from plain text using the ElevenLabs TTS API.
75+
*
76+
* Calls the ElevenLabs v1 text-to-speech endpoint with the
77+
* `eleven_multilingual_v2` model and returns the resulting MP3 audio
78+
* as a Node.js `Buffer`.
79+
*
80+
* @param text - The text to convert to speech.
81+
* @returns A `Buffer` containing the MP3 audio data.
82+
* @throws {Error} If the text is empty, env vars are missing, or the API request fails.
83+
*
84+
* @example
85+
* ```ts
86+
* import { generateSpeech } from "@/lib/services/elevenlabs";
87+
*
88+
* const mp3Buffer = await generateSpeech("Hello from CodingCat.dev!");
89+
* ```
90+
*/
91+
export async function generateSpeech(text: string): Promise<Buffer> {
92+
if (!text || text.trim().length === 0) {
93+
throw new Error("Cannot generate speech from empty text.");
94+
}
95+
96+
const { apiKey, voiceId } = getConfig();
97+
98+
const url = `${ELEVENLABS_API_BASE}/text-to-speech/${voiceId}`;
99+
100+
const body: TTSRequestBody = {
101+
text,
102+
model_id: "eleven_multilingual_v2",
103+
voice_settings: {
104+
stability: 0.5,
105+
similarity_boost: 0.75,
106+
style: 0.5,
107+
},
108+
};
109+
110+
let response: Response;
111+
112+
try {
113+
response = await fetch(url, {
114+
method: "POST",
115+
headers: {
116+
Accept: "audio/mpeg",
117+
"Content-Type": "application/json",
118+
"xi-api-key": apiKey,
119+
},
120+
body: JSON.stringify(body),
121+
});
122+
} catch (error) {
123+
throw new Error(
124+
`ElevenLabs API request failed: ${error instanceof Error ? error.message : String(error)}`
125+
);
126+
}
127+
128+
if (!response.ok) {
129+
let errorDetail: string;
130+
131+
try {
132+
const errorBody = await response.json();
133+
errorDetail =
134+
errorBody?.detail?.message ||
135+
errorBody?.detail ||
136+
JSON.stringify(errorBody);
137+
} catch {
138+
errorDetail = response.statusText || "Unknown error";
139+
}
140+
141+
throw new Error(
142+
`ElevenLabs TTS API error (${response.status}): ${errorDetail}`
143+
);
144+
}
145+
146+
const arrayBuffer = await response.arrayBuffer();
147+
148+
if (arrayBuffer.byteLength === 0) {
149+
throw new Error("ElevenLabs API returned an empty audio response.");
150+
}
151+
152+
return Buffer.from(arrayBuffer);
153+
}
154+
155+
/**
156+
* Generate speech audio from a structured video script.
157+
*
158+
* Concatenates the script's hook, scene narrations, and call-to-action
159+
* into a single text block (separated by pauses) and converts it to
160+
* MP3 audio via {@link generateSpeech}.
161+
*
162+
* @param script - The video script containing a hook, scenes with narrations, and a CTA.
163+
* @returns A `Buffer` containing the MP3 audio data.
164+
* @throws {Error} If the script produces empty text or the TTS call fails.
165+
*
166+
* @example
167+
* ```ts
168+
* import { generateSpeechFromScript } from "@/lib/services/elevenlabs";
169+
*
170+
* const mp3Buffer = await generateSpeechFromScript({
171+
* hook: "Did you know you can automate video creation?",
172+
* scenes: [
173+
* { narration: "First, we generate a script using AI." },
174+
* { narration: "Then, we convert it to speech with ElevenLabs." },
175+
* ],
176+
* cta: "Subscribe to CodingCat.dev for more!",
177+
* });
178+
* ```
179+
*/
180+
export async function generateSpeechFromScript(
181+
script: VideoScript
182+
): Promise<Buffer> {
183+
const sections: string[] = [];
184+
185+
if (script.hook?.trim()) {
186+
sections.push(script.hook.trim());
187+
}
188+
189+
if (script.scenes && Array.isArray(script.scenes)) {
190+
for (const scene of script.scenes) {
191+
if (scene.narration?.trim()) {
192+
sections.push(scene.narration.trim());
193+
}
194+
}
195+
}
196+
197+
if (script.cta?.trim()) {
198+
sections.push(script.cta.trim());
199+
}
200+
201+
if (sections.length === 0) {
202+
throw new Error(
203+
"Cannot generate speech from an empty script. " +
204+
"Provide at least a hook, one scene narration, or a CTA."
205+
);
206+
}
207+
208+
// Join sections with ". " to create natural pauses between parts.
209+
// Ensure each section ends cleanly before adding the pause separator.
210+
const combinedText = sections
211+
.map((s) => (s.endsWith(".") ? s : `${s}.`))
212+
.join(" ");
213+
214+
return generateSpeech(combinedText);
215+
}

0 commit comments

Comments
 (0)