|
| 1 | +#!/bin/bash |
| 2 | +# generate-audio.sh — Generate walkthrough narration audio from a JSON script. |
| 3 | +# |
| 4 | +# Generates one TTS call per segment, producing individual WAV clips directly. |
| 5 | +# No chunking, alignment, or splitting needed. |
| 6 | +# |
| 7 | +# Usage: |
| 8 | +# ./generate-audio.sh <script.json> [output-dir] |
| 9 | +# |
| 10 | +# Input JSON format: |
| 11 | +# { |
| 12 | +# "style": "Read in a calm, steady, professional tone...", |
| 13 | +# "voice": "Iapetus", (optional, default: Iapetus) |
| 14 | +# "slides": [ |
| 15 | +# "Intro narration text...", |
| 16 | +# "Problem slide narration...", |
| 17 | +# "Approach narration...", |
| 18 | +# ... |
| 19 | +# ] |
| 20 | +# } |
| 21 | +# |
| 22 | +# Output: |
| 23 | +# <output-dir>/audio-00.wav, audio-01.wav, ... |
| 24 | +# <output-dir>/durations.json |
| 25 | +# |
| 26 | +# Dependencies: |
| 27 | +# ffmpeg / ffprobe |
| 28 | +# |
| 29 | +# Environment: |
| 30 | +# GEMINI_API_KEY — required. Auto-sourced from .env if not set. |
| 31 | +# |
| 32 | +set -euo pipefail |
| 33 | + |
| 34 | +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" |
| 35 | + |
| 36 | +# --- Args --- |
| 37 | +SCRIPT_JSON="${1:?Usage: generate-audio.sh <script.json> [output-dir]}" |
| 38 | +OUTPUT_DIR="${2:-.}" |
| 39 | + |
| 40 | +# Resolve relative paths |
| 41 | +[[ "$SCRIPT_JSON" != /* ]] && SCRIPT_JSON="$(pwd)/$SCRIPT_JSON" |
| 42 | +[[ "$OUTPUT_DIR" != /* ]] && OUTPUT_DIR="$(pwd)/$OUTPUT_DIR" |
| 43 | + |
| 44 | +if [ ! -f "$SCRIPT_JSON" ]; then |
| 45 | + echo "Error: ${SCRIPT_JSON} not found" |
| 46 | + exit 1 |
| 47 | +fi |
| 48 | + |
| 49 | +mkdir -p "$OUTPUT_DIR" |
| 50 | + |
| 51 | +PYTHON="python3" |
| 52 | + |
| 53 | +# --- API key --- |
| 54 | +REPO_ROOT=$(git rev-parse --show-toplevel 2>/dev/null || echo ".") |
| 55 | + |
| 56 | +if [ -z "${GEMINI_API_KEY:-}" ]; then |
| 57 | + if [ -f "${REPO_ROOT}/.env" ]; then |
| 58 | + export $(grep '^GEMINI_API_KEY=' "${REPO_ROOT}/.env" | xargs) 2>/dev/null || true |
| 59 | + fi |
| 60 | +fi |
| 61 | +GEMINI_API_KEY="${GEMINI_API_KEY:?Set GEMINI_API_KEY environment variable or add it to .env}" |
| 62 | + |
| 63 | +# --- Config --- |
| 64 | +TTS_MODEL="gemini-2.5-pro-preview-tts" |
| 65 | +TTS_ENDPOINT="https://generativelanguage.googleapis.com/v1beta/models/${TTS_MODEL}:generateContent" |
| 66 | +SPEED=1.2 |
| 67 | + |
| 68 | +# --- Run everything in Python for reliability --- |
| 69 | +"$PYTHON" - "$SCRIPT_JSON" "$OUTPUT_DIR" "$GEMINI_API_KEY" "$TTS_MODEL" "$TTS_ENDPOINT" "$SPEED" <<'PYTHON_SCRIPT' |
| 70 | +import json, sys, os, subprocess, base64, urllib.request, re |
| 71 | +
|
| 72 | +script_json = sys.argv[1] |
| 73 | +output_dir = sys.argv[2] |
| 74 | +api_key = sys.argv[3] |
| 75 | +tts_model = sys.argv[4] |
| 76 | +tts_endpoint = sys.argv[5] |
| 77 | +speed = float(sys.argv[6]) |
| 78 | +
|
| 79 | +MAX_RETRIES = 2 |
| 80 | +
|
| 81 | +def api_call(endpoint, body_dict): |
| 82 | + body = json.dumps(body_dict).encode() |
| 83 | + req = urllib.request.Request( |
| 84 | + f"{endpoint}?key={api_key}", |
| 85 | + data=body, |
| 86 | + headers={"Content-Type": "application/json"}, |
| 87 | + method="POST", |
| 88 | + ) |
| 89 | + with urllib.request.urlopen(req) as resp: |
| 90 | + return json.loads(resp.read()) |
| 91 | +
|
| 92 | +# --- Load narration --- |
| 93 | +with open(script_json) as f: |
| 94 | + data = json.load(f) |
| 95 | +
|
| 96 | +voice = data.get("voice", "Iapetus") |
| 97 | +slides = data["slides"] |
| 98 | +style = data.get("style", |
| 99 | + "Read the following in a calm, steady, professional tone. " |
| 100 | + "Speak at a measured pace.") |
| 101 | +
|
| 102 | +word_count = sum(len(s.split()) for s in slides) |
| 103 | +print(f"=== Generating narration audio ===") |
| 104 | +print(f" Voice: {voice}") |
| 105 | +print(f" Slides: {len(slides)}") |
| 106 | +print(f" Words: {word_count}") |
| 107 | +print() |
| 108 | +
|
| 109 | +def call_tts(prompt_text): |
| 110 | + response = api_call(tts_endpoint, { |
| 111 | + "contents": [{"parts": [{"text": prompt_text}]}], |
| 112 | + "generationConfig": { |
| 113 | + "responseModalities": ["AUDIO"], |
| 114 | + "speechConfig": { |
| 115 | + "voiceConfig": { |
| 116 | + "prebuiltVoiceConfig": { |
| 117 | + "voiceName": voice |
| 118 | + } |
| 119 | + } |
| 120 | + } |
| 121 | + } |
| 122 | + }) |
| 123 | +
|
| 124 | + error_msg = response.get("error", {}).get("message", "") |
| 125 | + if error_msg: |
| 126 | + raise RuntimeError(f"TTS API error: {error_msg}") |
| 127 | +
|
| 128 | + return base64.b64decode(response["candidates"][0]["content"]["parts"][0]["inlineData"]["data"]) |
| 129 | +
|
| 130 | +def pcm_to_wav(pcm_bytes, out_wav): |
| 131 | + pcm_tmp = out_wav + ".pcm" |
| 132 | + with open(pcm_tmp, "wb") as f: |
| 133 | + f.write(pcm_bytes) |
| 134 | + subprocess.run([ |
| 135 | + "ffmpeg", "-y", "-f", "s16le", "-ar", "24000", "-ac", "1", |
| 136 | + "-i", pcm_tmp, "-af", f"atempo={speed}", "-ar", "48000", out_wav |
| 137 | + ], capture_output=True, check=True) |
| 138 | + os.remove(pcm_tmp) |
| 139 | +
|
| 140 | +def get_duration(wav_path): |
| 141 | + result = subprocess.run( |
| 142 | + ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "csv=p=0", wav_path], |
| 143 | + capture_output=True, text=True |
| 144 | + ) |
| 145 | + return float(result.stdout.strip()) |
| 146 | +
|
| 147 | +def validate_duration(wav_path, word_count): |
| 148 | + dur = get_duration(wav_path) |
| 149 | + expected = word_count / 150 * 60 / speed |
| 150 | + lower = expected * 0.3 |
| 151 | + upper = expected * 3.0 |
| 152 | + if word_count < 15: |
| 153 | + return dur < 30, dur |
| 154 | + return lower <= dur <= upper, dur |
| 155 | +
|
| 156 | +# --- Generate one TTS call per segment --- |
| 157 | +durations = {} |
| 158 | +
|
| 159 | +for i, text in enumerate(slides): |
| 160 | + num = f"{i:02d}" |
| 161 | + out_path = os.path.join(output_dir, f"audio-{num}.wav") |
| 162 | + wc = len(text.split()) |
| 163 | + prompt = f"{style}\n\n{text}" |
| 164 | +
|
| 165 | + ok = False |
| 166 | + for attempt in range(MAX_RETRIES + 1): |
| 167 | + try: |
| 168 | + label = f" [{num}] " + ("" if attempt == 0 else f"(retry {attempt}) ") |
| 169 | + print(f"{label}Generating ({wc} words)...", end=" ", flush=True) |
| 170 | + pcm_data = call_tts(prompt) |
| 171 | + pcm_to_wav(pcm_data, out_path) |
| 172 | + ok, dur = validate_duration(out_path, wc) |
| 173 | + if ok: |
| 174 | + print(f"{dur:.1f}s") |
| 175 | + durations[f"audio-{num}.wav"] = round(dur, 2) |
| 176 | + break |
| 177 | + else: |
| 178 | + expected = wc / 150 * 60 / speed |
| 179 | + print(f"{dur:.1f}s (expected ~{expected:.0f}s, retrying)") |
| 180 | + except (urllib.error.HTTPError, RuntimeError) as e: |
| 181 | + print(f"error: {e}") |
| 182 | + if attempt == MAX_RETRIES: |
| 183 | + print(f" [error] Segment {i} failed after {MAX_RETRIES + 1} attempts") |
| 184 | + sys.exit(1) |
| 185 | +
|
| 186 | + if not ok: |
| 187 | + dur = get_duration(out_path) |
| 188 | + durations[f"audio-{num}.wav"] = round(dur, 2) |
| 189 | + print(f" [warn] Segment {i} audio may be unreliable ({dur:.1f}s for {wc} words)") |
| 190 | +
|
| 191 | +# --- Trim silence from each clip --- |
| 192 | +MAX_SILENCE = 0.15 |
| 193 | +SILENCE_THRESHOLD = "-40dB" |
| 194 | +print() |
| 195 | +print("=== Trimming silence ===") |
| 196 | +
|
| 197 | +for i in range(len(slides)): |
| 198 | + num = f"{i:02d}" |
| 199 | + clip_path = os.path.join(output_dir, f"audio-{num}.wav") |
| 200 | +
|
| 201 | + detect = subprocess.run([ |
| 202 | + "ffmpeg", "-i", clip_path, "-af", |
| 203 | + f"silencedetect=noise={SILENCE_THRESHOLD}:d=0.1", |
| 204 | + "-f", "null", "-" |
| 205 | + ], capture_output=True, text=True) |
| 206 | + stderr = detect.stderr |
| 207 | +
|
| 208 | + clip_dur = get_duration(clip_path) |
| 209 | +
|
| 210 | + silence_starts = re.findall(r'silence_start: ([\d.]+)', stderr) |
| 211 | + silence_ends = re.findall(r'silence_end: ([\d.]+)', stderr) |
| 212 | +
|
| 213 | + trim_start = 0.0 |
| 214 | + if silence_starts and float(silence_starts[0]) < 0.05: |
| 215 | + if silence_ends: |
| 216 | + leading_silence = float(silence_ends[0]) |
| 217 | + if leading_silence > MAX_SILENCE: |
| 218 | + trim_start = leading_silence - MAX_SILENCE |
| 219 | +
|
| 220 | + trim_end = clip_dur |
| 221 | + is_last = (i == len(slides) - 1) |
| 222 | + if not is_last and silence_starts: |
| 223 | + last_silence_start = float(silence_starts[-1]) |
| 224 | + last_silence_is_trailing = True |
| 225 | + for se in silence_ends: |
| 226 | + se_val = float(se) |
| 227 | + if se_val > last_silence_start and se_val < clip_dur - 0.05: |
| 228 | + last_silence_is_trailing = False |
| 229 | + break |
| 230 | + if last_silence_is_trailing and last_silence_start > 0.05: |
| 231 | + trailing_silence = clip_dur - last_silence_start |
| 232 | + if trailing_silence > MAX_SILENCE: |
| 233 | + trim_end = last_silence_start + MAX_SILENCE |
| 234 | +
|
| 235 | + if trim_start > 0 or trim_end < clip_dur: |
| 236 | + trimmed_path = clip_path + ".tmp.wav" |
| 237 | + subprocess.run([ |
| 238 | + "ffmpeg", "-y", "-i", clip_path, |
| 239 | + "-ss", str(trim_start), "-to", str(trim_end), |
| 240 | + "-c", "copy", trimmed_path |
| 241 | + ], capture_output=True) |
| 242 | + os.replace(trimmed_path, clip_path) |
| 243 | + new_dur = trim_end - trim_start |
| 244 | + durations[f"audio-{num}.wav"] = round(new_dur, 2) |
| 245 | + print(f" audio-{num}.wav: {clip_dur:.1f}s -> {new_dur:.1f}s (trimmed {clip_dur - new_dur:.1f}s)") |
| 246 | + else: |
| 247 | + print(f" audio-{num}.wav: {clip_dur:.1f}s (no trim needed)") |
| 248 | +
|
| 249 | +# --- Write durations.json --- |
| 250 | +durations_path = os.path.join(output_dir, "durations.json") |
| 251 | +with open(durations_path, "w") as f: |
| 252 | + json.dump(durations, f, indent=2) |
| 253 | +
|
| 254 | +total_dur = sum(durations.values()) |
| 255 | +print(f"\n Wrote durations.json ({len(durations)} entries, {total_dur:.1f}s total)") |
| 256 | +
|
| 257 | +print() |
| 258 | +print("=== Done ===") |
| 259 | +PYTHON_SCRIPT |
| 260 | + |
| 261 | +echo "" |
| 262 | +echo "Output:" |
| 263 | +ls -la "${OUTPUT_DIR}"/audio-*.wav 2>/dev/null || echo " (no files generated)" |
0 commit comments