From e8227bf8678597c4255059029d628789b4518144 Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Sun, 22 Mar 2026 07:04:53 -0700 Subject: [PATCH] update: add podcast-creator skill Adds a podcast-creator skill that converts text scripts into podcast episodes using MiniMax TTS and Music APIs. Supports plain text, Markdown, and structured JSON input formats. Uses ffmpeg for audio assembly with crossfading between narration and intro/outro music. Co-Authored-By: Claude Opus 4.6 --- README.md | 1 + README_zh.md | 1 + skills/podcast-creator/SKILL.md | 153 ++++++++++++++++ .../references/requirements.txt | 1 + .../references/script-format.md | 81 +++++++++ .../podcast-creator/scripts/minimax_music.py | 153 ++++++++++++++++ skills/podcast-creator/scripts/minimax_tts.py | 123 +++++++++++++ .../podcast-creator/scripts/podcast_create.py | 172 ++++++++++++++++++ 8 files changed, 685 insertions(+) create mode 100644 skills/podcast-creator/SKILL.md create mode 100644 skills/podcast-creator/references/requirements.txt create mode 100644 skills/podcast-creator/references/script-format.md create mode 100644 skills/podcast-creator/scripts/minimax_music.py create mode 100755 skills/podcast-creator/scripts/minimax_tts.py create mode 100644 skills/podcast-creator/scripts/podcast_create.py diff --git a/README.md b/README.md index 9fccd363..a59b280d 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Development skills for AI coding agents. Plug into your favorite AI coding tool | `ios-application-dev` | iOS application development guide covering UIKit, SnapKit, and SwiftUI. Touch targets, safe areas, navigation patterns, Dynamic Type, Dark Mode, accessibility, collection views, and Apple HIG compliance. | | `shader-dev` | Comprehensive GLSL shader techniques for creating stunning visual effects — ray marching, SDF modeling, fluid simulation, particle systems, procedural generation, lighting, post-processing, and more. ShaderToy-compatible. | | `gif-sticker-maker` | Convert photos (people, pets, objects, logos) into 4 animated GIF stickers with captions. Funko Pop / Pop Mart style, powered by MiniMax Image & Video Generation API. | +| `podcast-creator` | Convert text scripts into podcast episodes with narration and music. Supports plain text, Markdown, or structured JSON input. Generates narration via MiniMax TTS API and intro/outro music via MiniMax Music API, then assembles with ffmpeg. | | `minimax-pdf` | Generate, fill, and reformat PDF documents with a token-based design system. CREATE polished PDFs from scratch (15 cover styles), FILL existing form fields, or REFORMAT documents into a new design. Print-ready output with typography and color derived from document type. | | `pptx-generator` | Generate, edit, and read PowerPoint presentations. Create from scratch with PptxGenJS (cover, TOC, content, section divider, summary slides), edit existing PPTX via XML workflows, or extract text with markitdown. | | `minimax-xlsx` | Open, create, read, analyze, edit, or validate Excel/spreadsheet files (.xlsx, .xlsm, .csv, .tsv). Covers creating new xlsx from scratch via XML templates, reading and analyzing with pandas, editing existing files with zero format loss, formula recalculation, validation, and professional financial formatting. | diff --git a/README_zh.md b/README_zh.md index 57902fb6..e8cc2c7d 100644 --- a/README_zh.md +++ b/README_zh.md @@ -16,6 +16,7 @@ | `ios-application-dev` | iOS 应用开发指南,涵盖 UIKit、SnapKit 和 SwiftUI。触控目标、安全区域、导航模式、Dynamic Type、深色模式、无障碍、集合视图,符合 Apple HIG 规范。 | | `shader-dev` | 全面的 GLSL 着色器技术,用于创建惊艳的视觉效果 — 光线行进、SDF 建模、流体模拟、粒子系统、程序化生成、光照、后处理等。兼容 ShaderToy。 | | `gif-sticker-maker` | 将照片(人物、宠物、物品、Logo)转换为 4 张带字幕的动画 GIF 贴纸。Funko Pop / Pop Mart 盲盒风格,基于 MiniMax 图片与视频生成 API。 | +| `podcast-creator` | 将文本脚本转换为播客节目。支持纯文本、Markdown 或 JSON 格式输入。通过 MiniMax TTS API 生成旁白,通过 MiniMax 音乐生成 API 创建片头/片尾音乐,使用 ffmpeg 合成最终音频。 | | `minimax-pdf` | 基于 token 化设计系统生成、填写和重排 PDF 文档。支持三种模式:CREATE(从零生成,15 种封面风格)、FILL(填写现有表单字段)、REFORMAT(将已有文档重排为新设计)。排版与配色由文档类型自动推导,输出即可打印。 | | `pptx-generator` | 生成、编辑和读取 PowerPoint 演示文稿。支持用 PptxGenJS 从零创建(封面、目录、内容、分节页、总结页),通过 XML 工作流编辑现有 PPTX,或用 markitdown 提取文本。 | | `minimax-xlsx` | 打开、创建、读取、分析、编辑或验证 Excel/电子表格文件(.xlsx、.xlsm、.csv、.tsv)。支持通过 XML 模板从零创建 xlsx、使用 pandas 读取分析、零格式损失编辑现有文件、公式重算与验证、专业财务格式化。 | diff --git a/skills/podcast-creator/SKILL.md b/skills/podcast-creator/SKILL.md new file mode 100644 index 00000000..c0a162f3 --- /dev/null +++ b/skills/podcast-creator/SKILL.md @@ -0,0 +1,153 @@ +--- +name: podcast-creator +description: | + Convert text scripts into podcast episodes using MiniMax TTS and Music APIs. + Use when: creating podcasts from text, generating audio narration with background music, + converting articles or blog posts to audio, producing voiceover content with intro/outro music. + Triggers: podcast, audio episode, narration, text-to-speech with music, voiceover, audio content. +license: MIT +metadata: + version: "1.0" + category: creative-tools + output_format: mp3 + sources: + - MiniMax Text-to-Speech API (speech-2.8-hd) + - MiniMax Music Generation API (music-2.5+) +--- + +# Podcast Creator + +Convert text scripts into polished podcast episodes with narration and music. + +## Prerequisites + +Before starting, ensure: + +1. **Python venv** is activated with dependencies from [requirements.txt](references/requirements.txt) installed +2. **`MINIMAX_API_KEY`** is exported (e.g. `export MINIMAX_API_KEY='your-key'`) +3. **`ffmpeg`** is available on PATH (for audio assembly) + +If any prerequisite is missing, set it up first. Do NOT proceed without all three. + +## Workflow + +### Step 0: Collect Script + +Accept the podcast script in one of three formats: + +1. **Plain text** - Split into segments by blank lines or headings +2. **Markdown** - Use headings as chapter markers +3. **Structured JSON** - See [script-format.md](references/script-format.md) for the schema + +If the user provides plain text or markdown, convert it to the internal JSON structure before proceeding. + +Ask the user: +> "Do you have a podcast script ready, or would you like me to help write one?" + +If the user wants help writing a script, ask for the topic and target length (short ~2min, medium ~5min, long ~10min), then draft chapters. + +### Step 1: Voice Selection + +Present voice options based on content type. Reference the [MiniMax Voice Catalog](../frontend-dev/references/minimax-voice-catalog.md) for the full list. + +**Quick picks by content type:** + +| Content type | Recommended voice | Voice ID | +|---|---|---| +| Tech / tutorial | Male, clear, neutral | `male-qn-qingse` | +| Storytelling | Male, warm, narrative | `male-qn-jingying` | +| News / formal | Female, professional | `female-shaonv` | +| Conversational | Female, friendly | `female-yujie` | + +Ask the user: +> "Which voice style fits your podcast? Pick from the suggestions above or describe what you want." + +Record the selected `voice_id` for Step 2. + +### Step 2: Generate Narration + +**Tool**: `scripts/minimax_tts.py` + +For each chapter/segment in the script: + +```bash +python3 scripts/minimax_tts.py "CHAPTER_TEXT" -o output/chapter_01.mp3 -v VOICE_ID --speed 0.95 +``` + +**Tips:** +- Use `--speed 0.95` for narration (slightly slower than default for clarity) +- Keep each TTS call under 10,000 characters. Split longer chapters. +- Generate chapters sequentially to maintain consistent pacing. + +After generation, verify each file exists: +```bash +ls -la output/chapter_*.mp3 +``` + +### Step 3: Generate Music + +**Tool**: `scripts/minimax_music.py` + +Generate intro and outro music based on the podcast tone. + +```bash +# Intro music (instrumental, 15-30 seconds feel) +python3 scripts/minimax_music.py --prompt "MUSIC_STYLE, short intro jingle" --instrumental -o output/intro_music.mp3 + +# Outro music (instrumental, fade-out feel) +python3 scripts/minimax_music.py --prompt "MUSIC_STYLE, gentle outro, fade out" --instrumental -o output/outro_music.mp3 +``` + +**Music style examples:** +- Tech podcast: "Electronic ambient, minimal, modern, professional" +- Story podcast: "Acoustic guitar, warm, intimate, indie folk" +- News podcast: "Clean piano, confident, broadcast quality" +- Casual podcast: "Lo-fi beats, relaxed, warm, coffee shop" + +### Step 4: Assemble + +**Tool**: `scripts/podcast_create.py` + +Stitch all audio segments into a final episode: + +```bash +python3 scripts/podcast_create.py \ + --intro output/intro_music.mp3 \ + --chapters output/chapter_01.mp3 output/chapter_02.mp3 output/chapter_03.mp3 \ + --outro output/outro_music.mp3 \ + --title "Episode Title" \ + -o output/episode.mp3 +``` + +This handles: +- Crossfading intro music into first chapter (2s overlap) +- Adding 1s silence between chapters +- Crossfading last chapter into outro music (2s overlap) +- Writing ID3 tags (title, artist) to the final mp3 + +### Step 5: Deliver + +Output format: +1. Summary line: "Podcast episode created: {title}" +2. File path and duration +3. Chapter breakdown with timestamps + +``` +Podcast episode created: "Episode Title" + File: output/episode.mp3 + Duration: 5m 23s + Chapters: + 00:00 - Intro + 00:08 - Chapter 1: Introduction + 01:45 - Chapter 2: Main Topic + 04:10 - Chapter 3: Wrap-up + 05:05 - Outro +``` + +## Rules + +- Always generate intro and outro music. A podcast without music sounds unfinished. +- Use `--instrumental` for all music generation. Vocals in background music compete with narration. +- Keep intro music short (the generated clip will be ~30s, but crossfading trims it naturally). +- Detect user's language and match the TTS voice language accordingly. +- All music prompts must be in **English** regardless of user language. diff --git a/skills/podcast-creator/references/requirements.txt b/skills/podcast-creator/references/requirements.txt new file mode 100644 index 00000000..a8608b2c --- /dev/null +++ b/skills/podcast-creator/references/requirements.txt @@ -0,0 +1 @@ +requests>=2.28.0 diff --git a/skills/podcast-creator/references/script-format.md b/skills/podcast-creator/references/script-format.md new file mode 100644 index 00000000..9f37663a --- /dev/null +++ b/skills/podcast-creator/references/script-format.md @@ -0,0 +1,81 @@ +# Podcast Script Format + +The podcast-creator skill accepts scripts in three formats. + +## Format 1: Plain Text + +Separate segments with blank lines. The first line becomes the title. + +``` +My Podcast Episode + +Welcome to the show. Today we talk about AI-generated audio content. + +The main topic is how text-to-speech and music generation APIs +can work together to produce complete podcast episodes. + +Thanks for listening. See you next time. +``` + +## Format 2: Markdown + +Use headings as chapter markers. + +```markdown +# My Podcast Episode + +## Introduction +Welcome to the show. Today we talk about AI-generated audio content. + +## Main Topic +The main topic is how text-to-speech and music generation APIs +can work together to produce complete podcast episodes. + +## Wrap-up +Thanks for listening. See you next time. +``` + +## Format 3: Structured JSON + +Full control over chapters, voice, and music style. + +```json +{ + "title": "My Podcast Episode", + "voice_id": "male-qn-qingse", + "music_style": "Lo-fi beats, relaxed, warm", + "chapters": [ + { + "type": "intro", + "title": "Introduction", + "text": "Welcome to the show. Today we talk about AI-generated audio content." + }, + { + "type": "segment", + "title": "Main Topic", + "text": "The main topic is how text-to-speech and music generation APIs can work together to produce complete podcast episodes." + }, + { + "type": "outro", + "title": "Wrap-up", + "text": "Thanks for listening. See you next time." + } + ] +} +``` + +### Chapter types + +| Type | Description | +|------|-------------| +| `intro` | Opening segment. Narrated over intro music fade. | +| `segment` | Main content chapter. | +| `outro` | Closing segment. Fades into outro music. | + +### Optional fields + +| Field | Default | Description | +|-------|---------|-------------| +| `voice_id` | `male-qn-qingse` | MiniMax voice ID for narration | +| `music_style` | `"Ambient, professional, modern"` | Prompt for music generation | +| `speed` | `0.95` | Narration speed (0.5-2.0) | diff --git a/skills/podcast-creator/scripts/minimax_music.py b/skills/podcast-creator/scripts/minimax_music.py new file mode 100644 index 00000000..54d531a8 --- /dev/null +++ b/skills/podcast-creator/scripts/minimax_music.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +""" +MiniMax Music Generation (HTTP) +Self-contained: no external dependencies beyond `requests`. + +Usage: + python minimax_music.py --prompt "Indie folk, melancholic" --lyrics "[verse]\nStreetlights flicker" -o song.mp3 + python minimax_music.py --prompt "Upbeat pop, energetic" --auto-lyrics -o pop.mp3 + python minimax_music.py --prompt "Jazz piano, smooth, relaxing" --instrumental -o jazz.mp3 + +Env: MINIMAX_API_KEY (required) +""" + +import os +import sys +import json +import argparse +import requests + +API_KEY = os.getenv("MINIMAX_API_KEY") +API_BASE = os.getenv("MINIMAX_API_BASE", "https://api.minimax.io/v1") + + +def generate_music( + prompt: str = "", + lyrics: str = "", + model: str = "music-2.5+", + is_instrumental: bool = False, + lyrics_optimizer: bool = False, + sample_rate: int = 44100, + bitrate: int = 256000, + fmt: str = "mp3", + output_format: str = "hex", + timeout: int = 600, +) -> dict: + """Synchronous HTTP music generation. Returns dict with audio bytes and metadata.""" + if not API_KEY: + raise SystemExit("ERROR: MINIMAX_API_KEY is not set.\n export MINIMAX_API_KEY='your-key'") + + payload = { + "model": model, + "audio_setting": { + "sample_rate": sample_rate, + "bitrate": bitrate, + "format": fmt, + }, + "output_format": output_format, + } + + if prompt: + payload["prompt"] = prompt + if lyrics: + payload["lyrics"] = lyrics + if is_instrumental: + payload["is_instrumental"] = True + if lyrics_optimizer: + payload["lyrics_optimizer"] = True + + resp = requests.post( + f"{API_BASE}/music_generation", + headers={ + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + }, + json=payload, + timeout=timeout, + ) + resp.raise_for_status() + data = resp.json() + + # Check API-level error + base_resp = data.get("base_resp", {}) + if base_resp.get("status_code", 0) != 0: + raise SystemExit(f"API Error [{base_resp.get('status_code')}]: {base_resp.get('status_msg')}") + + status = data.get("data", {}).get("status") + if status != 2: + raise SystemExit(f"Generation incomplete (status={status}): {json.dumps(data, indent=2)}") + + audio_data = data.get("data", {}).get("audio", "") + if not audio_data: + raise SystemExit(f"No audio in response: {json.dumps(data, indent=2)}") + + extra = data.get("extra_info", {}) + + if output_format == "hex": + audio_bytes = bytes.fromhex(audio_data) + else: + # URL mode — audio_data is a URL string + audio_bytes = None + + return { + "audio_bytes": audio_bytes, + "audio_url": audio_data if output_format == "url" else None, + "duration": extra.get("music_duration"), + "sample_rate": extra.get("music_sample_rate"), + "channels": extra.get("music_channel"), + "bitrate": extra.get("bitrate"), + "size": extra.get("music_size"), + } + + +def main(): + p = argparse.ArgumentParser(description="MiniMax Music Generation (HTTP)") + p.add_argument("-o", "--output", required=True, help="Output file path") + p.add_argument("--prompt", default="", help="Music description: style, mood, scenario (max 2000 chars)") + p.add_argument("--lyrics", default="", help="Song lyrics with structure tags (max 3500 chars)") + p.add_argument("--lyrics-file", default="", help="Read lyrics from file instead of --lyrics") + p.add_argument("--model", default="music-2.5+", choices=["music-2.5+", "music-2.5"], help="Model (default: music-2.5+)") + p.add_argument("--instrumental", action="store_true", help="Generate instrumental only (no vocals)") + p.add_argument("--auto-lyrics", action="store_true", help="Auto-generate lyrics from prompt") + p.add_argument("--format", default="mp3", dest="fmt", choices=["mp3", "wav", "pcm"], help="Audio format (default: mp3)") + p.add_argument("--sample-rate", type=int, default=44100, choices=[16000, 24000, 32000, 44100], help="Sample rate (default: 44100)") + p.add_argument("--bitrate", type=int, default=256000, choices=[32000, 64000, 128000, 256000], help="Bitrate (default: 256000)") + args = p.parse_args() + + lyrics = args.lyrics + if args.lyrics_file: + with open(args.lyrics_file, "r") as f: + lyrics = f.read() + + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + + result = generate_music( + prompt=args.prompt, + lyrics=lyrics, + model=args.model, + is_instrumental=args.instrumental, + lyrics_optimizer=args.auto_lyrics, + sample_rate=args.sample_rate, + bitrate=args.bitrate, + fmt=args.fmt, + ) + + if result["audio_bytes"]: + with open(args.output, "wb") as f: + f.write(result["audio_bytes"]) + size = len(result["audio_bytes"]) + else: + # URL mode — download + r = requests.get(result["audio_url"], timeout=120) + r.raise_for_status() + with open(args.output, "wb") as f: + f.write(r.content) + size = len(r.content) + + duration = result.get("duration", "?") + print(f"OK: {size} bytes -> {args.output} (duration: {duration}s)") + + +if __name__ == "__main__": + main() diff --git a/skills/podcast-creator/scripts/minimax_tts.py b/skills/podcast-creator/scripts/minimax_tts.py new file mode 100755 index 00000000..9f78d67b --- /dev/null +++ b/skills/podcast-creator/scripts/minimax_tts.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +""" +MiniMax Sync TTS (HTTP) +Self-contained: no external dependencies beyond `requests`. + +Usage: + python minimax_tts.py "Hello world" -o output.mp3 + python minimax_tts.py "你好世界" -o hi.mp3 -v female-shaonv --model speech-2.8-hd + python minimax_tts.py "Welcome" -o out.wav -v male-qn-jingying --speed 0.8 --format wav + +Env: MINIMAX_API_KEY (required) +""" + +import os +import sys +import json +import argparse +import requests + +API_KEY = os.getenv("MINIMAX_API_KEY") +API_BASE = os.getenv("MINIMAX_API_BASE", "https://api.minimax.io/v1") + + +def tts( + text: str, + voice_id: str = "male-qn-qingse", + model: str = "speech-2.8-hd", + speed: float = 1.0, + volume: float = 1.0, + pitch: int = 0, + emotion: str = "", + sample_rate: int = 32000, + bitrate: int = 128000, + fmt: str = "mp3", + language_boost: str = "auto", + timeout: int = 120, +) -> bytes: + """Synchronous HTTP TTS. Returns raw audio bytes.""" + if not API_KEY: + raise SystemExit("ERROR: MINIMAX_API_KEY is not set.\n export MINIMAX_API_KEY='your-key'") + + voice_setting = {"voice_id": voice_id, "speed": speed, "vol": volume, "pitch": pitch} + if emotion: + voice_setting["emotion"] = emotion + + payload = { + "model": model, + "text": text, + "stream": False, + "voice_setting": voice_setting, + "audio_setting": { + "sample_rate": sample_rate, + "bitrate": bitrate, + "format": fmt, + "channel": 1, + }, + "language_boost": language_boost, + "output_format": "hex", + } + + resp = requests.post( + f"{API_BASE}/t2a_v2", + headers={ + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + }, + json=payload, + timeout=timeout, + ) + resp.raise_for_status() + data = resp.json() + + # Check API-level error + base_resp = data.get("base_resp", {}) + if base_resp.get("status_code", 0) != 0: + raise SystemExit(f"API Error [{base_resp.get('status_code')}]: {base_resp.get('status_msg')}") + + audio_hex = data.get("data", {}).get("audio", "") + if not audio_hex: + raise SystemExit(f"No audio in response: {json.dumps(data, indent=2)}") + + return bytes.fromhex(audio_hex) + + +def main(): + p = argparse.ArgumentParser(description="MiniMax Sync TTS (HTTP)") + p.add_argument("text", help="Text to synthesize (max 10000 chars)") + p.add_argument("-o", "--output", required=True, help="Output file path") + p.add_argument("-v", "--voice", default="male-qn-qingse", help="Voice ID") + p.add_argument("--model", default="speech-2.8-hd", help="Model (default: speech-2.8-hd)") + p.add_argument("--speed", type=float, default=1.0, help="Speed 0.5-2.0") + p.add_argument("--volume", type=float, default=1.0, help="Volume 0.1-10") + p.add_argument("--pitch", type=int, default=0, help="Pitch -12 to 12") + p.add_argument("--emotion", default="", help="Emotion tag (happy/sad/angry/...)") + p.add_argument("--format", default="mp3", dest="fmt", help="Audio format (mp3/wav/flac)") + p.add_argument("--sample-rate", type=int, default=32000, help="Sample rate") + p.add_argument("--lang", default="auto", help="Language boost") + args = p.parse_args() + + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + + audio = tts( + text=args.text, + voice_id=args.voice, + model=args.model, + speed=args.speed, + volume=args.volume, + pitch=args.pitch, + emotion=args.emotion, + fmt=args.fmt, + sample_rate=args.sample_rate, + language_boost=args.lang, + ) + + with open(args.output, "wb") as f: + f.write(audio) + + print(f"OK: {len(audio)} bytes -> {args.output}") + + +if __name__ == "__main__": + main() diff --git a/skills/podcast-creator/scripts/podcast_create.py b/skills/podcast-creator/scripts/podcast_create.py new file mode 100644 index 00000000..ffd2598e --- /dev/null +++ b/skills/podcast-creator/scripts/podcast_create.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +""" +Podcast assembler — stitch narration + music into a final episode. + +Usage: + python podcast_create.py --intro intro.mp3 --chapters ch1.mp3 ch2.mp3 --outro outro.mp3 -o episode.mp3 + python podcast_create.py --chapters ch1.mp3 -o episode.mp3 --title "My Episode" + +Requires: ffmpeg on PATH +""" + +import argparse +import os +import subprocess +import sys +import tempfile + + +def _check_ffmpeg(): + try: + subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True) + except FileNotFoundError: + raise SystemExit("ERROR: ffmpeg not found on PATH.\n Install: brew install ffmpeg (macOS) or apt install ffmpeg (Linux)") + + +def _get_duration(path: str) -> float: + """Get audio duration in seconds using ffprobe.""" + result = subprocess.run( + ["ffprobe", "-v", "quiet", "-show_entries", "format=duration", "-of", "csv=p=0", path], + capture_output=True, text=True, + ) + try: + return float(result.stdout.strip()) + except ValueError: + return 0.0 + + +def _crossfade_pair(a: str, b: str, output: str, overlap: float = 2.0): + """Crossfade two audio files with given overlap duration.""" + dur_a = _get_duration(a) + if dur_a <= overlap: + overlap = max(0.5, dur_a / 2) + + subprocess.run([ + "ffmpeg", "-y", + "-i", a, + "-i", b, + "-filter_complex", + f"[0]afade=t=out:st={dur_a - overlap}:d={overlap}[a0];" + f"[1]afade=t=in:st=0:d={overlap}[a1];" + f"[a0][a1]acrossfade=d={overlap}:c1=tri:c2=tri", + output, + ], capture_output=True, check=True) + + +def _concat_with_silence(files: list, output: str, gap: float = 1.0): + """Concatenate audio files with silence gaps between them.""" + if len(files) == 1: + subprocess.run(["cp", files[0], output], check=True) + return + + inputs = [] + filter_parts = [] + for i, f in enumerate(files): + inputs.extend(["-i", f]) + if i < len(files) - 1: + filter_parts.append(f"[{i}]apad=pad_dur={gap}[p{i}];") + else: + filter_parts.append(f"[{i}]acopy[p{i}];") + + concat_inputs = "".join(f"[p{i}]" for i in range(len(files))) + filter_parts.append(f"{concat_inputs}concat=n={len(files)}:v=0:a=1[out]") + + subprocess.run( + ["ffmpeg", "-y"] + inputs + [ + "-filter_complex", "".join(filter_parts), + "-map", "[out]", + output, + ], + capture_output=True, check=True, + ) + + +def _set_id3(path: str, title: str = "", artist: str = ""): + """Set basic ID3 tags on the output mp3.""" + if not title and not artist: + return + tmp = path + ".tagged.mp3" + cmd = ["ffmpeg", "-y", "-i", path] + if title: + cmd.extend(["-metadata", f"title={title}"]) + if artist: + cmd.extend(["-metadata", f"artist={artist}"]) + cmd.extend(["-codec", "copy", tmp]) + subprocess.run(cmd, capture_output=True, check=True) + os.replace(tmp, path) + + +def main(): + p = argparse.ArgumentParser(description="Assemble podcast episode from narration + music") + p.add_argument("--intro", default="", help="Intro music file (mp3)") + p.add_argument("--chapters", nargs="+", required=True, help="Chapter narration files in order") + p.add_argument("--outro", default="", help="Outro music file (mp3)") + p.add_argument("--title", default="", help="Episode title (ID3 tag)") + p.add_argument("--artist", default="", help="Artist/show name (ID3 tag)") + p.add_argument("--gap", type=float, default=1.0, help="Silence between chapters in seconds (default: 1.0)") + p.add_argument("--crossfade", type=float, default=2.0, help="Crossfade duration for intro/outro (default: 2.0)") + p.add_argument("-o", "--output", required=True, help="Output file path") + args = p.parse_args() + + _check_ffmpeg() + + for f in args.chapters: + if not os.path.exists(f): + raise SystemExit(f"ERROR: Chapter file not found: {f}") + if args.intro and not os.path.exists(args.intro): + raise SystemExit(f"ERROR: Intro file not found: {args.intro}") + if args.outro and not os.path.exists(args.outro): + raise SystemExit(f"ERROR: Outro file not found: {args.outro}") + + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + + with tempfile.TemporaryDirectory() as tmp: + # Concatenate chapters with silence gaps + chapters_combined = os.path.join(tmp, "chapters.mp3") + _concat_with_silence(args.chapters, chapters_combined, gap=args.gap) + + current = chapters_combined + + # Crossfade intro if provided + if args.intro: + intro_merged = os.path.join(tmp, "with_intro.mp3") + _crossfade_pair(args.intro, current, intro_merged, overlap=args.crossfade) + current = intro_merged + + # Crossfade outro if provided + if args.outro: + final_merged = os.path.join(tmp, "with_outro.mp3") + _crossfade_pair(current, args.outro, final_merged, overlap=args.crossfade) + current = final_merged + + # Copy to output + subprocess.run(["cp", current, args.output], check=True) + + # Set ID3 tags + _set_id3(args.output, title=args.title, artist=args.artist) + + duration = _get_duration(args.output) + size = os.path.getsize(args.output) + mins = int(duration // 60) + secs = int(duration % 60) + + # Print chapter timestamps + print(f"OK: {size:,} bytes -> {args.output} (duration: {mins}m {secs:02d}s)") + offset = 0.0 + if args.intro: + intro_dur = _get_duration(args.intro) + print(f" 00:00 - Intro ({intro_dur:.0f}s)") + offset = max(0, intro_dur - args.crossfade) + for i, ch in enumerate(args.chapters, 1): + m, s = int(offset // 60), int(offset % 60) + ch_dur = _get_duration(ch) + print(f" {m:02d}:{s:02d} - Chapter {i} ({ch_dur:.0f}s)") + offset += ch_dur + args.gap + if args.outro: + m, s = int(offset // 60), int(offset % 60) + print(f" {m:02d}:{s:02d} - Outro") + + +if __name__ == "__main__": + main()