|
| 1 | +""" |
| 2 | +scraperx/silent_video_ocr.py — frame-OCR transcription for silent videos. |
| 3 | +
|
| 4 | +When a video has no audio stream (e.g. Twitter screen-recording demos), neither |
| 5 | +captions nor Whisper can produce a transcript. This module fills the gap by |
| 6 | +sampling N evenly-spaced frames via PyAV, running tesseract OCR on each, and |
| 7 | +assembling a markdown narrative. |
| 8 | +
|
| 9 | +Designed as the third leg of scraperx's video-transcription cascade: |
| 10 | + captions → whisper → silent_video_ocr |
| 11 | +
|
| 12 | +The motivating incident (2026-05-17): a tweet at |
| 13 | +https://x.com/gitlawb/status/2055992174358274431 contained a silent demo of |
| 14 | +Hermes Agent + OpenGateway. The model name `mimo-v2.5-pro` was on-screen in the |
| 15 | +TUI footer but Whisper failed because there was no audio. Frame OCR recovers |
| 16 | +the on-screen text in 6 frames at 1 fps with deterministic tesseract output |
| 17 | +(no LLM hallucination risk). |
| 18 | +
|
| 19 | +Public API: |
| 20 | + transcribe_silent_video(url_or_path, n_frames=12, lang="eng") -> SilentVideoResult |
| 21 | +
|
| 22 | +The function downloads remote URLs to a temp file, opens with PyAV, samples |
| 23 | +frames at uniform stride, OCRs each, and returns timestamped text + assembled |
| 24 | +full_text. PyAV detects "no audio stream" by checking that container.streams.audio |
| 25 | +is empty; if a video DOES have audio, this function still works but the caller |
| 26 | +should prefer the whisper path for that case. |
| 27 | +
|
| 28 | +Dependencies (declare in pyproject.toml [silent-video] extra): |
| 29 | + av >= 14.0 (already transitive of faster_whisper) |
| 30 | + Pillow >= 10.0 (already transitive of av) |
| 31 | + pytesseract >= 0.3.13 |
| 32 | +
|
| 33 | +System dep: tesseract binary (`apt install tesseract-ocr` / brew install tesseract). |
| 34 | +""" |
| 35 | +from __future__ import annotations |
| 36 | + |
| 37 | +import io |
| 38 | +import logging |
| 39 | +import os |
| 40 | +import shutil |
| 41 | +import tempfile |
| 42 | +import urllib.request |
| 43 | +from dataclasses import dataclass, field |
| 44 | +from pathlib import Path |
| 45 | +from typing import Optional |
| 46 | +from urllib.parse import urlparse |
| 47 | + |
| 48 | +logger = logging.getLogger(__name__) |
| 49 | + |
| 50 | +# Soft imports — silent_video_ocr is opt-in |
| 51 | +try: |
| 52 | + import av |
| 53 | + HAS_PYAV = True |
| 54 | +except ImportError: |
| 55 | + HAS_PYAV = False |
| 56 | + |
| 57 | +try: |
| 58 | + import pytesseract |
| 59 | + from PIL import Image |
| 60 | + HAS_OCR = True |
| 61 | +except ImportError: |
| 62 | + HAS_OCR = False |
| 63 | + |
| 64 | + |
| 65 | +class SilentVideoNotAvailable(RuntimeError): |
| 66 | + """Raised when PyAV or pytesseract is not installed. |
| 67 | +
|
| 68 | + Install with: `pip install 'scraperx[silent-video]'` (adds pyav + pytesseract). |
| 69 | + The system `tesseract` binary must also be on PATH. |
| 70 | + """ |
| 71 | + |
| 72 | + |
| 73 | +@dataclass |
| 74 | +class FrameOCR: |
| 75 | + """One sampled frame with its OCR text.""" |
| 76 | + index: int # frame index in source stream |
| 77 | + timestamp_sec: float # wall-clock timestamp from video start |
| 78 | + text: str # tesseract output, raw (NOT post-processed) |
| 79 | + image_path: Optional[str] = None # if save_frames=True, path to PNG |
| 80 | + |
| 81 | + |
| 82 | +@dataclass |
| 83 | +class SilentVideoResult: |
| 84 | + """Result of OCR-transcribing a silent video.""" |
| 85 | + source: str # original URL or path |
| 86 | + duration_sec: float # total video duration |
| 87 | + has_audio: bool # was there any audio stream (sanity flag) |
| 88 | + fps: float # source frame rate |
| 89 | + total_frames: int # total video frames |
| 90 | + n_frames_sampled: int # how many we OCR'd |
| 91 | + frames: list[FrameOCR] = field(default_factory=list) |
| 92 | + full_text: str = "" # all frame texts joined with \n--- frame N @ Xs ---\n |
| 93 | + summary: str = "" # one-line summary (first 200 chars of full_text) |
| 94 | + |
| 95 | + def to_dict(self) -> dict: |
| 96 | + return { |
| 97 | + "source": self.source, |
| 98 | + "duration_sec": self.duration_sec, |
| 99 | + "has_audio": self.has_audio, |
| 100 | + "fps": self.fps, |
| 101 | + "total_frames": self.total_frames, |
| 102 | + "n_frames_sampled": self.n_frames_sampled, |
| 103 | + "full_text": self.full_text, |
| 104 | + "summary": self.summary, |
| 105 | + "frames": [ |
| 106 | + {"index": f.index, "timestamp_sec": f.timestamp_sec, |
| 107 | + "text": f.text, "image_path": f.image_path} |
| 108 | + for f in self.frames |
| 109 | + ], |
| 110 | + } |
| 111 | + |
| 112 | + |
| 113 | +def _download_to_temp(url: str, suffix: str = ".mp4", timeout: int = 60) -> str: |
| 114 | + """Download URL to a temp file. Caller owns cleanup.""" |
| 115 | + parsed = urlparse(url) |
| 116 | + if parsed.scheme not in ("http", "https"): |
| 117 | + raise ValueError(f"unsupported scheme for download: {parsed.scheme}") |
| 118 | + req = urllib.request.Request(url, headers={ |
| 119 | + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) scraperx/silent_video_ocr" |
| 120 | + }) |
| 121 | + fd, path = tempfile.mkstemp(suffix=suffix, prefix="scraperx-svo-") |
| 122 | + os.close(fd) |
| 123 | + try: |
| 124 | + # Stream to disk via copyfileobj (no full-file buffering — avoids OOM on >100MB videos). |
| 125 | + with urllib.request.urlopen(req, timeout=timeout) as r: |
| 126 | + with open(path, "wb") as f: |
| 127 | + shutil.copyfileobj(r, f, length=64 * 1024) |
| 128 | + except Exception: |
| 129 | + try: |
| 130 | + os.unlink(path) |
| 131 | + except OSError: |
| 132 | + pass |
| 133 | + raise |
| 134 | + return path |
| 135 | + |
| 136 | + |
| 137 | +def transcribe_silent_video( |
| 138 | + url_or_path: str, |
| 139 | + n_frames: int = 12, |
| 140 | + lang: str = "eng", |
| 141 | + tesseract_psm: int = 6, |
| 142 | + save_frames: bool = False, |
| 143 | + frames_dir: Optional[str] = None, |
| 144 | +) -> SilentVideoResult: |
| 145 | + """OCR a silent video by sampling N evenly-spaced frames. |
| 146 | +
|
| 147 | + Args: |
| 148 | + url_or_path: HTTP(S) URL or local file path. Remote URLs are downloaded |
| 149 | + to a temp file which is auto-deleted unless save_frames is True. |
| 150 | + n_frames: how many frames to sample. 12 = roughly 1 every 3.5 sec for a |
| 151 | + 40-sec demo video. Higher = more text recovery, slower runtime. |
| 152 | + lang: tesseract language code(s). Default `eng`. Use `eng+chi_sim` for |
| 153 | + mixed-language content (requires `tesseract-ocr-chi-sim` package). |
| 154 | + tesseract_psm: page segmentation mode. 6 = "uniform block of text", |
| 155 | + best for full-screen TUI captures. 3 = auto, slower. 11 = sparse text. |
| 156 | + save_frames: if True, keep extracted PNGs in frames_dir. |
| 157 | + frames_dir: directory to save frames (created if missing). Defaults to |
| 158 | + a temp dir; required if save_frames is True. |
| 159 | +
|
| 160 | + Returns: |
| 161 | + SilentVideoResult with timestamped OCR text per frame + full_text. |
| 162 | +
|
| 163 | + Raises: |
| 164 | + SilentVideoNotAvailable: if PyAV or pytesseract are not installed. |
| 165 | + FileNotFoundError: if the local file doesn't exist. |
| 166 | + av.error.InvalidDataError: if the video file is malformed. |
| 167 | + """ |
| 168 | + if not HAS_PYAV or not HAS_OCR: |
| 169 | + missing = [] |
| 170 | + if not HAS_PYAV: missing.append("av") |
| 171 | + if not HAS_OCR: missing.append("pytesseract+Pillow") |
| 172 | + raise SilentVideoNotAvailable( |
| 173 | + f"silent_video_ocr requires {', '.join(missing)} — " |
| 174 | + f"install with `pip install 'scraperx[silent-video]'`" |
| 175 | + ) |
| 176 | + |
| 177 | + # Resolve source |
| 178 | + cleanup_path: Optional[str] = None |
| 179 | + if url_or_path.startswith(("http://", "https://")): |
| 180 | + logger.info("downloading silent video: %s", url_or_path) |
| 181 | + local_path = _download_to_temp(url_or_path) |
| 182 | + cleanup_path = local_path if not save_frames else None |
| 183 | + else: |
| 184 | + local_path = url_or_path |
| 185 | + if not os.path.exists(local_path): |
| 186 | + raise FileNotFoundError(local_path) |
| 187 | + |
| 188 | + # Frame save dir |
| 189 | + if save_frames: |
| 190 | + frames_dir = frames_dir or tempfile.mkdtemp(prefix="scraperx-svo-frames-") |
| 191 | + Path(frames_dir).mkdir(parents=True, exist_ok=True) |
| 192 | + |
| 193 | + container = None |
| 194 | + try: |
| 195 | + container = av.open(local_path) |
| 196 | + video_stream = container.streams.video[0] |
| 197 | + has_audio = len(container.streams.audio) > 0 |
| 198 | + duration_sec = ( |
| 199 | + float(video_stream.duration * video_stream.time_base) |
| 200 | + if video_stream.duration else 0.0 |
| 201 | + ) |
| 202 | + fps = float(video_stream.average_rate) if video_stream.average_rate else 0.0 |
| 203 | + total_frames = int(video_stream.frames or 0) |
| 204 | + |
| 205 | + if total_frames <= 0: |
| 206 | + logger.warning("video has no frames metadata; decoding to count") |
| 207 | + # rare codec branch — count by iterating (cheap for short clips). |
| 208 | + # Re-open so the decoder cursor starts fresh. |
| 209 | + total_frames = sum(1 for _ in container.decode(video=0)) |
| 210 | + container.close() |
| 211 | + container = av.open(local_path) |
| 212 | + video_stream = container.streams.video[0] |
| 213 | + |
| 214 | + n_frames = max(1, min(n_frames, total_frames)) |
| 215 | + target_indices = sorted({int(total_frames * i / n_frames) for i in range(n_frames)}) |
| 216 | + |
| 217 | + # Sample frames at the target indices |
| 218 | + frames: list[FrameOCR] = [] |
| 219 | + idx_set = set(target_indices) |
| 220 | + seen = 0 |
| 221 | + for i, frame in enumerate(container.decode(video=0)): |
| 222 | + if i in idx_set: |
| 223 | + pil_img = frame.to_image() |
| 224 | + # PTS-based timestamp preferred; fall back to fps-derived; else 0. |
| 225 | + if frame.pts is not None: |
| 226 | + ts = float(frame.pts * video_stream.time_base) |
| 227 | + elif fps > 0: |
| 228 | + ts = i / fps |
| 229 | + else: |
| 230 | + ts = 0.0 |
| 231 | + image_path: Optional[str] = None |
| 232 | + if save_frames and frames_dir: |
| 233 | + image_path = os.path.join(frames_dir, f"frame_{i:06d}.png") |
| 234 | + pil_img.save(image_path) |
| 235 | + text = pytesseract.image_to_string( |
| 236 | + pil_img, lang=lang, config=f"--psm {tesseract_psm}" |
| 237 | + ).strip() |
| 238 | + frames.append(FrameOCR( |
| 239 | + index=i, timestamp_sec=ts, text=text, image_path=image_path |
| 240 | + )) |
| 241 | + seen += 1 |
| 242 | + if seen >= len(target_indices): |
| 243 | + break |
| 244 | + |
| 245 | + # Assemble full text |
| 246 | + parts = [] |
| 247 | + for f in frames: |
| 248 | + parts.append(f"--- frame {f.index} @ {f.timestamp_sec:.1f}s ---") |
| 249 | + parts.append(f.text or "(blank)") |
| 250 | + full_text = "\n".join(parts) |
| 251 | + summary = (full_text.replace("\n", " ").strip()[:200] + "…") if len(full_text) > 200 else full_text |
| 252 | + |
| 253 | + return SilentVideoResult( |
| 254 | + source=url_or_path, |
| 255 | + duration_sec=duration_sec, |
| 256 | + has_audio=has_audio, |
| 257 | + fps=fps, |
| 258 | + total_frames=total_frames, |
| 259 | + n_frames_sampled=len(frames), |
| 260 | + frames=frames, |
| 261 | + full_text=full_text, |
| 262 | + summary=summary, |
| 263 | + ) |
| 264 | + finally: |
| 265 | + # Guarantee container close even if PyAV iteration / OCR raises. |
| 266 | + if container is not None: |
| 267 | + try: |
| 268 | + container.close() |
| 269 | + except Exception: |
| 270 | + pass |
| 271 | + if cleanup_path: |
| 272 | + try: |
| 273 | + os.unlink(cleanup_path) |
| 274 | + except OSError: |
| 275 | + pass |
| 276 | + |
| 277 | + |
| 278 | +def has_audio_stream(local_path: str) -> bool: |
| 279 | + """Probe whether a video file has any audio stream. Used by callers as a |
| 280 | + pre-flight to decide between whisper and silent_video_ocr. |
| 281 | +
|
| 282 | + Returns True on error (conservative — let whisper try and fail). |
| 283 | + """ |
| 284 | + if not HAS_PYAV: |
| 285 | + return True |
| 286 | + try: |
| 287 | + container = av.open(local_path) |
| 288 | + has = len(container.streams.audio) > 0 |
| 289 | + container.close() |
| 290 | + return has |
| 291 | + except Exception as e: |
| 292 | + logger.warning("has_audio_stream probe failed: %s", e) |
| 293 | + return True |
0 commit comments