Skip to content

Commit 5b138b5

Browse files
prezisclaude
andcommitted
checkpoint (22:06): session crash protection — README.md,pyproject.toml,__init__.py,silent_video_ocr.py
Auto-committed by stop-auto-commit hook to prevent data loss. Files changed: 4 | Repo: scraperx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5f08f64 commit 5b138b5

4 files changed

Lines changed: 315 additions & 0 deletions

File tree

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,20 @@ res = vm.get_transcript(
282282

283283
Transcription priority: creator-uploaded VTT → `faster-whisper` (GPU) → `whisper` CLI. Auto-detects GPU (float16 on CUDA, int8 on Metal, CPU fallback).
284284

285+
### 4b. Silent video transcription — frame OCR for audio-less videos
286+
287+
Videos with no audio track (screen recordings, silent TUI demos) can't be transcribed by Whisper — `transcribe_silent_video` fills the gap by sampling frames and running tesseract OCR.
288+
289+
```python
290+
from scraperx import transcribe_silent_video
291+
292+
result = transcribe_silent_video("https://x.com/gitlawb/status/2055992174358274431", n_frames=6)
293+
print(result.full_text) # timestamped on-screen text
294+
print(result.has_audio) # False = confirms silent path was correct
295+
```
296+
297+
Requires the `tesseract` system binary (`apt install tesseract-ocr` or `brew install tesseract`) and the `[silent-video]` extra (`pip install 'scraperx[silent-video]'`).
298+
285299
### 5. Video discovery — scan any webpage
286300

287301
```python

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ blockchain = ["playwright>=1.58.0"]
4646
# See scraperx/scrapling_stealth.py + scraperx/fetch.py for cascade integration.
4747
stealth = ["scrapling[fetchers]>=0.4.7"]
4848
tv-resolve = ["tvdatafeed>=2.1.0"]
49+
silent-video = ["pytesseract>=0.3.13", "av>=14.0"]
4950
dev = ["pytest>=9.0.3", "pytest-cov>=4.0"]
5051

5152
[project.scripts]

scraperx/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@
1515
)
1616
from scraperx.video_discovery import VideoRef, discover_videos, fetch_any_video_transcript
1717
from scraperx.vimeo_scraper import VimeoResult, VimeoScraper, parse_vimeo_url
18+
from scraperx.silent_video_ocr import (
19+
FrameOCR,
20+
SilentVideoNotAvailable,
21+
SilentVideoResult,
22+
has_audio_stream,
23+
transcribe_silent_video,
24+
)
1825

1926
from .cookie_banner import (
2027
BannerSelector,

scraperx/silent_video_ocr.py

Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
"""
2+
scraperx/silent_video_ocr.py — frame-OCR transcription for silent videos.
3+
4+
When a video has no audio stream (e.g. Twitter screen-recording demos), neither
5+
captions nor Whisper can produce a transcript. This module fills the gap by
6+
sampling N evenly-spaced frames via PyAV, running tesseract OCR on each, and
7+
assembling a markdown narrative.
8+
9+
Designed as the third leg of scraperx's video-transcription cascade:
10+
captions → whisper → silent_video_ocr
11+
12+
The motivating incident (2026-05-17): a tweet at
13+
https://x.com/gitlawb/status/2055992174358274431 contained a silent demo of
14+
Hermes Agent + OpenGateway. The model name `mimo-v2.5-pro` was on-screen in the
15+
TUI footer but Whisper failed because there was no audio. Frame OCR recovers
16+
the on-screen text in 6 frames at 1 fps with deterministic tesseract output
17+
(no LLM hallucination risk).
18+
19+
Public API:
20+
transcribe_silent_video(url_or_path, n_frames=12, lang="eng") -> SilentVideoResult
21+
22+
The function downloads remote URLs to a temp file, opens with PyAV, samples
23+
frames at uniform stride, OCRs each, and returns timestamped text + assembled
24+
full_text. PyAV detects "no audio stream" by checking that container.streams.audio
25+
is empty; if a video DOES have audio, this function still works but the caller
26+
should prefer the whisper path for that case.
27+
28+
Dependencies (declare in pyproject.toml [silent-video] extra):
29+
av >= 14.0 (already transitive of faster_whisper)
30+
Pillow >= 10.0 (already transitive of av)
31+
pytesseract >= 0.3.13
32+
33+
System dep: tesseract binary (`apt install tesseract-ocr` / brew install tesseract).
34+
"""
35+
from __future__ import annotations
36+
37+
import io
38+
import logging
39+
import os
40+
import shutil
41+
import tempfile
42+
import urllib.request
43+
from dataclasses import dataclass, field
44+
from pathlib import Path
45+
from typing import Optional
46+
from urllib.parse import urlparse
47+
48+
logger = logging.getLogger(__name__)
49+
50+
# Soft imports — silent_video_ocr is opt-in
51+
try:
52+
import av
53+
HAS_PYAV = True
54+
except ImportError:
55+
HAS_PYAV = False
56+
57+
try:
58+
import pytesseract
59+
from PIL import Image
60+
HAS_OCR = True
61+
except ImportError:
62+
HAS_OCR = False
63+
64+
65+
class SilentVideoNotAvailable(RuntimeError):
66+
"""Raised when PyAV or pytesseract is not installed.
67+
68+
Install with: `pip install 'scraperx[silent-video]'` (adds pyav + pytesseract).
69+
The system `tesseract` binary must also be on PATH.
70+
"""
71+
72+
73+
@dataclass
74+
class FrameOCR:
75+
"""One sampled frame with its OCR text."""
76+
index: int # frame index in source stream
77+
timestamp_sec: float # wall-clock timestamp from video start
78+
text: str # tesseract output, raw (NOT post-processed)
79+
image_path: Optional[str] = None # if save_frames=True, path to PNG
80+
81+
82+
@dataclass
83+
class SilentVideoResult:
84+
"""Result of OCR-transcribing a silent video."""
85+
source: str # original URL or path
86+
duration_sec: float # total video duration
87+
has_audio: bool # was there any audio stream (sanity flag)
88+
fps: float # source frame rate
89+
total_frames: int # total video frames
90+
n_frames_sampled: int # how many we OCR'd
91+
frames: list[FrameOCR] = field(default_factory=list)
92+
full_text: str = "" # all frame texts joined with \n--- frame N @ Xs ---\n
93+
summary: str = "" # one-line summary (first 200 chars of full_text)
94+
95+
def to_dict(self) -> dict:
96+
return {
97+
"source": self.source,
98+
"duration_sec": self.duration_sec,
99+
"has_audio": self.has_audio,
100+
"fps": self.fps,
101+
"total_frames": self.total_frames,
102+
"n_frames_sampled": self.n_frames_sampled,
103+
"full_text": self.full_text,
104+
"summary": self.summary,
105+
"frames": [
106+
{"index": f.index, "timestamp_sec": f.timestamp_sec,
107+
"text": f.text, "image_path": f.image_path}
108+
for f in self.frames
109+
],
110+
}
111+
112+
113+
def _download_to_temp(url: str, suffix: str = ".mp4", timeout: int = 60) -> str:
114+
"""Download URL to a temp file. Caller owns cleanup."""
115+
parsed = urlparse(url)
116+
if parsed.scheme not in ("http", "https"):
117+
raise ValueError(f"unsupported scheme for download: {parsed.scheme}")
118+
req = urllib.request.Request(url, headers={
119+
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) scraperx/silent_video_ocr"
120+
})
121+
fd, path = tempfile.mkstemp(suffix=suffix, prefix="scraperx-svo-")
122+
os.close(fd)
123+
try:
124+
# Stream to disk via copyfileobj (no full-file buffering — avoids OOM on >100MB videos).
125+
with urllib.request.urlopen(req, timeout=timeout) as r:
126+
with open(path, "wb") as f:
127+
shutil.copyfileobj(r, f, length=64 * 1024)
128+
except Exception:
129+
try:
130+
os.unlink(path)
131+
except OSError:
132+
pass
133+
raise
134+
return path
135+
136+
137+
def transcribe_silent_video(
138+
url_or_path: str,
139+
n_frames: int = 12,
140+
lang: str = "eng",
141+
tesseract_psm: int = 6,
142+
save_frames: bool = False,
143+
frames_dir: Optional[str] = None,
144+
) -> SilentVideoResult:
145+
"""OCR a silent video by sampling N evenly-spaced frames.
146+
147+
Args:
148+
url_or_path: HTTP(S) URL or local file path. Remote URLs are downloaded
149+
to a temp file which is auto-deleted unless save_frames is True.
150+
n_frames: how many frames to sample. 12 = roughly 1 every 3.5 sec for a
151+
40-sec demo video. Higher = more text recovery, slower runtime.
152+
lang: tesseract language code(s). Default `eng`. Use `eng+chi_sim` for
153+
mixed-language content (requires `tesseract-ocr-chi-sim` package).
154+
tesseract_psm: page segmentation mode. 6 = "uniform block of text",
155+
best for full-screen TUI captures. 3 = auto, slower. 11 = sparse text.
156+
save_frames: if True, keep extracted PNGs in frames_dir.
157+
frames_dir: directory to save frames (created if missing). Defaults to
158+
a temp dir; required if save_frames is True.
159+
160+
Returns:
161+
SilentVideoResult with timestamped OCR text per frame + full_text.
162+
163+
Raises:
164+
SilentVideoNotAvailable: if PyAV or pytesseract are not installed.
165+
FileNotFoundError: if the local file doesn't exist.
166+
av.error.InvalidDataError: if the video file is malformed.
167+
"""
168+
if not HAS_PYAV or not HAS_OCR:
169+
missing = []
170+
if not HAS_PYAV: missing.append("av")
171+
if not HAS_OCR: missing.append("pytesseract+Pillow")
172+
raise SilentVideoNotAvailable(
173+
f"silent_video_ocr requires {', '.join(missing)} — "
174+
f"install with `pip install 'scraperx[silent-video]'`"
175+
)
176+
177+
# Resolve source
178+
cleanup_path: Optional[str] = None
179+
if url_or_path.startswith(("http://", "https://")):
180+
logger.info("downloading silent video: %s", url_or_path)
181+
local_path = _download_to_temp(url_or_path)
182+
cleanup_path = local_path if not save_frames else None
183+
else:
184+
local_path = url_or_path
185+
if not os.path.exists(local_path):
186+
raise FileNotFoundError(local_path)
187+
188+
# Frame save dir
189+
if save_frames:
190+
frames_dir = frames_dir or tempfile.mkdtemp(prefix="scraperx-svo-frames-")
191+
Path(frames_dir).mkdir(parents=True, exist_ok=True)
192+
193+
container = None
194+
try:
195+
container = av.open(local_path)
196+
video_stream = container.streams.video[0]
197+
has_audio = len(container.streams.audio) > 0
198+
duration_sec = (
199+
float(video_stream.duration * video_stream.time_base)
200+
if video_stream.duration else 0.0
201+
)
202+
fps = float(video_stream.average_rate) if video_stream.average_rate else 0.0
203+
total_frames = int(video_stream.frames or 0)
204+
205+
if total_frames <= 0:
206+
logger.warning("video has no frames metadata; decoding to count")
207+
# rare codec branch — count by iterating (cheap for short clips).
208+
# Re-open so the decoder cursor starts fresh.
209+
total_frames = sum(1 for _ in container.decode(video=0))
210+
container.close()
211+
container = av.open(local_path)
212+
video_stream = container.streams.video[0]
213+
214+
n_frames = max(1, min(n_frames, total_frames))
215+
target_indices = sorted({int(total_frames * i / n_frames) for i in range(n_frames)})
216+
217+
# Sample frames at the target indices
218+
frames: list[FrameOCR] = []
219+
idx_set = set(target_indices)
220+
seen = 0
221+
for i, frame in enumerate(container.decode(video=0)):
222+
if i in idx_set:
223+
pil_img = frame.to_image()
224+
# PTS-based timestamp preferred; fall back to fps-derived; else 0.
225+
if frame.pts is not None:
226+
ts = float(frame.pts * video_stream.time_base)
227+
elif fps > 0:
228+
ts = i / fps
229+
else:
230+
ts = 0.0
231+
image_path: Optional[str] = None
232+
if save_frames and frames_dir:
233+
image_path = os.path.join(frames_dir, f"frame_{i:06d}.png")
234+
pil_img.save(image_path)
235+
text = pytesseract.image_to_string(
236+
pil_img, lang=lang, config=f"--psm {tesseract_psm}"
237+
).strip()
238+
frames.append(FrameOCR(
239+
index=i, timestamp_sec=ts, text=text, image_path=image_path
240+
))
241+
seen += 1
242+
if seen >= len(target_indices):
243+
break
244+
245+
# Assemble full text
246+
parts = []
247+
for f in frames:
248+
parts.append(f"--- frame {f.index} @ {f.timestamp_sec:.1f}s ---")
249+
parts.append(f.text or "(blank)")
250+
full_text = "\n".join(parts)
251+
summary = (full_text.replace("\n", " ").strip()[:200] + "…") if len(full_text) > 200 else full_text
252+
253+
return SilentVideoResult(
254+
source=url_or_path,
255+
duration_sec=duration_sec,
256+
has_audio=has_audio,
257+
fps=fps,
258+
total_frames=total_frames,
259+
n_frames_sampled=len(frames),
260+
frames=frames,
261+
full_text=full_text,
262+
summary=summary,
263+
)
264+
finally:
265+
# Guarantee container close even if PyAV iteration / OCR raises.
266+
if container is not None:
267+
try:
268+
container.close()
269+
except Exception:
270+
pass
271+
if cleanup_path:
272+
try:
273+
os.unlink(cleanup_path)
274+
except OSError:
275+
pass
276+
277+
278+
def has_audio_stream(local_path: str) -> bool:
279+
"""Probe whether a video file has any audio stream. Used by callers as a
280+
pre-flight to decide between whisper and silent_video_ocr.
281+
282+
Returns True on error (conservative — let whisper try and fail).
283+
"""
284+
if not HAS_PYAV:
285+
return True
286+
try:
287+
container = av.open(local_path)
288+
has = len(container.streams.audio) > 0
289+
container.close()
290+
return has
291+
except Exception as e:
292+
logger.warning("has_audio_stream probe failed: %s", e)
293+
return True

0 commit comments

Comments
 (0)