Skip to content

Commit a4a47eb

Browse files
committed
fix: harden YouTube downloads against yt-dlp 403 failures
- add multi-strategy yt-dlp retries in Python downloader using fallback format/client profiles\n- add fallback download functions in interactive shell script for audio and video modes\n- improve cleanup pattern for downloaded video extensions\n- add downloader retry test covering first-attempt 403 failure path\n- document YouTube reliability behavior and troubleshooting steps in README
1 parent 70e109e commit a4a47eb

4 files changed

Lines changed: 254 additions & 31 deletions

File tree

README.md

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ Local-first audio/video transcription tool powered by [whisper-cli](https://gith
66

77
- **Live microphone transcription** with incremental chunk-based output (text appears every ~5 seconds while you speak)
88
- **YouTube download + transcribe** — paste a URL, get a transcript
9+
- **YouTube fallback download strategies** — retries multiple YouTube client profiles to reduce 403 failures
910
- **File transcription** — supports Zoom recordings, WhatsApp audio, and any audio/video file
1011
- **Multiple output formats** — txt, srt, vtt, json
1112
- **Multi-language support** — English, French, auto-detect
@@ -53,7 +54,23 @@ This launches an interactive menu with these options:
5354
5. **WhatsApp Audio** — Transcribe a WhatsApp voice message
5455
6. **Other File** — Transcribe any audio/video file
5556

56-
Transcripts are saved to `~/Desktop/Transcripts/`. Audio downloads are saved to `~/whisper-downloads/` and auto-cleaned after 7 days.
57+
Transcripts are saved to `~/Documents/Transcripts/`. Audio downloads are saved to `~/whisper-downloads/` and auto-cleaned after 7 days.
58+
59+
## YouTube Download Reliability
60+
61+
When YouTube changes delivery behavior (for example SABR-related client restrictions), a single `yt-dlp` strategy can fail with `HTTP Error 403: Forbidden`.
62+
63+
This project now retries YouTube downloads using multiple fallback profiles:
64+
65+
- Alternative audio/video format selectors
66+
- Multiple YouTube player clients (`android`, `ios`, `web`)
67+
- Conservative transfer settings (`--retries`, `--fragment-retries`, `--force-ipv4`)
68+
69+
If you still hit 403 errors, run:
70+
71+
```bash
72+
yt-dlp -U
73+
```
5774

5875
## How Live Transcription Works
5976

src/transcriber/downloader.py

Lines changed: 91 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import tempfile
44
from pathlib import Path
5-
from typing import Optional
5+
from typing import Any, Dict, List, Optional
66

77
try:
88
import yt_dlp
@@ -45,8 +45,8 @@ def download_audio(
4545
Returns:
4646
Path to downloaded audio file
4747
"""
48-
# Configure yt-dlp options
49-
ydl_opts = {
48+
# Common yt-dlp options shared across retry strategies.
49+
base_opts = {
5050
"format": f"{quality}/best[height<=480]/best",
5151
"extract_audio": True,
5252
"audio_format": format,
@@ -62,29 +62,29 @@ def download_audio(
6262
}
6363
],
6464
}
65-
66-
# Download the video
67-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
68-
info = ydl.extract_info(url, download=True)
69-
filename = ydl.prepare_filename(info)
70-
71-
# Convert to audio format if needed
72-
if info.get("ext") != format:
73-
audio_filename = filename.rsplit(".", 1)[0] + f".{format}"
74-
else:
75-
audio_filename = filename
76-
77-
audio_path = Path(audio_filename)
78-
if not audio_path.exists():
79-
# Fallback: look for the file with the expected extension
80-
expected_path = Path(filename.rsplit(".", 1)[0] + f".{format}")
81-
if expected_path.exists():
82-
audio_path = expected_path
83-
84-
if not audio_path.exists():
85-
raise FileNotFoundError(f"Downloaded audio file not found: {audio_path}")
86-
87-
return audio_path
65+
errors: List[str] = []
66+
for strategy in self._download_strategies(quality):
67+
ydl_opts = dict(base_opts)
68+
ydl_opts.update(strategy)
69+
70+
try:
71+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
72+
info = ydl.extract_info(url, download=True)
73+
filename = ydl.prepare_filename(info)
74+
return self._resolve_audio_path(
75+
filename=filename,
76+
input_ext=info.get("ext", ""),
77+
output_format=format,
78+
)
79+
except Exception as exc: # pragma: no cover - passthrough from yt-dlp
80+
errors.append(str(exc))
81+
82+
last_error = errors[-1] if errors else "unknown yt-dlp failure"
83+
raise RuntimeError(
84+
"Failed to download YouTube audio after multiple strategies. "
85+
"Update yt-dlp (`yt-dlp -U`) and retry. "
86+
f"Last error: {last_error}"
87+
)
8888

8989
def get_video_info(self, url: str) -> dict:
9090
"""Get video information without downloading.
@@ -99,6 +99,11 @@ def get_video_info(self, url: str) -> dict:
9999
"quiet": True,
100100
"no_warnings": True,
101101
"extract_flat": False,
102+
"extractor_args": {
103+
"youtube": {
104+
"player_client": ["android", "ios", "web"],
105+
}
106+
},
102107
}
103108

104109
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
@@ -115,7 +120,67 @@ def list_formats(self, url: str) -> None:
115120
"listformats": True,
116121
"quiet": True,
117122
"no_warnings": True,
123+
"extractor_args": {
124+
"youtube": {
125+
"player_client": ["android", "ios", "web"],
126+
}
127+
},
118128
}
119129

120130
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
121131
ydl.extract_info(url, download=False)
132+
133+
def _download_strategies(self, quality: str) -> List[Dict[str, Any]]:
134+
"""Return prioritized yt-dlp download strategies for flaky YouTube clients."""
135+
return [
136+
{
137+
"format": f"{quality}/bestaudio[ext=m4a]/bestaudio/best",
138+
"extractor_args": {
139+
"youtube": {
140+
"player_client": ["android", "ios", "web"],
141+
}
142+
},
143+
},
144+
{
145+
"format": "bestaudio[ext=m4a]/bestaudio/best",
146+
"extractor_args": {
147+
"youtube": {
148+
"player_client": ["android", "ios", "web"],
149+
}
150+
},
151+
},
152+
{
153+
"format": "140/bestaudio[ext=m4a]/bestaudio/best",
154+
"extractor_args": {
155+
"youtube": {
156+
"player_client": ["android", "web"],
157+
}
158+
},
159+
},
160+
{
161+
"format": "bestaudio/best",
162+
},
163+
]
164+
165+
def _resolve_audio_path(
166+
self,
167+
filename: str,
168+
input_ext: str,
169+
output_format: str,
170+
) -> Path:
171+
"""Resolve audio output path after post-processing."""
172+
if input_ext != output_format:
173+
audio_filename = filename.rsplit(".", 1)[0] + f".{output_format}"
174+
else:
175+
audio_filename = filename
176+
177+
audio_path = Path(audio_filename)
178+
if not audio_path.exists():
179+
expected_path = Path(filename.rsplit(".", 1)[0] + f".{output_format}")
180+
if expected_path.exists():
181+
audio_path = expected_path
182+
183+
if not audio_path.exists():
184+
raise FileNotFoundError(f"Downloaded audio file not found: {audio_path}")
185+
186+
return audio_path

tests/test_downloader.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""Tests for YouTube downloader fallback behavior."""
2+
3+
from pathlib import Path
4+
from types import SimpleNamespace
5+
6+
from transcriber import downloader
7+
8+
9+
def test_download_audio_retries_after_first_attempt_fails(monkeypatch, tmp_path):
10+
"""Downloader should retry with fallback options when first attempt fails."""
11+
attempts = []
12+
expected_output = tmp_path / "sample.mp3"
13+
14+
class FakeYoutubeDL:
15+
def __init__(self, opts):
16+
self.opts = opts
17+
attempts.append(opts)
18+
19+
def __enter__(self):
20+
return self
21+
22+
def __exit__(self, exc_type, exc, tb):
23+
return False
24+
25+
def extract_info(self, url, download=True): # noqa: ARG002
26+
if len(attempts) == 1:
27+
raise RuntimeError("HTTP Error 403: Forbidden")
28+
expected_output.write_bytes(b"audio")
29+
return {"ext": "webm", "title": "sample"}
30+
31+
def prepare_filename(self, info): # noqa: ARG002
32+
return str(tmp_path / "sample.webm")
33+
34+
monkeypatch.setattr(
35+
downloader,
36+
"yt_dlp",
37+
SimpleNamespace(YoutubeDL=FakeYoutubeDL),
38+
)
39+
monkeypatch.setattr(downloader, "YT_DLP_AVAILABLE", True)
40+
41+
dl = downloader.YouTubeDownloader(output_dir=str(tmp_path))
42+
result = dl.download_audio("https://youtu.be/test-id", quality="bestaudio", format="mp3")
43+
44+
assert result == Path(expected_output)
45+
assert len(attempts) == 2

whisper-transcribe-with-download.sh

Lines changed: 100 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,102 @@ cleanup_old_audio() {
123123
print_color "$BLUE" "🧹 Cleaning up audio files older than 7 days..." >&2
124124
find "$AUDIO_DOWNLOAD_DIR" -name "*_live_recording.wav" -type f -mtime +7 -delete 2>/dev/null
125125
find "$AUDIO_DOWNLOAD_DIR" -name "*_audio.mp3" -type f -mtime +7 -delete 2>/dev/null
126-
find "$AUDIO_DOWNLOAD_DIR" -name "*_video.mp4" -type f -mtime +7 -delete 2>/dev/null
126+
find "$AUDIO_DOWNLOAD_DIR" -name "*_video.*" -type f -mtime +7 -delete 2>/dev/null
127127
print_color "$GREEN" "✅ Cleanup completed" >&2
128128
}
129129

130+
download_youtube_audio_with_fallback() {
131+
local youtube_url=$1
132+
local output_template=$2
133+
134+
local -a formats=(
135+
"bestaudio[ext=m4a]/bestaudio/best"
136+
"140/bestaudio[ext=m4a]/bestaudio/best"
137+
"bestaudio/best"
138+
)
139+
local -a clients=(
140+
"youtube:player_client=android,ios,web"
141+
"youtube:player_client=android,web"
142+
""
143+
)
144+
145+
for i in "${!formats[@]}"; do
146+
local format_selector="${formats[$i]}"
147+
local extractor_args="${clients[$i]}"
148+
149+
print_color "$YELLOW" "Trying download profile $((i+1))/${#formats[@]}..." >&2
150+
local -a cmd=(
151+
yt-dlp
152+
--no-playlist
153+
--retries 10
154+
--fragment-retries 10
155+
--force-ipv4
156+
--concurrent-fragments 1
157+
-f "$format_selector"
158+
--extract-audio
159+
--audio-format mp3
160+
--audio-quality 0
161+
-o "$output_template"
162+
)
163+
164+
if [ -n "$extractor_args" ]; then
165+
cmd+=(--extractor-args "$extractor_args")
166+
fi
167+
cmd+=("$youtube_url")
168+
169+
"${cmd[@]}" >&2
170+
if [ $? -eq 0 ]; then
171+
return 0
172+
fi
173+
done
174+
175+
return 1
176+
}
177+
178+
download_youtube_video_with_fallback() {
179+
local youtube_url=$1
180+
local output_template=$2
181+
182+
local -a formats=(
183+
"bv*[height<=720]+ba[ext=m4a]/b[height<=720]/best[height<=720]"
184+
"best[height<=720]/best"
185+
)
186+
local -a clients=(
187+
"youtube:player_client=android,ios,web"
188+
""
189+
)
190+
191+
for i in "${!formats[@]}"; do
192+
local format_selector="${formats[$i]}"
193+
local extractor_args="${clients[$i]}"
194+
195+
print_color "$YELLOW" "Trying video profile $((i+1))/${#formats[@]}..." >&2
196+
local -a cmd=(
197+
yt-dlp
198+
--no-playlist
199+
--retries 10
200+
--fragment-retries 10
201+
--force-ipv4
202+
--concurrent-fragments 1
203+
--merge-output-format mp4
204+
-f "$format_selector"
205+
-o "$output_template"
206+
)
207+
208+
if [ -n "$extractor_args" ]; then
209+
cmd+=(--extractor-args "$extractor_args")
210+
fi
211+
cmd+=("$youtube_url")
212+
213+
"${cmd[@]}" >&2
214+
if [ $? -eq 0 ]; then
215+
return 0
216+
fi
217+
done
218+
219+
return 1
220+
}
221+
130222
# Function for ORIGINAL live transcription
131223
original_live_transcription() {
132224
local model_file=$1
@@ -296,7 +388,7 @@ case $choice in
296388
2) # YouTube Video + Transcript
297389
read -p "Enter YouTube URL: " youtube_url
298390
print_color "$BLUE" "\n📥 Downloading YouTube audio..."
299-
yt-dlp -f "bestaudio/best" --extract-audio --audio-format mp3 --audio-quality 0 -o "$AUDIO_DOWNLOAD_DIR/${TIMESTAMP}_audio.%(ext)s" "$youtube_url" >&2
391+
download_youtube_audio_with_fallback "$youtube_url" "$AUDIO_DOWNLOAD_DIR/${TIMESTAMP}_audio.%(ext)s"
300392
if [ $? -eq 0 ]; then
301393
audio_file=$(find "$AUDIO_DOWNLOAD_DIR" -name "${TIMESTAMP}_audio.mp3" -type f -print -quit)
302394
if [ -n "$audio_file" ]; then
@@ -310,12 +402,15 @@ case $choice in
310402
print_color "$GREEN" "✅ Transcript saved: $TRANSCRIPT_DIR/${TIMESTAMP}_youtube.txt"
311403
fi
312404
fi
405+
else
406+
print_color "$RED" "❌ YouTube audio download failed after all fallback attempts."
407+
print_color "$YELLOW" "Tip: update yt-dlp with: yt-dlp -U" >&2
313408
fi
314409
;;
315410
3) # YouTube Video Download Only
316411
read -p "Enter YouTube URL: " youtube_url
317412
print_color "$BLUE" "\n📥 Downloading YouTube video..."
318-
yt-dlp -f "best[height<=720]" -o "$AUDIO_DOWNLOAD_DIR/${TIMESTAMP}_video.%(ext)s" "$youtube_url" >&2
413+
download_youtube_video_with_fallback "$youtube_url" "$AUDIO_DOWNLOAD_DIR/${TIMESTAMP}_video.%(ext)s"
319414
if [ $? -eq 0 ]; then
320415
video_file=$(find "$AUDIO_DOWNLOAD_DIR" -name "${TIMESTAMP}_video.*" -type f -print -quit)
321416
if [ -n "$video_file" ]; then
@@ -326,7 +421,8 @@ case $choice in
326421
print_color "$RED" "❌ Video download failed!"
327422
fi
328423
else
329-
print_color "$RED" "❌ Video download failed!"
424+
print_color "$RED" "❌ Video download failed after all fallback attempts!"
425+
print_color "$YELLOW" "Tip: update yt-dlp with: yt-dlp -U" >&2
330426
fi
331427
;;
332428
4) # Zoom Recording

0 commit comments

Comments
 (0)