Skip to content

Commit 87f64cf

Browse files
authored
Merge pull request #3 from kovyrin/feature/subtitle-downloads
feat(wistia): add subtitle download support
2 parents fe94a72 + 462b3b6 commit 87f64cf

5 files changed

Lines changed: 332 additions & 25 deletions

File tree

.env.example

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ RESUME_PARTIAL=true
6161
# Enable detailed logging for troubleshooting
6262
DEBUG=false
6363

64+
# Download subtitles/captions when available (default: true)
65+
SUBTITLE_DOWNLOAD_ENABLED=true
66+
6467
# ===============================================
6568
# ADVANCED SETTINGS
6669
# ===============================================
@@ -83,4 +86,4 @@ COURSE_DATA_FILE=""
8386
# ALL_VIDEO_FORMATS=false
8487

8588
# Log level (DEBUG, INFO, WARNING, ERROR)
86-
# LOG_LEVEL="INFO"
89+
# LOG_LEVEL="INFO"

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor
5252
| 📄 **HTML Content** | ✅ Full | `downloader.py` | Clean extraction, formatting |
5353
| 📚 **PDF Documents** | ✅ Full | `downloader.py` | Direct download, validation |
5454
| 🎵 **Audio Files** | ✅ Full | `downloader.py` | MP3, M4A support |
55+
| 📝 **Subtitles (Wistia)** | ✅ Full | `wistia_downloader.py` | Multi-language caption downloads |
5556
| 🎯 **Quizzes** | ✅ Basic | `downloader.py` | Structure extraction |
5657
| 🎨 **Presentations** | ✅ Full | FFmpeg merge | Multi-slide processing |
5758

@@ -70,6 +71,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor
7071
- **Resume Support** - Skip existing files, continue interrupted downloads
7172
- **Atomic Resume/Backup** - Status file is always safely backed up and updated, works on Windows, Mac, Linux
7273
- **Multiple Quality Options** - Choose video quality (720p, 1080p, etc.)
74+
- **Subtitle Downloads** - Automatically grab Wistia caption tracks in multiple languages
7375
- **Comprehensive Logging** - Debug mode for troubleshooting
7476

7577
### 🛡️ **Safety & Compliance**
@@ -201,6 +203,7 @@ RATE_LIMIT_MB_S= # Rate limit in MB/s (empty = unlimited)
201203
VALIDATE_DOWNLOADS=true # Enable file integrity validation
202204
RESUME_PARTIAL=true # Enable resume for partial downloads
203205
DEBUG=false # Enable debug logging
206+
SUBTITLE_DOWNLOAD_ENABLED=true # Download subtitles/captions when available
204207

205208
# ===============================================
206209
# ADVANCED SETTINGS

thinkific_downloader/config.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class Settings:
3737
resume_partial: bool = True
3838
debug: bool = False
3939
course_name: str = "Course"
40+
subtitle_download_enabled: bool = True
4041

4142
@classmethod
4243
def from_env(cls):
@@ -67,6 +68,7 @@ def from_env(cls):
6768
validate_downloads = os.getenv('VALIDATE_DOWNLOADS', 'true').lower() in ('1', 'true', 'yes', 'on')
6869
resume_partial = os.getenv('RESUME_PARTIAL', 'true').lower() in ('1', 'true', 'yes', 'on')
6970
debug = os.getenv('DEBUG', 'false').lower() in ('1', 'true', 'yes', 'on')
71+
subtitle_download_enabled = os.getenv('SUBTITLE_DOWNLOAD_ENABLED', 'true').lower() in ('1', 'true', 'yes', 'on')
7072

7173
# Clean cookie data to remove Unicode characters that cause encoding issues
7274
if cookie_data:
@@ -101,5 +103,6 @@ def from_env(cls):
101103
download_delay=download_delay,
102104
validate_downloads=validate_downloads,
103105
resume_partial=resume_partial,
104-
debug=debug
106+
debug=debug,
107+
subtitle_download_enabled=subtitle_download_enabled
105108
)

thinkific_downloader/downloader.py

Lines changed: 83 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -382,10 +382,74 @@ def download_file_chunked(src_url: str, dst_name: str, chunk_mb: int = 1):
382382
add_download_task(src_url, dst_path, "file")
383383

384384

385+
def _load_cached_progress(cache_file: Path):
386+
"""Return previously analyzed chapters and queued tasks from the resume cache."""
387+
analyzed_chapters = set()
388+
saved_tasks: List[Dict[str, Any]] = []
389+
390+
if not cache_file.exists():
391+
return analyzed_chapters, saved_tasks
392+
393+
try:
394+
with open(cache_file, 'r', encoding='utf-8') as f:
395+
cache_data = json.load(f)
396+
397+
analyzed_chapters = set(cache_data.get('analyzed_chapters', []))
398+
saved_tasks = cache_data.get('download_tasks', [])
399+
print(f"📋 Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached")
400+
401+
# If subtitle downloads were newly enabled, invalidate cache so we can regenerate tasks.
402+
if SETTINGS and SETTINGS.subtitle_download_enabled and saved_tasks:
403+
has_subtitle_tasks = any(
404+
(task.get('content_type') or '').lower() == 'subtitle'
405+
for task in saved_tasks
406+
)
407+
if not has_subtitle_tasks:
408+
print("🆕 Subtitle support enabled — refreshing cached analysis to include captions.")
409+
analyzed_chapters = set()
410+
saved_tasks = []
411+
try:
412+
cache_file.unlink()
413+
except OSError as exc:
414+
print(f" ⚠️ Warning: Failed to delete cache file for refresh: {exc}")
415+
except (json.JSONDecodeError, OSError):
416+
analyzed_chapters = set()
417+
saved_tasks = []
418+
419+
return analyzed_chapters, saved_tasks
420+
421+
422+
def _restore_saved_tasks(saved_tasks: List[Dict[str, Any]]):
423+
"""Restore cached download tasks, respecting the subtitle feature flag."""
424+
if not saved_tasks:
425+
return
426+
427+
restored_tasks = list(saved_tasks)
428+
if SETTINGS and not SETTINGS.subtitle_download_enabled:
429+
total_tasks = len(restored_tasks)
430+
restored_tasks = [
431+
task for task in restored_tasks
432+
if (task.get('content_type') or 'video').lower() != 'subtitle'
433+
]
434+
skipped_count = total_tasks - len(restored_tasks)
435+
if skipped_count > 0:
436+
print(f"⏭️ Skipping {skipped_count} cached subtitle task(s) because subtitle downloads are disabled.")
437+
438+
if not restored_tasks:
439+
return
440+
441+
print(f"📥 Restoring {len(restored_tasks)} previously collected download tasks...")
442+
for task_data in restored_tasks:
443+
add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video'))
444+
445+
385446

386447
def init_course(data: Dict[str, Any]):
387448
"""Initialize course structure and collect ALL download tasks first."""
388449
global COURSE_CONTENTS, ROOT_PROJECT_DIR, BASE_HOST, DOWNLOAD_TASKS
450+
451+
# Ensure settings/download manager are initialized so feature flags are available
452+
init_settings()
389453

390454
# Initialize download tasks list
391455
DOWNLOAD_TASKS = []
@@ -409,17 +473,7 @@ def init_course(data: Dict[str, Any]):
409473
analyzed_chapters = set()
410474
saved_tasks = []
411475

412-
if cache_file.exists():
413-
try:
414-
import json
415-
with open(cache_file, 'r', encoding='utf-8') as f:
416-
cache_data = json.load(f)
417-
analyzed_chapters = set(cache_data.get('analyzed_chapters', []))
418-
saved_tasks = cache_data.get('download_tasks', [])
419-
print(f"📋 Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached")
420-
except:
421-
analyzed_chapters = set()
422-
saved_tasks = []
476+
analyzed_chapters, saved_tasks = _load_cached_progress(cache_file)
423477

424478
# Derive base host from landing_page_url if available
425479
landing = data['course'].get('landing_page_url')
@@ -430,10 +484,7 @@ def init_course(data: Dict[str, Any]):
430484
print("\n🔍 Phase 1: Analyzing course content and collecting download links...")
431485

432486
# Restore saved download tasks
433-
if saved_tasks:
434-
print(f"📥 Restoring {len(saved_tasks)} previously collected download tasks...")
435-
for task_data in saved_tasks:
436-
add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video'))
487+
_restore_saved_tasks(saved_tasks)
437488

438489
collect_all_download_tasks(data, analyzed_chapters, cache_file)
439490

@@ -835,9 +886,24 @@ def collect_video_task_wistia(wistia_id: str, file_name: str, dest_dir: Path):
835886
video_url = selected.get('url')
836887
if video_url:
837888
ext = '.mp4' # Default extension
838-
resolved_name = filter_filename(file_name) + ext
889+
resolved_name = filter_filename(file_name)
890+
if not resolved_name.lower().endswith(ext):
891+
resolved_name += ext
839892
print(f" 📹 Found video: {resolved_name}")
840893
add_download_task(video_url, dest_dir / resolved_name, "video")
894+
try:
895+
from .wistia_downloader import build_wistia_subtitle_tasks
896+
subtitle_tasks = build_wistia_subtitle_tasks(
897+
data.get('media') or {},
898+
dest_dir,
899+
resolved_name,
900+
SETTINGS,
901+
)
902+
for task in subtitle_tasks:
903+
print(f" [Subs] Queued subtitles: {Path(task['dest_path']).name}")
904+
add_download_task(task['url'], Path(task['dest_path']), task.get('content_type', 'subtitle'))
905+
except Exception as subtitle_error:
906+
print(f" ⚠️ Unable to queue subtitles for {resolved_name}: {subtitle_error}")
841907
except Exception as e:
842908
print(f" ❌ Failed to collect Wistia video {wistia_id}: {e}")
843909

@@ -1282,4 +1348,4 @@ def main(argv: List[str]):
12821348

12831349

12841350
if __name__ == '__main__':
1285-
main(sys.argv)
1351+
main(sys.argv)

0 commit comments

Comments
 (0)