Skip to content

Commit e90dbf0

Browse files
authored
feat(download): implement streaming downloads for large files (#65)
feat(download): implement streaming downloads for large files
2 parents 8f9594a + 5edb28c commit e90dbf0

4 files changed

Lines changed: 184 additions & 187 deletions

File tree

forklet/core/orchestrator.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -323,8 +323,12 @@ async def _download_single_file(
323323
self.progress_tracker.add_skipped_file()
324324
return None
325325

326-
# Download file content
327-
content = await self.github_service.get_file_content(file.download_url)
326+
# Determine if we should stream based on file size
327+
should_stream = file.size > request.stream_threshold
328+
# Download file content (potentially as a stream for large files)
329+
content = await self.github_service.get_file_content(
330+
file.download_url, stream=should_stream
331+
)
328332

329333
# Check again for pause after API call
330334
await self.state_controller.wait_for_resume()
@@ -334,7 +338,10 @@ async def _download_single_file(
334338

335339
# Save content to file
336340
bytes_written = await self.download_service.save_content(
337-
content, target_path, show_progress=request.show_progress_bars
341+
content,
342+
target_path,
343+
show_progress=request.show_progress_bars,
344+
is_stream=should_stream,
338345
)
339346

340347
# Update progress

forklet/models/download.py

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@
1919
class DownloadStrategy(Enum):
2020
"""Available download strategies for repository content."""
2121

22-
ARCHIVE = "archive" # Download as ZIP/TAR archive
23-
INDIVIDUAL = "individual" # Download files individually via API
24-
GIT_CLONE = "git_clone" # Use git clone (for complete history)
25-
SPARSE_CHECKOUT = "sparse" # Git sparse-checkout for partial downloads
22+
ARCHIVE = "archive" # Download as ZIP/TAR archive
23+
INDIVIDUAL = "individual" # Download files individually via API
24+
GIT_CLONE = "git_clone" # Use git clone (for complete history)
25+
SPARSE_CHECKOUT = "sparse" # Git sparse-checkout for partial downloads
2626

2727

2828
class DownloadStatus(Enum):
@@ -61,14 +61,18 @@ def matches_path(self, path: str) -> bool:
6161
return False
6262

6363
if self.include_patterns:
64-
if not any(fnmatch.fnmatch(path, pattern) for pattern in self.include_patterns):
64+
if not any(
65+
fnmatch.fnmatch(path, pattern) for pattern in self.include_patterns
66+
):
6567
return False
6668

6769
if self.exclude_patterns:
6870
if any(fnmatch.fnmatch(path, pattern) for pattern in self.exclude_patterns):
6971
return False
7072

71-
if (not self.include_hidden and any(part.startswith('.') for part in _Path(path).parts)):
73+
if not self.include_hidden and any(
74+
part.startswith(".") for part in _Path(path).parts
75+
):
7276
return False
7377

7478
file_ext = _Path(path).suffix.lower()
@@ -102,6 +106,7 @@ class DownloadRequest:
102106
max_concurrent_downloads: int = 5
103107
chunk_size: int = 8192
104108
timeout: int = 300
109+
stream_threshold: int = 10 * 1024 * 1024 # 10 MB default
105110

106111
# Authentication
107112
token: Optional[str] = None
@@ -110,7 +115,9 @@ class DownloadRequest:
110115
dry_run: bool = False
111116

112117
# Metadata
113-
request_id: str = field(default_factory=lambda: f"req_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
118+
request_id: str = field(
119+
default_factory=lambda: f"req_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
120+
)
114121
created_at: datetime = field(default_factory=datetime.now)
115122

116123
def __post_init__(self) -> None:
@@ -122,6 +129,8 @@ def __post_init__(self) -> None:
122129
raise ValueError("chunk_size must be positive")
123130
if self.timeout <= 0:
124131
raise ValueError("timeout must be positive")
132+
if self.stream_threshold < 0:
133+
raise ValueError("stream_threshold must be non-negative")
125134

126135

127136
@dataclass
@@ -170,7 +179,9 @@ def files_percentage(self) -> float:
170179
def elapsed_time(self) -> float:
171180
return (datetime.now() - self.started_at).total_seconds()
172181

173-
def update_file_progress(self, bytes_downloaded: int, current_file: Optional[str] = None) -> None:
182+
def update_file_progress(
183+
self, bytes_downloaded: int, current_file: Optional[str] = None
184+
) -> None:
174185
self.downloaded_bytes += bytes_downloaded
175186
if current_file:
176187
self.current_file = current_file
@@ -219,11 +230,17 @@ def success_rate(self) -> float:
219230

220231
def mark_completed(self) -> None:
221232
self.completed_at = datetime.now()
222-
self.status = DownloadStatus.COMPLETED if not self.failed_files else DownloadStatus.FAILED
233+
self.status = (
234+
DownloadStatus.COMPLETED if not self.failed_files else DownloadStatus.FAILED
235+
)
223236
if self.completed_at:
224-
self.total_download_time = (self.completed_at - self.started_at).total_seconds()
237+
self.total_download_time = (
238+
self.completed_at - self.started_at
239+
).total_seconds()
225240
if self.total_download_time > 0 and self.progress.downloaded_bytes > 0:
226-
self.average_speed = self.progress.downloaded_bytes / self.total_download_time
241+
self.average_speed = (
242+
self.progress.downloaded_bytes / self.total_download_time
243+
)
227244

228245

229246
@dataclass

0 commit comments

Comments
 (0)