Skip to content

Commit 0f07c6f

Browse files
committed
docs(models): add inline comments throughout models crate
Add concise, high-signal inline comments to function bodies across the model discovery, download, and caching implementation. Comments explain intent, edge cases, error handling strategies, and concurrency patterns rather than restating code behavior. Key areas covered: - Catalog lookup (blocking client, redirect limits, JSON parsing) - Local discovery (filesystem scanning, directory naming conventions) - Download logic (resume support, Range headers, progress throttling) - Cache helpers (path resolution, partial file management) - Metadata cache (TTL, backoff config, cache-first lookup) - Network utilities (timeouts, exponential backoff, error tracking) - Worker threads (batch updates, non-blocking sends)
1 parent afab78c commit 0f07c6f

7 files changed

Lines changed: 155 additions & 18 deletions

File tree

keyless-models/src/catalog.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,35 +7,46 @@ use keyless_core::error::{KeylessError, KeylessResult};
77
/// List public OpenAI Whisper model IDs from Hugging Face.
88
pub fn list_openai_whisper_models() -> KeylessResult<Vec<String>> {
99
let url = "https://huggingface.co/api/models?search=openai/whisper";
10+
// Use blocking client (synchronous API; this function is not async).
11+
// Limit redirects to 10 to prevent infinite loops (safety measure).
1012
let client = reqwest::blocking::Client::builder()
1113
.redirect(reqwest::redirect::Policy::limited(10))
1214
.build()
1315
.map_err(|e| KeylessError::Config(format!("http client: {}", e)))?;
1416
let mut req = client.get(url);
17+
// Add auth header if available (HF token from env); works without token for public models.
1518
if let Some((h, v)) = crate::net::auth_header() {
1619
req = req.header(h, v);
1720
}
21+
// Send request and check HTTP status (4xx/5xx treated as errors).
1822
let resp = req
1923
.send()
2024
.map_err(|e| KeylessError::Config(format!("GET {}: {}", url, e)))?
2125
.error_for_status()
2226
.map_err(|e| KeylessError::Config(e.to_string()))?;
27+
// Read response body as UTF-8 string (assumes HF API returns UTF-8).
2328
let text = resp
2429
.text()
2530
.map_err(|e| KeylessError::Config(format!("read response: {}", e)))?;
31+
// Parse JSON into generic Value (flexible parsing; don't assume exact schema).
2632
let v: serde_json::Value = serde_json::from_str(&text)
2733
.map_err(|e| KeylessError::Config(format!("parse json: {}", e)))?;
2834
let mut out: Vec<String> = Vec::new();
35+
// Expect response to be array of model objects (handle non-array gracefully).
2936
if let Some(arr) = v.as_array() {
3037
for item in arr {
38+
// Extract "modelId" field, filter to only OpenAI Whisper models (prefix check).
39+
// and_then chains: get("modelId") → as_str() → Some(id) if all succeed.
3140
if let Some(id) = item.get("modelId").and_then(|s| s.as_str())
3241
&& id.starts_with("openai/whisper")
3342
{
3443
out.push(id.to_string());
3544
}
3645
}
3746
}
47+
// Sort for deterministic output (alphabetical order).
3848
out.sort();
49+
// Remove duplicates (same model ID may appear multiple times in HF response).
3950
out.dedup();
4051
Ok(out)
4152
}

keyless-models/src/discover.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,26 +9,33 @@ use std::fs;
99
pub fn discover_local_whisper_repos() -> Vec<String> {
1010
let root = crate::hf::keyless_cache_root();
1111
let mut out = Vec::new();
12+
// Return empty list if cache dir doesn't exist or can't be read (graceful degradation).
1213
let Ok(entries) = fs::read_dir(&root) else {
1314
return out;
1415
};
16+
// flatten() skips I/O errors on individual entries (best-effort scanning).
1517
for e in entries.flatten() {
1618
let name = e.file_name();
19+
// Convert OsStr to String (lossy; invalid UTF-8 replaced with �).
1720
let name = name.to_string_lossy();
18-
// Expect <org>--<repo>
21+
// Expect <org>--<repo> format (HF model IDs use `/`, but filesystem uses `--` to avoid conflicts).
1922
if !name.contains("--") {
2023
continue;
2124
}
25+
// splitn(2, "--") limits to 2 parts (prevents splitting on multiple `--` sequences).
2226
let mut parts = name.splitn(2, "--");
27+
// unwrap_or is safe here: we checked contains("--") above, but defensive fallback handles edge cases.
2328
let org = parts.next().unwrap_or("");
2429
let repo = parts.next().unwrap_or("");
30+
// Skip malformed directory names (empty org or repo after split).
2531
if org.is_empty() || repo.is_empty() {
2632
continue;
2733
}
2834
let repo_path = e.path();
2935
let weights = repo_path.join("model.safetensors");
30-
// Mark as installed ONLY when final weights exist (ignore .partial)
36+
// Mark as installed ONLY when final weights exist (ignore .partial downloads in progress).
3137
if weights.exists() {
38+
// Convert back to HF model ID format (`org/repo`).
3239
out.push(format!("{}/{}", org, repo));
3340
}
3441
}

keyless-models/src/download.rs

Lines changed: 47 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -57,47 +57,55 @@ pub fn ensure_model_cached(
5757
tx: SyncSender<DownloadEvent>,
5858
cancel: Arc<AtomicBool>,
5959
) {
60+
// Try-send for non-blocking start event (may drop if channel full; OK for best-effort).
6061
let _ = tx.try_send(DownloadEvent::Started {
6162
model: model_id.clone(),
6263
});
6364
info!(model = %model_id, "starting model download");
65+
// Clone channel senders for different scopes (closure captures require owned values).
6466
let tx_clone = tx.clone();
6567
let tx_for_loop = tx.clone();
6668
let result: KeylessResult<()> = (|| {
67-
// Blocking reqwest client (with timeouts)
69+
// Blocking reqwest client (with timeouts); this function is not async.
6870
let client =
6971
net::build_blocking_client().map_err(|e| KeylessError::Other(e.to_string()))?;
72+
// Optional auth header (HF token from env); works without token for public models.
7073
let auth = net::auth_header();
7174

72-
// Ensure repo cache dir
75+
// Ensure repo cache dir exists (e.g., ~/.cache/keyless/openai--whisper-base/).
7376
let repo_dir = hf::keyless_cache_repo_dir(&model_id);
7477
fs::create_dir_all(&repo_dir).map_err(KeylessError::from)?;
7578

76-
// Plan line (if size known)
79+
// Plan line (if size known from cache): show total expected size to user.
7780
if let Some(total) = meta::plan_total_bytes(&model_id) {
7881
let _ = tx_clone.try_send(DownloadEvent::Stage {
7982
text: format!("plan: {}", keyless_core::utils::human_size(total)),
8083
});
8184
}
8285

83-
// Small files (with retry/backoff)
86+
// Small files (with retry/backoff): fetch into memory, then write atomically.
8487
for name in ["config.json", "tokenizer.json"] {
88+
// Check cancellation before each file (allows prompt abort).
8589
if cancel.load(Ordering::Relaxed) {
8690
return Err(KeylessError::Other("cancelled".into()));
8791
}
8892
let dst = repo_dir.join(name);
93+
// Skip if already downloaded (idempotent operation).
8994
if dst.exists() {
9095
continue;
9196
}
9297
let url = net::hf_resolve_url(&model_id, name);
9398
let _ = tx_for_loop.try_send(DownloadEvent::Stage {
9499
text: format!("downloading {}", name),
95100
});
101+
// Retry with exponential backoff (handles transient network errors).
96102
let (attempts, initial, max_ms) = meta::backoff_config();
97103
let (bytes, content_len_hdr) =
98104
net::blocking_get_with_backoff(&client, &url, &auth, attempts, initial, max_ms)?;
105+
// Write file atomically (create overwrites if exists; we checked above, but defensive).
99106
let mut f = File::create(&dst).map_err(KeylessError::from)?;
100107
f.write_all(&bytes).map_err(KeylessError::from)?;
108+
// Verify downloaded size matches Content-Length header (catches truncation/corruption).
101109
if let Some(cl) = content_len_hdr
102110
&& cl != bytes.len() as u64
103111
{
@@ -110,25 +118,28 @@ pub fn ensure_model_cached(
110118
}
111119
}
112120

113-
// Large file streaming with progress
121+
// Large file streaming with progress: stream chunks to avoid memory issues.
114122
if cancel.load(Ordering::Relaxed) {
115123
return Err(KeylessError::Other("cancelled".into()));
116124
}
117125
let weights = repo_dir.join("model.safetensors");
118126
if !weights.exists() {
119127
let url = net::hf_resolve_url(&model_id, "model.safetensors");
120128
let mut req = client.get(&url);
129+
// Add auth header if available (for private models or rate limit increases).
121130
if let Some((h, v)) = &auth {
122131
req = req.header(h, v.clone());
123132
}
124-
// resume support
133+
// Resume support: check for existing partial file to resume download.
125134
let tmp = repo_dir.join("model.safetensors.partial");
126135
let mut downloaded: u64 = 0;
136+
// Read partial file size to determine resume offset.
127137
if tmp.exists()
128138
&& let Ok(m) = std::fs::metadata(&tmp)
129139
{
130140
downloaded = m.len();
131141
}
142+
// Add Range header if resuming (bytes=<offset>- requests from offset to end).
132143
if downloaded > 0 {
133144
req = req.header(reqwest::header::RANGE, format!("bytes={}-", downloaded));
134145
}
@@ -138,7 +149,7 @@ pub fn ensure_model_cached(
138149
let status = resp.status();
139150
let mut file: Box<dyn std::io::Write>;
140151
if status == reqwest::StatusCode::PARTIAL_CONTENT {
141-
// Resume OK (206); keep downloaded as-is and append
152+
// Resume OK (206): server honored Range header; append to existing partial file.
142153
resp = resp
143154
.error_for_status()
144155
.map_err(|e| KeylessError::Other(e.to_string()))?;
@@ -149,12 +160,14 @@ pub fn ensure_model_cached(
149160
.map_err(KeylessError::from)?,
150161
);
151162
} else if status == reqwest::StatusCode::OK {
152-
// Range ignored; restart from 0
163+
// Range ignored (200): server doesn't support Range; restart from beginning.
164+
// Remove partial file to avoid corruption (truncate would also work).
153165
let _ = std::fs::remove_file(&tmp);
154166
downloaded = 0;
155167
resp = resp
156168
.error_for_status()
157169
.map_err(|e| KeylessError::Other(e.to_string()))?;
170+
// Create new file (truncate overwrites existing; defensive for edge cases).
158171
file = Box::new(
159172
std::fs::OpenOptions::new()
160173
.write(true)
@@ -164,46 +177,61 @@ pub fn ensure_model_cached(
164177
.map_err(KeylessError::from)?,
165178
);
166179
} else {
180+
// Unexpected status (e.g., 416 Range Not Satisfiable); fail fast.
167181
return Err(KeylessError::Other(format!("unexpected status {}", status)));
168182
}
169183

184+
// Calculate total size: for 206, add downloaded bytes to Content-Length (partial response).
185+
// For 200, Content-Length is full file size (no resume offset).
170186
let total = if status == reqwest::StatusCode::PARTIAL_CONTENT {
171187
resp.content_length()
172188
.unwrap_or(0)
173189
.saturating_add(downloaded)
174190
} else {
175191
resp.content_length().unwrap_or(0)
176192
};
193+
// Cache total size for future resume attempts (if known).
177194
if total > 0 {
178195
meta::update_saved_size(&model_id, total);
179196
}
197+
// Track time for speed/ETA calculations.
180198
let start = std::time::Instant::now();
181199
let mut last_emit = std::time::Instant::now();
200+
// 64KB buffer: balances memory usage vs I/O syscall overhead.
182201
let mut buf = vec![0u8; 64 * 1024];
183202
loop {
203+
// Check cancellation before reading (allows prompt abort on slow networks).
184204
if cancel.load(Ordering::Relaxed) {
185205
return Err(KeylessError::Other("cancelled".into()));
186206
}
187207
use std::io::Read;
208+
// Read chunk from response stream (may return < buf.len() at end).
188209
let n = resp
189210
.read(&mut buf)
190211
.map_err(|e| KeylessError::Other(format!("read chunk: {}", e)))?;
212+
// EOF: n == 0 indicates stream end (normal completion).
191213
if n == 0 {
192214
break;
193215
}
216+
// Check cancellation after read (allows abort during write).
194217
if cancel.load(Ordering::Relaxed) {
195218
return Err(KeylessError::Other("cancelled".into()));
196219
}
220+
// Write chunk to file (only write n bytes, not full buffer).
197221
file.write_all(&buf[..n]).map_err(KeylessError::from)?;
198222
downloaded += n as u64;
199-
// Throttle progress events to at most every 250ms
223+
// Throttle progress events to at most every 250ms (prevents channel saturation).
200224
if last_emit.elapsed().as_millis() >= 250 {
201225
last_emit = std::time::Instant::now();
202226
if total > 0 && downloaded <= total {
227+
// Calculate speed (MB/s) and ETA (seconds remaining).
228+
// max(0.001) prevents division by zero on very fast downloads.
203229
let secs = start.elapsed().as_secs_f64().max(0.001);
204230
let mbps = (downloaded as f64 / 1_000_000.0) / secs;
231+
// saturating_sub prevents underflow if downloaded > total (shouldn't happen).
205232
let left = (total.saturating_sub(downloaded)) as f64;
206233
let bps = (downloaded as f64) / secs;
234+
// ETA = remaining_bytes / bytes_per_second; max(0.0) prevents negative ETA.
207235
let eta = if bps > 0.0 {
208236
(left / bps).max(0.0)
209237
} else {
@@ -216,6 +244,7 @@ pub fn ensure_model_cached(
216244
eta_s: eta,
217245
});
218246
} else {
247+
// Total unknown or downloaded exceeds expected (shouldn't happen).
219248
let _ = tx_clone.try_send(DownloadEvent::Progress {
220249
bytes: downloaded,
221250
total: None,
@@ -225,22 +254,25 @@ pub fn ensure_model_cached(
225254
}
226255
}
227256
}
257+
// Close file handle before rename (ensures all buffers flushed to disk).
228258
drop(file);
259+
// Atomic rename: partial → final (prevents partial files from being used).
229260
fs::rename(&tmp, &weights).map_err(KeylessError::from)?;
230-
// Verify final size if we know total
261+
// Verify final size if we know total (catches truncation/corruption).
231262
if total > 0 {
232263
let final_len = std::fs::metadata(&weights)
233264
.map_err(KeylessError::from)?
234265
.len();
235266
if final_len != total {
236-
// clean up and error out; controller will surface error
267+
// Clean up corrupted file (better to have no file than wrong size).
268+
// Controller will surface error to user and allow retry.
237269
let _ = std::fs::remove_file(&weights);
238270
return Err(KeylessError::Other(format!(
239271
"downloaded size mismatch ({} != {}), please retry",
240272
final_len, total
241273
)));
242274
}
243-
// Emit a final 100% progress to fully fill the gauge
275+
// Emit a final 100% progress to fully fill the gauge (UI completeness).
244276
let _ = tx_clone.try_send(DownloadEvent::Progress {
245277
bytes: total,
246278
total: Some(total),
@@ -252,12 +284,13 @@ pub fn ensure_model_cached(
252284
Ok(())
253285
})();
254286

255-
// Log result for debugging (goes to session.log)
287+
// Log result for debugging (goes to session.log; not shown in TUI).
256288
match &result {
257289
Ok(()) => info!(model = %model_id, "model download completed successfully"),
258290
Err(e) => error!(model = %model_id, error = %e, "model download failed"),
259291
}
260292

261-
// Ensure the terminal receives completion even when the channel is saturated
293+
// Use blocking send for completion event (must be delivered; wait if channel full).
294+
// try_send would drop the event if channel saturated, hiding errors from user.
262295
let _ = tx.send(DownloadEvent::Done(result));
263296
}

keyless-models/src/hf.rs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ use keyless_core::error::KeylessResult;
77
/// Downloads are written here in `<org>--<repo>/` subdirectories.
88
/// Env override: `KEYLESS_CACHE_DIR` (primarily for tests).
99
pub fn keyless_cache_root() -> std::path::PathBuf {
10+
// Use platform-specific cache directory (e.g., ~/.cache/keyless/models on Linux).
11+
// Fallback to .cache in current directory if cache_dir() unavailable (rare edge case).
1012
if let Some(base) = dirs_next::cache_dir() {
1113
return base.join("keyless").join("models");
1214
}
@@ -19,29 +21,39 @@ pub fn keyless_cache_root() -> std::path::PathBuf {
1921
///
2022
/// Converts `owner/repo` to `<cache_root>/owner--repo/`.
2123
pub fn keyless_cache_repo_dir(model_id: &str) -> std::path::PathBuf {
24+
// splitn(2, '/') limits to 2 parts (prevents splitting on multiple slashes).
2225
let mut parts = model_id.splitn(2, '/');
26+
// Fallback to "openai" if no org (malformed model_id like "whisper-tiny").
2327
let org = parts.next().unwrap_or("openai");
24-
// Extract repo name from DEFAULT_MODEL_ID for fallback
28+
// Extract repo name from DEFAULT_MODEL_ID for fallback (if model_id has no slash).
29+
// nth(1) gets second part after split; fallback to "whisper-tiny" if DEFAULT_MODEL_ID malformed.
2530
let default_repo = keyless_core::config::DEFAULT_MODEL_ID
2631
.split('/')
2732
.nth(1)
2833
.unwrap_or("whisper-tiny");
34+
// Use default_repo if model_id has no repo part (e.g., just "openai").
2935
let repo = parts.next().unwrap_or(default_repo);
36+
// Format as "org--repo" (double dash replaces slash for filesystem compatibility).
3037
keyless_cache_root().join(format!("{}--{}", org, repo))
3138
}
3239

3340
/// Get the size of a locally cached model's weights file (if present).
3441
pub fn get_local_model_size(model_id: &str) -> Option<u64> {
3542
let cache_root = keyless_cache_root();
43+
// splitn(2, '/') limits to 2 parts (prevents splitting on multiple slashes).
3644
let mut parts = model_id.splitn(2, '/');
45+
// Empty fallbacks for validation (unlike keyless_cache_repo_dir which uses defaults).
3746
let org = parts.next().unwrap_or("");
3847
let repo = parts.next().unwrap_or("");
48+
// Return None for malformed model_id (empty org or repo).
3949
if org.is_empty() || repo.is_empty() {
4050
return None;
4151
}
52+
// Construct path: cache_root/org--repo/model.safetensors.
4253
let weights = cache_root
4354
.join(format!("{}--{}", org, repo))
4455
.join("model.safetensors");
56+
// ok() converts Result to Option; map extracts file size if metadata exists.
4557
std::fs::metadata(weights).ok().map(|m| m.len())
4658
}
4759

@@ -56,10 +68,13 @@ pub fn get_local_model_size(model_id: &str) -> Option<u64> {
5668
pub fn delete_partial_file(model_id: &str) -> KeylessResult<bool> {
5769
let repo_dir = keyless_cache_repo_dir(model_id);
5870
let partial = repo_dir.join("model.safetensors.partial");
71+
// Check existence before deletion (avoids error if file already gone).
5972
if partial.exists() {
60-
std::fs::remove_file(&partial)?; // auto-converts to KeylessError::Io
73+
// ? operator auto-converts io::Error to KeylessError::Io via From impl.
74+
std::fs::remove_file(&partial)?;
6175
Ok(true)
6276
} else {
77+
// File doesn't exist; return false (not an error).
6378
Ok(false)
6479
}
6580
}

0 commit comments

Comments
 (0)