Harden the long-running worker: bounded cache, run backoff, manifest guards

MagicalTux · claude · MagicalTux · commit 1f38213adaf1 · 2026-07-01T16:28:37.000+09:00
Review pass for latent issues that only surface over long uptimes:

- Blob cache grew without bound (only the temp file was ever removed), so
  a worker filled its disk over time — data blobs run hundreds of MB. Add
  a size-capped LRU: after each cached download, evict the oldest blobs
  past --cache-max-gb (default 20). Eviction is safe because fetch_blob
  loads each blob fully into memory; a running job never needs its file.
  In-progress .part/.tmp files are spared.

- run_loop had no backoff on GPU errors (unlike prefetch_loop), so any
  persistent fault — OOM, wedged driver, no compatible cubin — spun it,
  claiming and downloading fragments as fast as the network allowed and
  burning the work pool. Back off idle_secs after a run error.

- A bad manifest could panic the runner thread (block==0 divide-by-zero;
  oversized tile truncating the u32 grid), and a runner panic quietly
  exits the daemon. Validate block/record_size and reject grid overflow
  as handled errors instead.

Add unit test for cache eviction. Bump version to 0.1.8.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "decryptd"
-version = "0.1.7"
+version = "0.1.8"
 edition = "2024"
 license = "Proprietary"
 authors = ["Karpeles Lab Inc"]
diff --git a/src/cuda.rs b/src/cuda.rs
@@ -238,6 +238,17 @@ pub fn run_job(
     tile: u64,
     mut progress: impl FnMut(u64, u64),
 ) -> Result<Vec<u8>, String> {
+    // Validate the publisher-supplied launch params up front: a bad manifest is a
+    // handled error, never a panic (a panic here unwinds the runner thread and
+    // takes the whole daemon down). `block == 0` would divide-by-zero below;
+    // `record_size == 0` makes the output layout meaningless.
+    if block == 0 {
+        return Err("manifest block size is 0".into());
+    }
+    if record_size == 0 {
+        return Err("manifest record_size is 0".into());
+    }
+
     let func = gpu.function(entry)?;
     let d_data = DeviceBuf::from_slice(data)?;
     let d_out = DeviceBuf::alloc(record_size as usize * out_cap as usize)?;
@@ -263,7 +274,12 @@ pub fn run_job(
             &mut a_oc as *mut _ as *mut c_void,
             &mut a_cap as *mut _ as *mut c_void,
         ];
-        let grid = count.div_ceil(block as u64) as u32;
+        // A too-large tile relative to block can overflow the u32 grid dimension;
+        // reject it rather than silently truncating (which would under-compute).
+        let grid_u64 = count.div_ceil(block as u64);
+        let grid = u32::try_from(grid_u64).map_err(|_| {
+            format!("grid {grid_u64} exceeds u32 (tile too large for block {block})")
+        })?;
         check(
             unsafe {
                 cuLaunchKernel(
diff --git a/src/main.rs b/src/main.rs
@@ -33,7 +33,7 @@ mod gui;
 
 use std::collections::HashMap;
 use std::io::{Cursor, Read};
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::mpsc::{Receiver, SyncSender, sync_channel};
 use std::sync::{Arc, Mutex};
@@ -66,6 +66,11 @@ struct RunArgs {
     /// the next job and upload of finished results always overlap the GPU run.
     #[arg(long, default_value_t = 1)]
     jobs: usize,
+    /// Maximum on-disk blob cache size, in GB. Once exceeded, the oldest cached
+    /// blobs are evicted. Eviction is safe: a running job holds its blob in memory,
+    /// so the only cost of dropping a file is a re-download on a later cache miss.
+    #[arg(long, default_value_t = 20)]
+    cache_max_gb: u64,
 }
 
 // ------------------------------------------------------------- pullOne response
@@ -245,10 +250,53 @@ fn fetch_blob(args: &RunArgs, d: &DataRef) -> Result<Vec<u8>> {
     let bytes = std::fs::read(&target)?;
     if is_temp {
         let _ = std::fs::remove_file(&target);
+    } else {
+        // We just added a finalized blob; keep the cache under its size cap.
+        prune_cache(&cache, args.cache_max_gb.saturating_mul(1 << 30));
     }
     Ok(bytes)
 }
 
+/// Keep the blob cache under `max_bytes` by evicting the oldest finalized entries
+/// (by mtime). Best-effort — any IO error just leaves that file in place. Skips
+/// rsurl's in-progress `.part`/`.tmp` files so an active download is never
+/// disturbed; everything else is fair game, since `fetch_blob` reads each blob
+/// fully into memory and no running job depends on its file surviving.
+fn prune_cache(cache: &Path, max_bytes: u64) {
+    let Ok(rd) = std::fs::read_dir(cache) else {
+        return;
+    };
+    let mut entries: Vec<(std::time::SystemTime, u64, PathBuf)> = Vec::new();
+    let mut total: u64 = 0;
+    for e in rd.flatten() {
+        let path = e.path();
+        let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
+        if name.ends_with(".part") || name.ends_with(".tmp") {
+            continue; // an in-progress download rsurl still owns
+        }
+        let Ok(meta) = e.metadata() else { continue };
+        if !meta.is_file() {
+            continue;
+        }
+        let mtime = meta.modified().unwrap_or(std::time::UNIX_EPOCH);
+        total = total.saturating_add(meta.len());
+        entries.push((mtime, meta.len(), path));
+    }
+    if total <= max_bytes {
+        return;
+    }
+    entries.sort_by_key(|(mtime, _, _)| *mtime); // oldest first
+    for (_, len, path) in entries {
+        if total <= max_bytes {
+            break;
+        }
+        if std::fs::remove_file(&path).is_ok() {
+            total = total.saturating_sub(len);
+            eprintln!("[decryptd] cache: evicted {} ({len} B)", path.display());
+        }
+    }
+}
+
 /// Decode an RFC 2397 `data:` URI into its raw bytes. Handles the two payload
 /// encodings: `;base64` (the platform's case — base64 over the gzip/xz blob) and
 /// the default percent-encoding. The media type in the header is ignored; the
@@ -654,6 +702,7 @@ fn prefetch_loop(
 /// GPU stage: the serialized step. One per `--jobs`; each takes a ready job, runs it,
 /// and hands the result to the upload stage.
 fn run_loop(
+    args: Arc<RunArgs>,
     ready: Arc<Mutex<Receiver<ReadyJob>>>,
     inflight: InFlight,
     done: SyncSender<FinishedJob>,
@@ -677,6 +726,11 @@ fn run_loop(
             Err(e) => {
                 eprintln!("[decryptd] run error: {e:#}");
                 inflight.lock().unwrap().remove(&frag_id);
+                // Back off before taking the next fragment. Without this a
+                // persistent GPU fault (OOM, driver wedged, no compatible cubin)
+                // spins here, claiming + downloading fragments as fast as the
+                // network allows and burning the work pool for nothing.
+                thread::sleep(Duration::from_secs(args.idle_secs));
             }
         }
     }
@@ -798,10 +852,15 @@ fn run_worker(args: RunArgs, status: Status) -> Result<()> {
     }
     let mut runners = Vec::new();
     for _ in 0..jobs {
-        let (ready_rx, inflight, done_tx) = (ready_rx.clone(), inflight.clone(), done_tx.clone());
+        let (args, ready_rx, inflight, done_tx) = (
+            args.clone(),
+            ready_rx.clone(),
+            inflight.clone(),
+            done_tx.clone(),
+        );
         let status = status.clone();
         runners.push(thread::spawn(move || {
-            run_loop(ready_rx, inflight, done_tx, status)
+            run_loop(args, ready_rx, inflight, done_tx, status)
         }));
     }
     drop(done_tx);
@@ -844,4 +903,37 @@ mod tests {
             decode_data_url("data:application/octet-stream;BASE64,aGVs\nbG8=").expect("decode");
         assert_eq!(bytes, b"hello");
     }
+
+    #[test]
+    fn prune_cache_evicts_down_to_cap_and_spares_in_progress() {
+        // Unique scratch dir so parallel test runs don't collide.
+        let dir = std::env::temp_dir().join(format!("decryptd-prune-{}", std::process::id()));
+        let _ = std::fs::remove_dir_all(&dir);
+        std::fs::create_dir_all(&dir).unwrap();
+
+        // Three finalized 10-byte blobs (30 B) plus an in-progress .part that
+        // eviction must never touch even though we're over the cap.
+        for name in ["aaa", "bbb", "ccc"] {
+            std::fs::write(dir.join(name), [0u8; 10]).unwrap();
+        }
+        std::fs::write(dir.join("pending-download.tmp.part"), [0u8; 10]).unwrap();
+
+        // Cap at 15 B: must evict finalized blobs until <= 15 (keeps exactly one),
+        // leaving the .part alone.
+        prune_cache(&dir, 15);
+
+        let finalized = std::fs::read_dir(&dir)
+            .unwrap()
+            .flatten()
+            .filter_map(|e| e.file_name().into_string().ok())
+            .filter(|n| !n.ends_with(".part"))
+            .count();
+        assert_eq!(finalized, 1, "should evict down to one finalized blob");
+        assert!(
+            dir.join("pending-download.tmp.part").exists(),
+            "in-progress .part must survive eviction"
+        );
+
+        let _ = std::fs::remove_dir_all(&dir);
+    }
 }