Add a 60-minute active-compute timeout per fragment

MagicalTux · claude · MagicalTux · commit 9e903b3bd3a3 · 2026-07-02T02:31:43.000+09:00
run_job now takes a `timeout` and checks it between tiles (paused time
excluded), aborting a fragment that never converges so a runaway or
mis-sized job can't pin a GPU indefinitely. A timed-out fragment flows
through the existing run-loop error path: logged, dropped from in-flight
(not submitted), then the worker backs off and claims the next.

The bound is per-tile-granular — a tile already in flight runs to
completion (a blocking cuCtxSynchronize can't be interrupted) — so it's
"60 min + one tile" in the worst case.

Co-Authored-By: Claude Opus 4.8 &lt;noreply@anthropic.com&gt;
diff --git a/src/cuda.rs b/src/cuda.rs
@@ -16,6 +16,7 @@
 
 use std::ffi::{CStr, CString, c_char, c_void};
 use std::ptr;
+use std::time::{Duration, Instant};
 
 type CuResult = i32;
 type CuDevice = i32;
@@ -296,6 +297,12 @@ impl Drop for Gpu {
 /// reports per-tile progress via `progress(done, total)`. `gate` is called before
 /// each tile launch: it blocks while the worker is paused, so a long fragment stops
 /// computing promptly and resumes on the next tile without losing progress.
+///
+/// `timeout` bounds the *active* compute time of the whole fragment: checked between
+/// tiles (paused time excluded), it aborts a fragment that never converges so a
+/// runaway job can't pin a GPU forever. The bound is per-tile-granular — a single
+/// tile already in flight runs to completion (a blocking `cuCtxSynchronize` can't be
+/// interrupted), so keep tiles small enough that one tile is well under the limit.
 #[allow(clippy::too_many_arguments)]
 pub fn run_job(
     gpu: &Gpu,
@@ -307,6 +314,7 @@ pub fn run_job(
     out_cap: u32,
     block: u32,
     tile: u64,
+    timeout: Duration,
     mut progress: impl FnMut(u64, u64),
     gate: impl Fn(),
 ) -> Result<Vec<u8>, String> {
@@ -331,8 +339,22 @@ pub fn run_job(
     let mut results = Vec::new();
     let mut done = 0u64;
     let mut cur = start;
+    // Wall-clock budget for this fragment, minus any time spent parked in `gate`
+    // (a paused worker must not time out). Checked once per tile below.
+    let started = Instant::now();
+    let mut paused = Duration::ZERO;
     while cur <= end_incl {
+        let park = Instant::now();
         gate(); // park here while paused (no kernel launched until resumed)
+        paused += park.elapsed();
+        let active = started.elapsed().saturating_sub(paused);
+        if active >= timeout {
+            return Err(format!(
+                "timed out after {:.0}s (limit {:.0}s) at {done}/{total} items",
+                active.as_secs_f64(),
+                timeout.as_secs_f64(),
+            ));
+        }
         let count = ((end_incl - cur).saturating_add(1)).min(tile);
         d_count.memset0()?;
         let (mut a_start, mut a_count) = (cur, count);
diff --git a/src/main.rs b/src/main.rs
@@ -822,6 +822,12 @@ fn decode_stream(dec: &mut dyn compcol::Decoder, input: &[u8]) -> Result<Vec<u8>
 /// Run one ready fragment on GPU `ordinal`. Creates its own CUDA context on the
 /// calling thread, so multiple of these run concurrently across `--jobs` runners
 /// and across GPUs.
+/// Cap on active compute time for a single fragment. A fragment that hasn't
+/// finished within this bound is aborted (dropped, not submitted) so a runaway or
+/// mis-sized job can't hold a GPU indefinitely; the worker backs off and claims the
+/// next. Paused time doesn't count against it (see [`cuda::run_job`]).
+const JOB_TIMEOUT: Duration = Duration::from_secs(60 * 60);
+
 fn run_on_gpu(ordinal: i32, job: ReadyJob, status: &Status) -> Result<FinishedJob> {
     let gpu = cuda::Gpu::load_first(ordinal, &job.cubins).map_err(|e| anyhow!(e))?;
     let (maj, min) = gpu.compute_capability();
@@ -842,6 +848,7 @@ fn run_on_gpu(ordinal: i32, job: ReadyJob, status: &Status) -> Result<FinishedJo
         job.manifest.out_cap,
         job.manifest.block,
         job.manifest.tile,
+        JOB_TIMEOUT,
         {
             // Feed the tray's rate meter per tile: record the item delta since the
             // last callback (`done` is cumulative within the fragment).