Skip to content

Commit 9e903b3

Browse files
MagicalTuxclaude
andcommitted
Add a 60-minute active-compute timeout per fragment
run_job now takes a `timeout` and checks it between tiles (paused time excluded), aborting a fragment that never converges so a runaway or mis-sized job can't pin a GPU indefinitely. A timed-out fragment flows through the existing run-loop error path: logged, dropped from in-flight (not submitted), then the worker backs off and claims the next. The bound is per-tile-granular — a tile already in flight runs to completion (a blocking cuCtxSynchronize can't be interrupted) — so it's "60 min + one tile" in the worst case. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 728019e commit 9e903b3

2 files changed

Lines changed: 29 additions & 0 deletions

File tree

src/cuda.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
1717
use std::ffi::{CStr, CString, c_char, c_void};
1818
use std::ptr;
19+
use std::time::{Duration, Instant};
1920

2021
type CuResult = i32;
2122
type CuDevice = i32;
@@ -296,6 +297,12 @@ impl Drop for Gpu {
296297
/// reports per-tile progress via `progress(done, total)`. `gate` is called before
297298
/// each tile launch: it blocks while the worker is paused, so a long fragment stops
298299
/// computing promptly and resumes on the next tile without losing progress.
300+
///
301+
/// `timeout` bounds the *active* compute time of the whole fragment: checked between
302+
/// tiles (paused time excluded), it aborts a fragment that never converges so a
303+
/// runaway job can't pin a GPU forever. The bound is per-tile-granular — a single
304+
/// tile already in flight runs to completion (a blocking `cuCtxSynchronize` can't be
305+
/// interrupted), so keep tiles small enough that one tile is well under the limit.
299306
#[allow(clippy::too_many_arguments)]
300307
pub fn run_job(
301308
gpu: &Gpu,
@@ -307,6 +314,7 @@ pub fn run_job(
307314
out_cap: u32,
308315
block: u32,
309316
tile: u64,
317+
timeout: Duration,
310318
mut progress: impl FnMut(u64, u64),
311319
gate: impl Fn(),
312320
) -> Result<Vec<u8>, String> {
@@ -331,8 +339,22 @@ pub fn run_job(
331339
let mut results = Vec::new();
332340
let mut done = 0u64;
333341
let mut cur = start;
342+
// Wall-clock budget for this fragment, minus any time spent parked in `gate`
343+
// (a paused worker must not time out). Checked once per tile below.
344+
let started = Instant::now();
345+
let mut paused = Duration::ZERO;
334346
while cur <= end_incl {
347+
let park = Instant::now();
335348
gate(); // park here while paused (no kernel launched until resumed)
349+
paused += park.elapsed();
350+
let active = started.elapsed().saturating_sub(paused);
351+
if active >= timeout {
352+
return Err(format!(
353+
"timed out after {:.0}s (limit {:.0}s) at {done}/{total} items",
354+
active.as_secs_f64(),
355+
timeout.as_secs_f64(),
356+
));
357+
}
336358
let count = ((end_incl - cur).saturating_add(1)).min(tile);
337359
d_count.memset0()?;
338360
let (mut a_start, mut a_count) = (cur, count);

src/main.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -822,6 +822,12 @@ fn decode_stream(dec: &mut dyn compcol::Decoder, input: &[u8]) -> Result<Vec<u8>
822822
/// Run one ready fragment on GPU `ordinal`. Creates its own CUDA context on the
823823
/// calling thread, so multiple of these run concurrently across `--jobs` runners
824824
/// and across GPUs.
825+
/// Cap on active compute time for a single fragment. A fragment that hasn't
826+
/// finished within this bound is aborted (dropped, not submitted) so a runaway or
827+
/// mis-sized job can't hold a GPU indefinitely; the worker backs off and claims the
828+
/// next. Paused time doesn't count against it (see [`cuda::run_job`]).
829+
const JOB_TIMEOUT: Duration = Duration::from_secs(60 * 60);
830+
825831
fn run_on_gpu(ordinal: i32, job: ReadyJob, status: &Status) -> Result<FinishedJob> {
826832
let gpu = cuda::Gpu::load_first(ordinal, &job.cubins).map_err(|e| anyhow!(e))?;
827833
let (maj, min) = gpu.compute_capability();
@@ -842,6 +848,7 @@ fn run_on_gpu(ordinal: i32, job: ReadyJob, status: &Status) -> Result<FinishedJo
842848
job.manifest.out_cap,
843849
job.manifest.block,
844850
job.manifest.tile,
851+
JOB_TIMEOUT,
845852
{
846853
// Feed the tray's rate meter per tile: record the item delta since the
847854
// last callback (`done` is cumulative within the fragment).

0 commit comments

Comments
 (0)