perf(bench): scope timeout to diffctx call only, not git ops

nikolay-e · nikolay-e · commit 72d0ebc33f6f · 2026-05-01T18:35:16.000+02:00
Outer kill switch on the worker covered ensure_repo + apply_as_commit
+ build_diff_context. On warm-cache Python repos git ops are ~1s so
the 20s budget covered diffctx. On non-Python repos with thousands
of files (vscode, mui, polybench) `git worktree add` alone is 30-60s,
exhausting the timeout before diffctx even starts. Result: 100%
timeout on polybench / multi_swebench / contextbench despite the
algorithm being capable of finishing under 20s.

Move the threading.Timer + os._exit(137) wrapper into the eval_fn
itself, scoped narrowly around build_diff_context (single-cell path)
and compute_scored_state + per-cell select_with_params (cached path).
Workers read DIFFCTX_BENCH_TIMEOUT_SEC from env; orchestrators set
it before pool spawn.

Git operations (clone, worktree add, gold-patch apply, reset_to_parent)
now run uninstrumented because they are benchmark scaffolding, not
the algorithm under measurement.
diff --git a/benchmarks/adapters/calibrate.py b/benchmarks/adapters/calibrate.py
@@ -222,15 +222,7 @@ def _drain(pool: ProcessPoolExecutor) -> None:
         for inst, params_list in pending:
             try:
                 submit_times[inst.instance_id] = _time.monotonic()
-                futures[
-                    pool.submit(
-                        _run_cells_with_kill_switch,
-                        eval_all_cells_fn,
-                        inst,
-                        params_list,
-                        timeout_per_instance,
-                    )
-                ] = (inst, params_list)
+                futures[pool.submit(eval_all_cells_fn, inst, params_list)] = (inst, params_list)
             except BrokenProcessPool:
                 idx = pending.index((inst, params_list))
                 submit_failed.extend(pending[idx:])
@@ -324,30 +316,6 @@ def _drain(pool: ProcessPoolExecutor) -> None:
     return out
 
 
-def _run_cells_with_kill_switch(
-    eval_all_cells_fn: EvalAllCellsFn,
-    instance: BenchmarkInstance,
-    params_list: list[RunParams],
-    timeout_s: float,
-) -> list[tuple[RunParams, EvalResult]]:
-    """Worker-side hard timeout for the cached calibration path. The whole
-    `compute_scored_state + N selections` task must finish inside the
-    deadline; otherwise `os._exit(137)` kills the worker and the pool
-    spawns a replacement. Selection cost is sub-second per cell, so the
-    deadline is effectively a budget on the heavy scoring phase.
-    """
-    import os
-    import threading
-
-    timer = threading.Timer(timeout_s, lambda: os._exit(137))
-    timer.daemon = True
-    timer.start()
-    try:
-        return eval_all_cells_fn(instance, params_list)
-    finally:
-        timer.cancel()
-
-
 def _failure_eval(
     instance: BenchmarkInstance,
     params: RunParams,
diff --git a/benchmarks/adapters/runner.py b/benchmarks/adapters/runner.py
@@ -11,31 +11,6 @@
 EvalFn = Callable[[BenchmarkInstance, "RunParams"], EvalResult]
 
 
-def _run_with_kill_switch(
-    eval_fn: EvalFn,
-    instance: BenchmarkInstance,
-    params: RunParams,
-    timeout_s: float,
-) -> EvalResult:
-    """Worker-side hard timeout. A daemon thread arms `os._exit(137)` after
-    `timeout_s`; if the Rust pipeline blocks past the deadline (releasing
-    the GIL via `py.allow_threads`), this kills the worker process
-    unconditionally. The pool detects `BrokenProcessPool` and spawns a
-    replacement, so a single pathological instance no longer monopolizes
-    a worker slot for hours.
-    """
-    import os
-    import threading
-
-    timer = threading.Timer(timeout_s, lambda: os._exit(137))
-    timer.daemon = True
-    timer.start()
-    try:
-        return eval_fn(instance, params)
-    finally:
-        timer.cancel()
-
-
 @dataclass(frozen=True)
 class RunParams:
     """Parameters for one diffctx evaluation pass.
@@ -180,11 +155,14 @@ def run_eval_set(
 
     - `workers > 1` uses a process pool (spawn context) so workers do
       not share the GIL; otherwise sequential.
-    - `timeout_per_instance` is enforced worker-side via a daemon timer
-      that calls `os._exit(137)` if the deadline passes — the pool then
-      respawns the killed worker. This bounds calibration wall-clock at
-      `timeout_per_instance * ceil(n / workers)` even on pathological
-      instances (e.g. PPR convergence blow-up on huge graphs).
+    - `timeout_per_instance` is the wall-clock budget for ONE diffctx
+      call. The actual kill switch is armed inside the eval_fn (see
+      `benchmarks/diffctx_eval_fn.py`) around `build_diff_context` /
+      `compute_scored_state` only — git ops (clone, worktree add,
+      apply_as_commit) run uninstrumented because they are benchmark
+      scaffolding, not the algorithm under measurement. The orchestrator
+      passes the deadline to workers via the `DIFFCTX_BENCH_TIMEOUT_SEC`
+      environment variable.
     - `resume_from` (JSONL path): instance_ids already present in that file
       are skipped — re-running after a crash continues where it left off.
     - `checkpoint_path` (JSONL path): each completed result is appended
@@ -263,7 +241,7 @@ def _drain(active_pool: ProcessPoolExecutor) -> None:  # noqa: C901
         for inst in pending:
             try:
                 submit_times[inst.instance_id] = time.monotonic()
-                futures[active_pool.submit(_run_with_kill_switch, eval_fn, inst, params, timeout_per_instance)] = inst
+                futures[active_pool.submit(eval_fn, inst, params)] = inst
             except BrokenProcessPool:
                 submit_failed.extend([inst, *pending[pending.index(inst) + 1 :]])
                 pool_broken = True
diff --git a/benchmarks/calibrate.py b/benchmarks/calibrate.py
@@ -113,6 +113,12 @@ def main() -> int:
 
     _prewarm_bare_clones(instances)
 
+    # Workers read DIFFCTX_BENCH_TIMEOUT_SEC to scope the kill switch
+    # to the diffctx call only (excluding git clone / worktree setup).
+    import os as _os
+
+    _os.environ["DIFFCTX_BENCH_TIMEOUT_SEC"] = str(args.timeout_per_instance)
+
     eval_all_cells_fn = make_diffctx_eval_all_cells_fn(repo_root)
     args.out.mkdir(parents=True, exist_ok=True)
     checkpoint_dir = args.out / "checkpoints"
diff --git a/benchmarks/diffctx_eval_fn.py b/benchmarks/diffctx_eval_fn.py
@@ -61,6 +61,37 @@ def _selected_files(fragments: tuple[GoldenFragment, ...]) -> frozenset[str]:
     return frozenset(f.path for f in fragments)
 
 
+def _read_diffctx_timeout_sec() -> float:
+    """Read the wall-clock budget for a single diffctx invocation from
+    the worker's environment. Set by the CLI orchestrators before pool
+    spawn; absent or <=0 disables the timer (git/clone/setup operations
+    around the diffctx call are intentionally NOT covered).
+    """
+    raw = os.environ.get("DIFFCTX_BENCH_TIMEOUT_SEC", "")
+    try:
+        return float(raw) if raw else 0.0
+    except ValueError:
+        return 0.0
+
+
+def _arm_diffctx_kill_switch(timeout_s: float):
+    """Arm a daemon timer that calls `os._exit(137)` after `timeout_s`
+    elapse. Used to bound the wall-clock cost of ONE diffctx call (heavy
+    scoring + selection); the surrounding `ensure_repo` / `apply_as_commit`
+    / `reset_to_parent` git operations are deliberately uninstrumented
+    because they are benchmark scaffolding, not part of the algorithm
+    under measurement.
+    """
+    import threading
+
+    if timeout_s <= 0:
+        return None
+    timer = threading.Timer(timeout_s, lambda: os._exit(137))
+    timer.daemon = True
+    timer.start()
+    return timer
+
+
 def _ensure_worker_state(repos_dir_str: str) -> tuple[Path, UniversalEvaluator]:
     state = _WORKER_STATE.get(repos_dir_str)
     if state is None:
@@ -98,13 +129,23 @@ def _pool_eval(repos_dir_str: str, instance: BenchmarkInstance, params: RunParam
         t0 = time.perf_counter()
         from treemapper.diffctx.pipeline import build_diff_context
 
-        output = build_diff_context(
-            repo_dir,
-            "HEAD~1..HEAD",
-            budget_tokens=params.budget,
-            scoring_mode=params.scoring,
-            tau=params.tau,
-        )
+        # Kill switch covers ONLY the diffctx call. Git ops above
+        # (ensure_repo, apply_as_commit) and reset_to_parent below run
+        # uninstrumented — `git worktree add` on huge repos (vscode,
+        # mui) takes 30-60s of pure filesystem I/O which is benchmark
+        # scaffolding, not the algorithm under measurement.
+        timer = _arm_diffctx_kill_switch(_read_diffctx_timeout_sec())
+        try:
+            output = build_diff_context(
+                repo_dir,
+                "HEAD~1..HEAD",
+                budget_tokens=params.budget,
+                scoring_mode=params.scoring,
+                tau=params.tau,
+            )
+        finally:
+            if timer is not None:
+                timer.cancel()
         elapsed = time.perf_counter() - t0
         if output is None:
             result = EvalResult(
@@ -231,6 +272,12 @@ def pool_eval_all_cells(
         apply_as_commit(repo_dir, instance.gold_patch, "diffctx-eval-gold")
 
         t_heavy_start = time.perf_counter()
+        # Kill switch around the heavy phase ONLY. Same rationale as
+        # `_pool_eval`: git ops outside this scope are scaffolding. The
+        # selection sub-loop below is sub-second per cell so a single
+        # heavy-phase deadline is sufficient.
+        bench_timeout = _read_diffctx_timeout_sec()
+        timer = _arm_diffctx_kill_switch(bench_timeout)
         try:
             state = compute_scored_state(
                 repo_dir,
@@ -240,6 +287,9 @@ def pool_eval_all_cells(
         except Exception as e:
             err = f"{type(e).__name__}: {e}"
             return [(p, _failure_result(instance, p, "diffctx_fail", err)) for p in params_list]
+        finally:
+            if timer is not None:
+                timer.cancel()
         heavy_elapsed = time.perf_counter() - t_heavy_start
 
         for params in params_list:
@@ -248,11 +298,20 @@ def pool_eval_all_cells(
                 for k, v in params.to_env().items():
                     os.environ[k] = v
                 t_select_start = time.perf_counter()
-                output = select_with_params(
-                    state,
-                    budget_tokens=params.budget,
-                    tau=params.tau,
-                )
+                # Each cell selection is sub-second on cached state, but
+                # arm a fresh timer per cell as a defense in depth: a
+                # pathological lazy-greedy regression on one cell should
+                # not poison the whole instance.
+                cell_timer = _arm_diffctx_kill_switch(bench_timeout)
+                try:
+                    output = select_with_params(
+                        state,
+                        budget_tokens=params.budget,
+                        tau=params.tau,
+                    )
+                finally:
+                    if cell_timer is not None:
+                        cell_timer.cancel()
                 select_elapsed = time.perf_counter() - t_select_start
                 # Charge the heavy cost to the first cell only — subsequent
                 # cells reuse the cached state, so they only pay select cost.
@@ -295,5 +354,5 @@ def _failure_result(
 
 def make_diffctx_eval_all_cells_fn(repos_dir: Path):
     """Sibling of `make_diffctx_eval_fn` for the inverted orchestrator
-    (one task = one instance × N cells)."""
+    (one task = one instance times N cells)."""
     return functools.partial(pool_eval_all_cells, str(repos_dir))
diff --git a/benchmarks/run_eval.py b/benchmarks/run_eval.py
@@ -71,6 +71,10 @@ def main() -> int:
     )
     print(f"Params: {params.label()}")
 
+    import os as _os
+
+    _os.environ["DIFFCTX_BENCH_TIMEOUT_SEC"] = str(args.timeout_per_instance)
+
     eval_fn = make_diffctx_eval_fn(repo_root)
     results = run_eval_set(
         instances,
diff --git a/benchmarks/run_final_eval.py b/benchmarks/run_final_eval.py
@@ -91,6 +91,11 @@ def main() -> int:
         return 1
 
     adapters = default_test_adapters() + default_calibration_pool_adapters()
+
+    import os as _os
+
+    _os.environ["DIFFCTX_BENCH_TIMEOUT_SEC"] = str(args.timeout_per_instance)
+
     eval_fn = _make_eval_fn(args.baseline, repo_root, request_timeout=args.timeout_per_instance)
 
     args.out.mkdir(parents=True, exist_ok=True)
diff --git a/benchmarks/select_final.py b/benchmarks/select_final.py
@@ -65,6 +65,10 @@ def main() -> int:
     instances = list(filter_instances_by_manifest(adapters, manifest_ids))
     print(f"Validation set: {len(instances)} instances")
 
+    import os as _os
+
+    _os.environ["DIFFCTX_BENCH_TIMEOUT_SEC"] = str(args.timeout_per_instance)
+
     eval_fn = make_diffctx_eval_fn(repo_root)
     evaluator = UniversalEvaluator()