Skip to content

Commit 72d0ebc

Browse files
committed
perf(bench): scope timeout to diffctx call only, not git ops
Outer kill switch on the worker covered ensure_repo + apply_as_commit + build_diff_context. On warm-cache Python repos git ops are ~1s so the 20s budget covered diffctx. On non-Python repos with thousands of files (vscode, mui, polybench) `git worktree add` alone is 30-60s, exhausting the timeout before diffctx even starts. Result: 100% timeout on polybench / multi_swebench / contextbench despite the algorithm being capable of finishing under 20s. Move the threading.Timer + os._exit(137) wrapper into the eval_fn itself, scoped narrowly around build_diff_context (single-cell path) and compute_scored_state + per-cell select_with_params (cached path). Workers read DIFFCTX_BENCH_TIMEOUT_SEC from env; orchestrators set it before pool spawn. Git operations (clone, worktree add, gold-patch apply, reset_to_parent) now run uninstrumented because they are benchmark scaffolding, not the algorithm under measurement.
1 parent eecebaf commit 72d0ebc

7 files changed

Lines changed: 101 additions & 77 deletions

File tree

benchmarks/adapters/calibrate.py

Lines changed: 1 addition & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -222,15 +222,7 @@ def _drain(pool: ProcessPoolExecutor) -> None:
222222
for inst, params_list in pending:
223223
try:
224224
submit_times[inst.instance_id] = _time.monotonic()
225-
futures[
226-
pool.submit(
227-
_run_cells_with_kill_switch,
228-
eval_all_cells_fn,
229-
inst,
230-
params_list,
231-
timeout_per_instance,
232-
)
233-
] = (inst, params_list)
225+
futures[pool.submit(eval_all_cells_fn, inst, params_list)] = (inst, params_list)
234226
except BrokenProcessPool:
235227
idx = pending.index((inst, params_list))
236228
submit_failed.extend(pending[idx:])
@@ -324,30 +316,6 @@ def _drain(pool: ProcessPoolExecutor) -> None:
324316
return out
325317

326318

327-
def _run_cells_with_kill_switch(
328-
eval_all_cells_fn: EvalAllCellsFn,
329-
instance: BenchmarkInstance,
330-
params_list: list[RunParams],
331-
timeout_s: float,
332-
) -> list[tuple[RunParams, EvalResult]]:
333-
"""Worker-side hard timeout for the cached calibration path. The whole
334-
`compute_scored_state + N selections` task must finish inside the
335-
deadline; otherwise `os._exit(137)` kills the worker and the pool
336-
spawns a replacement. Selection cost is sub-second per cell, so the
337-
deadline is effectively a budget on the heavy scoring phase.
338-
"""
339-
import os
340-
import threading
341-
342-
timer = threading.Timer(timeout_s, lambda: os._exit(137))
343-
timer.daemon = True
344-
timer.start()
345-
try:
346-
return eval_all_cells_fn(instance, params_list)
347-
finally:
348-
timer.cancel()
349-
350-
351319
def _failure_eval(
352320
instance: BenchmarkInstance,
353321
params: RunParams,

benchmarks/adapters/runner.py

Lines changed: 9 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -11,31 +11,6 @@
1111
EvalFn = Callable[[BenchmarkInstance, "RunParams"], EvalResult]
1212

1313

14-
def _run_with_kill_switch(
15-
eval_fn: EvalFn,
16-
instance: BenchmarkInstance,
17-
params: RunParams,
18-
timeout_s: float,
19-
) -> EvalResult:
20-
"""Worker-side hard timeout. A daemon thread arms `os._exit(137)` after
21-
`timeout_s`; if the Rust pipeline blocks past the deadline (releasing
22-
the GIL via `py.allow_threads`), this kills the worker process
23-
unconditionally. The pool detects `BrokenProcessPool` and spawns a
24-
replacement, so a single pathological instance no longer monopolizes
25-
a worker slot for hours.
26-
"""
27-
import os
28-
import threading
29-
30-
timer = threading.Timer(timeout_s, lambda: os._exit(137))
31-
timer.daemon = True
32-
timer.start()
33-
try:
34-
return eval_fn(instance, params)
35-
finally:
36-
timer.cancel()
37-
38-
3914
@dataclass(frozen=True)
4015
class RunParams:
4116
"""Parameters for one diffctx evaluation pass.
@@ -180,11 +155,14 @@ def run_eval_set(
180155
181156
- `workers > 1` uses a process pool (spawn context) so workers do
182157
not share the GIL; otherwise sequential.
183-
- `timeout_per_instance` is enforced worker-side via a daemon timer
184-
that calls `os._exit(137)` if the deadline passes — the pool then
185-
respawns the killed worker. This bounds calibration wall-clock at
186-
`timeout_per_instance * ceil(n / workers)` even on pathological
187-
instances (e.g. PPR convergence blow-up on huge graphs).
158+
- `timeout_per_instance` is the wall-clock budget for ONE diffctx
159+
call. The actual kill switch is armed inside the eval_fn (see
160+
`benchmarks/diffctx_eval_fn.py`) around `build_diff_context` /
161+
`compute_scored_state` only — git ops (clone, worktree add,
162+
apply_as_commit) run uninstrumented because they are benchmark
163+
scaffolding, not the algorithm under measurement. The orchestrator
164+
passes the deadline to workers via the `DIFFCTX_BENCH_TIMEOUT_SEC`
165+
environment variable.
188166
- `resume_from` (JSONL path): instance_ids already present in that file
189167
are skipped — re-running after a crash continues where it left off.
190168
- `checkpoint_path` (JSONL path): each completed result is appended
@@ -263,7 +241,7 @@ def _drain(active_pool: ProcessPoolExecutor) -> None: # noqa: C901
263241
for inst in pending:
264242
try:
265243
submit_times[inst.instance_id] = time.monotonic()
266-
futures[active_pool.submit(_run_with_kill_switch, eval_fn, inst, params, timeout_per_instance)] = inst
244+
futures[active_pool.submit(eval_fn, inst, params)] = inst
267245
except BrokenProcessPool:
268246
submit_failed.extend([inst, *pending[pending.index(inst) + 1 :]])
269247
pool_broken = True

benchmarks/calibrate.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,12 @@ def main() -> int:
113113

114114
_prewarm_bare_clones(instances)
115115

116+
# Workers read DIFFCTX_BENCH_TIMEOUT_SEC to scope the kill switch
117+
# to the diffctx call only (excluding git clone / worktree setup).
118+
import os as _os
119+
120+
_os.environ["DIFFCTX_BENCH_TIMEOUT_SEC"] = str(args.timeout_per_instance)
121+
116122
eval_all_cells_fn = make_diffctx_eval_all_cells_fn(repo_root)
117123
args.out.mkdir(parents=True, exist_ok=True)
118124
checkpoint_dir = args.out / "checkpoints"

benchmarks/diffctx_eval_fn.py

Lines changed: 72 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,37 @@ def _selected_files(fragments: tuple[GoldenFragment, ...]) -> frozenset[str]:
6161
return frozenset(f.path for f in fragments)
6262

6363

64+
def _read_diffctx_timeout_sec() -> float:
65+
"""Read the wall-clock budget for a single diffctx invocation from
66+
the worker's environment. Set by the CLI orchestrators before pool
67+
spawn; absent or <=0 disables the timer (git/clone/setup operations
68+
around the diffctx call are intentionally NOT covered).
69+
"""
70+
raw = os.environ.get("DIFFCTX_BENCH_TIMEOUT_SEC", "")
71+
try:
72+
return float(raw) if raw else 0.0
73+
except ValueError:
74+
return 0.0
75+
76+
77+
def _arm_diffctx_kill_switch(timeout_s: float):
78+
"""Arm a daemon timer that calls `os._exit(137)` after `timeout_s`
79+
elapse. Used to bound the wall-clock cost of ONE diffctx call (heavy
80+
scoring + selection); the surrounding `ensure_repo` / `apply_as_commit`
81+
/ `reset_to_parent` git operations are deliberately uninstrumented
82+
because they are benchmark scaffolding, not part of the algorithm
83+
under measurement.
84+
"""
85+
import threading
86+
87+
if timeout_s <= 0:
88+
return None
89+
timer = threading.Timer(timeout_s, lambda: os._exit(137))
90+
timer.daemon = True
91+
timer.start()
92+
return timer
93+
94+
6495
def _ensure_worker_state(repos_dir_str: str) -> tuple[Path, UniversalEvaluator]:
6596
state = _WORKER_STATE.get(repos_dir_str)
6697
if state is None:
@@ -98,13 +129,23 @@ def _pool_eval(repos_dir_str: str, instance: BenchmarkInstance, params: RunParam
98129
t0 = time.perf_counter()
99130
from treemapper.diffctx.pipeline import build_diff_context
100131

101-
output = build_diff_context(
102-
repo_dir,
103-
"HEAD~1..HEAD",
104-
budget_tokens=params.budget,
105-
scoring_mode=params.scoring,
106-
tau=params.tau,
107-
)
132+
# Kill switch covers ONLY the diffctx call. Git ops above
133+
# (ensure_repo, apply_as_commit) and reset_to_parent below run
134+
# uninstrumented — `git worktree add` on huge repos (vscode,
135+
# mui) takes 30-60s of pure filesystem I/O which is benchmark
136+
# scaffolding, not the algorithm under measurement.
137+
timer = _arm_diffctx_kill_switch(_read_diffctx_timeout_sec())
138+
try:
139+
output = build_diff_context(
140+
repo_dir,
141+
"HEAD~1..HEAD",
142+
budget_tokens=params.budget,
143+
scoring_mode=params.scoring,
144+
tau=params.tau,
145+
)
146+
finally:
147+
if timer is not None:
148+
timer.cancel()
108149
elapsed = time.perf_counter() - t0
109150
if output is None:
110151
result = EvalResult(
@@ -231,6 +272,12 @@ def pool_eval_all_cells(
231272
apply_as_commit(repo_dir, instance.gold_patch, "diffctx-eval-gold")
232273

233274
t_heavy_start = time.perf_counter()
275+
# Kill switch around the heavy phase ONLY. Same rationale as
276+
# `_pool_eval`: git ops outside this scope are scaffolding. The
277+
# selection sub-loop below is sub-second per cell so a single
278+
# heavy-phase deadline is sufficient.
279+
bench_timeout = _read_diffctx_timeout_sec()
280+
timer = _arm_diffctx_kill_switch(bench_timeout)
234281
try:
235282
state = compute_scored_state(
236283
repo_dir,
@@ -240,6 +287,9 @@ def pool_eval_all_cells(
240287
except Exception as e:
241288
err = f"{type(e).__name__}: {e}"
242289
return [(p, _failure_result(instance, p, "diffctx_fail", err)) for p in params_list]
290+
finally:
291+
if timer is not None:
292+
timer.cancel()
243293
heavy_elapsed = time.perf_counter() - t_heavy_start
244294

245295
for params in params_list:
@@ -248,11 +298,20 @@ def pool_eval_all_cells(
248298
for k, v in params.to_env().items():
249299
os.environ[k] = v
250300
t_select_start = time.perf_counter()
251-
output = select_with_params(
252-
state,
253-
budget_tokens=params.budget,
254-
tau=params.tau,
255-
)
301+
# Each cell selection is sub-second on cached state, but
302+
# arm a fresh timer per cell as a defense in depth: a
303+
# pathological lazy-greedy regression on one cell should
304+
# not poison the whole instance.
305+
cell_timer = _arm_diffctx_kill_switch(bench_timeout)
306+
try:
307+
output = select_with_params(
308+
state,
309+
budget_tokens=params.budget,
310+
tau=params.tau,
311+
)
312+
finally:
313+
if cell_timer is not None:
314+
cell_timer.cancel()
256315
select_elapsed = time.perf_counter() - t_select_start
257316
# Charge the heavy cost to the first cell only — subsequent
258317
# cells reuse the cached state, so they only pay select cost.
@@ -295,5 +354,5 @@ def _failure_result(
295354

296355
def make_diffctx_eval_all_cells_fn(repos_dir: Path):
297356
"""Sibling of `make_diffctx_eval_fn` for the inverted orchestrator
298-
(one task = one instance × N cells)."""
357+
(one task = one instance times N cells)."""
299358
return functools.partial(pool_eval_all_cells, str(repos_dir))

benchmarks/run_eval.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ def main() -> int:
7171
)
7272
print(f"Params: {params.label()}")
7373

74+
import os as _os
75+
76+
_os.environ["DIFFCTX_BENCH_TIMEOUT_SEC"] = str(args.timeout_per_instance)
77+
7478
eval_fn = make_diffctx_eval_fn(repo_root)
7579
results = run_eval_set(
7680
instances,

benchmarks/run_final_eval.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,11 @@ def main() -> int:
9191
return 1
9292

9393
adapters = default_test_adapters() + default_calibration_pool_adapters()
94+
95+
import os as _os
96+
97+
_os.environ["DIFFCTX_BENCH_TIMEOUT_SEC"] = str(args.timeout_per_instance)
98+
9499
eval_fn = _make_eval_fn(args.baseline, repo_root, request_timeout=args.timeout_per_instance)
95100

96101
args.out.mkdir(parents=True, exist_ok=True)

benchmarks/select_final.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ def main() -> int:
6565
instances = list(filter_instances_by_manifest(adapters, manifest_ids))
6666
print(f"Validation set: {len(instances)} instances")
6767

68+
import os as _os
69+
70+
_os.environ["DIFFCTX_BENCH_TIMEOUT_SEC"] = str(args.timeout_per_instance)
71+
6872
eval_fn = make_diffctx_eval_fn(repo_root)
6973
evaluator = UniversalEvaluator()
7074

0 commit comments

Comments
 (0)