nikolay-e
diff --git a/‎benchmarks/adapters/calibrate.py‎
Lines changed: 176 additions & 0 deletions b/‎benchmarks/adapters/calibrate.py‎
Lines changed: 176 additions & 0 deletions
diff --git a/‎benchmarks/calibrate.py‎
Lines changed: 10 additions & 5 deletions b/‎benchmarks/calibrate.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎benchmarks/diffctx_eval_fn.py‎
Lines changed: 148 additions & 0 deletions b/‎benchmarks/diffctx_eval_fn.py‎
Lines changed: 148 additions & 0 deletions
diff --git a/‎diffctx/python/_diffctx/__init__.py‎
Lines changed: 6 additions & 1 deletion b/‎diffctx/python/_diffctx/__init__.py‎
Lines changed: 6 additions & 1 deletion
@@ -133,6 +133,182 @@ def _make_pool() -> ProcessPoolExecutor:
     return out
 
 
+EvalAllCellsFn = Callable[
+    [BenchmarkInstance, list[RunParams]],
+    list[tuple[RunParams, EvalResult]],
+]
+
+
+def evaluate_grid_cached(  # noqa: C901 — pool teardown + per-cell demux + retry-on-BPP do not factor cleanly
+    spec: GridSpec,
+    instances: list[BenchmarkInstance],
+    eval_all_cells_fn: EvalAllCellsFn,
+    workers: int = 1,
+    on_trial: TrialCallback | None = None,
+    timeout_per_instance: float = 300.0,
+    checkpoint_dir: Path | None = None,
+) -> list[TrialResult]:
+    """Inverted-loop calibration: outer = instance, inner = grid cells.
+
+    Each ProcessPool task computes the heavy `ScoredState` ONCE per
+    instance, then runs all (`tau`, `core_budget_fraction`) cells against
+    it cheaply. Cuts wall time by ~12x for a 12-cell grid because the
+    expensive parse/fragment/discover/score work is no longer redone per
+    cell. State never crosses the pickle boundary — only the resulting
+    `EvalResult` list does — so ProcessPool is preserved and per-process
+    memory pressure is bounded.
+
+    Per-cell checkpoint files (`<params.label()>.jsonl`) match the
+    layout produced by `evaluate_grid` so the existing aggregator,
+    `top_k_trials`, and `render_grid_report` work unchanged.
+    """
+    import multiprocessing as mp
+    from concurrent.futures import ProcessPoolExecutor, as_completed
+    from concurrent.futures.process import BrokenProcessPool
+
+    from benchmarks.adapters.runner import (
+        _load_existing_results,
+        append_checkpoint,
+        read_checkpoint,
+    )
+
+    evaluator = UniversalEvaluator()
+    points = list(spec.points())
+
+    ckpts: dict[RunParams, Path | None] = {
+        p: (checkpoint_dir / f"{p.label()}.jsonl") if checkpoint_dir is not None else None for p in points
+    }
+    done_ids: dict[RunParams, set[str]] = {p: read_checkpoint(c) if c is not None else set() for p, c in ckpts.items()}
+    results_by_cell: dict[RunParams, list[EvalResult]] = {
+        p: (_load_existing_results(c, done_ids[p]) if c is not None else []) for p, c in ckpts.items()
+    }
+
+    pending: list[tuple[BenchmarkInstance, list[RunParams]]] = []
+    for inst in instances:
+        needed = [p for p in points if inst.instance_id not in done_ids[p]]
+        if needed:
+            pending.append((inst, needed))
+
+    def _make_pool() -> ProcessPoolExecutor:
+        ctx = mp.get_context("spawn")
+        p = ProcessPoolExecutor(max_workers=workers, mp_context=ctx, max_tasks_per_child=50)
+        list(p.map(int, range(workers)))
+        return p
+
+    def _record_per_cell(per_cell_results: list[tuple[RunParams, EvalResult]]) -> None:
+        for params, result in per_cell_results:
+            ckpt = ckpts.get(params)
+            if ckpt is not None:
+                err = str((result.extra or {}).get("error", ""))
+                if "BrokenProcessPool" not in err:
+                    append_checkpoint(ckpt, result)
+            results_by_cell[params].append(result)
+
+    def _drain(pool: ProcessPoolExecutor) -> None:
+        futures: dict = {}
+        submit_failed: list[tuple[BenchmarkInstance, list[RunParams]]] = []
+        pool_broken = False
+        for inst, params_list in pending:
+            try:
+                futures[pool.submit(eval_all_cells_fn, inst, params_list)] = (inst, params_list)
+            except BrokenProcessPool:
+                idx = pending.index((inst, params_list))
+                submit_failed.extend(pending[idx:])
+                pool_broken = True
+                break
+        outer_deadline = __import__("time").monotonic() + timeout_per_instance * len(points) * max(
+            1, (len(pending) + workers - 1) // workers
+        )
+        completed: set[str] = set()
+        try:
+            for future in as_completed(
+                futures,
+                timeout=max(0.0, outer_deadline - __import__("time").monotonic()),
+            ):
+                inst, params_list = futures[future]
+                try:
+                    per_cell = future.result(timeout=0)
+                except BrokenProcessPool:
+                    pool_broken = True
+                    per_cell = [(p, _failure_eval(inst, p, "error", "BrokenProcessPool: worker died")) for p in params_list]
+                except Exception as e:
+                    per_cell = [(p, _failure_eval(inst, p, "error", f"{type(e).__name__}: {e}")) for p in params_list]
+                completed.add(inst.instance_id)
+                _record_per_cell(per_cell)
+        except BrokenProcessPool:
+            pool_broken = True
+        for inst, params_list in submit_failed:
+            _record_per_cell([(p, _failure_eval(inst, p, "error", "BrokenProcessPool: submit failed")) for p in params_list])
+        if pool_broken:
+            raise BrokenProcessPool("pool degraded mid-grid")
+
+    pool: ProcessPoolExecutor | None = _make_pool() if workers > 1 else None
+    try:
+        if pending and pool is not None:
+            while True:
+                try:
+                    _drain(pool)
+                    break
+                except BrokenProcessPool:
+                    try:
+                        pool.shutdown(wait=False, cancel_futures=True)
+                    except Exception:
+                        pass
+                    pool = _make_pool()
+                    # Recompute pending for the rebuild from current
+                    # checkpoint state — instances completed since last
+                    # rebuild should be skipped.
+                    done_ids_now = {p: read_checkpoint(c) if c is not None else set() for p, c in ckpts.items()}
+                    pending[:] = [
+                        (inst, [p for p in points if inst.instance_id not in done_ids_now[p]])
+                        for inst, _ in pending
+                        if any(inst.instance_id not in done_ids_now[p] for p in points)
+                    ]
+        elif pending and pool is None:
+            # workers == 1: serial fallback
+            for inst, params_list in pending:
+                try:
+                    per_cell = eval_all_cells_fn(inst, params_list)
+                except Exception as e:
+                    per_cell = [(p, _failure_eval(inst, p, "error", f"{type(e).__name__}: {e}")) for p in params_list]
+                _record_per_cell(per_cell)
+    finally:
+        if pool is not None:
+            pool.shutdown(wait=False, cancel_futures=True)
+
+    out: list[TrialResult] = []
+    for i, params in enumerate(points):
+        agg = evaluator.aggregate_per_benchmark(results_by_cell[params])
+        trial = TrialResult(
+            params=params,
+            per_benchmark=agg,
+            raw_results=tuple(results_by_cell[params]),
+        )
+        out.append(trial)
+        if on_trial is not None:
+            on_trial(i, len(points), trial)
+    return out
+
+
+def _failure_eval(
+    instance: BenchmarkInstance,
+    params: RunParams,
+    status: str,
+    error: str,
+) -> EvalResult:
+    r = EvalResult(
+        instance_id=instance.instance_id,
+        source_benchmark=instance.source_benchmark,
+        file_recall=0.0,
+        file_precision=0.0,
+        budget=params.budget,
+    )
+    r.extra["status"] = status
+    r.extra["error"] = error
+    r.extra["language"] = instance.language
+    return r
+
+
 def top_k_trials(trials: Iterable[TrialResult], k: int = 3) -> list[TrialResult]:
     """Pick the k highest-score trials, breaking ties by lower mean tokens."""
 
 
@@ -18,12 +18,17 @@
 from dataclasses import asdict
 from pathlib import Path
 
-from benchmarks.adapters.calibrate import GridSpec, evaluate_grid, render_grid_report, top_k_trials
+from benchmarks.adapters.calibrate import (
+    GridSpec,
+    evaluate_grid_cached,
+    render_grid_report,
+    top_k_trials,
+)
 from benchmarks.adapters.runner import filter_instances_by_manifest, read_manifest
 from benchmarks.adapters.runtime_probe import probe_resources, report_and_maybe_exit
 from benchmarks.build_splits import default_calibration_pool_adapters, default_test_adapters
 from benchmarks.common import repos_dir as default_repos_dir
-from benchmarks.diffctx_eval_fn import make_diffctx_eval_fn
+from benchmarks.diffctx_eval_fn import make_diffctx_eval_all_cells_fn
 
 
 def _parse_floats(s: str) -> tuple[float, ...]:
@@ -108,7 +113,7 @@ def main() -> int:
 
     _prewarm_bare_clones(instances)
 
-    eval_fn = make_diffctx_eval_fn(repo_root)
+    eval_all_cells_fn = make_diffctx_eval_all_cells_fn(repo_root)
     args.out.mkdir(parents=True, exist_ok=True)
     checkpoint_dir = args.out / "checkpoints"
 
@@ -119,10 +124,10 @@ def _on_trial(idx: int, total: int, trial) -> None:
             f"min(per_benchmark file_recall) = {trial.score:.4f}"
         )
 
-    trials = evaluate_grid(
+    trials = evaluate_grid_cached(
         spec,
         instances,
-        eval_fn,
+        eval_all_cells_fn,
         workers=args.workers,
         on_trial=_on_trial,
         timeout_per_instance=args.timeout_per_instance,
 
@@ -149,3 +149,151 @@ def _pool_eval(repos_dir_str: str, instance: BenchmarkInstance, params: RunParam
 
 def make_diffctx_eval_fn(repos_dir: Path):
     return functools.partial(_pool_eval, str(repos_dir))
+
+
+def _build_eval_result_from_output(
+    output: dict,
+    instance: BenchmarkInstance,
+    params: RunParams,
+    elapsed: float,
+    evaluator: UniversalEvaluator,
+) -> EvalResult:
+    if output is None:
+        result = EvalResult(
+            instance_id=instance.instance_id,
+            source_benchmark=instance.source_benchmark,
+            file_recall=0.0,
+            file_precision=0.0,
+            budget=params.budget,
+            elapsed_seconds=elapsed,
+        )
+        result.extra["status"] = "diffctx_fail"
+        return result
+    fragments = _output_fragments(output)
+    used_tokens = _compute_used_tokens(output)
+    selection = SelectionOutput(
+        selected_files=_selected_files(fragments),
+        selected_fragments=fragments,
+        used_tokens=used_tokens,
+        elapsed_seconds=elapsed,
+    )
+    result = evaluator.evaluate(instance, selection, budget=params.budget)
+    result.used_tokens = used_tokens
+    result.extra["status"] = "ok"
+    result.extra["language"] = instance.language
+    result.extra["fragment_count"] = len(fragments)
+    latency = output.get("latency") or {}
+    if latency:
+        result.extra["latency_total_ms"] = latency.get("total_ms")
+        result.extra["latency_breakdown"] = {k: v for k, v in latency.items() if k != "total_ms"}
+    return result
+
+
+def pool_eval_all_cells(
+    repos_dir_str: str,
+    instance: BenchmarkInstance,
+    params_list: list[RunParams],
+) -> list[tuple[RunParams, EvalResult]]:
+    """Compute the heavy `ScoredState` ONCE for the instance, then run
+    every (`tau`, `core_budget_fraction`) cell against it cheaply.
+
+    Returns one (params, result) tuple per input params. The orchestrator
+    demuxes these into per-cell checkpoints. This is the ProcessPool
+    worker entry point — the entire ScoredState lives only inside this
+    process and is dropped before return; only EvalResults cross the
+    pickle boundary.
+    """
+    from benchmarks.common import apply_as_commit, ensure_repo, reset_to_parent
+    from treemapper.diffctx.pipeline import compute_scored_state, select_with_params
+
+    if not params_list:
+        return []
+
+    worktree_dir, evaluator = _ensure_worker_state(repos_dir_str)
+
+    repo_url = str(instance.extra.get("repo_url") or f"https://github.com/{instance.repo}")
+    repo_dir = ensure_repo(repo_url, instance.repo, instance.base_commit, worktree_dir)
+    if repo_dir is None:
+        return [
+            (
+                p,
+                _failure_result(instance, p, "clone_fail", "ensure_repo returned None"),
+            )
+            for p in params_list
+        ]
+
+    # All params in a sweep share scoring_mode (BM25/PPR/Ego/Hybrid is a
+    # discovery-and-scoring choice, not a (τ, cbf) one). Use the first.
+    scoring_mode = params_list[0].scoring
+
+    out: list[tuple[RunParams, EvalResult]] = []
+    try:
+        apply_as_commit(repo_dir, instance.gold_patch, "diffctx-eval-gold")
+
+        t_heavy_start = time.perf_counter()
+        try:
+            state = compute_scored_state(
+                repo_dir,
+                "HEAD~1..HEAD",
+                scoring_mode=scoring_mode,
+            )
+        except Exception as e:
+            err = f"{type(e).__name__}: {e}"
+            return [(p, _failure_result(instance, p, "diffctx_fail", err)) for p in params_list]
+        heavy_elapsed = time.perf_counter() - t_heavy_start
+
+        for params in params_list:
+            prior_env = {k: os.environ.get(k) for k in params.to_env()}
+            try:
+                for k, v in params.to_env().items():
+                    os.environ[k] = v
+                t_select_start = time.perf_counter()
+                output = select_with_params(
+                    state,
+                    budget_tokens=params.budget,
+                    tau=params.tau,
+                )
+                select_elapsed = time.perf_counter() - t_select_start
+                # Charge the heavy cost to the first cell only — subsequent
+                # cells reuse the cached state, so they only pay select cost.
+                charged = heavy_elapsed + select_elapsed if not out else select_elapsed
+                result = _build_eval_result_from_output(output, instance, params, charged, evaluator)
+                out.append((params, result))
+            finally:
+                for k, v in prior_env.items():
+                    if v is None:
+                        os.environ.pop(k, None)
+                    else:
+                        os.environ[k] = v
+    finally:
+        try:
+            reset_to_parent(repo_dir)
+        except Exception:
+            pass
+
+    return out
+
+
+def _failure_result(
+    instance: BenchmarkInstance,
+    params: RunParams,
+    status: str,
+    error: str,
+) -> EvalResult:
+    r = EvalResult(
+        instance_id=instance.instance_id,
+        source_benchmark=instance.source_benchmark,
+        file_recall=0.0,
+        file_precision=0.0,
+        budget=params.budget,
+    )
+    r.extra["status"] = status
+    r.extra["error"] = error
+    r.extra["language"] = instance.language
+    return r
+
+
+def make_diffctx_eval_all_cells_fn(repos_dir: Path):
+    """Sibling of `make_diffctx_eval_fn` for the inverted orchestrator
+    (one task = one instance × N cells)."""
+    return functools.partial(pool_eval_all_cells, str(repos_dir))
@@ -1,2 +1,7 @@
 from ._diffctx import *  # noqa: F403
-from ._diffctx import GitError  # noqa: F401  # exception class is not picked up by `import *`
+from ._diffctx import (  # noqa: F401  # not picked up by `import *`
+    GitError,
+    PyScoredState,
+    compute_scored_state,
+    select_with_params,
+)