perf(bench): inverted-index BM25 discovery, label-keyed cell dicts

nikolay-e · nikolay-e · commit f3275b6855e9 · 2026-05-01T14:49:19.000+02:00
diff --git a/benchmarks/adapters/calibrate.py b/benchmarks/adapters/calibrate.py
@@ -174,35 +174,40 @@ def evaluate_grid_cached(  # noqa: C901 — pool teardown + per-cell demux + ret
 
     evaluator = UniversalEvaluator()
     points = list(spec.points())
+    # Index by `params.label()` because `RunParams.extra_env: dict` makes
+    # the dataclass unhashable — can't use the params object itself as a
+    # dict key. Labels are stable and uniquely identify a grid cell.
+    points_by_label: dict[str, RunParams] = {p.label(): p for p in points}
 
-    ckpts: dict[RunParams, Path | None] = {
-        p: (checkpoint_dir / f"{p.label()}.jsonl") if checkpoint_dir is not None else None for p in points
+    ckpts: dict[str, Path | None] = {
+        lbl: (checkpoint_dir / f"{lbl}.jsonl") if checkpoint_dir is not None else None for lbl in points_by_label
     }
-    done_ids: dict[RunParams, set[str]] = {p: read_checkpoint(c) if c is not None else set() for p, c in ckpts.items()}
-    results_by_cell: dict[RunParams, list[EvalResult]] = {
-        p: (_load_existing_results(c, done_ids[p]) if c is not None else []) for p, c in ckpts.items()
+    done_ids: dict[str, set[str]] = {lbl: read_checkpoint(c) if c is not None else set() for lbl, c in ckpts.items()}
+    results_by_cell: dict[str, list[EvalResult]] = {
+        lbl: (_load_existing_results(c, done_ids[lbl]) if c is not None else []) for lbl, c in ckpts.items()
     }
 
     pending: list[tuple[BenchmarkInstance, list[RunParams]]] = []
     for inst in instances:
-        needed = [p for p in points if inst.instance_id not in done_ids[p]]
+        needed = [p for p in points if inst.instance_id not in done_ids[p.label()]]
         if needed:
             pending.append((inst, needed))
 
     def _make_pool() -> ProcessPoolExecutor:
         ctx = mp.get_context("spawn")
-        p = ProcessPoolExecutor(max_workers=workers, mp_context=ctx, max_tasks_per_child=50)
+        p = ProcessPoolExecutor(max_workers=workers, mp_context=ctx, max_tasks_per_child=20)
         list(p.map(int, range(workers)))
         return p
 
     def _record_per_cell(per_cell_results: list[tuple[RunParams, EvalResult]]) -> None:
         for params, result in per_cell_results:
-            ckpt = ckpts.get(params)
+            lbl = params.label()
+            ckpt = ckpts.get(lbl)
             if ckpt is not None:
                 err = str((result.extra or {}).get("error", ""))
                 if "BrokenProcessPool" not in err:
                     append_checkpoint(ckpt, result)
-            results_by_cell[params].append(result)
+            results_by_cell[lbl].append(result)
 
     def _drain(pool: ProcessPoolExecutor) -> None:
         futures: dict = {}
@@ -258,11 +263,11 @@ def _drain(pool: ProcessPoolExecutor) -> None:
                     # Recompute pending for the rebuild from current
                     # checkpoint state — instances completed since last
                     # rebuild should be skipped.
-                    done_ids_now = {p: read_checkpoint(c) if c is not None else set() for p, c in ckpts.items()}
+                    done_ids_now = {lbl: read_checkpoint(c) if c is not None else set() for lbl, c in ckpts.items()}
                     pending[:] = [
-                        (inst, [p for p in points if inst.instance_id not in done_ids_now[p]])
+                        (inst, [p for p in points if inst.instance_id not in done_ids_now[p.label()]])
                         for inst, _ in pending
-                        if any(inst.instance_id not in done_ids_now[p] for p in points)
+                        if any(inst.instance_id not in done_ids_now[p.label()] for p in points)
                     ]
         elif pending and pool is None:
             # workers == 1: serial fallback
@@ -278,11 +283,12 @@ def _drain(pool: ProcessPoolExecutor) -> None:
 
     out: list[TrialResult] = []
     for i, params in enumerate(points):
-        agg = evaluator.aggregate_per_benchmark(results_by_cell[params])
+        cell_results = results_by_cell[params.label()]
+        agg = evaluator.aggregate_per_benchmark(cell_results)
         trial = TrialResult(
             params=params,
             per_benchmark=agg,
-            raw_results=tuple(results_by_cell[params]),
+            raw_results=tuple(cell_results),
         )
         out.append(trial)
         if on_trial is not None:
diff --git a/diffctx/src/discovery.rs b/diffctx/src/discovery.rs
@@ -184,42 +184,54 @@ impl DiscoveryStrategy for BM25Discovery {
         if query_tokens.is_empty() {
             return Vec::new();
         }
+        let query_set: FxHashSet<String> = query_tokens.into_iter().collect();
 
         let changed_set: FxHashSet<&Path> = ctx.changed_files.iter().map(|p| p.as_path()).collect();
-        let mut corpus: Vec<Vec<String>> = Vec::new();
-        let mut paths: Vec<PathBuf> = Vec::new();
 
-        for f in &ctx.all_candidates {
-            if changed_set.contains(f.as_path()) {
-                continue;
-            }
-            let content = match ctx.read_file(f) {
-                Some(c) => c,
-                None => continue,
-            };
-            corpus.push(extract_identifier_list(
-                &content,
-                BM25.min_query_token_length,
-            ));
-            paths.push(f.clone());
-        }
+        // Parallel tokenization: previously a serial loop, the dominant
+        // cost on mega-repos (vscode/mui ~5k TS files). par_iter saturates
+        // available rayon threads.
+        let pairs: Vec<(PathBuf, Vec<String>)> = ctx
+            .all_candidates
+            .par_iter()
+            .filter(|f| !changed_set.contains(f.as_path()))
+            .filter_map(|f| {
+                let content = ctx.read_file(f)?;
+                Some((
+                    f.clone(),
+                    extract_identifier_list(&content, BM25.min_query_token_length),
+                ))
+            })
+            .collect();
 
-        if corpus.is_empty() {
+        if pairs.is_empty() {
             return Vec::new();
         }
+        let n_docs = pairs.len();
+        if n_docs > 5000 {
+            tracing::warn!(
+                "BM25Discovery: large candidate corpus ({n_docs} docs) — using inverted-index fast path"
+            );
+        }
 
-        let n_docs = corpus.len();
-        let avgdl = corpus.iter().map(|d| d.len()).sum::<usize>() as f64 / n_docs as f64;
-
+        // Single pass: compute df globally + inverted-index posting lists
+        // for query terms only (skip indexing terms not in the query — they
+        // are never needed and would balloon memory on large repos).
         let mut df: FxHashMap<String, usize> = FxHashMap::default();
-        for doc in &corpus {
+        let mut postings: FxHashMap<String, Vec<usize>> = FxHashMap::default();
+        let mut total_len: usize = 0;
+        for (doc_id, (_, doc)) in pairs.iter().enumerate() {
+            total_len += doc.len();
             let unique: FxHashSet<&str> = doc.iter().map(|s| s.as_str()).collect();
             for term in unique {
                 *df.entry(term.to_string()).or_insert(0) += 1;
+                if query_set.contains(term) {
+                    postings.entry(term.to_string()).or_default().push(doc_id);
+                }
             }
         }
+        let avgdl = total_len as f64 / n_docs as f64;
 
-        let query_set: FxHashSet<String> = query_tokens.into_iter().collect();
         let idf: FxHashMap<String, f64> = query_set
             .iter()
             .map(|t| {
@@ -230,23 +242,37 @@ impl DiscoveryStrategy for BM25Discovery {
             })
             .collect();
 
-        let scores: Vec<f64> = corpus
+        // Candidate doc-ids = union of posting lists for query terms. Docs
+        // not in this set contain zero query terms and would score 0 — skip
+        // them. This is the algorithmic win: scoring shrinks from O(N_docs)
+        // to O(|posting-list union|), typically ~10-100× smaller on big
+        // corpora where the query is sparse against the corpus vocabulary.
+        let mut candidate_ids: FxHashSet<usize> = FxHashSet::default();
+        for term in &query_set {
+            if let Some(p) = postings.get(term) {
+                candidate_ids.extend(p);
+            }
+        }
+        if candidate_ids.is_empty() {
+            return Vec::new();
+        }
+
+        let candidate_vec: Vec<usize> = candidate_ids.into_iter().collect();
+        let scored: Vec<(usize, f64)> = candidate_vec
             .par_iter()
-            .map(|doc| Self::bm25_score(doc, &query_set, &idf, avgdl))
+            .map(|&doc_id| {
+                let s = Self::bm25_score(&pairs[doc_id].1, &query_set, &idf, avgdl);
+                (doc_id, s)
+            })
             .collect();
 
-        let mut ranked: Vec<usize> = (0..scores.len()).collect();
-        ranked.sort_by(|&a, &b| {
-            scores[b]
-                .partial_cmp(&scores[a])
-                .unwrap_or(std::cmp::Ordering::Equal)
-        });
+        let mut ranked: Vec<(usize, f64)> = scored.into_iter().filter(|(_, s)| *s > 0.0).collect();
+        ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
 
         ranked
             .into_iter()
             .take(self.top_k)
-            .filter(|&i| scores[i] > 0.0)
-            .map(|i| paths[i].clone())
+            .map(|(i, _)| pairs[i].0.clone())
             .collect()
     }
 }