bench results and fix bugs for beam retriever

carsontung666 · carsontung666 · commit dad82a831f6e · 2026-04-24T11:04:00.000+08:00
diff --git a/README.md b/README.md
@@ -142,18 +142,30 @@ export ANTHROPIC_API_KEY=sk-ant-...
 python bench/run_swebench_filetree.py --tier medium
 ```
 
-Tiers:
+Tiers (by retriever difficulty; lower difficulty = more path signal in query):
 
 ```
-strict   107 queries   sanity check (gold path appears in query text)
-medium   133 queries   main report
-loose    261 queries   fuzzy matching
-full     500 queries   includes ~48% path-signal-less queries
+easy     107 queries   gold path appears in query text (sanity check)
+medium   133 queries   gold filename appears in query    (main report)
+hard     261 queries   gold module stem appears          (fuzzy matching)
+all      500 queries   no filter, includes ~48% path-signal-less queries
 ```
 
 Output goes to `bench/runs/<timestamp>__<tier>/`: `report.md`, `summary.json`,
 `per_query.jsonl`.
 
+#### Snapshot (Claude Sonnet 4.6, `--strategy auto`, top-k=10)
+
+| tier   | n   | hit@1 | hit@3 | hit@5 | hit@10 |  MRR  | nDCG@10 |
+|--------|-----|-------|-------|-------|--------|-------|---------|
+| easy   | 107 | 0.776 | 0.841 | 0.841 | 0.841  | 0.805 | 0.772   |
+| medium | 133 | 0.797 | 0.850 | 0.850 | 0.850  | 0.821 | 0.787   |
+
+Path-only filesystem retrieval lands gold in top-10 for ~85% of queries that
+carry any path-level signal. The `hard` tier (261 queries, module-stem
+signal only) is in progress and will be added once Anthropic API rate-limit
+retries are wired in. `all` (the path-signal-less ceiling) has not been run.
+
 ### Document mode — single long document
 
 Compares retriever algorithms (Block / Beam / Vertical / ...) on one
diff --git a/bench/run_swebench_filetree.py b/bench/run_swebench_filetree.py
@@ -4,8 +4,8 @@
 
 Usage:
   python bench/run_swebench_filetree.py --tier medium
-  python bench/run_swebench_filetree.py --tier strict --limit 20
-  python bench/run_swebench_filetree.py --tier full --model claude-sonnet-4-7
+  python bench/run_swebench_filetree.py --tier easy --limit 20
+  python bench/run_swebench_filetree.py --tier all --model claude-sonnet-4-6
 
 Outputs to bench/runs/<timestamp>__<tier>/:
   config.json         run metadata
@@ -18,14 +18,12 @@
 import argparse
 import json
 import math
-import os
 import shutil
-import statistics
 import sys
 import tempfile
 import time
 import traceback
-from collections import Counter, defaultdict
+from collections import defaultdict
 from datetime import datetime
 from pathlib import Path
 
@@ -45,7 +43,8 @@ def load_jsonl(path: Path) -> list[dict]:
 
 
 def load_tier(data_dir: Path, tier: str) -> tuple[list[dict], list[dict]]:
-    if tier == "full":
+    # "all" is the unfiltered 500-query set, stored as queries.jsonl / qrels.jsonl
+    if tier == "all":
         q_path, qr_path = data_dir / "queries.jsonl", data_dir / "qrels.jsonl"
     else:
         q_path = data_dir / f"queries_{tier}.jsonl"
@@ -198,14 +197,12 @@ def run(args):
                 print(f"[skip] missing snapshot {fs_json.name}")
                 continue
 
-            t0 = time.time()
             try:
                 tree_id = build_tree_for_snapshot(db, fs_json)
             except Exception as e:
                 print(f"[ingest-err] {slug}__{commit}: {e}")
                 continue
             info = db.tree_info(tree_id)
-            ingest_ms = int((time.time() - t0) * 1000)
 
             for q in qs:
                 qid = q["id"]
@@ -274,6 +271,8 @@ def run(args):
         "per_path_signal_level": by_signal,
         "per_snapshot_size_bucket": by_bucket,
     }
+    # Re-ensure out_dir exists in case it was removed mid-run
+    out_dir.mkdir(parents=True, exist_ok=True)
     (out_dir / "summary.json").write_text(json.dumps(summary, indent=2))
     (out_dir / "report.md").write_text(render_report(summary, records))
 
@@ -367,15 +366,12 @@ def table(title, d, key_label):
 
 def main():
     p = argparse.ArgumentParser()
-    p.add_argument("--tier", choices=["strict", "medium", "loose", "full"], default="medium")
+    p.add_argument("--tier", choices=["easy", "medium", "hard", "all"], default="medium")
     p.add_argument("--data-dir", default=str(DEFAULT_DATA_DIR))
     p.add_argument("--model", default=DEFAULT_MODEL)
     p.add_argument("--provider", default="anthropic")
     p.add_argument("--top-k", type=int, default=10)
-    # Default to block: beam terminates prematurely on path-only virtual-JSON roots
-    # when node_count <= 50 (it selects the virtual root dir and stops).
-    # Block retriever handles the whole path-only tree correctly across all sizes.
-    p.add_argument("--strategy", choices=["auto", "beam", "block"], default="block")
+    p.add_argument("--strategy", choices=["auto", "beam", "block"], default="auto")
     p.add_argument("--max-turns", type=int, default=None)
     p.add_argument("--limit", type=int, default=0, help="0 = all")
     p.add_argument("--output-dir", default=None)
diff --git a/contextdb/retriever/algorithm/beam_retriever.py b/contextdb/retriever/algorithm/beam_retriever.py
@@ -113,6 +113,18 @@ def retrieve(
             # Build lookup map for O(1) access
             candidates_map = {c["node_id"]: c for c in candidates}
 
+            # Guard against premature termination in filesystem mode: the LLM must
+            # commit to at least one non-directory before we accept done=True.
+            # Otherwise we'd stop after picking a directory and return no files.
+            if done and self.mode == "filesystem":
+                picked_file = any(
+                    not candidates_map.get(nid, {}).get("is_dir", False)
+                    for nid in ranked_ids
+                )
+                if not picked_file:
+                    log.debug("turn %d: overriding done=True (all ranked are directories)", turn)
+                    done = False
+
             # Show LLM decision
             log.debug("turn %d: LLM ranked top-%d, done=%s", turn, len(ranked_ids), done)
             for i, nid in enumerate(ranked_ids[:5]):