Skip to content

Commit dad82a8

Browse files
committed
bench results and fix bugs for beam retriever
1 parent 979b494 commit dad82a8

3 files changed

Lines changed: 38 additions & 18 deletions

File tree

README.md

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -142,18 +142,30 @@ export ANTHROPIC_API_KEY=sk-ant-...
142142
python bench/run_swebench_filetree.py --tier medium
143143
```
144144

145-
Tiers:
145+
Tiers (by retriever difficulty; lower difficulty = more path signal in query):
146146

147147
```
148-
strict 107 queries sanity check (gold path appears in query text)
149-
medium 133 queries main report
150-
loose 261 queries fuzzy matching
151-
full 500 queries includes ~48% path-signal-less queries
148+
easy 107 queries gold path appears in query text (sanity check)
149+
medium 133 queries gold filename appears in query (main report)
150+
hard 261 queries gold module stem appears (fuzzy matching)
151+
all 500 queries no filter, includes ~48% path-signal-less queries
152152
```
153153

154154
Output goes to `bench/runs/<timestamp>__<tier>/`: `report.md`, `summary.json`,
155155
`per_query.jsonl`.
156156

157+
#### Snapshot (Claude Sonnet 4.6, `--strategy auto`, top-k=10)
158+
159+
| tier | n | hit@1 | hit@3 | hit@5 | hit@10 | MRR | nDCG@10 |
160+
|--------|-----|-------|-------|-------|--------|-------|---------|
161+
| easy | 107 | 0.776 | 0.841 | 0.841 | 0.841 | 0.805 | 0.772 |
162+
| medium | 133 | 0.797 | 0.850 | 0.850 | 0.850 | 0.821 | 0.787 |
163+
164+
Path-only filesystem retrieval lands gold in top-10 for ~85% of queries that
165+
carry any path-level signal. The `hard` tier (261 queries, module-stem
166+
signal only) is in progress and will be added once Anthropic API rate-limit
167+
retries are wired in. `all` (the path-signal-less ceiling) has not been run.
168+
157169
### Document mode — single long document
158170

159171
Compares retriever algorithms (Block / Beam / Vertical / ...) on one

bench/run_swebench_filetree.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
55
Usage:
66
python bench/run_swebench_filetree.py --tier medium
7-
python bench/run_swebench_filetree.py --tier strict --limit 20
8-
python bench/run_swebench_filetree.py --tier full --model claude-sonnet-4-7
7+
python bench/run_swebench_filetree.py --tier easy --limit 20
8+
python bench/run_swebench_filetree.py --tier all --model claude-sonnet-4-6
99
1010
Outputs to bench/runs/<timestamp>__<tier>/:
1111
config.json run metadata
@@ -18,14 +18,12 @@
1818
import argparse
1919
import json
2020
import math
21-
import os
2221
import shutil
23-
import statistics
2422
import sys
2523
import tempfile
2624
import time
2725
import traceback
28-
from collections import Counter, defaultdict
26+
from collections import defaultdict
2927
from datetime import datetime
3028
from pathlib import Path
3129

@@ -45,7 +43,8 @@ def load_jsonl(path: Path) -> list[dict]:
4543

4644

4745
def load_tier(data_dir: Path, tier: str) -> tuple[list[dict], list[dict]]:
48-
if tier == "full":
46+
# "all" is the unfiltered 500-query set, stored as queries.jsonl / qrels.jsonl
47+
if tier == "all":
4948
q_path, qr_path = data_dir / "queries.jsonl", data_dir / "qrels.jsonl"
5049
else:
5150
q_path = data_dir / f"queries_{tier}.jsonl"
@@ -198,14 +197,12 @@ def run(args):
198197
print(f"[skip] missing snapshot {fs_json.name}")
199198
continue
200199

201-
t0 = time.time()
202200
try:
203201
tree_id = build_tree_for_snapshot(db, fs_json)
204202
except Exception as e:
205203
print(f"[ingest-err] {slug}__{commit}: {e}")
206204
continue
207205
info = db.tree_info(tree_id)
208-
ingest_ms = int((time.time() - t0) * 1000)
209206

210207
for q in qs:
211208
qid = q["id"]
@@ -274,6 +271,8 @@ def run(args):
274271
"per_path_signal_level": by_signal,
275272
"per_snapshot_size_bucket": by_bucket,
276273
}
274+
# Re-ensure out_dir exists in case it was removed mid-run
275+
out_dir.mkdir(parents=True, exist_ok=True)
277276
(out_dir / "summary.json").write_text(json.dumps(summary, indent=2))
278277
(out_dir / "report.md").write_text(render_report(summary, records))
279278

@@ -367,15 +366,12 @@ def table(title, d, key_label):
367366

368367
def main():
369368
p = argparse.ArgumentParser()
370-
p.add_argument("--tier", choices=["strict", "medium", "loose", "full"], default="medium")
369+
p.add_argument("--tier", choices=["easy", "medium", "hard", "all"], default="medium")
371370
p.add_argument("--data-dir", default=str(DEFAULT_DATA_DIR))
372371
p.add_argument("--model", default=DEFAULT_MODEL)
373372
p.add_argument("--provider", default="anthropic")
374373
p.add_argument("--top-k", type=int, default=10)
375-
# Default to block: beam terminates prematurely on path-only virtual-JSON roots
376-
# when node_count <= 50 (it selects the virtual root dir and stops).
377-
# Block retriever handles the whole path-only tree correctly across all sizes.
378-
p.add_argument("--strategy", choices=["auto", "beam", "block"], default="block")
374+
p.add_argument("--strategy", choices=["auto", "beam", "block"], default="auto")
379375
p.add_argument("--max-turns", type=int, default=None)
380376
p.add_argument("--limit", type=int, default=0, help="0 = all")
381377
p.add_argument("--output-dir", default=None)

contextdb/retriever/algorithm/beam_retriever.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,18 @@ def retrieve(
113113
# Build lookup map for O(1) access
114114
candidates_map = {c["node_id"]: c for c in candidates}
115115

116+
# Guard against premature termination in filesystem mode: the LLM must
117+
# commit to at least one non-directory before we accept done=True.
118+
# Otherwise we'd stop after picking a directory and return no files.
119+
if done and self.mode == "filesystem":
120+
picked_file = any(
121+
not candidates_map.get(nid, {}).get("is_dir", False)
122+
for nid in ranked_ids
123+
)
124+
if not picked_file:
125+
log.debug("turn %d: overriding done=True (all ranked are directories)", turn)
126+
done = False
127+
116128
# Show LLM decision
117129
log.debug("turn %d: LLM ranked top-%d, done=%s", turn, len(ranked_ids), done)
118130
for i, nid in enumerate(ranked_ids[:5]):

0 commit comments

Comments
 (0)