44
55Usage:
66 python bench/run_swebench_filetree.py --tier medium
7- python bench/run_swebench_filetree.py --tier strict --limit 20
8- python bench/run_swebench_filetree.py --tier full --model claude-sonnet-4-7
7+ python bench/run_swebench_filetree.py --tier easy --limit 20
8+ python bench/run_swebench_filetree.py --tier all --model claude-sonnet-4-6
99
1010Outputs to bench/runs/<timestamp>__<tier>/:
1111 config.json run metadata
1818import argparse
1919import json
2020import math
21- import os
2221import shutil
23- import statistics
2422import sys
2523import tempfile
2624import time
2725import traceback
28- from collections import Counter , defaultdict
26+ from collections import defaultdict
2927from datetime import datetime
3028from pathlib import Path
3129
@@ -45,7 +43,8 @@ def load_jsonl(path: Path) -> list[dict]:
4543
4644
4745def load_tier (data_dir : Path , tier : str ) -> tuple [list [dict ], list [dict ]]:
48- if tier == "full" :
46+ # "all" is the unfiltered 500-query set, stored as queries.jsonl / qrels.jsonl
47+ if tier == "all" :
4948 q_path , qr_path = data_dir / "queries.jsonl" , data_dir / "qrels.jsonl"
5049 else :
5150 q_path = data_dir / f"queries_{ tier } .jsonl"
@@ -198,14 +197,12 @@ def run(args):
198197 print (f"[skip] missing snapshot { fs_json .name } " )
199198 continue
200199
201- t0 = time .time ()
202200 try :
203201 tree_id = build_tree_for_snapshot (db , fs_json )
204202 except Exception as e :
205203 print (f"[ingest-err] { slug } __{ commit } : { e } " )
206204 continue
207205 info = db .tree_info (tree_id )
208- ingest_ms = int ((time .time () - t0 ) * 1000 )
209206
210207 for q in qs :
211208 qid = q ["id" ]
@@ -274,6 +271,8 @@ def run(args):
274271 "per_path_signal_level" : by_signal ,
275272 "per_snapshot_size_bucket" : by_bucket ,
276273 }
274+ # Re-ensure out_dir exists in case it was removed mid-run
275+ out_dir .mkdir (parents = True , exist_ok = True )
277276 (out_dir / "summary.json" ).write_text (json .dumps (summary , indent = 2 ))
278277 (out_dir / "report.md" ).write_text (render_report (summary , records ))
279278
@@ -367,15 +366,12 @@ def table(title, d, key_label):
367366
368367def main ():
369368 p = argparse .ArgumentParser ()
370- p .add_argument ("--tier" , choices = ["strict " , "medium" , "loose " , "full " ], default = "medium" )
369+ p .add_argument ("--tier" , choices = ["easy " , "medium" , "hard " , "all " ], default = "medium" )
371370 p .add_argument ("--data-dir" , default = str (DEFAULT_DATA_DIR ))
372371 p .add_argument ("--model" , default = DEFAULT_MODEL )
373372 p .add_argument ("--provider" , default = "anthropic" )
374373 p .add_argument ("--top-k" , type = int , default = 10 )
375- # Default to block: beam terminates prematurely on path-only virtual-JSON roots
376- # when node_count <= 50 (it selects the virtual root dir and stops).
377- # Block retriever handles the whole path-only tree correctly across all sizes.
378- p .add_argument ("--strategy" , choices = ["auto" , "beam" , "block" ], default = "block" )
374+ p .add_argument ("--strategy" , choices = ["auto" , "beam" , "block" ], default = "auto" )
379375 p .add_argument ("--max-turns" , type = int , default = None )
380376 p .add_argument ("--limit" , type = int , default = 0 , help = "0 = all" )
381377 p .add_argument ("--output-dir" , default = None )
0 commit comments