Skip to content

Commit 773096d

Browse files
sjarmakclaude
andcommitted
Add GT coverage infrastructure: audit, verification, cross-validation, registry, and routing
New scripts: - audit_gt_coverage.py: scan all suites, report GT status per task (valid/invalid/empty/missing) - verify_oracle_fail2pass.py: verify GT files produce perfect retrieval scores - cross_validate_gt.py: compare curator vs manual GT with file-level F1 - update_gt_registry.py: rebuild configs/ground_truth_files.json from GT files Modified: - daytona_curator_runner.py: add --manifest, --auto-route/--local-only/--daytona-only flags, classify_task_environment() for SWEAP image routing, load_manifest_tasks() for batch generation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 91ed94e commit 773096d

File tree

7 files changed

+993
-11
lines changed

7 files changed

+993
-11
lines changed

docs/ops/SCRIPT_INDEX.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
129129
- `scripts/swap_default_branch.sh` - Infrastructure or mirror management script for swap default branch.
130130
- `scripts/sync_oracle_files.py` - Infrastructure or mirror management script for sync oracle files.
131131
- `scripts/sync_pytorch_verifiers.sh` - Infrastructure or mirror management script for sync pytorch verifiers.
132+
- `scripts/update_gt_registry.py` - Infrastructure or mirror management script for update gt registry.
132133
- `scripts/update_sg_only_mirrors.py` - Infrastructure or mirror management script for update sg only mirrors.
133134

134135
## Library / Helpers
@@ -166,6 +167,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
166167
## Misc
167168

168169
- `scripts/add_verification_metadata.py` - Utility script for add verification metadata.
170+
- `scripts/audit_gt_coverage.py` - Utility script for audit gt coverage.
169171
- `scripts/audit_official_scores.py` - Utility script for audit official scores.
170172
- `scripts/audit_unpinned_repos.py` - Utility script for audit unpinned repos.
171173
- `scripts/audit_v2_report_data.py` - Utility script for audit v2 report data.
@@ -181,6 +183,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
181183
- `scripts/context_retrieval_agent.py` - Utility script for context retrieval agent.
182184
- `scripts/control_plane.py` - Utility script for control plane.
183185
- `scripts/convert_harbor_to_contextbench.py` - Utility script for convert harbor to contextbench.
186+
- `scripts/cross_validate_gt.py` - Utility script for cross validate gt.
184187
- `scripts/cross_validate_oracles.py` - Utility script for cross validate oracles.
185188
- `scripts/daytona_curator_runner.py` - Utility script for daytona curator runner.
186189
- `scripts/daytona_poc_runner.py` - Utility script for daytona poc runner.
@@ -249,6 +252,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
249252
- `scripts/select_contextbench_pilot.py` - Utility script for select contextbench pilot.
250253
- `scripts/smoke_artifact_verifier.py` - Utility script for smoke artifact verifier.
251254
- `scripts/smoke_test_tasks.py` - Utility script for smoke test tasks.
255+
- `scripts/verify_oracle_fail2pass.py` - Utility script for verify oracle fail2pass.
252256
- `scripts/verify_retrieval_eval_smoke.py` - Utility script for verify retrieval eval smoke.
253257

254258
## Regeneration

scripts/audit_gt_coverage.py

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
#!/usr/bin/env python3
2+
"""Audit ground truth coverage across all benchmark suites.
3+
4+
Scans benchmarks/csb_sdlc_*/ and benchmarks/csb_org_*/ for GT files,
5+
categorizes each task, and reports coverage per suite.
6+
7+
Usage:
8+
python3 scripts/audit_gt_coverage.py
9+
python3 scripts/audit_gt_coverage.py --output manifest.json
10+
python3 scripts/audit_gt_coverage.py --threshold 0.90
11+
"""
12+
from __future__ import annotations
13+
14+
import argparse
15+
import json
16+
import sys
17+
from pathlib import Path
18+
19+
REPO_ROOT = Path(__file__).resolve().parent.parent
20+
BENCHMARKS_DIR = REPO_ROOT / "benchmarks"
21+
22+
# GT file names in priority order (manual first, then curator-generated)
23+
MANUAL_GT_FILES = ["ground_truth.json", "oracle_answer.json"]
24+
CURATOR_GT_FILES = ["ground_truth_agent.json"]
25+
ALL_GT_FILES = MANUAL_GT_FILES + CURATOR_GT_FILES
26+
27+
28+
def classify_gt(tests_dir: Path) -> tuple[str, str, str | None]:
29+
"""Classify a task's GT status and provenance.
30+
31+
Returns (status, provenance, gt_file_path):
32+
status: 'valid' | 'invalid-schema' | 'empty' | 'missing'
33+
provenance: 'manual' | 'curator' | 'none'
34+
gt_file_path: relative path to the GT file used, or None
35+
"""
36+
# Check manual files first (higher priority), then curator
37+
for gt_name in ALL_GT_FILES:
38+
gt_path = tests_dir / gt_name
39+
if not gt_path.exists():
40+
continue
41+
42+
provenance = "manual" if gt_name in MANUAL_GT_FILES else "curator"
43+
rel_path = str(gt_path.relative_to(REPO_ROOT))
44+
45+
try:
46+
data = json.loads(gt_path.read_text())
47+
except (json.JSONDecodeError, UnicodeDecodeError):
48+
return "invalid-schema", provenance, rel_path
49+
50+
if not isinstance(data, dict):
51+
return "invalid-schema", provenance, rel_path
52+
53+
# Check for 'files' key
54+
if "files" not in data:
55+
# Legacy onboard-search format uses 'function_id'
56+
if "function_id" in data:
57+
return "valid", provenance, rel_path
58+
return "invalid-schema", provenance, rel_path
59+
60+
files = data["files"]
61+
if not isinstance(files, list) or len(files) == 0:
62+
return "empty", provenance, rel_path
63+
64+
return "valid", provenance, rel_path
65+
66+
return "missing", "none", None
67+
68+
69+
def detect_dockerfile_type(task_dir: Path) -> str:
70+
"""Detect which Dockerfile variants exist for a task."""
71+
env_dir = task_dir / "environment"
72+
if not env_dir.is_dir():
73+
return "none"
74+
75+
types = []
76+
for name in sorted(env_dir.iterdir()):
77+
if name.name.startswith("Dockerfile"):
78+
suffix = name.name.replace("Dockerfile", "").lstrip(".")
79+
types.append(suffix if suffix else "default")
80+
return ",".join(types) if types else "none"
81+
82+
83+
def scan_benchmarks() -> list[dict]:
84+
"""Scan all benchmark suites and classify each task."""
85+
results = []
86+
87+
for suite_dir in sorted(BENCHMARKS_DIR.iterdir()):
88+
if not suite_dir.is_dir():
89+
continue
90+
if not suite_dir.name.startswith(("csb_", "ccb_")):
91+
continue
92+
93+
suite_name = suite_dir.name
94+
95+
for task_dir in sorted(suite_dir.iterdir()):
96+
if not task_dir.is_dir():
97+
continue
98+
99+
tests_dir = task_dir / "tests"
100+
if not tests_dir.is_dir():
101+
# Task without tests/ dir counts as missing
102+
status, provenance, gt_file = "missing", "none", None
103+
else:
104+
status, provenance, gt_file = classify_gt(tests_dir)
105+
106+
results.append({
107+
"suite": suite_name,
108+
"task_id": task_dir.name,
109+
"status": status,
110+
"provenance": provenance,
111+
"gt_file": gt_file,
112+
"dockerfile_type": detect_dockerfile_type(task_dir),
113+
})
114+
115+
return results
116+
117+
118+
def print_summary_table(results: list[dict]) -> None:
119+
"""Print per-suite summary table to stdout."""
120+
# Aggregate by suite
121+
suites: dict[str, dict] = {}
122+
for r in results:
123+
s = suites.setdefault(r["suite"], {
124+
"total": 0, "valid": 0, "invalid": 0, "empty": 0, "missing": 0
125+
})
126+
s["total"] += 1
127+
if r["status"] == "valid":
128+
s["valid"] += 1
129+
elif r["status"] == "invalid-schema":
130+
s["invalid"] += 1
131+
elif r["status"] == "empty":
132+
s["empty"] += 1
133+
elif r["status"] == "missing":
134+
s["missing"] += 1
135+
136+
# Print table
137+
header = f"{'suite':<40} {'total':>5} {'valid':>5} {'invalid':>7} {'empty':>5} {'missing':>7} {'coverage%':>9}"
138+
print(header)
139+
print("-" * len(header))
140+
141+
grand = {"total": 0, "valid": 0, "invalid": 0, "empty": 0, "missing": 0}
142+
for suite_name in sorted(suites):
143+
s = suites[suite_name]
144+
cov = (s["valid"] / s["total"] * 100) if s["total"] > 0 else 0
145+
print(f"{suite_name:<40} {s['total']:>5} {s['valid']:>5} {s['invalid']:>7} {s['empty']:>5} {s['missing']:>7} {cov:>8.1f}%")
146+
for k in grand:
147+
grand[k] += s[k]
148+
149+
print("-" * len(header))
150+
cov = (grand["valid"] / grand["total"] * 100) if grand["total"] > 0 else 0
151+
print(f"{'TOTAL':<40} {grand['total']:>5} {grand['valid']:>5} {grand['invalid']:>7} {grand['empty']:>5} {grand['missing']:>7} {cov:>8.1f}%")
152+
153+
154+
def build_manifest(results: list[dict]) -> list[dict]:
155+
"""Build manifest of non-valid tasks."""
156+
return [
157+
{
158+
"suite": r["suite"],
159+
"task_id": r["task_id"],
160+
"status": r["status"],
161+
"dockerfile_type": r["dockerfile_type"],
162+
}
163+
for r in results
164+
if r["status"] != "valid"
165+
]
166+
167+
168+
def main() -> int:
169+
parser = argparse.ArgumentParser(
170+
description="Audit ground truth coverage across benchmark suites"
171+
)
172+
parser.add_argument(
173+
"--output", "-o",
174+
help="Write manifest JSON of non-valid tasks to this path",
175+
)
176+
parser.add_argument(
177+
"--threshold", "-t",
178+
type=float, default=0.75,
179+
help="Coverage threshold (0-1). Exit 1 if below (default: 0.75)",
180+
)
181+
args = parser.parse_args()
182+
183+
results = scan_benchmarks()
184+
print_summary_table(results)
185+
186+
total = len(results)
187+
valid = sum(1 for r in results if r["status"] == "valid")
188+
coverage = valid / total if total > 0 else 0
189+
190+
print(f"\nOverall coverage: {valid}/{total} = {coverage:.1%}")
191+
print(f"Threshold: {args.threshold:.0%}")
192+
193+
if args.output:
194+
manifest = build_manifest(results)
195+
Path(args.output).write_text(json.dumps(manifest, indent=2) + "\n")
196+
print(f"Manifest written to {args.output} ({len(manifest)} tasks)")
197+
198+
if coverage < args.threshold:
199+
print("FAIL: Coverage below threshold")
200+
return 1
201+
202+
print("PASS: Coverage meets threshold")
203+
return 0
204+
205+
206+
if __name__ == "__main__":
207+
sys.exit(main())

0 commit comments

Comments
 (0)