Skip to content

Commit 64c02a5

Browse files
committed
feat: [US-010] - [Add preflight harness readiness script]
1 parent 610f7cb commit 64c02a5

File tree

4 files changed

+270
-2
lines changed

4 files changed

+270
-2
lines changed

.beads/issues.jsonl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@
149149
{"id":"CodeContextBench-wfx","title":"Rewrite MCP preamble in claude_baseline_agent.py","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-12T13:22:07.535142622Z","created_by":"LoCoBench Bot","updated_at":"2026-02-12T13:25:17.883599Z","closed_at":"2026-02-12T13:25:17.883599Z","close_reason":"Replaced 330-line SG_BASE/FULL_PREAMBLE with 15-line SG_TOOL_REFERENCE, simplified instruction preamble (removed two-phase mandate), removed system prompt MCP append, fixed finder→nls_search, simplified CLAUDE.md builder"}
150150
{"id":"CodeContextBench-wmh","title":"US-004 Create cursor_2config and gemini_2config runners","status":"closed","priority":3,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:32:40.428395443Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T03:45:47.513433108Z","closed_at":"2026-02-17T03:45:47.513433108Z","close_reason":"done"}
151151
{"id":"CodeContextBench-xn2","title":"US-013: Ensure all new tasks have Dockerfiles","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T00:48:27.133537053Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T00:50:42.240071565Z","closed_at":"2026-02-16T00:50:42.240071565Z","close_reason":"US-013 complete: all 25 tasks have Dockerfiles and pass syntax check"}
152-
{"id":"CodeContextBench-xp1","title":"US-010 Add preflight harness readiness script","status":"open","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:33:42.377723263Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T03:33:42.377723263Z"}
152+
{"id":"CodeContextBench-xp1","title":"US-010 Add preflight harness readiness script","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:33:42.377723263Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T04:11:04.64784541Z","closed_at":"2026-02-17T04:11:04.64784541Z","close_reason":"done"}
153153
{"id":"CodeContextBench-xp2","title":"Evaluate whether governance/enterprise tasks exercise MCP well","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T18:28:34.316428116Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T18:39:10.504574323Z","closed_at":"2026-02-15T18:39:10.504574323Z","close_reason":"60% of tasks (9/15) use ZERO MCP tools despite preamble injection. Tasks are too local for MCP benefit. Only 3 tasks exercise MCP well: dep-discovery-001 (58% ratio), conflicting-docs-001 (29%), dep-impact-001 (40%). Recommendation: redesign around cross-codebase discovery tasks."}
154154
{"id":"CodeContextBench-xs4","title":"US-016: Build enterprise report generator","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T14:03:20.824191062Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T14:07:08.049549155Z","closed_at":"2026-02-15T14:07:08.049549155Z","close_reason":"US-016 implemented and passing"}
155155
{"id":"CodeContextBench-xs8","title":"US-010: Build reliability analysis pipeline","status":"closed","priority":1,"issue_type":"feature","owner":"locobench@anthropic.com","created_at":"2026-02-15T13:50:48.063408521Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T13:53:00.916908239Z","closed_at":"2026-02-15T13:53:00.916908239Z","close_reason":"US-010 implemented and passing all acceptance criteria"}

ralph-multi-harness/prd.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@
149149
"python3 scripts/check_harness_readiness.py --help runs successfully"
150150
],
151151
"priority": 10,
152-
"passes": false,
152+
"passes": true,
153153
"notes": ""
154154
}
155155
]

ralph-multi-harness/progress.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
- In sandboxed environments, `runs/staging` may resolve to an external symlink target; use a writable `--category` override when dry-running scaffolds locally.
1212
- In `scripts/ccb_metrics`, resolve transcript artifacts through a shared candidate list (not hardcoded `agent/claude-code.txt`) so non-Claude harness outputs are discoverable.
1313
- In `scripts/ccb_metrics/extractors.py`, treat unknown `MODEL_PRICING` keys deterministically by falling back to `_DEFAULT_MODEL` and emitting a one-time warning to keep cross-harness cost reports explainable.
14+
- Harness readiness preflight should enforce exact rollout MCP policy (`none` + `sourcegraph_full`) from registry entries and fail non-zero when required env vars are missing.
1415

1516
## Progress
1617

@@ -125,3 +126,16 @@
125126
- Useful context (e.g., "the evaluation panel is in component X")
126127
- `docs/CONFIGS.md` is already referenced for rollout config behavior, making it the lowest-friction location for Codex policy additions versus creating a new doc.
127128
---
129+
130+
## 2026-02-17 04:10:21 UTC - US-010
131+
- Implemented `scripts/check_harness_readiness.py` to validate `configs/harness_registry.json` structure, required harness keys, and rollout MCP constraints (`none` and `sourcegraph_full`) with support for `--harness` scoped checks.
132+
- Added required environment validation with non-zero exit behavior on missing vars, including global MCP readiness (`SOURCEGRAPH_ACCESS_TOKEN`) and harness auth env checks where applicable.
133+
- Files changed: `scripts/check_harness_readiness.py`, `ralph-multi-harness/prd.json`, `ralph-multi-harness/progress.txt`
134+
- **Learnings for future iterations:**
135+
- Patterns discovered (e.g., "this codebase uses X for Y")
136+
- Keep readiness checks deterministic by validating registry policy and environment prerequisites in one script that returns `0` only when both pass.
137+
- Gotchas encountered (e.g., "don't forget to update Z when changing W")
138+
- Treat auth marker files as valid credential sources to avoid false negatives when harnesses authenticate via local login state instead of env vars.
139+
- Useful context (e.g., "the evaluation panel is in component X")
140+
- `--harness` with `choices` (`codex`, `cursor`, `gemini`, `copilot`, `openhands`) provides a low-noise preflight path before full-batch runs.
141+
---

scripts/check_harness_readiness.py

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
#!/usr/bin/env python3
2+
"""Preflight harness readiness checks for multi-harness benchmark runs.
3+
4+
Validates:
5+
- `configs/harness_registry.json` exists and has required structure
6+
- required harness registry keys and rollout MCP constraints
7+
- required environment variables for selected harnesses
8+
9+
Usage:
10+
python3 scripts/check_harness_readiness.py
11+
python3 scripts/check_harness_readiness.py --harness codex
12+
python3 scripts/check_harness_readiness.py --format json
13+
"""
14+
15+
from __future__ import annotations
16+
17+
import argparse
18+
import json
19+
import os
20+
from dataclasses import dataclass
21+
from pathlib import Path
22+
from typing import Any
23+
24+
ROOT = Path(__file__).resolve().parent.parent
25+
DEFAULT_REGISTRY = ROOT / "configs" / "harness_registry.json"
26+
27+
REQUIRED_HARNESSES = ["codex", "cursor", "gemini", "copilot", "openhands"]
28+
REQUIRED_ENTRY_KEYS = [
29+
"harness_name",
30+
"agent_import_path",
31+
"default_model",
32+
"allowed_mcp_modes",
33+
]
34+
REQUIRED_MCP_MODES = {"none", "sourcegraph_full"}
35+
36+
# Global env needed for MCP-enabled runs in this rollout.
37+
GLOBAL_REQUIRED_ENVS = ["SOURCEGRAPH_ACCESS_TOKEN"]
38+
39+
# Harness-specific auth env guards. If optional marker files are present, auth can be file-backed.
40+
HARNESS_REQUIRED_ENVS: dict[str, list[str]] = {
41+
"codex": ["OPENAI_API_KEY", "CODEX_API_KEY"],
42+
"gemini": ["GEMINI_API_KEY", "GOOGLE_API_KEY"],
43+
}
44+
45+
HARNESS_AUTH_MARKER_FILES: dict[str, list[Path]] = {
46+
"codex": [Path.home() / ".codex" / "auth.json"],
47+
}
48+
49+
50+
@dataclass
51+
class CheckResult:
52+
ok: bool
53+
errors: list[str]
54+
warnings: list[str]
55+
checked_harnesses: list[str]
56+
57+
58+
def _load_registry(path: Path) -> tuple[dict[str, Any] | None, list[str]]:
59+
errors: list[str] = []
60+
if not path.is_file():
61+
return None, [f"Registry file not found: {path}"]
62+
63+
try:
64+
data = json.loads(path.read_text())
65+
except (json.JSONDecodeError, OSError) as exc:
66+
return None, [f"Failed to parse registry JSON: {exc}"]
67+
68+
if not isinstance(data, dict):
69+
errors.append("Registry root must be a JSON object keyed by harness id")
70+
return None, errors
71+
72+
return data, errors
73+
74+
75+
def _validate_registry_structure(
76+
registry: dict[str, Any],
77+
selected_harnesses: list[str],
78+
) -> tuple[list[str], list[str]]:
79+
errors: list[str] = []
80+
warnings: list[str] = []
81+
82+
missing_harnesses = sorted(set(REQUIRED_HARNESSES) - set(registry.keys()))
83+
if missing_harnesses:
84+
errors.append(
85+
"Registry missing required harness entries: " + ", ".join(missing_harnesses)
86+
)
87+
88+
for harness in selected_harnesses:
89+
entry = registry.get(harness)
90+
if entry is None:
91+
errors.append(f"Harness '{harness}' not found in registry")
92+
continue
93+
94+
if not isinstance(entry, dict):
95+
errors.append(f"Harness '{harness}' entry must be an object")
96+
continue
97+
98+
for key in REQUIRED_ENTRY_KEYS:
99+
if key not in entry:
100+
errors.append(f"Harness '{harness}' missing required field '{key}'")
101+
102+
if not isinstance(entry.get("harness_name"), str):
103+
errors.append(f"Harness '{harness}' field 'harness_name' must be a string")
104+
elif entry["harness_name"] != harness:
105+
errors.append(
106+
f"Harness '{harness}' has harness_name='{entry['harness_name']}' (must match key)"
107+
)
108+
109+
if not isinstance(entry.get("agent_import_path"), str):
110+
errors.append(f"Harness '{harness}' field 'agent_import_path' must be a string")
111+
112+
if not isinstance(entry.get("default_model"), str):
113+
errors.append(f"Harness '{harness}' field 'default_model' must be a string")
114+
115+
mcp_modes = entry.get("allowed_mcp_modes")
116+
if not isinstance(mcp_modes, list) or not all(
117+
isinstance(mode, str) for mode in mcp_modes
118+
):
119+
errors.append(
120+
f"Harness '{harness}' field 'allowed_mcp_modes' must be a list of strings"
121+
)
122+
elif set(mcp_modes) != REQUIRED_MCP_MODES:
123+
errors.append(
124+
f"Harness '{harness}' allowed_mcp_modes must be exactly "
125+
f"{sorted(REQUIRED_MCP_MODES)}"
126+
)
127+
128+
unknown_keys = sorted(set(entry.keys()) - set(REQUIRED_ENTRY_KEYS))
129+
if unknown_keys:
130+
warnings.append(
131+
f"Harness '{harness}' has extra keys (allowed but ignored): "
132+
+ ", ".join(unknown_keys)
133+
)
134+
135+
return errors, warnings
136+
137+
138+
def _has_auth_marker(harness: str) -> bool:
139+
markers = HARNESS_AUTH_MARKER_FILES.get(harness, [])
140+
return any(marker.exists() for marker in markers)
141+
142+
143+
def _validate_env(selected_harnesses: list[str]) -> tuple[list[str], list[str]]:
144+
errors: list[str] = []
145+
warnings: list[str] = []
146+
147+
for env_var in GLOBAL_REQUIRED_ENVS:
148+
if not os.environ.get(env_var):
149+
errors.append(f"Missing required env var: {env_var}")
150+
151+
for harness in selected_harnesses:
152+
any_of_envs = HARNESS_REQUIRED_ENVS.get(harness)
153+
if not any_of_envs:
154+
continue
155+
156+
if _has_auth_marker(harness):
157+
warnings.append(
158+
f"Harness '{harness}' auth marker file found; skipping strict env auth check"
159+
)
160+
continue
161+
162+
if not any(os.environ.get(env_var) for env_var in any_of_envs):
163+
errors.append(
164+
f"Harness '{harness}' requires one of env vars: {', '.join(any_of_envs)}"
165+
)
166+
167+
return errors, warnings
168+
169+
170+
def evaluate_readiness(registry_path: Path, harness: str | None) -> CheckResult:
171+
selected_harnesses = [harness] if harness else REQUIRED_HARNESSES
172+
173+
registry, load_errors = _load_registry(registry_path)
174+
errors = list(load_errors)
175+
warnings: list[str] = []
176+
177+
if registry is not None:
178+
structure_errors, structure_warnings = _validate_registry_structure(
179+
registry,
180+
selected_harnesses,
181+
)
182+
errors.extend(structure_errors)
183+
warnings.extend(structure_warnings)
184+
185+
env_errors, env_warnings = _validate_env(selected_harnesses)
186+
errors.extend(env_errors)
187+
warnings.extend(env_warnings)
188+
189+
return CheckResult(
190+
ok=not errors,
191+
errors=errors,
192+
warnings=warnings,
193+
checked_harnesses=selected_harnesses,
194+
)
195+
196+
197+
def _build_parser() -> argparse.ArgumentParser:
198+
parser = argparse.ArgumentParser(
199+
description="Validate harness registry and environment readiness before runs.",
200+
)
201+
parser.add_argument(
202+
"--registry",
203+
default=str(DEFAULT_REGISTRY),
204+
help=f"Path to harness registry JSON (default: {DEFAULT_REGISTRY})",
205+
)
206+
parser.add_argument(
207+
"--harness",
208+
choices=REQUIRED_HARNESSES,
209+
help="Validate a single harness instead of all harnesses",
210+
)
211+
parser.add_argument(
212+
"--format",
213+
choices=("text", "json"),
214+
default="text",
215+
help="Output format (default: text)",
216+
)
217+
return parser
218+
219+
220+
def main() -> int:
221+
parser = _build_parser()
222+
args = parser.parse_args()
223+
224+
result = evaluate_readiness(Path(args.registry), args.harness)
225+
226+
if args.format == "json":
227+
print(
228+
json.dumps(
229+
{
230+
"ok": result.ok,
231+
"checked_harnesses": result.checked_harnesses,
232+
"errors": result.errors,
233+
"warnings": result.warnings,
234+
},
235+
indent=2,
236+
)
237+
)
238+
else:
239+
print(f"Harness readiness: {'OK' if result.ok else 'FAILED'}")
240+
print("Checked harnesses: " + ", ".join(result.checked_harnesses))
241+
if result.errors:
242+
print("Errors:")
243+
for err in result.errors:
244+
print(f" - {err}")
245+
if result.warnings:
246+
print("Warnings:")
247+
for warn in result.warnings:
248+
print(f" - {warn}")
249+
250+
return 0 if result.ok else 2
251+
252+
253+
if __name__ == "__main__":
254+
raise SystemExit(main())

0 commit comments

Comments
 (0)