Skip to content

Commit 14f615f

Browse files
agent: add benchmark runner CLI artifacts
1 parent 80a093c commit 14f615f

4 files changed

Lines changed: 603 additions & 0 deletions

File tree

benchmarks/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Reproducible benchmark runner plumbing for python-docs-mcp-server."""

benchmarks/__main__.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""Command line entry point for the benchmark runner."""
2+
3+
from __future__ import annotations
4+
5+
import argparse
6+
import json
7+
import sys
8+
from pathlib import Path
9+
10+
from benchmarks.runner import BenchmarkConfig, BenchmarkValidationError, run_benchmark
11+
12+
13+
def _build_parser() -> argparse.ArgumentParser:
14+
parser = argparse.ArgumentParser(
15+
prog="python -m benchmarks",
16+
description="Run the reproducible Python docs benchmark harness.",
17+
)
18+
subparsers = parser.add_subparsers(dest="command", required=True)
19+
20+
run = subparsers.add_parser("run", help="Run benchmark cells and write raw artifacts.")
21+
run.add_argument("--corpus", required=True, type=Path, help="Path to the corpus YAML file.")
22+
run.add_argument(
23+
"--manifest",
24+
required=True,
25+
type=Path,
26+
help="Path to the competitor manifest YAML file.",
27+
)
28+
run.add_argument("--out", required=True, type=Path, help="Output directory for run artifacts.")
29+
run.add_argument("--run-id", help="Stable run identifier. Defaults to a UTC timestamp.")
30+
run.add_argument(
31+
"--dry-run",
32+
action="store_true",
33+
help="Validate inputs and emit planned benchmark cells without execution.",
34+
)
35+
return parser
36+
37+
38+
def main(argv: list[str] | None = None) -> int:
39+
parser = _build_parser()
40+
args = parser.parse_args(argv)
41+
42+
if args.command == "run":
43+
config = BenchmarkConfig(
44+
corpus_path=args.corpus,
45+
manifest_path=args.manifest,
46+
out_dir=args.out,
47+
run_id=args.run_id,
48+
dry_run=args.dry_run,
49+
)
50+
try:
51+
summary = run_benchmark(config)
52+
except BenchmarkValidationError as exc:
53+
parser.exit(2, f"benchmark validation failed: {exc}\n")
54+
55+
print(json.dumps(summary, indent=2, sort_keys=True))
56+
return 0
57+
58+
parser.print_help()
59+
return 2
60+
61+
62+
if __name__ == "__main__":
63+
sys.exit(main())

benchmarks/runner.py

Lines changed: 333 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
"""Benchmark runner artifact plumbing.
2+
3+
This module intentionally contains only local fake/baseline execution. Real
4+
provider adapters and report generation are follow-up work packages.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import json
10+
import platform
11+
import re
12+
import subprocess
13+
import sys
14+
import time
15+
from dataclasses import dataclass
16+
from datetime import UTC, datetime
17+
from pathlib import Path
18+
from typing import Any
19+
20+
import yaml
21+
22+
_SAFE_ID = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_.-]*$")
23+
_EXECUTABLE_ADAPTERS = {"fake", "no-mcp-baseline", "no_mcp_baseline"}
24+
25+
26+
class BenchmarkValidationError(ValueError):
27+
"""Raised when corpus or manifest input is not runnable."""
28+
29+
30+
@dataclass(frozen=True)
31+
class BenchmarkConfig:
32+
"""Benchmark runner configuration."""
33+
34+
corpus_path: Path
35+
manifest_path: Path
36+
out_dir: Path
37+
run_id: str | None = None
38+
dry_run: bool = False
39+
40+
41+
@dataclass(frozen=True)
42+
class Question:
43+
"""Validated corpus question."""
44+
45+
id: str
46+
prompt: str
47+
raw: dict[str, Any]
48+
49+
50+
@dataclass(frozen=True)
51+
class Competitor:
52+
"""Validated competitor manifest entry."""
53+
54+
id: str
55+
adapter: str
56+
raw: dict[str, Any]
57+
58+
59+
@dataclass(frozen=True)
60+
class BenchmarkCell:
61+
"""One competitor/question execution cell."""
62+
63+
competitor: Competitor
64+
question: Question
65+
66+
@property
67+
def cell_id(self) -> str:
68+
return f"{self.competitor.id}/{self.question.id}"
69+
70+
71+
def run_benchmark(config: BenchmarkConfig) -> dict[str, Any]:
72+
"""Run or dry-run a benchmark and write the stable artifact layout."""
73+
corpus_data = _load_yaml_mapping(config.corpus_path, "corpus")
74+
manifest_data = _load_yaml_mapping(config.manifest_path, "manifest")
75+
questions = _load_questions(corpus_data)
76+
competitors = _load_competitors(manifest_data)
77+
cells = [
78+
BenchmarkCell(competitor=competitor, question=question)
79+
for competitor in competitors
80+
for question in questions
81+
]
82+
83+
run_id = config.run_id or _default_run_id()
84+
run_dir = config.out_dir
85+
run_dir.mkdir(parents=True, exist_ok=True)
86+
_write_artifact(run_dir / "snapshots" / "competitor-manifest.yml", manifest_data)
87+
_write_artifact(run_dir / "snapshots" / "corpus.yml", corpus_data)
88+
89+
started_at = _utc_now()
90+
environment = _environment_metadata(run_id=run_id, dry_run=config.dry_run)
91+
_write_json(run_dir / "environment.json", environment)
92+
93+
planned_cells = [
94+
{"competitor_id": cell.competitor.id, "corpus_id": cell.question.id}
95+
for cell in cells
96+
]
97+
_write_json(run_dir / "planned-cells.json", {"cells": planned_cells})
98+
99+
succeeded = 0
100+
failed = 0
101+
if not config.dry_run:
102+
for cell in cells:
103+
result = _execute_cell(cell)
104+
_write_cell_artifacts(run_dir, cell, result)
105+
if result["status"] == "succeeded":
106+
succeeded += 1
107+
else:
108+
failed += 1
109+
110+
summary = {
111+
"run_id": run_id,
112+
"dry_run": config.dry_run,
113+
"started_at": started_at,
114+
"completed_at": _utc_now(),
115+
"corpus_path": str(config.corpus_path),
116+
"manifest_path": str(config.manifest_path),
117+
"artifact_root": str(run_dir),
118+
"repo_commit": environment["repo_commit"],
119+
"external_provider_calls": False,
120+
"planned_cells": len(cells),
121+
"succeeded_cells": succeeded,
122+
"failed_cells": failed,
123+
"competitors": [competitor.id for competitor in competitors],
124+
"corpus_ids": [question.id for question in questions],
125+
}
126+
_write_json(run_dir / "run-summary.json", summary)
127+
return summary
128+
129+
130+
def _load_yaml_mapping(path: Path, label: str) -> dict[str, Any]:
131+
if not path.exists():
132+
raise BenchmarkValidationError(f"{label} file does not exist: {path}")
133+
with path.open("r", encoding="utf-8") as file:
134+
data = yaml.safe_load(file)
135+
if not isinstance(data, dict):
136+
raise BenchmarkValidationError(f"{label} file must contain a YAML mapping")
137+
return data
138+
139+
140+
def _load_questions(data: dict[str, Any]) -> list[Question]:
141+
items = data.get("questions")
142+
if not isinstance(items, list) or not items:
143+
raise BenchmarkValidationError("corpus must contain a non-empty 'questions' list")
144+
145+
questions: list[Question] = []
146+
seen: set[str] = set()
147+
for index, item in enumerate(items):
148+
if not isinstance(item, dict):
149+
raise BenchmarkValidationError(f"corpus question at index {index} must be a mapping")
150+
question_id = _required_safe_id(item, "id", f"corpus question at index {index}")
151+
if question_id in seen:
152+
raise BenchmarkValidationError(f"duplicate corpus id: {question_id}")
153+
seen.add(question_id)
154+
prompt = item.get("prompt")
155+
if not isinstance(prompt, str) or not prompt.strip():
156+
raise BenchmarkValidationError(f"corpus question {question_id} must include prompt")
157+
questions.append(Question(id=question_id, prompt=prompt, raw=item))
158+
return questions
159+
160+
161+
def _load_competitors(data: dict[str, Any]) -> list[Competitor]:
162+
items = data.get("competitors")
163+
if not isinstance(items, list) or not items:
164+
raise BenchmarkValidationError("manifest must contain a non-empty 'competitors' list")
165+
166+
competitors: list[Competitor] = []
167+
seen: set[str] = set()
168+
for index, item in enumerate(items):
169+
if not isinstance(item, dict):
170+
raise BenchmarkValidationError(f"competitor at index {index} must be a mapping")
171+
competitor_id = _required_safe_id(item, "id", f"competitor at index {index}")
172+
if competitor_id in seen:
173+
raise BenchmarkValidationError(f"duplicate competitor id: {competitor_id}")
174+
seen.add(competitor_id)
175+
adapter = item.get("adapter")
176+
if not isinstance(adapter, str) or not adapter.strip():
177+
raise BenchmarkValidationError(f"competitor {competitor_id} must include adapter")
178+
competitors.append(Competitor(id=competitor_id, adapter=adapter, raw=item))
179+
return competitors
180+
181+
182+
def _required_safe_id(item: dict[str, Any], key: str, label: str) -> str:
183+
value = item.get(key)
184+
if not isinstance(value, str) or not value.strip():
185+
raise BenchmarkValidationError(f"{label} is missing required {key!r}")
186+
if not _SAFE_ID.fullmatch(value):
187+
raise BenchmarkValidationError(
188+
f"{label} has unsafe {key!r}: {value!r}; use letters, numbers, dots, dashes, "
189+
"or underscores"
190+
)
191+
return value
192+
193+
194+
def _execute_cell(cell: BenchmarkCell) -> dict[str, Any]:
195+
started = time.perf_counter()
196+
started_at = _utc_now()
197+
status = "succeeded"
198+
error: dict[str, str] | None = None
199+
answer = ""
200+
201+
try:
202+
answer = _fake_provider_answer(cell)
203+
except Exception as exc: # noqa: BLE001 - failures are benchmark artifacts
204+
status = "failed"
205+
error = {"type": type(exc).__name__, "message": str(exc)}
206+
207+
latency_ms = round((time.perf_counter() - started) * 1000, 3)
208+
completed_at = _utc_now()
209+
transcript = {
210+
"competitor_id": cell.competitor.id,
211+
"corpus_id": cell.question.id,
212+
"adapter": cell.competitor.adapter,
213+
"status": status,
214+
"started_at": started_at,
215+
"completed_at": completed_at,
216+
"messages": [{"role": "user", "content": cell.question.prompt}],
217+
"answer": answer,
218+
"error": error,
219+
"external_provider_calls": False,
220+
}
221+
token_record = {
222+
"competitor_id": cell.competitor.id,
223+
"corpus_id": cell.question.id,
224+
"status": "placeholder",
225+
"input_characters": len(cell.question.prompt),
226+
"output_characters": len(answer),
227+
"client_wrapped_tokens": None,
228+
"raw_payload_tokens": None,
229+
"notes": "Token counting integration is intentionally out of scope for issue #72.",
230+
}
231+
latency_record = {
232+
"competitor_id": cell.competitor.id,
233+
"corpus_id": cell.question.id,
234+
"status": status,
235+
"latency_ms": latency_ms,
236+
"started_at": started_at,
237+
"completed_at": completed_at,
238+
}
239+
scoring_record = {
240+
"competitor_id": cell.competitor.id,
241+
"corpus_id": cell.question.id,
242+
"status": "placeholder",
243+
"score": None,
244+
"requires_manual_scoring": True,
245+
"notes": "Correctness scoring automation is intentionally out of scope for issue #72.",
246+
}
247+
return {
248+
"status": status,
249+
"transcript": transcript,
250+
"tokens": token_record,
251+
"latency": latency_record,
252+
"scoring": scoring_record,
253+
"failure": error,
254+
}
255+
256+
257+
def _fake_provider_answer(cell: BenchmarkCell) -> str:
258+
adapter = cell.competitor.adapter
259+
if adapter not in _EXECUTABLE_ADAPTERS:
260+
raise RuntimeError(f"adapter {adapter!r} is not implemented in issue #72 runner")
261+
if cell.competitor.raw.get("force_failure") is True:
262+
raise RuntimeError("forced fake provider failure")
263+
answer = cell.competitor.raw.get("fake_answer")
264+
if isinstance(answer, str):
265+
return answer
266+
return f"[fake:{cell.competitor.id}] {cell.question.prompt}"
267+
268+
269+
def _write_cell_artifacts(run_dir: Path, cell: BenchmarkCell, result: dict[str, Any]) -> None:
270+
competitor_id = cell.competitor.id
271+
corpus_id = cell.question.id
272+
_write_json(run_dir / "transcripts" / competitor_id / f"{corpus_id}.json", result["transcript"])
273+
_write_json(run_dir / "tokens" / competitor_id / f"{corpus_id}.json", result["tokens"])
274+
_write_json(run_dir / "latency" / competitor_id / f"{corpus_id}.json", result["latency"])
275+
_write_json(run_dir / "scoring" / competitor_id / f"{corpus_id}.json", result["scoring"])
276+
if result["failure"] is not None:
277+
_write_json(
278+
run_dir / "failures" / competitor_id / f"{corpus_id}.json",
279+
{
280+
"competitor_id": competitor_id,
281+
"corpus_id": corpus_id,
282+
"status": "failed",
283+
"error": result["failure"],
284+
},
285+
)
286+
287+
288+
def _environment_metadata(*, run_id: str, dry_run: bool) -> dict[str, Any]:
289+
return {
290+
"run_id": run_id,
291+
"created_at": _utc_now(),
292+
"dry_run": dry_run,
293+
"repo_commit": _repo_commit_sha(),
294+
"python_version": sys.version,
295+
"python_executable": sys.executable,
296+
"platform": platform.platform(),
297+
"system": platform.system(),
298+
"machine": platform.machine(),
299+
"benchmark_runner": "benchmarks",
300+
"external_provider_calls": False,
301+
}
302+
303+
304+
def _repo_commit_sha() -> str:
305+
try:
306+
result = subprocess.run(
307+
["git", "rev-parse", "HEAD"],
308+
check=True,
309+
capture_output=True,
310+
text=True,
311+
timeout=5,
312+
)
313+
except (OSError, subprocess.CalledProcessError, subprocess.TimeoutExpired):
314+
return "unknown"
315+
return result.stdout.strip() or "unknown"
316+
317+
318+
def _write_json(path: Path, data: dict[str, Any]) -> None:
319+
path.parent.mkdir(parents=True, exist_ok=True)
320+
path.write_text(json.dumps(data, indent=2, sort_keys=True) + "\n", encoding="utf-8")
321+
322+
323+
def _write_artifact(path: Path, data: dict[str, Any]) -> None:
324+
path.parent.mkdir(parents=True, exist_ok=True)
325+
path.write_text(yaml.safe_dump(data, sort_keys=True), encoding="utf-8")
326+
327+
328+
def _default_run_id() -> str:
329+
return datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
330+
331+
332+
def _utc_now() -> str:
333+
return datetime.now(UTC).isoformat(timespec="seconds").replace("+00:00", "Z")

0 commit comments

Comments
 (0)