Skip to content

Commit 07d176f

Browse files
committed
add validation for graders on ci
Signed-off-by: Peter Jausovec <peter.jausovec@solo.io>
1 parent c859e0e commit 07d176f

5 files changed

Lines changed: 345 additions & 5 deletions

File tree

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: Validate graders
2+
3+
on:
4+
pull_request:
5+
paths:
6+
- "graders/**"
7+
- "scripts/validate_grader.py"
8+
- "scripts/test_input.json"
9+
10+
jobs:
11+
validate:
12+
runs-on: ubuntu-latest
13+
steps:
14+
- uses: actions/checkout@v4
15+
16+
- name: Set up Python
17+
uses: actions/setup-python@v5
18+
with:
19+
python-version: "3.12"
20+
21+
- name: Install dependencies
22+
run: pip install agentevals-grader-sdk pyyaml
23+
24+
- name: Discover and validate all graders
25+
run: |
26+
grader_dirs=$(find graders -mindepth 1 -maxdepth 1 -type d | sort)
27+
if [ -z "$grader_dirs" ]; then
28+
echo "No grader directories found."
29+
exit 0
30+
fi
31+
python scripts/validate_grader.py $grader_dirs

README.md

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,15 +99,27 @@ tags: [quality, tools]
9999
author: your-github-username
100100
```
101101
102-
### 4. Test locally
102+
### 4. Validate locally
103103
104-
Add it to an eval config as a local `type: code` grader and run it:
104+
Run the validation script to catch issues before submitting:
105+
106+
```bash
107+
pip install agentevals-grader-sdk pyyaml
108+
python scripts/validate_grader.py graders/my_grader
109+
```
110+
111+
This checks:
112+
- **Manifest schema** -- required fields, entrypoint exists, name matches directory
113+
- **Syntax and imports** -- compiles cleanly, uses `@grader` decorator
114+
- **Smoke run** -- runs the grader with synthetic input and validates the `EvalResult` output (correct types for `score`, `details`, `status`, etc.)
115+
116+
You can also test with a full eval run:
105117

106118
```yaml
107119
metrics:
108120
- name: my_grader
109121
type: code
110-
path: ./my_grader/my_grader.py
122+
path: ./graders/my_grader/my_grader.py
111123
threshold: 0.5
112124
```
113125
@@ -133,7 +145,7 @@ graders/
133145

134146
3. Open a PR against `main`
135147

136-
A CI workflow will validate your `grader.yaml` manifest. Once merged, the workflow regenerates `index.yaml` automatically, and your grader becomes available to everyone via `agentevals grader list`.
148+
CI will automatically validate your grader (manifest, syntax, and smoke run). Once merged, a separate workflow regenerates `index.yaml`, and your grader becomes available to everyone via `agentevals grader list`.
137149

138150
## Supported languages
139151

graders/peters_grader/peters_grader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@
1414

1515
@grader
1616
def peters_grader(input: EvalInput) -> EvalResult:
17-
return EvalResult(score=0.123, details="All good")
17+
return EvalResult(score=0.123, details={"message": "All good"})

scripts/test_input.json

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"metric_name": "test",
3+
"threshold": 0.5,
4+
"config": {},
5+
"invocations": [
6+
{
7+
"invocation_id": "ci-test-001",
8+
"user_content": "What is 2+2?",
9+
"final_response": "The answer is 4.",
10+
"tool_calls": [{"name": "calculator", "args": {"expr": "2+2"}}],
11+
"tool_responses": [{"name": "calculator", "output": "4"}]
12+
}
13+
],
14+
"expected_invocations": null
15+
}

scripts/validate_grader.py

Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
#!/usr/bin/env python3
2+
"""Validate a grader directory: manifest, syntax, and smoke run.
3+
4+
Usage:
5+
python scripts/validate_grader.py graders/my_grader
6+
python scripts/validate_grader.py graders/* # validate all
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import json
12+
import subprocess
13+
import sys
14+
from pathlib import Path
15+
16+
import yaml
17+
18+
SCRIPT_DIR = Path(__file__).resolve().parent
19+
TEST_INPUT = SCRIPT_DIR / "test_input.json"
20+
21+
REQUIRED_MANIFEST_FIELDS = {"name", "description", "language", "entrypoint"}
22+
VALID_STATUSES = {"PASSED", "FAILED", "NOT_EVALUATED"}
23+
24+
LANGUAGE_EXTENSIONS = {
25+
"python": {".py"},
26+
"javascript": {".js"},
27+
"typescript": {".ts"},
28+
}
29+
30+
31+
def _fail(msg: str) -> None:
32+
print(f" FAIL: {msg}", file=sys.stderr)
33+
34+
35+
def _ok(msg: str) -> None:
36+
print(f" OK: {msg}")
37+
38+
39+
def validate_manifest(grader_dir: Path) -> dict | None:
40+
"""Check grader.yaml exists and has required fields. Returns parsed manifest or None."""
41+
manifest_path = grader_dir / "grader.yaml"
42+
if not manifest_path.exists():
43+
_fail(f"Missing grader.yaml in {grader_dir}")
44+
return None
45+
46+
try:
47+
manifest = yaml.safe_load(manifest_path.read_text())
48+
except yaml.YAMLError as exc:
49+
_fail(f"Invalid YAML in {manifest_path}: {exc}")
50+
return None
51+
52+
if not isinstance(manifest, dict):
53+
_fail(f"grader.yaml must be a YAML mapping, got {type(manifest).__name__}")
54+
return None
55+
56+
missing = REQUIRED_MANIFEST_FIELDS - set(manifest.keys())
57+
if missing:
58+
_fail(f"grader.yaml missing required fields: {sorted(missing)}")
59+
return None
60+
61+
entrypoint = manifest["entrypoint"]
62+
entry_path = grader_dir / entrypoint
63+
if not entry_path.exists():
64+
_fail(f"Entrypoint file not found: {entry_path}")
65+
return None
66+
67+
dir_name = grader_dir.name
68+
if manifest["name"] != dir_name:
69+
_fail(
70+
f"Manifest name '{manifest['name']}' does not match "
71+
f"directory name '{dir_name}'"
72+
)
73+
return None
74+
75+
_ok(f"Manifest valid ({manifest_path})")
76+
return manifest
77+
78+
79+
def validate_syntax(grader_dir: Path, manifest: dict) -> bool:
80+
"""Check syntax and basic structure of the grader source file."""
81+
language = manifest.get("language", "python")
82+
entrypoint = manifest["entrypoint"]
83+
entry_path = grader_dir / entrypoint
84+
85+
if language == "python":
86+
result = subprocess.run(
87+
[sys.executable, "-m", "py_compile", str(entry_path)],
88+
capture_output=True,
89+
text=True,
90+
)
91+
if result.returncode != 0:
92+
_fail(f"Syntax error in {entry_path}:\n{result.stderr}")
93+
return False
94+
_ok(f"Python syntax valid ({entry_path})")
95+
96+
source = entry_path.read_text()
97+
if "agentevals_grader_sdk" not in source:
98+
_fail(
99+
f"{entry_path} does not import agentevals_grader_sdk. "
100+
f"Graders must use the SDK or implement the stdin/stdout protocol."
101+
)
102+
return False
103+
if "@grader" not in source:
104+
_fail(f"{entry_path} does not use the @grader decorator")
105+
return False
106+
_ok("Imports and decorator present")
107+
108+
elif language in ("javascript", "typescript"):
109+
ext = Path(entrypoint).suffix
110+
expected = LANGUAGE_EXTENSIONS.get(language, set())
111+
if ext not in expected:
112+
_fail(
113+
f"Entrypoint extension '{ext}' doesn't match "
114+
f"language '{language}' (expected {expected})"
115+
)
116+
return False
117+
_ok(f"Extension matches language ({entry_path})")
118+
119+
return True
120+
121+
122+
def validate_smoke_run(grader_dir: Path, manifest: dict) -> bool:
123+
"""Run the grader with synthetic input and validate the output."""
124+
language = manifest.get("language", "python")
125+
entrypoint = manifest["entrypoint"]
126+
entry_path = grader_dir / entrypoint
127+
128+
if language == "python":
129+
cmd = [sys.executable, str(entry_path)]
130+
elif language in ("javascript", "typescript"):
131+
cmd = ["node", str(entry_path)]
132+
else:
133+
_ok(f"Skipping smoke run for unsupported language: {language}")
134+
return True
135+
136+
test_input = TEST_INPUT.read_text()
137+
138+
result = subprocess.run(
139+
cmd,
140+
input=test_input,
141+
capture_output=True,
142+
text=True,
143+
timeout=30,
144+
)
145+
146+
if result.returncode != 0:
147+
stderr_preview = result.stderr.strip()[:500]
148+
_fail(
149+
f"Grader exited with code {result.returncode}\n"
150+
f" stderr: {stderr_preview}"
151+
)
152+
return False
153+
154+
stdout = result.stdout.strip()
155+
if not stdout:
156+
stderr_preview = result.stderr.strip()[:500]
157+
_fail(
158+
f"Grader produced no output on stdout"
159+
+ (f"\n stderr: {stderr_preview}" if stderr_preview else "")
160+
)
161+
return False
162+
163+
try:
164+
output = json.loads(stdout)
165+
except json.JSONDecodeError as exc:
166+
_fail(f"Grader stdout is not valid JSON: {exc}\n stdout: {stdout[:200]}")
167+
return False
168+
169+
# Validate score
170+
score = output.get("score")
171+
if score is None:
172+
_fail("Output missing required 'score' field")
173+
return False
174+
if not isinstance(score, (int, float)):
175+
_fail(f"'score' must be a number, got {type(score).__name__}: {score}")
176+
return False
177+
if score < 0.0 or score > 1.0:
178+
_fail(f"'score' must be in [0.0, 1.0], got {score}")
179+
return False
180+
181+
# Validate status
182+
status = output.get("status")
183+
if status is not None and status not in VALID_STATUSES:
184+
_fail(
185+
f"'status' must be one of {sorted(VALID_STATUSES)} or null, "
186+
f"got '{status}'"
187+
)
188+
return False
189+
190+
# Validate details type
191+
details = output.get("details")
192+
if details is not None and not isinstance(details, dict):
193+
_fail(
194+
f"'details' must be a dict or null, "
195+
f"got {type(details).__name__}: {details!r}"
196+
)
197+
return False
198+
199+
# Validate per_invocation_scores type
200+
per_inv = output.get("per_invocation_scores")
201+
if per_inv is not None:
202+
if not isinstance(per_inv, list):
203+
_fail(
204+
f"'per_invocation_scores' must be a list, "
205+
f"got {type(per_inv).__name__}"
206+
)
207+
return False
208+
209+
# Full Pydantic validation via the SDK if available
210+
try:
211+
from agentevals_grader_sdk import EvalResult
212+
EvalResult.model_validate(output)
213+
_ok("Output validates against EvalResult schema (Pydantic)")
214+
except ImportError:
215+
_ok("Output JSON structure valid (SDK not installed, skipped Pydantic check)")
216+
except Exception as exc:
217+
_fail(f"Output fails EvalResult validation: {exc}")
218+
return False
219+
220+
_ok(f"Smoke run passed (score={score})")
221+
return True
222+
223+
224+
def validate_grader(grader_dir: Path) -> bool:
225+
"""Run all validations on a single grader directory."""
226+
print(f"\nValidating: {grader_dir}")
227+
print(f"{'─' * 50}")
228+
229+
manifest = validate_manifest(grader_dir)
230+
if manifest is None:
231+
return False
232+
233+
if not validate_syntax(grader_dir, manifest):
234+
return False
235+
236+
if not validate_smoke_run(grader_dir, manifest):
237+
return False
238+
239+
return True
240+
241+
242+
def main() -> None:
243+
if len(sys.argv) < 2:
244+
print(f"Usage: {sys.argv[0]} <grader_dir> [<grader_dir> ...]", file=sys.stderr)
245+
sys.exit(2)
246+
247+
dirs = [Path(arg) for arg in sys.argv[1:]]
248+
results: dict[str, bool] = {}
249+
250+
for d in dirs:
251+
if not d.is_dir():
252+
print(f"\nSkipping {d} (not a directory)", file=sys.stderr)
253+
continue
254+
if not (d / "grader.yaml").exists():
255+
print(f"\nSkipping {d} (no grader.yaml)", file=sys.stderr)
256+
continue
257+
results[str(d)] = validate_grader(d)
258+
259+
print(f"\n{'=' * 50}")
260+
print("Summary:")
261+
all_passed = True
262+
for name, passed in results.items():
263+
icon = "PASS" if passed else "FAIL"
264+
print(f" [{icon}] {name}")
265+
if not passed:
266+
all_passed = False
267+
268+
if not results:
269+
print(" No graders found to validate.")
270+
sys.exit(2)
271+
272+
print()
273+
if all_passed:
274+
print(f"All {len(results)} grader(s) passed.")
275+
else:
276+
failed = sum(1 for v in results.values() if not v)
277+
print(f"{failed} of {len(results)} grader(s) failed.")
278+
sys.exit(1)
279+
280+
281+
if __name__ == "__main__":
282+
main()

0 commit comments

Comments
 (0)