Skip to content

Commit 49595e8

Browse files
committed
add a couple of simple evaluators
Signed-off-by: Peter Jausovec <peter.jausovec@solo.io>
1 parent b7b23d0 commit 49595e8

18 files changed

Lines changed: 456 additions & 13 deletions

File tree

.github/workflows/validate-evaluators.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@ jobs:
2020

2121
- name: Install dependencies
2222
run: |
23-
pip install pyyaml
24-
# TODO: switch to `pip install agentevals-grader-sdk` once published to PyPI
25-
pip install "agentevals-grader-sdk @ git+https://github.com/agentevals-dev/agentevals.git#subdirectory=packages/grader-sdk-py"
23+
pip install pyyaml agentevals-evaluator-sdk
2624
2725
- name: Discover and validate all evaluators
2826
run: |

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.venv/

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,13 +107,13 @@ author: your-github-username
107107
Run the validation script to catch issues before submitting:
108108
109109
```bash
110-
pip install agentevals-grader-sdk pyyaml
110+
pip install pyyaml agentevals-evaluator-sdk
111111
python scripts/validate_evaluator.py evaluators/my_evaluator
112112
```
113113

114114
This checks:
115115
- **Manifest schema** -- required fields, entrypoint exists, name matches directory
116-
- **Syntax and imports** -- compiles cleanly, uses `@grader` decorator
116+
- **Syntax and imports** -- compiles cleanly, uses `@evaluator` decorator
117117
- **Smoke run** -- runs the evaluator with synthetic input and validates the `EvalResult` output (correct types for `score`, `details`, `status`, etc.)
118118

119119
You can also test with a full eval run:

evaluators/contains/contains.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""Substring containment evaluator.
2+
3+
Scores each invocation 1.0 if final_response contains the configured substring,
4+
otherwise 0.0.
5+
6+
Config:
7+
substring (str): Required for scoring; if omitted, the evaluator is a no-op (1.0).
8+
9+
Usage in eval_config.yaml:
10+
config:
11+
substring: "expected phrase"
12+
"""
13+
14+
from __future__ import annotations
15+
16+
from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
17+
18+
19+
@evaluator
20+
def contains(input: EvalInput) -> EvalResult:
21+
needle = (input.config.get("substring") or "").strip()
22+
if not needle:
23+
return EvalResult(
24+
score=1.0,
25+
per_invocation_scores=[1.0] * len(input.invocations),
26+
details={"note": "no substring configured; skipping check"},
27+
)
28+
29+
case_insensitive = bool(input.config.get("case_insensitive", False))
30+
haystack_fn = str.lower if case_insensitive else lambda s: s
31+
needle_cmp = haystack_fn(needle)
32+
33+
scores: list[float] = []
34+
issues: list[str] = []
35+
36+
for inv in input.invocations:
37+
text = (inv.final_response or "")
38+
if case_insensitive:
39+
ok = needle_cmp in haystack_fn(text)
40+
else:
41+
ok = needle in text
42+
if ok:
43+
scores.append(1.0)
44+
else:
45+
scores.append(0.0)
46+
issues.append(f"{inv.invocation_id}: response does not contain {needle!r}")
47+
48+
overall = sum(scores) / len(scores) if scores else 0.0
49+
return EvalResult(
50+
score=overall,
51+
per_invocation_scores=scores,
52+
details={"issues": issues} if issues else None,
53+
)
54+
55+
56+
if __name__ == "__main__":
57+
contains.run()

evaluators/contains/evaluator.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name: contains
2+
description: Scores whether each final response contains a configured substring (case-sensitive or case-insensitive)
3+
language: python
4+
entrypoint: contains.py
5+
tags: [string, contains,]
6+
author: agentevals-dev

evaluators/equals/equals.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
"""Exact string match evaluator.
2+
3+
Config:
4+
expected (str): If omitted, no-op (1.0).
5+
case_insensitive (bool, default True): Compare normalized strings.
6+
strip (bool, default True): Strip whitespace before compare.
7+
8+
Usage:
9+
config:
10+
expected: "4"
11+
"""
12+
13+
from __future__ import annotations
14+
15+
from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
16+
17+
18+
@evaluator
19+
def equals(input: EvalInput) -> EvalResult:
20+
expected = input.config.get("expected")
21+
if expected is None:
22+
return EvalResult(
23+
score=1.0,
24+
per_invocation_scores=[1.0] * len(input.invocations),
25+
details={"note": "no expected string configured; skipping check"},
26+
)
27+
28+
case_insensitive = bool(input.config.get("case_insensitive", True))
29+
strip = bool(input.config.get("strip", True))
30+
31+
def norm(s: str) -> str:
32+
t = s.strip() if strip else s
33+
return t.lower() if case_insensitive else t
34+
35+
exp = norm(str(expected))
36+
scores: list[float] = []
37+
issues: list[str] = []
38+
39+
for inv in input.invocations:
40+
got = norm(inv.final_response or "")
41+
if got == exp:
42+
scores.append(1.0)
43+
else:
44+
scores.append(0.0)
45+
issues.append(
46+
f"{inv.invocation_id}: expected {expected!r}, got {inv.final_response!r}"
47+
)
48+
49+
overall = sum(scores) / len(scores) if scores else 0.0
50+
return EvalResult(
51+
score=overall,
52+
per_invocation_scores=scores,
53+
details={"issues": issues} if issues else None,
54+
)
55+
56+
57+
if __name__ == "__main__":
58+
equals.run()

evaluators/equals/evaluator.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name: equals
2+
description: Scores whether each final response exactly matches a configured expected string
3+
language: python
4+
entrypoint: equals.py
5+
tags: [string, equals]
6+
author: agentevals-dev

evaluators/is_json/evaluator.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name: is_json
2+
description: Scores whether each final response parses as JSON (optional markdown code fence extraction)
3+
language: python
4+
entrypoint: is_json.py
5+
tags: [json, structured]
6+
author: agentevals-dev

evaluators/is_json/is_json.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""JSON parse check evaluator.
2+
3+
Tries to parse final_response as JSON. Optionally extracts fenced ```json ... ``` blocks.
4+
5+
Config:
6+
require_json (bool, default False): If False, evaluator is a no-op (1.0).
7+
extract_markdown_fence (bool, default True): Strip ```json fences if present.
8+
9+
Usage:
10+
config:
11+
require_json: true
12+
"""
13+
14+
from __future__ import annotations
15+
16+
import json
17+
import re
18+
19+
from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
20+
21+
_FENCE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?```\s*$", re.DOTALL | re.IGNORECASE)
22+
23+
24+
def _parse_json_payload(text: str, extract_fence: bool) -> object:
25+
raw = (text or "").strip()
26+
if extract_fence:
27+
m = _FENCE.match(raw)
28+
if m:
29+
raw = m.group(1).strip()
30+
return json.loads(raw)
31+
32+
33+
@evaluator
34+
def is_json(input: EvalInput) -> EvalResult:
35+
if not input.config.get("require_json"):
36+
return EvalResult(
37+
score=1.0,
38+
per_invocation_scores=[1.0] * len(input.invocations),
39+
details={"note": "require_json not set; skipping check"},
40+
)
41+
42+
extract_fence = bool(input.config.get("extract_markdown_fence", True))
43+
44+
scores: list[float] = []
45+
issues: list[str] = []
46+
47+
for inv in input.invocations:
48+
try:
49+
_parse_json_payload(inv.final_response or "", extract_fence)
50+
scores.append(1.0)
51+
except (json.JSONDecodeError, TypeError, ValueError) as exc:
52+
scores.append(0.0)
53+
issues.append(f"{inv.invocation_id}: not valid JSON ({exc})")
54+
55+
overall = sum(scores) / len(scores) if scores else 0.0
56+
return EvalResult(
57+
score=overall,
58+
per_invocation_scores=scores,
59+
details={"issues": issues} if issues else None,
60+
)
61+
62+
63+
if __name__ == "__main__":
64+
is_json.run()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name: levenshtein_ratio
2+
description: Scores similarity of each response to a reference string using normalized Levenshtein distance
3+
language: python
4+
entrypoint: levenshtein_ratio.py
5+
tags: [string, levenshtein]
6+
author: agentevals-dev

0 commit comments

Comments
 (0)