Skip to content

Commit f66a520

Browse files
authored
Merge pull request #3 from agentevals-dev/peterj/addsomeevaluators
add a couple of simple evaluators
2 parents b7b23d0 + ae67759 commit f66a520

18 files changed

Lines changed: 474 additions & 14 deletions

File tree

.github/workflows/validate-evaluators.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@ jobs:
2020

2121
- name: Install dependencies
2222
run: |
23-
pip install pyyaml
24-
# TODO: switch to `pip install agentevals-grader-sdk` once published to PyPI
25-
pip install "agentevals-grader-sdk @ git+https://github.com/agentevals-dev/agentevals.git#subdirectory=packages/grader-sdk-py"
23+
pip install pyyaml agentevals-evaluator-sdk
2624
2725
- name: Discover and validate all evaluators
2826
run: |

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.venv/

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,13 +107,13 @@ author: your-github-username
107107
Run the validation script to catch issues before submitting:
108108
109109
```bash
110-
pip install agentevals-grader-sdk pyyaml
110+
pip install pyyaml agentevals-evaluator-sdk
111111
python scripts/validate_evaluator.py evaluators/my_evaluator
112112
```
113113

114114
This checks:
115115
- **Manifest schema** -- required fields, entrypoint exists, name matches directory
116-
- **Syntax and imports** -- compiles cleanly, uses `@grader` decorator
116+
- **Syntax and imports** -- compiles cleanly, uses `@evaluator` decorator
117117
- **Smoke run** -- runs the evaluator with synthetic input and validates the `EvalResult` output (correct types for `score`, `details`, `status`, etc.)
118118

119119
You can also test with a full eval run:

evaluators/contains/contains.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""Substring containment evaluator.
2+
3+
Scores each invocation 1.0 if final_response contains the configured substring,
4+
otherwise 0.0.
5+
6+
Config:
7+
substring (str): Required. If omitted, returns NOT_EVALUATED.
8+
9+
Usage in eval_config.yaml:
10+
config:
11+
substring: "expected phrase"
12+
"""
13+
14+
from __future__ import annotations
15+
16+
from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
17+
18+
19+
@evaluator
20+
def contains(input: EvalInput) -> EvalResult:
21+
substring = (input.config.get("substring") or "").strip()
22+
n = len(input.invocations)
23+
if not substring:
24+
return EvalResult(
25+
score=0.0,
26+
status=EvalStatus.NOT_EVALUATED,
27+
per_invocation_scores=[None] * n,
28+
details={"reason": "missing config: substring"},
29+
)
30+
31+
case_insensitive = bool(input.config.get("case_insensitive", False))
32+
normalize = str.lower if case_insensitive else lambda s: s
33+
substring_cmp = normalize(substring)
34+
35+
scores: list[float] = []
36+
issues: list[str] = []
37+
38+
for inv in input.invocations:
39+
response_text = inv.final_response or ""
40+
if case_insensitive:
41+
ok = substring_cmp in normalize(response_text)
42+
else:
43+
ok = substring in response_text
44+
if ok:
45+
scores.append(1.0)
46+
else:
47+
scores.append(0.0)
48+
issues.append(f"{inv.invocation_id}: response does not contain {substring!r}")
49+
50+
overall = sum(scores) / len(scores) if scores else 0.0
51+
return EvalResult(
52+
score=overall,
53+
per_invocation_scores=scores,
54+
details={"issues": issues} if issues else None,
55+
)
56+
57+
58+
if __name__ == "__main__":
59+
contains.run()

evaluators/contains/evaluator.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name: contains
2+
description: Scores whether each final response contains a configured substring (case-sensitive or case-insensitive)
3+
language: python
4+
entrypoint: contains.py
5+
tags: [string, contains]
6+
author: agentevals-dev

evaluators/equals/equals.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""Exact string match evaluator.
2+
3+
Config:
4+
expected (str): Required. If omitted, returns NOT_EVALUATED.
5+
case_insensitive (bool, default True): Compare normalized strings.
6+
strip (bool, default True): Strip whitespace before compare.
7+
8+
Usage:
9+
config:
10+
expected: "4"
11+
"""
12+
13+
from __future__ import annotations
14+
15+
from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
16+
17+
18+
@evaluator
19+
def equals(input: EvalInput) -> EvalResult:
20+
expected = input.config.get("expected")
21+
if expected is None:
22+
n = len(input.invocations)
23+
return EvalResult(
24+
score=0.0,
25+
status=EvalStatus.NOT_EVALUATED,
26+
per_invocation_scores=[None] * n,
27+
details={"reason": "missing config: expected"},
28+
)
29+
30+
case_insensitive = bool(input.config.get("case_insensitive", False))
31+
strip = bool(input.config.get("strip", True))
32+
33+
def norm(s: str) -> str:
34+
t = s.strip() if strip else s
35+
return t.lower() if case_insensitive else t
36+
37+
exp = norm(str(expected))
38+
scores: list[float] = []
39+
issues: list[str] = []
40+
41+
for inv in input.invocations:
42+
got = norm(inv.final_response or "")
43+
if got == exp:
44+
scores.append(1.0)
45+
else:
46+
scores.append(0.0)
47+
issues.append(
48+
f"{inv.invocation_id}: expected {expected!r}, got {inv.final_response!r}"
49+
)
50+
51+
overall = sum(scores) / len(scores) if scores else 0.0
52+
return EvalResult(
53+
score=overall,
54+
per_invocation_scores=scores,
55+
details={"issues": issues} if issues else None,
56+
)
57+
58+
59+
if __name__ == "__main__":
60+
equals.run()

evaluators/equals/evaluator.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name: equals
2+
description: Scores whether each final response exactly matches a configured expected string
3+
language: python
4+
entrypoint: equals.py
5+
tags: [string, equals]
6+
author: agentevals-dev

evaluators/is_json/evaluator.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name: is_json
2+
description: Scores whether each final response parses as JSON (optional markdown code fence extraction)
3+
language: python
4+
entrypoint: is_json.py
5+
tags: [json, structured]
6+
author: agentevals-dev

evaluators/is_json/is_json.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""JSON parse check evaluator.
2+
3+
Tries to parse final_response as JSON. Optionally extracts fenced ```json ... ``` blocks.
4+
5+
Config:
6+
extract_markdown_fence (bool, default True): Strip ```json fences if present.
7+
8+
Usage:
9+
config:
10+
extract_markdown_fence: true
11+
"""
12+
13+
from __future__ import annotations
14+
15+
import json
16+
import re
17+
18+
from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
19+
20+
_FENCE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?```\s*$", re.DOTALL | re.IGNORECASE)
21+
22+
23+
def _parse_json_payload(text: str, extract_fence: bool) -> object:
24+
raw = (text or "").strip()
25+
if extract_fence:
26+
m = _FENCE.match(raw)
27+
if m:
28+
raw = m.group(1).strip()
29+
return json.loads(raw)
30+
31+
32+
@evaluator
33+
def is_json(input: EvalInput) -> EvalResult:
34+
extract_fence = bool(input.config.get("extract_markdown_fence", True))
35+
36+
scores: list[float] = []
37+
issues: list[str] = []
38+
39+
for inv in input.invocations:
40+
try:
41+
_parse_json_payload(inv.final_response or "", extract_fence)
42+
scores.append(1.0)
43+
except (json.JSONDecodeError, TypeError, ValueError) as exc:
44+
scores.append(0.0)
45+
issues.append(f"{inv.invocation_id}: not valid JSON ({exc})")
46+
47+
overall = sum(scores) / len(scores) if scores else 0.0
48+
return EvalResult(
49+
score=overall,
50+
per_invocation_scores=scores,
51+
details={"issues": issues} if issues else None,
52+
)
53+
54+
55+
if __name__ == "__main__":
56+
is_json.run()
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name: levenshtein_ratio
2+
description: Scores similarity of each response to a reference string using normalized Levenshtein distance
3+
language: python
4+
entrypoint: levenshtein_ratio.py
5+
tags: [string, levenshtein]
6+
author: agentevals-dev

0 commit comments

Comments
 (0)