Skip to content

Commit 83f2d28

Browse files
committed
feat: add StringCheckGrader support for OpenAI Evals backend
1 parent 43bc581 commit 83f2d28

6 files changed

Lines changed: 198 additions & 39 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ evaluators:
240240
threshold: 0.7
241241
```
242242
243-
Evaluators with a `requirements.txt` get automatic virtual environment management. You can also use `type: remote` for community evaluators from GitHub, or `type: openai_eval` to delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) (requires `pip install "agentevals-cli[openai]"`).
243+
Evaluators with a `requirements.txt` get automatic virtual environment management. You can also use `type: remote` for community evaluators from GitHub, or `type: openai_eval` to delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) (requires `pip install "agentevals-cli[openai]"`). Supported grader types: `text_similarity` and `string_check`.
244244

245245
See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK helpers, and how to contribute evaluators.
246246

docs/custom-evaluators.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,32 @@ The `grader.evaluation_metric` field selects the similarity algorithm:
317317
| `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
318318
| `rouge_l` | Longest common subsequence overlap (F-measure) |
319319

320+
### String Check Grader
321+
322+
Checks whether the agent response contains, equals, or matches a fixed reference string. No eval set is needed.
323+
324+
```yaml
325+
evaluators:
326+
- name: response_contains_hello
327+
type: openai_eval
328+
threshold: 0.8
329+
grader:
330+
type: string_check
331+
reference: "hello"
332+
operation: ilike
333+
```
334+
335+
The `operation` field controls how the check is applied:
336+
337+
| Operation | Description |
338+
|---|---|
339+
| `eq` | Exact match (case-sensitive) |
340+
| `ne` | Does not equal (case-sensitive) |
341+
| `like` | Contains the reference (case-sensitive) |
342+
| `ilike` | Contains the reference (case-insensitive) |
343+
344+
Each invocation either passes or fails. The `threshold` field is not used by `string_check`.
345+
320346
### How it works
321347

322348
Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Eval config using OpenAI Evals API graders.
2+
# Requires OPENAI_API_KEY to be set.
3+
#
4+
# Run with:
5+
# agentevals run samples/helm.json \
6+
# --config examples/custom_evaluators/eval_config_openai_eval.yaml
7+
8+
evaluators:
9+
- name: response_contains_hello
10+
type: openai_eval
11+
grader:
12+
type: string_check
13+
reference: "hello"
14+
operation: ilike

src/agentevals/config.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
5454
ref: str = Field(description="Source-specific reference (e.g. path within the repo).")
5555

5656

57+
_VALID_STRING_CHECK_OPERATIONS = frozenset({"eq", "ne", "like", "ilike"})
58+
59+
_SUPPORTED_GRADER_TYPES = frozenset({"string_check", "text_similarity"})
60+
5761
_VALID_SIMILARITY_METRICS = frozenset(
5862
{
5963
"fuzzy_match",
@@ -84,13 +88,21 @@ class OpenAIEvalDef(BaseModel):
8488
@classmethod
8589
def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
8690
grader_type = v.get("type")
87-
if grader_type != "text_similarity":
88-
raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
89-
metric = v.get("evaluation_metric")
90-
if not metric:
91-
raise ValueError("'evaluation_metric' is required for text_similarity grader")
92-
if metric not in _VALID_SIMILARITY_METRICS:
93-
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
91+
if grader_type == "text_similarity":
92+
metric = v.get("evaluation_metric")
93+
if not metric:
94+
raise ValueError("'evaluation_metric' is required for text_similarity grader")
95+
if metric not in _VALID_SIMILARITY_METRICS:
96+
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
97+
elif grader_type == "string_check":
98+
for field in ("reference", "operation"):
99+
if not v.get(field):
100+
raise ValueError(f"'{field}' is required for string_check grader")
101+
op = v["operation"]
102+
if op not in _VALID_STRING_CHECK_OPERATIONS:
103+
raise ValueError(f"Invalid operation '{op}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}")
104+
else:
105+
raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: {sorted(_SUPPORTED_GRADER_TYPES)}")
94106
return v
95107

96108

src/agentevals/openai_eval_backend.py

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,4 @@
1-
"""OpenAI Evals API backend — delegates grading to the OpenAI Evals API.
2-
3-
Builds testing criteria from the evaluator config, submits invocation pairs
4-
as JSONL items, polls for completion, and maps per-item results back to a
5-
MetricResult.
6-
"""
1+
"""OpenAI Evals API backend."""
72

83
from __future__ import annotations
94

@@ -31,13 +26,14 @@
3126
"required": ["actual_response", "expected_response"],
3227
}
3328

29+
_ACTUAL_ONLY_SCHEMA = {
30+
"type": "object",
31+
"properties": {"actual_response": {"type": "string"}},
32+
"required": ["actual_response"],
33+
}
34+
3435

3536
def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
36-
"""Build the OpenAI testing_criteria dict from the evaluator config.
37-
38-
Each grader type produces a different shape. Extend this function
39-
when adding support for new OpenAI grader types.
40-
"""
4137
grader = evaluator_def.grader
4238
grader_type = grader["type"]
4339

@@ -51,28 +47,30 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
5147
"pass_threshold": evaluator_def.threshold,
5248
}
5349

50+
if grader_type == "string_check":
51+
return {
52+
"type": "string_check",
53+
"name": evaluator_def.name,
54+
"input": "{{ item.actual_response }}",
55+
"reference": grader["reference"],
56+
"operation": grader["operation"],
57+
}
58+
5459
raise ValueError(f"Unsupported grader type: {grader_type}")
5560

5661

5762
def _build_jsonl_items(
5863
actual_invocations: list[Invocation],
5964
expected_invocations: list[Invocation],
65+
*,
66+
include_expected: bool = True,
6067
) -> list[dict[str, Any]]:
6168
items = []
6269
for i, actual_inv in enumerate(actual_invocations):
63-
actual_text = _content_to_text(actual_inv.final_response)
64-
if i < len(expected_invocations):
65-
expected_text = _content_to_text(expected_invocations[i].final_response)
66-
else:
67-
expected_text = ""
68-
items.append(
69-
{
70-
"item": {
71-
"actual_response": actual_text,
72-
"expected_response": expected_text,
73-
}
74-
}
75-
)
70+
item: dict[str, Any] = {"actual_response": _content_to_text(actual_inv.final_response)}
71+
if include_expected:
72+
item["expected_response"] = _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else ""
73+
items.append({"item": item})
7674
return items
7775

7876

@@ -111,13 +109,15 @@ async def evaluate_openai_eval(
111109
error="OPENAI_API_KEY environment variable is not set.",
112110
)
113111

114-
if expected_invocations is None:
112+
grader_type = evaluator_def.grader.get("type")
113+
needs_expected = grader_type == "text_similarity"
114+
if needs_expected and expected_invocations is None:
115115
return MetricResult(
116116
metric_name=evaluator_def.name,
117-
error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
117+
error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).",
118118
)
119119

120-
items = _build_jsonl_items(actual_invocations, expected_invocations)
120+
items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=needs_expected)
121121
if not items:
122122
return MetricResult(
123123
metric_name=evaluator_def.name,
@@ -130,12 +130,13 @@ async def evaluate_openai_eval(
130130
try:
131131
client = await asyncio.to_thread(_get_openai_client)
132132

133+
item_schema = _TEXT_PAIR_SCHEMA if needs_expected else _ACTUAL_ONLY_SCHEMA
133134
eval_obj = await asyncio.to_thread(
134135
client.evals.create,
135-
name=f"agentevals-{evaluator_def.name}",
136+
name=f"agentevals-openai-{evaluator_def.name}",
136137
data_source_config={
137138
"type": "custom",
138-
"item_schema": _TEXT_PAIR_SCHEMA,
139+
"item_schema": item_schema,
139140
"include_sample_schema": False,
140141
},
141142
testing_criteria=[testing_criteria],
@@ -146,7 +147,7 @@ async def evaluate_openai_eval(
146147
run = await asyncio.to_thread(
147148
client.evals.runs.create,
148149
eval_id=eval_id,
149-
name=f"agentevals-run-{evaluator_def.name}",
150+
name=f"agentevals-openai-run-{evaluator_def.name}",
150151
data_source={
151152
"type": "jsonl",
152153
"source": {
@@ -225,12 +226,16 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
225226
total = result_counts.total if result_counts else 0
226227
eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
227228

229+
grader = evaluator_def.grader
228230
details: dict[str, Any] = {
229231
"openai_eval_id": eval_id,
230232
"openai_run_id": run_id,
231-
"evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
232233
"result_counts": {"passed": passed, "failed": failed, "total": total},
233234
}
235+
if grader["type"] == "text_similarity":
236+
details["evaluation_metric"] = grader.get("evaluation_metric")
237+
elif grader["type"] == "string_check":
238+
details["operation"] = grader.get("operation")
234239
per_criteria = getattr(run, "per_testing_criteria_results", None)
235240
if per_criteria:
236241
details["per_testing_criteria"] = [

tests/test_openai_eval_backend.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import pytest
2+
from unittest.mock import MagicMock
3+
4+
from agentevals.config import OpenAIEvalDef
5+
from agentevals.openai_eval_backend import (
6+
_build_jsonl_items,
7+
_build_testing_criteria,
8+
evaluate_openai_eval,
9+
)
10+
11+
12+
def _string_check_grader(**overrides):
13+
base = {"type": "string_check", "reference": "hello", "operation": "ilike"}
14+
base.update(overrides)
15+
return base
16+
17+
18+
def _invocation(text: str):
19+
inv = MagicMock()
20+
inv.final_response.parts = [MagicMock(text=text)]
21+
return inv
22+
23+
24+
class TestOpenAIEvalDefValidation:
25+
def test_text_similarity_valid(self):
26+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
27+
assert d.grader["type"] == "text_similarity"
28+
29+
def test_text_similarity_missing_metric(self):
30+
with pytest.raises(Exception, match="evaluation_metric"):
31+
OpenAIEvalDef(name="sim", grader={"type": "text_similarity"})
32+
33+
def test_text_similarity_bad_metric(self):
34+
with pytest.raises(Exception, match="Unknown evaluation_metric"):
35+
OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bad"})
36+
37+
def test_string_check_valid(self):
38+
d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
39+
assert d.grader["type"] == "string_check"
40+
41+
@pytest.mark.parametrize("field", ["reference", "operation"])
42+
def test_string_check_missing_required(self, field):
43+
with pytest.raises(Exception, match=field):
44+
OpenAIEvalDef(name="sc", grader=_string_check_grader(**{field: None}))
45+
46+
def test_string_check_bad_operation(self):
47+
with pytest.raises(Exception, match="Invalid operation"):
48+
OpenAIEvalDef(name="sc", grader=_string_check_grader(operation="bad"))
49+
50+
def test_unsupported_grader_type(self):
51+
with pytest.raises(Exception, match="Unsupported grader type"):
52+
OpenAIEvalDef(name="x", grader={"type": "unknown"})
53+
54+
55+
class TestBuildTestingCriteria:
56+
def test_text_similarity_shape(self):
57+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}, threshold=0.7)
58+
c = _build_testing_criteria(d)
59+
assert c["type"] == "text_similarity"
60+
assert c["evaluation_metric"] == "bleu"
61+
assert c["pass_threshold"] == 0.7
62+
assert "{{ item.actual_response }}" in c["input"]
63+
assert "{{ item.expected_response }}" in c["reference"]
64+
65+
def test_string_check_shape(self):
66+
d = OpenAIEvalDef(name="sc", grader=_string_check_grader(reference="ok", operation="eq"))
67+
c = _build_testing_criteria(d)
68+
assert c["type"] == "string_check"
69+
assert c["reference"] == "ok"
70+
assert c["operation"] == "eq"
71+
assert "{{ item.actual_response }}" in c["input"]
72+
73+
74+
class TestBuildJsonlItems:
75+
def test_includes_expected_when_requested(self):
76+
items = _build_jsonl_items([_invocation("hi")], [_invocation("bye")], include_expected=True)
77+
assert "expected_response" in items[0]["item"]
78+
79+
def test_excludes_expected_for_string_check(self):
80+
items = _build_jsonl_items([_invocation("hi")], [], include_expected=False)
81+
assert "expected_response" not in items[0]["item"]
82+
83+
84+
class TestEvaluateOpenAIEval:
85+
async def test_no_api_key_returns_error(self, monkeypatch):
86+
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
87+
d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
88+
result = await evaluate_openai_eval(d, [], [])
89+
assert "OPENAI_API_KEY" in (result.error or "")
90+
91+
async def test_text_similarity_requires_expected(self, monkeypatch):
92+
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
93+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
94+
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
95+
assert "expected invocations" in (result.error or "")
96+
97+
async def test_string_check_does_not_require_expected(self, monkeypatch):
98+
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
99+
monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
100+
d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
101+
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
102+
assert "expected invocations" not in (result.error or "")

0 commit comments

Comments
 (0)