Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ evaluators:
threshold: 0.7
```

Evaluators with a `requirements.txt` get automatic virtual environment management. You can also use `type: remote` for community evaluators from GitHub, or `type: openai_eval` to delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) (requires `pip install "agentevals-cli[openai]"`).
Evaluators with a `requirements.txt` get automatic virtual environment management. You can also use `type: remote` for community evaluators from GitHub, or `type: openai_eval` to delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) (requires `pip install "agentevals-cli[openai]"`). Supported grader types: `text_similarity` and `string_check`.

See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK helpers, and how to contribute evaluators.

Expand Down
26 changes: 26 additions & 0 deletions docs/custom-evaluators.md
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,32 @@ evaluators:

The `threshold` field is not used for `label_model`. A response passes if its assigned label is in `passing_labels`.

### String Check Grader

Checks whether the agent response contains, equals, or matches a fixed reference string. No eval set is needed.

```yaml
evaluators:
- name: response_contains_hello
type: openai_eval
threshold: 0.8
grader:
type: string_check
reference: "hello"
operation: ilike
```

The `operation` field controls how the check is applied:

| Operation | Description |
|---|---|
| `eq` | Exact match (case-sensitive) |
| `ne` | Does not equal (case-sensitive) |
| `like` | Contains the reference (case-sensitive) |
| `ilike` | Contains the reference (case-insensitive) |

Each invocation either passes or fails. The `threshold` field is not used by `string_check`.

### How it works

Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
Expand Down
6 changes: 6 additions & 0 deletions examples/custom_evaluators/eval_config_openai_eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,9 @@ evaluators:
content: "Rate this response: {{ item.actual_response }}"
labels: [good, bad]
passing_labels: [good]
- name: response_contains_hello
type: openai_eval
grader:
type: string_check
reference: "hello"
operation: ilike
13 changes: 12 additions & 1 deletion src/agentevals/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
ref: str = Field(description="Source-specific reference (e.g. path within the repo).")


_VALID_STRING_CHECK_OPERATIONS = frozenset({"eq", "ne", "like", "ilike"})

_SUPPORTED_GRADER_TYPES = frozenset({"string_check", "text_similarity", "label_model"})

_VALID_SIMILARITY_METRICS = frozenset(
{
"fuzzy_match",
Expand Down Expand Up @@ -113,8 +117,15 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
invalid = [lbl for lbl in v["passing_labels"] if lbl not in v["labels"]]
if invalid:
raise ValueError(f"passing_labels contains labels not declared in labels: {invalid}")
elif grader_type == "string_check":
for field in ("reference", "operation"):
if not v.get(field):
raise ValueError(f"'{field}' is required for string_check grader")
op = v["operation"]
if op not in _VALID_STRING_CHECK_OPERATIONS:
raise ValueError(f"Invalid operation '{op}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}")
else:
raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: label_model, text_similarity")
raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: {sorted(_SUPPORTED_GRADER_TYPES)}")
return v


Expand Down
36 changes: 18 additions & 18 deletions src/agentevals/openai_eval_backend.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
"""OpenAI Evals API backend — delegates grading to the OpenAI Evals API.

Builds testing criteria from the evaluator config, submits invocation pairs
as JSONL items, polls for completion, and maps per-item results back to a
MetricResult.
"""
"""OpenAI Evals API backend."""

from __future__ import annotations

Expand Down Expand Up @@ -39,11 +34,6 @@


def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
"""Build the OpenAI testing_criteria dict from the evaluator config.

Each grader type produces a different shape. Extend this function
when adding support for new OpenAI grader types.
"""
grader = evaluator_def.grader
grader_type = grader["type"]

Expand All @@ -67,12 +57,22 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
"passing_labels": grader["passing_labels"],
}

if grader_type == "string_check":
return {
"type": "string_check",
"name": evaluator_def.name,
"input": "{{ item.actual_response }}",
"reference": grader["reference"],
"operation": grader["operation"],
}

raise ValueError(f"Unsupported grader type: {grader_type}")


def _build_jsonl_items(
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
*,
include_expected: bool = True,
) -> list[dict[str, Any]]:
items = []
Expand Down Expand Up @@ -123,16 +123,14 @@ async def evaluate_openai_eval(
)

grader_type = evaluator_def.grader["type"]

if grader_type == "text_similarity" and expected_invocations is None:
needs_expected = grader_type == "text_similarity"
if needs_expected and expected_invocations is None:
return MetricResult(
metric_name=evaluator_def.name,
error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).",
)

items = _build_jsonl_items(
actual_invocations, expected_invocations or [], include_expected=(grader_type != "label_model")
)
items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=needs_expected)
if not items:
return MetricResult(
metric_name=evaluator_def.name,
Expand All @@ -145,7 +143,7 @@ async def evaluate_openai_eval(
try:
client = await asyncio.to_thread(_get_openai_client)

item_schema = _ACTUAL_ONLY_SCHEMA if grader_type == "label_model" else _TEXT_PAIR_SCHEMA
item_schema = _TEXT_PAIR_SCHEMA if needs_expected else _ACTUAL_ONLY_SCHEMA
eval_obj = await asyncio.to_thread(
client.evals.create,
name=f"agentevals-openai-{evaluator_def.name}",
Expand Down Expand Up @@ -252,6 +250,8 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
elif grader["type"] == "label_model":
details["model"] = grader.get("model")
details["passing_labels"] = grader.get("passing_labels")
elif grader["type"] == "string_check":
details["operation"] = grader.get("operation")
per_criteria = getattr(run, "per_testing_criteria_results", None)
if per_criteria:
details["per_testing_criteria"] = [
Expand Down
41 changes: 38 additions & 3 deletions tests/test_openai_eval_backend.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
from unittest.mock import MagicMock

import pytest

from agentevals.config import OpenAIEvalDef
from agentevals.openai_eval_backend import (
_build_jsonl_items,
Expand All @@ -21,6 +22,12 @@ def _label_grader(**overrides):
return base


def _string_check_grader(**overrides):
base = {"type": "string_check", "reference": "hello", "operation": "ilike"}
base.update(overrides)
return base


def _invocation(text: str):
inv = MagicMock()
inv.final_response.parts = [MagicMock(text=text)]
Expand Down Expand Up @@ -55,6 +62,19 @@ def test_label_model_passing_labels_not_in_labels(self):
with pytest.raises(Exception, match="passing_labels"):
OpenAIEvalDef(name="lm", grader=grader)

def test_string_check_valid(self):
d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
assert d.grader["type"] == "string_check"

@pytest.mark.parametrize("field", ["reference", "operation"])
def test_string_check_missing_required(self, field):
with pytest.raises(Exception, match=field):
OpenAIEvalDef(name="sc", grader=_string_check_grader(**{field: None}))

def test_string_check_bad_operation(self):
with pytest.raises(Exception, match="Invalid operation"):
OpenAIEvalDef(name="sc", grader=_string_check_grader(operation="bad"))

def test_unsupported_grader_type(self):
with pytest.raises(Exception, match="Unsupported grader type"):
OpenAIEvalDef(name="x", grader={"type": "unknown"})
Expand All @@ -80,13 +100,21 @@ def test_label_model_shape(self):
assert c["passing_labels"] == ["good"]
assert c["input"] == grader["input"]

def test_string_check_shape(self):
d = OpenAIEvalDef(name="sc", grader=_string_check_grader(reference="ok", operation="eq"))
c = _build_testing_criteria(d)
assert c["type"] == "string_check"
assert c["reference"] == "ok"
assert c["operation"] == "eq"
assert "{{ item.actual_response }}" in c["input"]


class TestBuildJsonlItems:
def test_text_similarity_includes_expected(self):
def test_includes_expected_when_requested(self):
items = _build_jsonl_items([_invocation("hello")], [_invocation("world")], include_expected=True)
assert "expected_response" in items[0]["item"]

def test_label_model_excludes_expected(self):
def test_excludes_expected_when_not_requested(self):
items = _build_jsonl_items([_invocation("hello")], [], include_expected=False)
assert "expected_response" not in items[0]["item"]

Expand Down Expand Up @@ -114,3 +142,10 @@ async def test_label_model_does_not_require_expected(self, monkeypatch):
d = OpenAIEvalDef(name="lm", grader=_label_grader())
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
assert "expected invocations" not in (result.error or "")

async def test_string_check_does_not_require_expected(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
assert "expected invocations" not in (result.error or "")