Skip to content

Commit ae67759

Browse files
committed
pr feedback
Signed-off-by: Peter Jausovec <peter.jausovec@solo.io>
1 parent 49595e8 commit ae67759

8 files changed

Lines changed: 64 additions & 47 deletions

File tree

evaluators/contains/contains.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
otherwise 0.0.
55
66
Config:
7-
substring (str): Required for scoring; if omitted, the evaluator is a no-op (1.0).
7+
substring (str): Required. If omitted, returns NOT_EVALUATED.
88
99
Usage in eval_config.yaml:
1010
config:
@@ -13,37 +13,39 @@
1313

1414
from __future__ import annotations
1515

16-
from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
16+
from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
1717

1818

1919
@evaluator
2020
def contains(input: EvalInput) -> EvalResult:
21-
needle = (input.config.get("substring") or "").strip()
22-
if not needle:
21+
substring = (input.config.get("substring") or "").strip()
22+
n = len(input.invocations)
23+
if not substring:
2324
return EvalResult(
24-
score=1.0,
25-
per_invocation_scores=[1.0] * len(input.invocations),
26-
details={"note": "no substring configured; skipping check"},
25+
score=0.0,
26+
status=EvalStatus.NOT_EVALUATED,
27+
per_invocation_scores=[None] * n,
28+
details={"reason": "missing config: substring"},
2729
)
2830

2931
case_insensitive = bool(input.config.get("case_insensitive", False))
30-
haystack_fn = str.lower if case_insensitive else lambda s: s
31-
needle_cmp = haystack_fn(needle)
32+
normalize = str.lower if case_insensitive else lambda s: s
33+
substring_cmp = normalize(substring)
3234

3335
scores: list[float] = []
3436
issues: list[str] = []
3537

3638
for inv in input.invocations:
37-
text = (inv.final_response or "")
39+
response_text = inv.final_response or ""
3840
if case_insensitive:
39-
ok = needle_cmp in haystack_fn(text)
41+
ok = substring_cmp in normalize(response_text)
4042
else:
41-
ok = needle in text
43+
ok = substring in response_text
4244
if ok:
4345
scores.append(1.0)
4446
else:
4547
scores.append(0.0)
46-
issues.append(f"{inv.invocation_id}: response does not contain {needle!r}")
48+
issues.append(f"{inv.invocation_id}: response does not contain {substring!r}")
4749

4850
overall = sum(scores) / len(scores) if scores else 0.0
4951
return EvalResult(

evaluators/contains/evaluator.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@ name: contains
22
description: Scores whether each final response contains a configured substring (case-sensitive or case-insensitive)
33
language: python
44
entrypoint: contains.py
5-
tags: [string, contains,]
5+
tags: [string, contains]
66
author: agentevals-dev

evaluators/equals/equals.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Exact string match evaluator.
22
33
Config:
4-
expected (str): If omitted, no-op (1.0).
4+
expected (str): Required. If omitted, returns NOT_EVALUATED.
55
case_insensitive (bool, default True): Compare normalized strings.
66
strip (bool, default True): Strip whitespace before compare.
77
@@ -12,20 +12,22 @@
1212

1313
from __future__ import annotations
1414

15-
from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
15+
from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
1616

1717

1818
@evaluator
1919
def equals(input: EvalInput) -> EvalResult:
2020
expected = input.config.get("expected")
2121
if expected is None:
22+
n = len(input.invocations)
2223
return EvalResult(
23-
score=1.0,
24-
per_invocation_scores=[1.0] * len(input.invocations),
25-
details={"note": "no expected string configured; skipping check"},
24+
score=0.0,
25+
status=EvalStatus.NOT_EVALUATED,
26+
per_invocation_scores=[None] * n,
27+
details={"reason": "missing config: expected"},
2628
)
2729

28-
case_insensitive = bool(input.config.get("case_insensitive", True))
30+
case_insensitive = bool(input.config.get("case_insensitive", False))
2931
strip = bool(input.config.get("strip", True))
3032

3133
def norm(s: str) -> str:

evaluators/is_json/is_json.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,11 @@
33
Tries to parse final_response as JSON. Optionally extracts fenced ```json ... ``` blocks.
44
55
Config:
6-
require_json (bool, default False): If False, evaluator is a no-op (1.0).
76
extract_markdown_fence (bool, default True): Strip ```json fences if present.
87
98
Usage:
109
config:
11-
require_json: true
10+
extract_markdown_fence: true
1211
"""
1312

1413
from __future__ import annotations
@@ -32,13 +31,6 @@ def _parse_json_payload(text: str, extract_fence: bool) -> object:
3231

3332
@evaluator
3433
def is_json(input: EvalInput) -> EvalResult:
35-
if not input.config.get("require_json"):
36-
return EvalResult(
37-
score=1.0,
38-
per_invocation_scores=[1.0] * len(input.invocations),
39-
details={"note": "require_json not set; skipping check"},
40-
)
41-
4234
extract_fence = bool(input.config.get("extract_markdown_fence", True))
4335

4436
scores: list[float] = []

evaluators/levenshtein_ratio/levenshtein_ratio.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Score for an invocation is 1.0 - (edit_distance / max(len(a), len(b), 1)), clamped to [0, 1].
44
55
Config:
6-
expected (str): If omitted, no-op (1.0).
6+
expected (str): Required. If omitted, returns NOT_EVALUATED.
77
case_insensitive (bool, default False): Compare lowercased strings.
88
99
Usage:
@@ -13,7 +13,7 @@
1313

1414
from __future__ import annotations
1515

16-
from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
16+
from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
1717

1818

1919
def _levenshtein(a: str, b: str) -> int:
@@ -38,10 +38,12 @@ def _levenshtein(a: str, b: str) -> int:
3838
def levenshtein_ratio(input: EvalInput) -> EvalResult:
3939
expected = input.config.get("expected")
4040
if expected is None:
41+
n = len(input.invocations)
4142
return EvalResult(
42-
score=1.0,
43-
per_invocation_scores=[1.0] * len(input.invocations),
44-
details={"note": "no expected string configured; skipping check"},
43+
score=0.0,
44+
status=EvalStatus.NOT_EVALUATED,
45+
per_invocation_scores=[None] * n,
46+
details={"reason": "missing config: expected"},
4547
)
4648

4749
case_insensitive = bool(input.config.get("case_insensitive", False))

evaluators/regex_match/regex_match.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Regex on final response evaluator.
22
33
Config:
4-
pattern (str): If omitted, no-op (1.0).
4+
pattern (str): Required. If omitted or invalid, returns NOT_EVALUATED.
55
flags (str, optional): "IGNORECASE" | "MULTILINE" | "DOTALL" — combined with |.
66
77
Usage:
@@ -14,7 +14,7 @@
1414

1515
import re
1616

17-
from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
17+
from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
1818

1919
_FLAG_MAP = {
2020
"IGNORECASE": re.IGNORECASE,
@@ -26,11 +26,13 @@
2626
@evaluator
2727
def regex_match(input: EvalInput) -> EvalResult:
2828
pattern = input.config.get("pattern")
29+
n = len(input.invocations)
2930
if not pattern:
3031
return EvalResult(
31-
score=1.0,
32-
per_invocation_scores=[1.0] * len(input.invocations),
33-
details={"note": "no pattern configured; skipping check"},
32+
score=0.0,
33+
status=EvalStatus.NOT_EVALUATED,
34+
per_invocation_scores=[None] * n,
35+
details={"reason": "missing config: pattern"},
3436
)
3537

3638
flag_names = input.config.get("flags")
@@ -51,8 +53,9 @@ def regex_match(input: EvalInput) -> EvalResult:
5153
except re.error as exc:
5254
return EvalResult(
5355
score=0.0,
54-
per_invocation_scores=[0.0] * len(input.invocations),
55-
details={"error": f"invalid regex: {exc}"},
56+
status=EvalStatus.NOT_EVALUATED,
57+
per_invocation_scores=[None] * n,
58+
details={"reason": "invalid regex pattern", "error": str(exc)},
5659
)
5760

5861
scores: list[float] = []

evaluators/tool_sequence_match/tool_sequence_match.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Compares the ordered list of tool names in each invocation to config.
44
55
Config:
6-
expected_tool_names (list[str]): If omitted or empty, no-op (1.0).
6+
expected_tool_names (list[str]): Required non-empty. Otherwise returns NOT_EVALUATED.
77
require_order (bool, default True): If False, compares multisets (same counts per name).
88
99
Usage:
@@ -16,17 +16,26 @@
1616

1717
from collections import Counter
1818

19-
from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
19+
from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
2020

2121

2222
@evaluator
2323
def tool_sequence_match(input: EvalInput) -> EvalResult:
2424
expected = input.config.get("expected_tool_names")
25+
n = len(input.invocations)
26+
if expected is None or not isinstance(expected, list):
27+
return EvalResult(
28+
score=0.0,
29+
status=EvalStatus.NOT_EVALUATED,
30+
per_invocation_scores=[None] * n,
31+
details={"reason": "missing or invalid config: expected_tool_names (need a list of names)"},
32+
)
2533
if not expected:
2634
return EvalResult(
27-
score=1.0,
28-
per_invocation_scores=[1.0] * len(input.invocations),
29-
details={"note": "no expected_tool_names configured; skipping check"},
35+
score=0.0,
36+
status=EvalStatus.NOT_EVALUATED,
37+
per_invocation_scores=[None] * n,
38+
details={"reason": "missing or empty config: expected_tool_names"},
3039
)
3140

3241
want = [str(x) for x in expected]

scripts/validate_evaluator.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool:
158158
if not stdout:
159159
stderr_preview = result.stderr.strip()[:500]
160160
_fail(
161-
f"Evaluator produced no output on stdout"
161+
"Evaluator produced no output on stdout"
162162
+ (f"\n stderr: {stderr_preview}" if stderr_preview else "")
163163
)
164164
return False
@@ -208,6 +208,13 @@ def validate_smoke_run(evaluator_dir: Path, manifest: dict) -> bool:
208208
f"got {type(per_inv).__name__}"
209209
)
210210
return False
211+
for i, x in enumerate(per_inv):
212+
if x is not None and not isinstance(x, (int, float)):
213+
_fail(
214+
f"'per_invocation_scores[{i}]' must be a number or null, "
215+
f"got {type(x).__name__}"
216+
)
217+
return False
211218

212219
# Full Pydantic validation via the SDK if available
213220
try:

0 commit comments

Comments
 (0)