Merge pull request #3 from agentevals-dev/peterj/addsomeevaluators

peterj · web-flow · commit f66a520b9094 · 2026-03-23T10:41:02.000+01:00
add a couple of simple evaluators
diff --git a/.github/workflows/validate-evaluators.yaml b/.github/workflows/validate-evaluators.yaml
@@ -20,9 +20,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install pyyaml
-          # TODO: switch to `pip install agentevals-grader-sdk` once published to PyPI
-          pip install "agentevals-grader-sdk @ git+https://github.com/agentevals-dev/agentevals.git#subdirectory=packages/grader-sdk-py"
+          pip install pyyaml agentevals-evaluator-sdk
 
       - name: Discover and validate all evaluators
         run: |
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.venv/
diff --git a/README.md b/README.md
@@ -107,13 +107,13 @@ author: your-github-username
 Run the validation script to catch issues before submitting:
 
 ```bash
-pip install agentevals-grader-sdk pyyaml
+pip install pyyaml agentevals-evaluator-sdk
 python scripts/validate_evaluator.py evaluators/my_evaluator
 ```
 
 This checks:
 - **Manifest schema** -- required fields, entrypoint exists, name matches directory
-- **Syntax and imports** -- compiles cleanly, uses `@grader` decorator
+- **Syntax and imports** -- compiles cleanly, uses `@evaluator` decorator
 - **Smoke run** -- runs the evaluator with synthetic input and validates the `EvalResult` output (correct types for `score`, `details`, `status`, etc.)
 
 You can also test with a full eval run:
diff --git a/evaluators/contains/contains.py b/evaluators/contains/contains.py
@@ -0,0 +1,59 @@
+"""Substring containment evaluator.
+
+Scores each invocation 1.0 if final_response contains the configured substring,
+otherwise 0.0.
+
+Config:
+  substring (str): Required. If omitted, returns NOT_EVALUATED.
+
+Usage in eval_config.yaml:
+    config:
+      substring: "expected phrase"
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+
+@evaluator
+def contains(input: EvalInput) -> EvalResult:
+    substring = (input.config.get("substring") or "").strip()
+    n = len(input.invocations)
+    if not substring:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing config: substring"},
+        )
+
+    case_insensitive = bool(input.config.get("case_insensitive", False))
+    normalize = str.lower if case_insensitive else lambda s: s
+    substring_cmp = normalize(substring)
+
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        response_text = inv.final_response or ""
+        if case_insensitive:
+            ok = substring_cmp in normalize(response_text)
+        else:
+            ok = substring in response_text
+        if ok:
+            scores.append(1.0)
+        else:
+            scores.append(0.0)
+            issues.append(f"{inv.invocation_id}: response does not contain {substring!r}")
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    contains.run()
diff --git a/evaluators/contains/evaluator.yaml b/evaluators/contains/evaluator.yaml
@@ -0,0 +1,6 @@
+name: contains
+description: Scores whether each final response contains a configured substring (case-sensitive or case-insensitive)
+language: python
+entrypoint: contains.py
+tags: [string, contains]
+author: agentevals-dev
diff --git a/evaluators/equals/equals.py b/evaluators/equals/equals.py
@@ -0,0 +1,60 @@
+"""Exact string match evaluator.
+
+Config:
+  expected (str): Required. If omitted, returns NOT_EVALUATED.
+  case_insensitive (bool, default True): Compare normalized strings.
+  strip (bool, default True): Strip whitespace before compare.
+
+Usage:
+    config:
+      expected: "4"
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+
+@evaluator
+def equals(input: EvalInput) -> EvalResult:
+    expected = input.config.get("expected")
+    if expected is None:
+        n = len(input.invocations)
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing config: expected"},
+        )
+
+    case_insensitive = bool(input.config.get("case_insensitive", False))
+    strip = bool(input.config.get("strip", True))
+
+    def norm(s: str) -> str:
+        t = s.strip() if strip else s
+        return t.lower() if case_insensitive else t
+
+    exp = norm(str(expected))
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        got = norm(inv.final_response or "")
+        if got == exp:
+            scores.append(1.0)
+        else:
+            scores.append(0.0)
+            issues.append(
+                f"{inv.invocation_id}: expected {expected!r}, got {inv.final_response!r}"
+            )
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    equals.run()
diff --git a/evaluators/equals/evaluator.yaml b/evaluators/equals/evaluator.yaml
@@ -0,0 +1,6 @@
+name: equals
+description: Scores whether each final response exactly matches a configured expected string
+language: python
+entrypoint: equals.py
+tags: [string, equals]
+author: agentevals-dev
diff --git a/evaluators/is_json/evaluator.yaml b/evaluators/is_json/evaluator.yaml
@@ -0,0 +1,6 @@
+name: is_json
+description: Scores whether each final response parses as JSON (optional markdown code fence extraction)
+language: python
+entrypoint: is_json.py
+tags: [json, structured]
+author: agentevals-dev
diff --git a/evaluators/is_json/is_json.py b/evaluators/is_json/is_json.py
@@ -0,0 +1,56 @@
+"""JSON parse check evaluator.
+
+Tries to parse final_response as JSON. Optionally extracts fenced ```json ... ``` blocks.
+
+Config:
+  extract_markdown_fence (bool, default True): Strip ```json fences if present.
+
+Usage:
+    config:
+      extract_markdown_fence: true
+"""
+
+from __future__ import annotations
+
+import json
+import re
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+_FENCE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?```\s*$", re.DOTALL | re.IGNORECASE)
+
+
+def _parse_json_payload(text: str, extract_fence: bool) -> object:
+    raw = (text or "").strip()
+    if extract_fence:
+        m = _FENCE.match(raw)
+        if m:
+            raw = m.group(1).strip()
+    return json.loads(raw)
+
+
+@evaluator
+def is_json(input: EvalInput) -> EvalResult:
+    extract_fence = bool(input.config.get("extract_markdown_fence", True))
+
+    scores: list[float] = []
+    issues: list[str] = []
+
+    for inv in input.invocations:
+        try:
+            _parse_json_payload(inv.final_response or "", extract_fence)
+            scores.append(1.0)
+        except (json.JSONDecodeError, TypeError, ValueError) as exc:
+            scores.append(0.0)
+            issues.append(f"{inv.invocation_id}: not valid JSON ({exc})")
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"issues": issues} if issues else None,
+    )
+
+
+if __name__ == "__main__":
+    is_json.run()
diff --git a/evaluators/levenshtein_ratio/evaluator.yaml b/evaluators/levenshtein_ratio/evaluator.yaml
@@ -0,0 +1,6 @@
+name: levenshtein_ratio
+description: Scores similarity of each response to a reference string using normalized Levenshtein distance
+language: python
+entrypoint: levenshtein_ratio.py
+tags: [string, levenshtein]
+author: agentevals-dev
diff --git a/evaluators/levenshtein_ratio/levenshtein_ratio.py b/evaluators/levenshtein_ratio/levenshtein_ratio.py
@@ -0,0 +1,82 @@
+"""Normalized Levenshtein similarity evaluator.
+
+Score for an invocation is 1.0 - (edit_distance / max(len(a), len(b), 1)), clamped to [0, 1].
+
+Config:
+  expected (str): Required. If omitted, returns NOT_EVALUATED.
+  case_insensitive (bool, default False): Compare lowercased strings.
+
+Usage:
+    config:
+      expected: "reference answer"
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+
+def _levenshtein(a: str, b: str) -> int:
+    """Classic O(nm) edit distance."""
+    if len(a) < len(b):
+        a, b = b, a
+    if not b:
+        return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a):
+        cur = [i + 1]
+        for j, cb in enumerate(b):
+            ins = prev[j + 1] + 1
+            delete = cur[j] + 1
+            sub = prev[j] + (ca != cb)
+            cur.append(min(ins, delete, sub))
+        prev = cur
+    return prev[-1]
+
+
+@evaluator
+def levenshtein_ratio(input: EvalInput) -> EvalResult:
+    expected = input.config.get("expected")
+    if expected is None:
+        n = len(input.invocations)
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "missing config: expected"},
+        )
+
+    case_insensitive = bool(input.config.get("case_insensitive", False))
+    ref = str(expected)
+    if case_insensitive:
+        ref = ref.lower()
+
+    scores: list[float] = []
+    details_rows: list[dict] = []
+
+    for inv in input.invocations:
+        got = inv.final_response or ""
+        a, b = (got.lower(), ref) if case_insensitive else (got, ref)
+        dist = _levenshtein(a, b)
+        denom = max(len(a), len(b), 1)
+        sim = 1.0 - (dist / denom)
+        sim = max(0.0, min(1.0, sim))
+        scores.append(sim)
+        details_rows.append(
+            {
+                "invocation_id": inv.invocation_id,
+                "distance": dist,
+                "similarity": sim,
+            }
+        )
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"per_invocation": details_rows},
+    )
+
+
+if __name__ == "__main__":
+    levenshtein_ratio.run()
diff --git a/evaluators/regex_match/evaluator.yaml b/evaluators/regex_match/evaluator.yaml
@@ -0,0 +1,6 @@
+name: regex_match
+description: Scores whether each final response matches a configured regular expression
+language: python
+entrypoint: regex_match.py
+tags: [regex]
+author: agentevals-dev
diff --git a/evaluators/regex_match/regex_match.py b/evaluators/regex_match/regex_match.py
diff --git a/evaluators/tool_coverage/tool_coverage.py b/evaluators/tool_coverage/tool_coverage.py
diff --git a/evaluators/tool_sequence_match/evaluator.yaml b/evaluators/tool_sequence_match/evaluator.yaml
diff --git a/evaluators/tool_sequence_match/tool_sequence_match.py b/evaluators/tool_sequence_match/tool_sequence_match.py
diff --git a/scripts/test_input.json b/scripts/test_input.json
diff --git a/scripts/validate_evaluator.py b/scripts/validate_evaluator.py