fix: address PR agentevals-dev#7 review — zero-value bugs, division guards, attribute access, e2e tests

henrikrexed · Paperclip-Paperclip · henrikrexed · commit 25ae145b9896 · 2026-04-09T12:03:08.000+02:00
- token_efficiency: replace `or` with `is None` checks so zero token counts
  are not silently dropped in favor of fallback keys
- time_efficiency: guard against max_duration &lt;= 0 (division by zero), fix
  zero duration_s being skipped via `or` fallback
- tool_efficiency: switch from dict get/isinstance to direct attribute access
  matching SDK conventions, add NOT_EVALUATED for empty invocations, guard
  max_tool_calls=0
- All three evaluators: use `issues` key in details for consistency with
  existing evaluators (e.g. response_quality)
- Add 48 end-to-end tests covering happy paths, edge cases, zero-value
  regressions, config overrides, and multi-invocation averaging

Co-Authored-By: Paperclip &lt;noreply@paperclip.ing&gt;
diff --git a/evaluators/time_efficiency/time_efficiency.py b/evaluators/time_efficiency/time_efficiency.py
@@ -14,7 +14,9 @@ def _extract_duration(inv) -> float | None:
     if not isinstance(perf, dict):
         return None
 
-    d = perf.get("duration_s") or perf.get("duration")
+    d = perf.get("duration_s")
+    if d is None:
+        d = perf.get("duration")
     if d is not None:
         return float(d)
     return None
@@ -36,7 +38,10 @@ def time_efficiency(input: EvalInput) -> EvalResult:
             continue
 
         has_data = True
-        score = max(0.0, min(1.0, 1.0 - (duration / max_duration)))
+        if max_duration <= 0:
+            score = 0.0
+        else:
+            score = max(0.0, min(1.0, 1.0 - (duration / max_duration)))
         scores.append(score)
         details_items.append(f"{inv.invocation_id}: {duration:.1f}s / {max_duration:.1f}s")
 
@@ -48,7 +53,7 @@ def time_efficiency(input: EvalInput) -> EvalResult:
         )
 
     overall = sum(scores) / len(scores) if scores else 0.0
-    return EvalResult(score=overall, per_invocation_scores=scores, details={"time_details": details_items})
+    return EvalResult(score=overall, per_invocation_scores=scores, details={"issues": details_items})
 
 
 if __name__ == "__main__":
diff --git a/evaluators/token_efficiency/token_efficiency.py b/evaluators/token_efficiency/token_efficiency.py
@@ -14,10 +14,14 @@ def _extract_tokens(inv) -> dict | None:
     if not isinstance(perf, dict):
         return None
 
-    input_t = perf.get("input_tokens") or perf.get("prompt_tokens")
-    output_t = perf.get("output_tokens") or perf.get("completion_tokens")
+    input_t = perf.get("input_tokens")
+    if input_t is None:
+        input_t = perf.get("prompt_tokens")
+    output_t = perf.get("output_tokens")
+    if output_t is None:
+        output_t = perf.get("completion_tokens")
     if input_t is not None or output_t is not None:
-        return {"input_tokens": int(input_t or 0), "output_tokens": int(output_t or 0)}
+        return {"input_tokens": int(input_t if input_t is not None else 0), "output_tokens": int(output_t if output_t is not None else 0)}
 
     return None
 
@@ -56,7 +60,7 @@ def token_efficiency(input: EvalInput) -> EvalResult:
         )
 
     overall = sum(scores) / len(scores) if scores else 0.0
-    return EvalResult(score=overall, per_invocation_scores=scores, details={"token_details": details_items})
+    return EvalResult(score=overall, per_invocation_scores=scores, details={"issues": details_items})
 
 
 if __name__ == "__main__":
diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py
@@ -8,23 +8,20 @@
 """
 
 import json
-from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
 
 
 def _call_signature(call) -> str:
-    name = call.get("name", "") if isinstance(call, dict) else getattr(call, "name", "")
-    args = call.get("args", {}) if isinstance(call, dict) else getattr(call, "args", {})
     try:
-        args_str = json.dumps(args, sort_keys=True, default=str)
+        args_str = json.dumps(call.args, sort_keys=True, default=str)
     except (TypeError, ValueError):
-        args_str = str(args)
-    return f"{name}::{args_str}"
+        args_str = str(call.args)
+    return f"{call.name}::{args_str}"
 
 
 def _is_error_response(response) -> bool:
     """Check if a tool response indicates an error via its status field."""
-    status = response.get("status", "") if isinstance(response, dict) else getattr(response, "status", "")
-    return str(status).lower() in ("error", "failed", "failure")
+    return str(response.status or "").lower() in ("error", "failed", "failure")
 
 
 @evaluator
@@ -63,7 +60,7 @@ def tool_efficiency(input: EvalInput) -> EvalResult:
         useful = max(0, total - dupes - errors)
 
         efficiency = useful / total
-        budget_factor = max(0.0, 1.0 - max(0, total - max_tool_calls) / max_tool_calls)
+        budget_factor = max(0.0, 1.0 - max(0, total - max_tool_calls) / max_tool_calls) if max_tool_calls > 0 else 0.0
         score = max(0.0, min(1.0, efficiency * budget_factor))
         scores.append(score)
 
@@ -72,8 +69,15 @@ def tool_efficiency(input: EvalInput) -> EvalResult:
         if errors: parts.append(f"errors={errors}")
         details_items.append(f"{inv.invocation_id}: {', '.join(parts)}")
 
-    overall = sum(scores) / len(scores) if scores else 0.0
-    return EvalResult(score=overall, per_invocation_scores=scores, details={"tool_details": details_items})
+    if not scores:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            details={"reason": "no invocations to evaluate"},
+        )
+
+    overall = sum(scores) / len(scores)
+    return EvalResult(score=overall, per_invocation_scores=scores, details={"issues": details_items})
 
 
 if __name__ == "__main__":
diff --git a/tests/test_time_efficiency.py b/tests/test_time_efficiency.py
@@ -0,0 +1,152 @@
+"""End-to-end tests for the time_efficiency evaluator."""
+
+import json
+import subprocess
+import sys
+
+import pytest
+
+EVALUATOR = "evaluators/time_efficiency/time_efficiency.py"
+
+
+def _run(payload: dict) -> dict:
+    result = subprocess.run(
+        [sys.executable, EVALUATOR],
+        input=json.dumps(payload),
+        capture_output=True,
+        text=True,
+    )
+    assert result.returncode == 0, f"stderr: {result.stderr}"
+    return json.loads(result.stdout)
+
+
+def _make_input(invocations, config=None):
+    return {
+        "protocol_version": "1.0",
+        "metric_name": "time_efficiency",
+        "threshold": 0.5,
+        "config": config or {},
+        "invocations": invocations,
+    }
+
+
+def _inv(inv_id, perf=None):
+    return {"invocation_id": inv_id, "performance_metrics": perf}
+
+
+class TestTimeEfficiencyBasic:
+    def test_no_invocations(self):
+        result = _run(_make_input([]))
+        assert result["status"] == "NOT_EVALUATED"
+        assert result["score"] == 0.0
+
+    def test_no_duration_data(self):
+        result = _run(_make_input([_inv("inv-1", {})]))
+        assert result["status"] == "NOT_EVALUATED"
+
+    def test_no_perf_metrics(self):
+        result = _run(_make_input([_inv("inv-1", None)]))
+        assert result["status"] == "NOT_EVALUATED"
+
+    def test_zero_duration_perfect_score(self):
+        result = _run(_make_input([
+            _inv("inv-1", {"duration_s": 0}),
+        ]))
+        assert result["score"] == 1.0
+
+    def test_half_budget(self):
+        result = _run(_make_input([
+            _inv("inv-1", {"duration_s": 60}),
+        ]))
+        # 1.0 - (60 / 120) = 0.5
+        assert result["score"] == pytest.approx(0.5)
+
+    def test_over_budget_clamps_to_zero(self):
+        result = _run(_make_input([
+            _inv("inv-1", {"duration_s": 200}),
+        ]))
+        assert result["score"] == 0.0
+
+
+class TestTimeEfficiencyZeroGuard:
+    """Regression: division by zero when max_duration_s = 0."""
+
+    def test_zero_max_duration_no_crash(self):
+        result = _run(_make_input(
+            [_inv("inv-1", {"duration_s": 10})],
+            config={"max_duration_s": 0},
+        ))
+        assert result["score"] == 0.0
+
+    def test_negative_max_duration_no_crash(self):
+        result = _run(_make_input(
+            [_inv("inv-1", {"duration_s": 10})],
+            config={"max_duration_s": -5},
+        ))
+        assert result["score"] == 0.0
+
+    def test_zero_duration_with_zero_max(self):
+        result = _run(_make_input(
+            [_inv("inv-1", {"duration_s": 0})],
+            config={"max_duration_s": 0},
+        ))
+        assert result["score"] == 0.0
+
+
+class TestTimeEfficiencyZeroValues:
+    """Regression: `or` operator previously dropped zero duration values."""
+
+    def test_zero_duration_s_not_dropped(self):
+        """duration_s=0 should be used, not fall back to duration key."""
+        result = _run(_make_input([
+            _inv("inv-1", {"duration_s": 0, "duration": 999}),
+        ]))
+        # duration_s=0 gives score 1.0; if it fell back to 999, score would be 0.0
+        assert result["score"] == 1.0
+
+
+class TestTimeEfficiencyAliases:
+    def test_duration_alias(self):
+        result = _run(_make_input([
+            _inv("inv-1", {"duration": 60}),
+        ]))
+        assert result["score"] == pytest.approx(0.5)
+
+    def test_duration_s_takes_precedence(self):
+        result = _run(_make_input([
+            _inv("inv-1", {"duration_s": 0, "duration": 120}),
+        ]))
+        assert result["score"] == 1.0
+
+
+class TestTimeEfficiencyConfig:
+    def test_custom_max_duration(self):
+        result = _run(_make_input(
+            [_inv("inv-1", {"duration_s": 5})],
+            config={"max_duration_s": 10},
+        ))
+        assert result["score"] == pytest.approx(0.5)
+
+
+class TestTimeEfficiencyMultipleInvocations:
+    def test_average_across_invocations(self):
+        result = _run(_make_input([
+            _inv("inv-1", {"duration_s": 0}),    # score = 1.0
+            _inv("inv-2", {"duration_s": 120}),   # score = 0.0
+        ]))
+        assert result["score"] == pytest.approx(0.5)
+        assert len(result["per_invocation_scores"]) == 2
+
+    def test_mixed_with_missing(self):
+        result = _run(_make_input([
+            _inv("inv-1", {"duration_s": 0}),  # score = 1.0
+            _inv("inv-2", None),                # score = 0.0 (missing)
+        ]))
+        assert result["status"] is None
+        assert result["score"] == pytest.approx(0.5)
+
+    def test_uses_issues_key_in_details(self):
+        result = _run(_make_input([
+            _inv("inv-1", {"duration_s": 10}),
+        ]))
+        assert "issues" in result["details"]
diff --git a/tests/test_token_efficiency.py b/tests/test_token_efficiency.py
diff --git a/tests/test_tool_efficiency.py b/tests/test_tool_efficiency.py