Skip to content

Commit 25ae145

Browse files
fix: address PR agentevals-dev#7 review — zero-value bugs, division guards, attribute access, e2e tests
- token_efficiency: replace `or` with `is None` checks so zero token counts are not silently dropped in favor of fallback keys - time_efficiency: guard against max_duration <= 0 (division by zero), fix zero duration_s being skipped via `or` fallback - tool_efficiency: switch from dict get/isinstance to direct attribute access matching SDK conventions, add NOT_EVALUATED for empty invocations, guard max_tool_calls=0 - All three evaluators: use `issues` key in details for consistency with existing evaluators (e.g. response_quality) - Add 48 end-to-end tests covering happy paths, edge cases, zero-value regressions, config overrides, and multi-invocation averaging Co-Authored-By: Paperclip <noreply@paperclip.ing>
1 parent c5d1684 commit 25ae145

6 files changed

Lines changed: 526 additions & 18 deletions

File tree

evaluators/time_efficiency/time_efficiency.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ def _extract_duration(inv) -> float | None:
1414
if not isinstance(perf, dict):
1515
return None
1616

17-
d = perf.get("duration_s") or perf.get("duration")
17+
d = perf.get("duration_s")
18+
if d is None:
19+
d = perf.get("duration")
1820
if d is not None:
1921
return float(d)
2022
return None
@@ -36,7 +38,10 @@ def time_efficiency(input: EvalInput) -> EvalResult:
3638
continue
3739

3840
has_data = True
39-
score = max(0.0, min(1.0, 1.0 - (duration / max_duration)))
41+
if max_duration <= 0:
42+
score = 0.0
43+
else:
44+
score = max(0.0, min(1.0, 1.0 - (duration / max_duration)))
4045
scores.append(score)
4146
details_items.append(f"{inv.invocation_id}: {duration:.1f}s / {max_duration:.1f}s")
4247

@@ -48,7 +53,7 @@ def time_efficiency(input: EvalInput) -> EvalResult:
4853
)
4954

5055
overall = sum(scores) / len(scores) if scores else 0.0
51-
return EvalResult(score=overall, per_invocation_scores=scores, details={"time_details": details_items})
56+
return EvalResult(score=overall, per_invocation_scores=scores, details={"issues": details_items})
5257

5358

5459
if __name__ == "__main__":

evaluators/token_efficiency/token_efficiency.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,14 @@ def _extract_tokens(inv) -> dict | None:
1414
if not isinstance(perf, dict):
1515
return None
1616

17-
input_t = perf.get("input_tokens") or perf.get("prompt_tokens")
18-
output_t = perf.get("output_tokens") or perf.get("completion_tokens")
17+
input_t = perf.get("input_tokens")
18+
if input_t is None:
19+
input_t = perf.get("prompt_tokens")
20+
output_t = perf.get("output_tokens")
21+
if output_t is None:
22+
output_t = perf.get("completion_tokens")
1923
if input_t is not None or output_t is not None:
20-
return {"input_tokens": int(input_t or 0), "output_tokens": int(output_t or 0)}
24+
return {"input_tokens": int(input_t if input_t is not None else 0), "output_tokens": int(output_t if output_t is not None else 0)}
2125

2226
return None
2327

@@ -56,7 +60,7 @@ def token_efficiency(input: EvalInput) -> EvalResult:
5660
)
5761

5862
overall = sum(scores) / len(scores) if scores else 0.0
59-
return EvalResult(score=overall, per_invocation_scores=scores, details={"token_details": details_items})
63+
return EvalResult(score=overall, per_invocation_scores=scores, details={"issues": details_items})
6064

6165

6266
if __name__ == "__main__":

evaluators/tool_efficiency/tool_efficiency.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,20 @@
88
"""
99

1010
import json
11-
from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
11+
from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
1212

1313

1414
def _call_signature(call) -> str:
15-
name = call.get("name", "") if isinstance(call, dict) else getattr(call, "name", "")
16-
args = call.get("args", {}) if isinstance(call, dict) else getattr(call, "args", {})
1715
try:
18-
args_str = json.dumps(args, sort_keys=True, default=str)
16+
args_str = json.dumps(call.args, sort_keys=True, default=str)
1917
except (TypeError, ValueError):
20-
args_str = str(args)
21-
return f"{name}::{args_str}"
18+
args_str = str(call.args)
19+
return f"{call.name}::{args_str}"
2220

2321

2422
def _is_error_response(response) -> bool:
2523
"""Check if a tool response indicates an error via its status field."""
26-
status = response.get("status", "") if isinstance(response, dict) else getattr(response, "status", "")
27-
return str(status).lower() in ("error", "failed", "failure")
24+
return str(response.status or "").lower() in ("error", "failed", "failure")
2825

2926

3027
@evaluator
@@ -63,7 +60,7 @@ def tool_efficiency(input: EvalInput) -> EvalResult:
6360
useful = max(0, total - dupes - errors)
6461

6562
efficiency = useful / total
66-
budget_factor = max(0.0, 1.0 - max(0, total - max_tool_calls) / max_tool_calls)
63+
budget_factor = max(0.0, 1.0 - max(0, total - max_tool_calls) / max_tool_calls) if max_tool_calls > 0 else 0.0
6764
score = max(0.0, min(1.0, efficiency * budget_factor))
6865
scores.append(score)
6966

@@ -72,8 +69,15 @@ def tool_efficiency(input: EvalInput) -> EvalResult:
7269
if errors: parts.append(f"errors={errors}")
7370
details_items.append(f"{inv.invocation_id}: {', '.join(parts)}")
7471

75-
overall = sum(scores) / len(scores) if scores else 0.0
76-
return EvalResult(score=overall, per_invocation_scores=scores, details={"tool_details": details_items})
72+
if not scores:
73+
return EvalResult(
74+
score=0.0,
75+
status=EvalStatus.NOT_EVALUATED,
76+
details={"reason": "no invocations to evaluate"},
77+
)
78+
79+
overall = sum(scores) / len(scores)
80+
return EvalResult(score=overall, per_invocation_scores=scores, details={"issues": details_items})
7781

7882

7983
if __name__ == "__main__":

tests/test_time_efficiency.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
"""End-to-end tests for the time_efficiency evaluator."""
2+
3+
import json
4+
import subprocess
5+
import sys
6+
7+
import pytest
8+
9+
EVALUATOR = "evaluators/time_efficiency/time_efficiency.py"
10+
11+
12+
def _run(payload: dict) -> dict:
13+
result = subprocess.run(
14+
[sys.executable, EVALUATOR],
15+
input=json.dumps(payload),
16+
capture_output=True,
17+
text=True,
18+
)
19+
assert result.returncode == 0, f"stderr: {result.stderr}"
20+
return json.loads(result.stdout)
21+
22+
23+
def _make_input(invocations, config=None):
24+
return {
25+
"protocol_version": "1.0",
26+
"metric_name": "time_efficiency",
27+
"threshold": 0.5,
28+
"config": config or {},
29+
"invocations": invocations,
30+
}
31+
32+
33+
def _inv(inv_id, perf=None):
34+
return {"invocation_id": inv_id, "performance_metrics": perf}
35+
36+
37+
class TestTimeEfficiencyBasic:
38+
def test_no_invocations(self):
39+
result = _run(_make_input([]))
40+
assert result["status"] == "NOT_EVALUATED"
41+
assert result["score"] == 0.0
42+
43+
def test_no_duration_data(self):
44+
result = _run(_make_input([_inv("inv-1", {})]))
45+
assert result["status"] == "NOT_EVALUATED"
46+
47+
def test_no_perf_metrics(self):
48+
result = _run(_make_input([_inv("inv-1", None)]))
49+
assert result["status"] == "NOT_EVALUATED"
50+
51+
def test_zero_duration_perfect_score(self):
52+
result = _run(_make_input([
53+
_inv("inv-1", {"duration_s": 0}),
54+
]))
55+
assert result["score"] == 1.0
56+
57+
def test_half_budget(self):
58+
result = _run(_make_input([
59+
_inv("inv-1", {"duration_s": 60}),
60+
]))
61+
# 1.0 - (60 / 120) = 0.5
62+
assert result["score"] == pytest.approx(0.5)
63+
64+
def test_over_budget_clamps_to_zero(self):
65+
result = _run(_make_input([
66+
_inv("inv-1", {"duration_s": 200}),
67+
]))
68+
assert result["score"] == 0.0
69+
70+
71+
class TestTimeEfficiencyZeroGuard:
72+
"""Regression: division by zero when max_duration_s = 0."""
73+
74+
def test_zero_max_duration_no_crash(self):
75+
result = _run(_make_input(
76+
[_inv("inv-1", {"duration_s": 10})],
77+
config={"max_duration_s": 0},
78+
))
79+
assert result["score"] == 0.0
80+
81+
def test_negative_max_duration_no_crash(self):
82+
result = _run(_make_input(
83+
[_inv("inv-1", {"duration_s": 10})],
84+
config={"max_duration_s": -5},
85+
))
86+
assert result["score"] == 0.0
87+
88+
def test_zero_duration_with_zero_max(self):
89+
result = _run(_make_input(
90+
[_inv("inv-1", {"duration_s": 0})],
91+
config={"max_duration_s": 0},
92+
))
93+
assert result["score"] == 0.0
94+
95+
96+
class TestTimeEfficiencyZeroValues:
97+
"""Regression: `or` operator previously dropped zero duration values."""
98+
99+
def test_zero_duration_s_not_dropped(self):
100+
"""duration_s=0 should be used, not fall back to duration key."""
101+
result = _run(_make_input([
102+
_inv("inv-1", {"duration_s": 0, "duration": 999}),
103+
]))
104+
# duration_s=0 gives score 1.0; if it fell back to 999, score would be 0.0
105+
assert result["score"] == 1.0
106+
107+
108+
class TestTimeEfficiencyAliases:
109+
def test_duration_alias(self):
110+
result = _run(_make_input([
111+
_inv("inv-1", {"duration": 60}),
112+
]))
113+
assert result["score"] == pytest.approx(0.5)
114+
115+
def test_duration_s_takes_precedence(self):
116+
result = _run(_make_input([
117+
_inv("inv-1", {"duration_s": 0, "duration": 120}),
118+
]))
119+
assert result["score"] == 1.0
120+
121+
122+
class TestTimeEfficiencyConfig:
123+
def test_custom_max_duration(self):
124+
result = _run(_make_input(
125+
[_inv("inv-1", {"duration_s": 5})],
126+
config={"max_duration_s": 10},
127+
))
128+
assert result["score"] == pytest.approx(0.5)
129+
130+
131+
class TestTimeEfficiencyMultipleInvocations:
132+
def test_average_across_invocations(self):
133+
result = _run(_make_input([
134+
_inv("inv-1", {"duration_s": 0}), # score = 1.0
135+
_inv("inv-2", {"duration_s": 120}), # score = 0.0
136+
]))
137+
assert result["score"] == pytest.approx(0.5)
138+
assert len(result["per_invocation_scores"]) == 2
139+
140+
def test_mixed_with_missing(self):
141+
result = _run(_make_input([
142+
_inv("inv-1", {"duration_s": 0}), # score = 1.0
143+
_inv("inv-2", None), # score = 0.0 (missing)
144+
]))
145+
assert result["status"] is None
146+
assert result["score"] == pytest.approx(0.5)
147+
148+
def test_uses_issues_key_in_details(self):
149+
result = _run(_make_input([
150+
_inv("inv-1", {"duration_s": 10}),
151+
]))
152+
assert "issues" in result["details"]

0 commit comments

Comments
 (0)