Skip to content

Commit fc8d8ce

Browse files
add performance evaluators
1 parent ba94de6 commit fc8d8ce

9 files changed

Lines changed: 570 additions & 0 deletions

File tree

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# time_efficiency
2+
3+
Scores how quickly an agent resolved relative to a time budget. Catches agents that produce correct answers but take too long for production use.
4+
5+
## How it works
6+
7+
Reads latency percentiles from the trace's `performance_metrics`. Scores against a time budget in seconds:
8+
9+
```
10+
score = clamp(1.0 - actual_seconds / max_duration_s, 0, 1)
11+
```
12+
13+
You choose which percentile (`p50`, `p95`, `p99`) and which latency category (`overall`, `llm_calls`, `tool_executions`) to score against. For example, scoring against `p95` of `llm_calls` catches slow LLM responses specifically.
14+
15+
This is a **trace-level** metric. Returns `NOT_EVALUATED` when no latency data is available or when an invalid percentile/source is configured.
16+
17+
## Config
18+
19+
| Option | Type | Default | Description |
20+
|---|---|---|---|
21+
| `max_duration_s` | float | 120 | Time budget in seconds |
22+
| `latency_percentile` | str | `"p50"` | Percentile to score: `"p50"`, `"p95"`, `"p99"` |
23+
| `latency_source` | str | `"overall"` | Latency category: `"overall"`, `"llm_calls"`, `"tool_executions"` |
24+
25+
## Example
26+
27+
```yaml
28+
evaluators:
29+
- name: time_efficiency
30+
type: remote
31+
source: github
32+
ref: evaluators/time_efficiency/time_efficiency.py
33+
threshold: 0.5
34+
config:
35+
max_duration_s: 60
36+
latency_percentile: p95
37+
latency_source: overall
38+
```
39+
40+
## Output details
41+
42+
```json
43+
{
44+
"duration_s": 4.164,
45+
"max_duration_s": 60,
46+
"utilization": "6.9%",
47+
"source": "latency.overall.p95"
48+
}
49+
```
50+
51+
Requires `agentevals-evaluator-sdk >= 0.1.1`.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name: time_efficiency
2+
description: Scores how quickly the agent resolved relative to a time budget
3+
language: python
4+
entrypoint: time_efficiency.py
5+
tags: [performance, time, latency, efficiency, budget]
6+
author: agentevals-dev
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""Community evaluator: time_efficiency
2+
3+
Scores resolution time relative to a budget. Reads latency from
4+
the trace's performance_metrics.
5+
6+
Score = clamp(1.0 - actual_seconds / max_duration_s, 0, 1)
7+
8+
Returns NOT_EVALUATED when no latency data is available.
9+
10+
Config options:
11+
max_duration_s (float): Time budget in seconds (default: 120)
12+
latency_percentile (str): Which percentile to score against:
13+
"p50" (default), "p95", "p99"
14+
latency_source (str): Latency category:
15+
"overall" (default), "llm_calls", "tool_executions"
16+
"""
17+
18+
from __future__ import annotations
19+
20+
from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
21+
22+
_VALID_PERCENTILES = ("p50", "p95", "p99")
23+
_VALID_SOURCES = ("overall", "llm_calls", "tool_executions")
24+
25+
26+
def _extract_duration_s(perf: dict, percentile: str, source: str) -> tuple[float | None, str]:
27+
"""Extract duration in seconds from a performance_metrics dict.
28+
29+
Returns (duration_seconds, description_of_source).
30+
31+
Supports:
32+
nested (agentevals): latency.<source>.<percentile> in milliseconds
33+
flat: duration_s (seconds) or duration_ms (milliseconds)
34+
"""
35+
latency_block = perf.get("latency")
36+
if isinstance(latency_block, dict):
37+
source_block = latency_block.get(source)
38+
if isinstance(source_block, dict):
39+
ms_value = source_block.get(percentile)
40+
if ms_value is not None:
41+
return float(ms_value) / 1000.0, f"latency.{source}.{percentile}"
42+
43+
duration_s = perf.get("duration_s")
44+
if duration_s is not None:
45+
return float(duration_s), "duration_s"
46+
47+
duration_ms = perf.get("duration_ms")
48+
if duration_ms is not None:
49+
return float(duration_ms) / 1000.0, "duration_ms"
50+
51+
return None, "no latency data found"
52+
53+
54+
def _get_perf(input: EvalInput) -> dict | None:
55+
"""Return the first non-None performance_metrics from any invocation."""
56+
for inv in input.invocations:
57+
if isinstance(inv.performance_metrics, dict):
58+
return inv.performance_metrics
59+
return None
60+
61+
62+
@evaluator
63+
def time_efficiency(input: EvalInput) -> EvalResult:
64+
max_duration = input.config.get("max_duration_s", 120.0)
65+
percentile = input.config.get("latency_percentile", "p50")
66+
source = input.config.get("latency_source", "overall")
67+
n = len(input.invocations)
68+
69+
if percentile not in _VALID_PERCENTILES:
70+
return EvalResult(
71+
score=0.0,
72+
status=EvalStatus.NOT_EVALUATED,
73+
per_invocation_scores=[None] * n,
74+
details={"reason": f"invalid latency_percentile '{percentile}', must be one of {_VALID_PERCENTILES}"},
75+
)
76+
if source not in _VALID_SOURCES:
77+
return EvalResult(
78+
score=0.0,
79+
status=EvalStatus.NOT_EVALUATED,
80+
per_invocation_scores=[None] * n,
81+
details={"reason": f"invalid latency_source '{source}', must be one of {_VALID_SOURCES}"},
82+
)
83+
84+
perf = _get_perf(input)
85+
if perf is None:
86+
return EvalResult(
87+
score=0.0,
88+
status=EvalStatus.NOT_EVALUATED,
89+
per_invocation_scores=[None] * n,
90+
details={"reason": "no performance_metrics available"},
91+
)
92+
93+
duration_s, source_desc = _extract_duration_s(perf, percentile, source)
94+
if duration_s is None:
95+
return EvalResult(
96+
score=0.0,
97+
status=EvalStatus.NOT_EVALUATED,
98+
per_invocation_scores=[None] * n,
99+
details={"reason": source_desc},
100+
)
101+
102+
score = max(0.0, min(1.0, 1.0 - duration_s / max_duration)) if max_duration > 0 else 1.0
103+
104+
breakdown = {}
105+
latency_block = perf.get("latency")
106+
if isinstance(latency_block, dict):
107+
for src in _VALID_SOURCES:
108+
src_block = latency_block.get(src)
109+
if isinstance(src_block, dict):
110+
val = src_block.get(percentile)
111+
if val is not None:
112+
breakdown[src] = round(float(val) / 1000.0, 3)
113+
114+
details: dict = {
115+
"duration_s": round(duration_s, 3),
116+
"max_duration_s": max_duration,
117+
"utilization": f"{duration_s / max_duration * 100:.1f}%" if max_duration > 0 else "n/a",
118+
"source": source_desc,
119+
}
120+
if breakdown:
121+
details["latency_breakdown_s"] = breakdown
122+
123+
return EvalResult(
124+
score=score,
125+
per_invocation_scores=[None] * n,
126+
details=details,
127+
)
128+
129+
130+
if __name__ == "__main__":
131+
time_efficiency.run()
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# token_efficiency
2+
3+
Scores how efficiently an agent used tokens relative to a budget. Useful for catching runaway token consumption — real benchmarks show 8x variation across agent solutions for the same task.
4+
5+
## How it works
6+
7+
Reads token counts from the trace's `performance_metrics`. Scores input and output tokens separately against their budgets, returns the worst of the two:
8+
9+
```
10+
input_score = clamp(1.0 - input_tokens / max_input_tokens, 0, 1)
11+
output_score = clamp(1.0 - output_tokens / max_output_tokens, 0, 1)
12+
score = min(input_score, output_score)
13+
```
14+
15+
A score of 1.0 means zero tokens used; 0.0 means at or over budget. With a threshold of 0.3, the agent must use less than 70% of the budget to pass.
16+
17+
This is a **trace-level** metric — per-invocation scores are not applicable (token counts come from the full trace).
18+
19+
Returns `NOT_EVALUATED` when no token data is available in the trace.
20+
21+
## Config
22+
23+
| Option | Type | Default | Description |
24+
|---|---|---|---|
25+
| `max_input_tokens` | int | 150000 | Input (prompt) token budget |
26+
| `max_output_tokens` | int | 50000 | Output (completion) token budget |
27+
28+
## Example
29+
30+
```yaml
31+
evaluators:
32+
- name: token_efficiency
33+
type: remote
34+
source: github
35+
ref: evaluators/token_efficiency/token_efficiency.py
36+
threshold: 0.3
37+
config:
38+
max_input_tokens: 100000
39+
max_output_tokens: 30000
40+
```
41+
42+
## Output details
43+
44+
```json
45+
{
46+
"input_tokens": 75000,
47+
"output_tokens": 10000,
48+
"max_input_tokens": 100000,
49+
"max_output_tokens": 30000,
50+
"input_utilization": "75.0%",
51+
"output_utilization": "33.3%",
52+
"input_score": 0.25,
53+
"output_score": 0.6667
54+
}
55+
```
56+
57+
Requires `agentevals-evaluator-sdk >= 0.1.1`.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
name: token_efficiency
2+
description: Scores how efficiently the agent used tokens relative to a budget
3+
language: python
4+
entrypoint: token_efficiency.py
5+
tags: [performance, tokens, efficiency, budget]
6+
author: agentevals-dev
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
"""Community evaluator: token_efficiency
2+
3+
Scores token usage relative to a budget. Reads token counts from
4+
the trace's performance_metrics.
5+
6+
Score formula (per dimension):
7+
input_score = clamp(1.0 - total_prompt / max_input_tokens, 0, 1)
8+
output_score = clamp(1.0 - total_output / max_output_tokens, 0, 1)
9+
score = min(input_score, output_score)
10+
11+
Returns NOT_EVALUATED when no token data is available.
12+
13+
Config options:
14+
max_input_tokens (int): Input token budget (default: 150000)
15+
max_output_tokens (int): Output token budget (default: 50000)
16+
"""
17+
18+
from __future__ import annotations
19+
20+
from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
21+
22+
23+
def _extract_tokens(perf: dict) -> dict[str, int] | None:
24+
"""Extract token counts from a performance_metrics dict.
25+
26+
Supports two layouts:
27+
nested (agentevals default): {"tokens": {"total_prompt": N, "total_output": N}}
28+
flat (custom harness): {"input_tokens": N, "output_tokens": N}
29+
"""
30+
tokens_block = perf.get("tokens")
31+
if isinstance(tokens_block, dict):
32+
total_prompt = tokens_block.get("total_prompt")
33+
total_output = tokens_block.get("total_output")
34+
if total_prompt is not None or total_output is not None:
35+
return {
36+
"input_tokens": int(total_prompt) if total_prompt is not None else 0,
37+
"output_tokens": int(total_output) if total_output is not None else 0,
38+
}
39+
40+
input_t = perf.get("input_tokens")
41+
if input_t is None:
42+
input_t = perf.get("prompt_tokens")
43+
output_t = perf.get("output_tokens")
44+
if output_t is None:
45+
output_t = perf.get("completion_tokens")
46+
47+
if input_t is not None or output_t is not None:
48+
return {
49+
"input_tokens": int(input_t) if input_t is not None else 0,
50+
"output_tokens": int(output_t) if output_t is not None else 0,
51+
}
52+
53+
return None
54+
55+
56+
def _get_perf(input: EvalInput) -> dict | None:
57+
"""Return the first non-None performance_metrics from any invocation."""
58+
for inv in input.invocations:
59+
if isinstance(inv.performance_metrics, dict):
60+
return inv.performance_metrics
61+
return None
62+
63+
64+
@evaluator
65+
def token_efficiency(input: EvalInput) -> EvalResult:
66+
max_input = input.config.get("max_input_tokens", 150_000)
67+
max_output = input.config.get("max_output_tokens", 50_000)
68+
n = len(input.invocations)
69+
70+
perf = _get_perf(input)
71+
if perf is None:
72+
return EvalResult(
73+
score=0.0,
74+
status=EvalStatus.NOT_EVALUATED,
75+
per_invocation_scores=[None] * n,
76+
details={"reason": "no performance_metrics available"},
77+
)
78+
79+
tokens = _extract_tokens(perf)
80+
if tokens is None:
81+
return EvalResult(
82+
score=0.0,
83+
status=EvalStatus.NOT_EVALUATED,
84+
per_invocation_scores=[None] * n,
85+
details={"reason": "no token data in performance_metrics"},
86+
)
87+
88+
input_tokens = tokens["input_tokens"]
89+
output_tokens = tokens["output_tokens"]
90+
91+
input_score = max(0.0, min(1.0, 1.0 - input_tokens / max_input)) if max_input > 0 else 1.0
92+
output_score = max(0.0, min(1.0, 1.0 - output_tokens / max_output)) if max_output > 0 else 1.0
93+
score = min(input_score, output_score)
94+
95+
return EvalResult(
96+
score=score,
97+
per_invocation_scores=[None] * n,
98+
details={
99+
"input_tokens": input_tokens,
100+
"output_tokens": output_tokens,
101+
"max_input_tokens": max_input,
102+
"max_output_tokens": max_output,
103+
"input_utilization": f"{input_tokens / max_input * 100:.1f}%" if max_input > 0 else "n/a",
104+
"output_utilization": f"{output_tokens / max_output * 100:.1f}%" if max_output > 0 else "n/a",
105+
"input_score": round(input_score, 4),
106+
"output_score": round(output_score, 4),
107+
},
108+
)
109+
110+
111+
if __name__ == "__main__":
112+
token_efficiency.run()

0 commit comments

Comments
 (0)