add performance evaluators

krisztianfekete · krisztianfekete · commit fc8d8ce27f67 · 2026-04-14T14:31:58.000+02:00
diff --git a/evaluators/time_efficiency/README.md b/evaluators/time_efficiency/README.md
@@ -0,0 +1,51 @@
+# time_efficiency
+
+Scores how quickly an agent resolved relative to a time budget. Catches agents that produce correct answers but take too long for production use.
+
+## How it works
+
+Reads latency percentiles from the trace's `performance_metrics`. Scores against a time budget in seconds:
+
+```
+score = clamp(1.0 - actual_seconds / max_duration_s, 0, 1)
+```
+
+You choose which percentile (`p50`, `p95`, `p99`) and which latency category (`overall`, `llm_calls`, `tool_executions`) to score against. For example, scoring against `p95` of `llm_calls` catches slow LLM responses specifically.
+
+This is a **trace-level** metric. Returns `NOT_EVALUATED` when no latency data is available or when an invalid percentile/source is configured.
+
+## Config
+
+| Option | Type | Default | Description |
+|---|---|---|---|
+| `max_duration_s` | float | 120 | Time budget in seconds |
+| `latency_percentile` | str | `"p50"` | Percentile to score: `"p50"`, `"p95"`, `"p99"` |
+| `latency_source` | str | `"overall"` | Latency category: `"overall"`, `"llm_calls"`, `"tool_executions"` |
+
+## Example
+
+```yaml
+evaluators:
+  - name: time_efficiency
+    type: remote
+    source: github
+    ref: evaluators/time_efficiency/time_efficiency.py
+    threshold: 0.5
+    config:
+      max_duration_s: 60
+      latency_percentile: p95
+      latency_source: overall
+```
+
+## Output details
+
+```json
+{
+  "duration_s": 4.164,
+  "max_duration_s": 60,
+  "utilization": "6.9%",
+  "source": "latency.overall.p95"
+}
+```
+
+Requires `agentevals-evaluator-sdk >= 0.1.1`.
diff --git a/evaluators/time_efficiency/evaluator.yaml b/evaluators/time_efficiency/evaluator.yaml
@@ -0,0 +1,6 @@
+name: time_efficiency
+description: Scores how quickly the agent resolved relative to a time budget
+language: python
+entrypoint: time_efficiency.py
+tags: [performance, time, latency, efficiency, budget]
+author: agentevals-dev
diff --git a/evaluators/time_efficiency/time_efficiency.py b/evaluators/time_efficiency/time_efficiency.py
@@ -0,0 +1,131 @@
+"""Community evaluator: time_efficiency
+
+Scores resolution time relative to a budget.  Reads latency from
+the trace's performance_metrics.
+
+Score = clamp(1.0 - actual_seconds / max_duration_s, 0, 1)
+
+Returns NOT_EVALUATED when no latency data is available.
+
+Config options:
+  max_duration_s       (float): Time budget in seconds (default: 120)
+  latency_percentile   (str):   Which percentile to score against:
+                                "p50" (default), "p95", "p99"
+  latency_source       (str):   Latency category:
+                                "overall" (default), "llm_calls", "tool_executions"
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+_VALID_PERCENTILES = ("p50", "p95", "p99")
+_VALID_SOURCES = ("overall", "llm_calls", "tool_executions")
+
+
+def _extract_duration_s(perf: dict, percentile: str, source: str) -> tuple[float | None, str]:
+    """Extract duration in seconds from a performance_metrics dict.
+
+    Returns (duration_seconds, description_of_source).
+
+    Supports:
+      nested (agentevals): latency.<source>.<percentile> in milliseconds
+      flat:                duration_s (seconds) or duration_ms (milliseconds)
+    """
+    latency_block = perf.get("latency")
+    if isinstance(latency_block, dict):
+        source_block = latency_block.get(source)
+        if isinstance(source_block, dict):
+            ms_value = source_block.get(percentile)
+            if ms_value is not None:
+                return float(ms_value) / 1000.0, f"latency.{source}.{percentile}"
+
+    duration_s = perf.get("duration_s")
+    if duration_s is not None:
+        return float(duration_s), "duration_s"
+
+    duration_ms = perf.get("duration_ms")
+    if duration_ms is not None:
+        return float(duration_ms) / 1000.0, "duration_ms"
+
+    return None, "no latency data found"
+
+
+def _get_perf(input: EvalInput) -> dict | None:
+    """Return the first non-None performance_metrics from any invocation."""
+    for inv in input.invocations:
+        if isinstance(inv.performance_metrics, dict):
+            return inv.performance_metrics
+    return None
+
+
+@evaluator
+def time_efficiency(input: EvalInput) -> EvalResult:
+    max_duration = input.config.get("max_duration_s", 120.0)
+    percentile = input.config.get("latency_percentile", "p50")
+    source = input.config.get("latency_source", "overall")
+    n = len(input.invocations)
+
+    if percentile not in _VALID_PERCENTILES:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": f"invalid latency_percentile '{percentile}', must be one of {_VALID_PERCENTILES}"},
+        )
+    if source not in _VALID_SOURCES:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": f"invalid latency_source '{source}', must be one of {_VALID_SOURCES}"},
+        )
+
+    perf = _get_perf(input)
+    if perf is None:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "no performance_metrics available"},
+        )
+
+    duration_s, source_desc = _extract_duration_s(perf, percentile, source)
+    if duration_s is None:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": source_desc},
+        )
+
+    score = max(0.0, min(1.0, 1.0 - duration_s / max_duration)) if max_duration > 0 else 1.0
+
+    breakdown = {}
+    latency_block = perf.get("latency")
+    if isinstance(latency_block, dict):
+        for src in _VALID_SOURCES:
+            src_block = latency_block.get(src)
+            if isinstance(src_block, dict):
+                val = src_block.get(percentile)
+                if val is not None:
+                    breakdown[src] = round(float(val) / 1000.0, 3)
+
+    details: dict = {
+        "duration_s": round(duration_s, 3),
+        "max_duration_s": max_duration,
+        "utilization": f"{duration_s / max_duration * 100:.1f}%" if max_duration > 0 else "n/a",
+        "source": source_desc,
+    }
+    if breakdown:
+        details["latency_breakdown_s"] = breakdown
+
+    return EvalResult(
+        score=score,
+        per_invocation_scores=[None] * n,
+        details=details,
+    )
+
+
+if __name__ == "__main__":
+    time_efficiency.run()
diff --git a/evaluators/token_efficiency/README.md b/evaluators/token_efficiency/README.md
@@ -0,0 +1,57 @@
+# token_efficiency
+
+Scores how efficiently an agent used tokens relative to a budget. Useful for catching runaway token consumption — real benchmarks show 8x variation across agent solutions for the same task.
+
+## How it works
+
+Reads token counts from the trace's `performance_metrics`. Scores input and output tokens separately against their budgets, returns the worst of the two:
+
+```
+input_score  = clamp(1.0 - input_tokens / max_input_tokens,  0, 1)
+output_score = clamp(1.0 - output_tokens / max_output_tokens, 0, 1)
+score = min(input_score, output_score)
+```
+
+A score of 1.0 means zero tokens used; 0.0 means at or over budget. With a threshold of 0.3, the agent must use less than 70% of the budget to pass.
+
+This is a **trace-level** metric — per-invocation scores are not applicable (token counts come from the full trace).
+
+Returns `NOT_EVALUATED` when no token data is available in the trace.
+
+## Config
+
+| Option | Type | Default | Description |
+|---|---|---|---|
+| `max_input_tokens` | int | 150000 | Input (prompt) token budget |
+| `max_output_tokens` | int | 50000 | Output (completion) token budget |
+
+## Example
+
+```yaml
+evaluators:
+  - name: token_efficiency
+    type: remote
+    source: github
+    ref: evaluators/token_efficiency/token_efficiency.py
+    threshold: 0.3
+    config:
+      max_input_tokens: 100000
+      max_output_tokens: 30000
+```
+
+## Output details
+
+```json
+{
+  "input_tokens": 75000,
+  "output_tokens": 10000,
+  "max_input_tokens": 100000,
+  "max_output_tokens": 30000,
+  "input_utilization": "75.0%",
+  "output_utilization": "33.3%",
+  "input_score": 0.25,
+  "output_score": 0.6667
+}
+```
+
+Requires `agentevals-evaluator-sdk >= 0.1.1`.
diff --git a/evaluators/token_efficiency/evaluator.yaml b/evaluators/token_efficiency/evaluator.yaml
@@ -0,0 +1,6 @@
+name: token_efficiency
+description: Scores how efficiently the agent used tokens relative to a budget
+language: python
+entrypoint: token_efficiency.py
+tags: [performance, tokens, efficiency, budget]
+author: agentevals-dev
diff --git a/evaluators/token_efficiency/token_efficiency.py b/evaluators/token_efficiency/token_efficiency.py
@@ -0,0 +1,112 @@
+"""Community evaluator: token_efficiency
+
+Scores token usage relative to a budget.  Reads token counts from
+the trace's performance_metrics.
+
+Score formula (per dimension):
+  input_score  = clamp(1.0 - total_prompt / max_input_tokens,  0, 1)
+  output_score = clamp(1.0 - total_output / max_output_tokens, 0, 1)
+  score = min(input_score, output_score)
+
+Returns NOT_EVALUATED when no token data is available.
+
+Config options:
+  max_input_tokens  (int): Input token budget  (default: 150000)
+  max_output_tokens (int): Output token budget  (default: 50000)
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+
+def _extract_tokens(perf: dict) -> dict[str, int] | None:
+    """Extract token counts from a performance_metrics dict.
+
+    Supports two layouts:
+      nested (agentevals default): {"tokens": {"total_prompt": N, "total_output": N}}
+      flat   (custom harness):     {"input_tokens": N, "output_tokens": N}
+    """
+    tokens_block = perf.get("tokens")
+    if isinstance(tokens_block, dict):
+        total_prompt = tokens_block.get("total_prompt")
+        total_output = tokens_block.get("total_output")
+        if total_prompt is not None or total_output is not None:
+            return {
+                "input_tokens": int(total_prompt) if total_prompt is not None else 0,
+                "output_tokens": int(total_output) if total_output is not None else 0,
+            }
+
+    input_t = perf.get("input_tokens")
+    if input_t is None:
+        input_t = perf.get("prompt_tokens")
+    output_t = perf.get("output_tokens")
+    if output_t is None:
+        output_t = perf.get("completion_tokens")
+
+    if input_t is not None or output_t is not None:
+        return {
+            "input_tokens": int(input_t) if input_t is not None else 0,
+            "output_tokens": int(output_t) if output_t is not None else 0,
+        }
+
+    return None
+
+
+def _get_perf(input: EvalInput) -> dict | None:
+    """Return the first non-None performance_metrics from any invocation."""
+    for inv in input.invocations:
+        if isinstance(inv.performance_metrics, dict):
+            return inv.performance_metrics
+    return None
+
+
+@evaluator
+def token_efficiency(input: EvalInput) -> EvalResult:
+    max_input = input.config.get("max_input_tokens", 150_000)
+    max_output = input.config.get("max_output_tokens", 50_000)
+    n = len(input.invocations)
+
+    perf = _get_perf(input)
+    if perf is None:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "no performance_metrics available"},
+        )
+
+    tokens = _extract_tokens(perf)
+    if tokens is None:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "no token data in performance_metrics"},
+        )
+
+    input_tokens = tokens["input_tokens"]
+    output_tokens = tokens["output_tokens"]
+
+    input_score = max(0.0, min(1.0, 1.0 - input_tokens / max_input)) if max_input > 0 else 1.0
+    output_score = max(0.0, min(1.0, 1.0 - output_tokens / max_output)) if max_output > 0 else 1.0
+    score = min(input_score, output_score)
+
+    return EvalResult(
+        score=score,
+        per_invocation_scores=[None] * n,
+        details={
+            "input_tokens": input_tokens,
+            "output_tokens": output_tokens,
+            "max_input_tokens": max_input,
+            "max_output_tokens": max_output,
+            "input_utilization": f"{input_tokens / max_input * 100:.1f}%" if max_input > 0 else "n/a",
+            "output_utilization": f"{output_tokens / max_output * 100:.1f}%" if max_output > 0 else "n/a",
+            "input_score": round(input_score, 4),
+            "output_score": round(output_score, 4),
+        },
+    )
+
+
+if __name__ == "__main__":
+    token_efficiency.run()
diff --git a/evaluators/tool_efficiency/README.md b/evaluators/tool_efficiency/README.md
diff --git a/evaluators/tool_efficiency/evaluator.yaml b/evaluators/tool_efficiency/evaluator.yaml
diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py