Skip to content

Commit 93171ca

Browse files
authored
fix(core): capture judge/evaluator token usage in reports (#63)
Move `collect_all_usage()` in `Benchmark._execute_task_repetition` to run after `evaluate()`, so token usage from evaluator-owned models (LLM judges) is captured in `report["usage"]` and aggregated into `Benchmark.usage`. Previously the snapshot fired before evaluators ran, so judge tokens showed as zero in per-task reports and were missing from run-level cost totals.
1 parent 1c52f23 commit 93171ca

3 files changed

Lines changed: 165 additions & 8 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2424
- Fixed MACS real-data tests passing `{"environment_data": task.environment_data}` instead of `task.environment_data` directly, which caused `setup_state` to silently receive an empty tools list. (PR: #58)
2525
- Benchmark reports from `Benchmark.run()` now have a consistent schema across every outcome. Setup failures, setup timeouts, and unexpected worker failures in parallel runs previously produced reports missing the `usage` and `task` keys (with empty `traces`/`config`). Every report now always includes `task_id`, `repeat_idx`, `status`, `error`, `traces`, `config`, `usage`, `eval`, and `task`, and `report["error"]` is always populated whenever `status` is not `SUCCESS`. (PR: #61)
2626
- `fail_on_setup_error`, `fail_on_task_error`, and `fail_on_evaluation_error` now abort a parallel `Benchmark.run()` the same way they abort a sequential run. Previously a parallel run swallowed the failure into a degraded report and kept going. (PR: #61)
27+
- Token usage and cost for LLM judges and other evaluator-owned models are now correctly captured in per-task reports (`report["usage"]`) and in `benchmark.usage` / `benchmark.usage_by_component`. Previously these entries showed zero tokens because the usage snapshot was taken before evaluators ran. Affects every benchmark that registers a model in `setup_evaluators` (e.g. ConVerse, MultiAgentBench). (PR: #63)
2728

2829
### Removed
2930

maseval/core/benchmark.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1254,12 +1254,11 @@ def _execute_task_repetition(
12541254

12551255
final_answers = None
12561256

1257-
# 3. Collect traces, configs, and usage (always attempt this)
1257+
# 3. Collect traces and configs (always attempt this)
12581258
execution_usage: Optional[Dict[str, Any]] = None
12591259
try:
12601260
execution_configs = self.collect_all_configs()
12611261
execution_traces = self.collect_all_traces()
1262-
execution_usage = self.collect_all_usage()
12631262
# Store in context for potential timeout errors
12641263
context.set_collected_traces(execution_traces)
12651264
except Exception as e:
@@ -1272,11 +1271,6 @@ def _execute_task_repetition(
12721271
"error": f"Failed to collect traces: {e}",
12731272
"error_type": type(e).__name__,
12741273
}
1275-
if execution_usage is None:
1276-
execution_usage = {
1277-
"error": f"Failed to collect usage: {e}",
1278-
"error_type": type(e).__name__,
1279-
}
12801274

12811275
# 4. Evaluate (skip if task execution failed)
12821276
if execution_status == TaskExecutionStatus.SUCCESS:
@@ -1311,7 +1305,16 @@ def _execute_task_repetition(
13111305
# Task execution failed, so skip evaluation
13121306
eval_results = None
13131307

1314-
# 5. Build report — all keys always present for consistent schema
1308+
# 5. Collect usage after evaluate() so judge/evaluator-owned model tokens are captured.
1309+
try:
1310+
execution_usage = self.collect_all_usage()
1311+
except Exception as e:
1312+
execution_usage = {
1313+
"error": f"Failed to collect usage: {e}",
1314+
"error_type": type(e).__name__,
1315+
}
1316+
1317+
# 6. Build report — all keys always present for consistent schema
13151318
report = self._build_report(
13161319
task,
13171320
repeat_idx,

tests/test_core/test_benchmark/test_usage_collection.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import pytest
88
from maseval import TaskQueue
9+
from maseval.core.exceptions import AgentError
10+
from conftest import DummyBenchmark, DummyModelAdapter
911

1012

1113
@pytest.mark.core
@@ -96,3 +98,154 @@ def test_usage_property_returns_total(self):
9698
total = benchmark.usage
9799
assert total is not None
98100
# cost may be None if DummyModelAdapter doesn't provide usage
101+
102+
103+
# ---------------------------------------------------------------------------
104+
# Regression tests for issue #60: judge (evaluator) token usage collection
105+
# ---------------------------------------------------------------------------
106+
107+
108+
class _JudgeEvaluator:
109+
"""Minimal evaluator that invokes a model adapter at evaluate-time.
110+
111+
Not an ``Evaluator`` subclass — it implements the duck-typed
112+
interface (``filter_traces`` + ``__call__``) that ``DummyBenchmark.evaluate``
113+
iterates over (see ``tests/conftest.py``).
114+
"""
115+
116+
def __init__(self, model):
117+
self.model = model
118+
119+
def filter_traces(self, traces):
120+
return traces
121+
122+
def __call__(self, traces, final_answer=None):
123+
# Invokes the judge at evaluate-time (not at setup).
124+
self.model.chat([{"role": "user", "content": "judge this"}])
125+
return {"score": 1.0, "passed": True}
126+
127+
128+
def _make_judge_benchmark(judge_usage):
129+
"""Build a JudgeBenchmark whose setup_evaluators registers a judge model.
130+
131+
The judge model is created with the provided per-call usage dict. Each
132+
call to the model appends one usage record, so a single evaluator
133+
invocation produces exactly one record's worth of tokens.
134+
"""
135+
136+
class JudgeBenchmark(DummyBenchmark):
137+
def setup_evaluators(self, environment, task, agents, user, seed_generator):
138+
judge_model = DummyModelAdapter(model_id="judge", usage=judge_usage)
139+
self.register("models", "judge_model", judge_model)
140+
return [_JudgeEvaluator(model=judge_model)]
141+
142+
return JudgeBenchmark()
143+
144+
145+
@pytest.mark.core
146+
class TestBenchmarkJudgeUsage:
147+
"""Tests that judge token usage reaches both per-task reports and ``benchmark.usage``."""
148+
149+
def test_judge_model_usage_captured_in_report(self):
150+
"""A judge model invoked during evaluate() has non-zero usage in
151+
report['usage']['models']['judge_model']."""
152+
judge_usage = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150}
153+
tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}])
154+
benchmark = _make_judge_benchmark(judge_usage)
155+
156+
reports = benchmark.run(tasks, agent_data={"model": "test"})
157+
158+
models = reports[0]["usage"]["models"]
159+
assert "judge_model" in models, f"judge_model not registered; got: {list(models)}"
160+
judge_entry = models["judge_model"]
161+
assert judge_entry["input_tokens"] == 100
162+
assert judge_entry["output_tokens"] == 50
163+
assert judge_entry["total_tokens"] == 150
164+
165+
def test_judge_model_usage_aggregated_in_benchmark_total(self):
166+
"""``benchmark.usage`` includes judge tokens, and
167+
``benchmark.usage_by_component`` has a non-zero ``models:judge_model``
168+
entry."""
169+
judge_usage = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150}
170+
tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}])
171+
benchmark = _make_judge_benchmark(judge_usage)
172+
173+
benchmark.run(tasks, agent_data={"model": "test"})
174+
175+
assert benchmark.usage.input_tokens >= 100
176+
assert benchmark.usage.output_tokens >= 50
177+
178+
by_component = benchmark.usage_by_component
179+
assert "models:judge_model" in by_component, f"keys: {list(by_component)}"
180+
assert by_component["models:judge_model"].input_tokens == 100
181+
assert by_component["models:judge_model"].output_tokens == 50
182+
183+
def test_agent_usage_captured_when_evaluation_raises(self):
184+
"""When evaluate() raises and fail_on_evaluation_error=False, the
185+
report still carries a real usage dict (step 5 runs after step 4)."""
186+
187+
class RaisingEvaluator:
188+
def __init__(self, task, environment, user):
189+
_ = task, environment, user
190+
191+
def filter_traces(self, traces):
192+
return traces
193+
194+
def __call__(self, traces, final_answer=None):
195+
raise RuntimeError("boom — simulated evaluator failure")
196+
197+
class RaisingJudgeBenchmark(DummyBenchmark):
198+
def setup_evaluators(self, environment, task, agents, user, seed_generator):
199+
# Register an agent-side model with usage so we can assert
200+
# that pre-evaluate usage still survives the eval failure.
201+
agent_model = DummyModelAdapter(
202+
model_id="agent_model",
203+
usage={"input_tokens": 42, "output_tokens": 7, "total_tokens": 49},
204+
)
205+
self.register("models", "agent_model", agent_model)
206+
# Drive the model once so it has a usage record.
207+
agent_model.chat([{"role": "user", "content": "hi"}])
208+
return [RaisingEvaluator(task, environment, user)]
209+
210+
tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}])
211+
benchmark = RaisingJudgeBenchmark(fail_on_evaluation_error=False)
212+
213+
reports = benchmark.run(tasks, agent_data={"model": "test"})
214+
215+
report = reports[0]
216+
assert report["status"] == "evaluation_failed"
217+
usage = report["usage"]
218+
assert "error" not in usage, f"usage became an error dict: {usage}"
219+
assert usage["models"]["agent_model"]["input_tokens"] == 42
220+
assert usage["models"]["agent_model"]["output_tokens"] == 7
221+
222+
def test_agent_usage_captured_when_execution_raises(self):
223+
"""When run_agents raises (execution failure) and fail_on_task_error
224+
is False, the report still carries a real usage dict. Evaluate is
225+
skipped, but step 5 still runs."""
226+
227+
class FailingAgentBenchmark(DummyBenchmark):
228+
def setup_agents(self, agent_data, environment, task, user, seed_generator):
229+
agent_model = DummyModelAdapter(
230+
model_id="agent_model",
231+
usage={"input_tokens": 11, "output_tokens": 3, "total_tokens": 14},
232+
)
233+
self.register("models", "agent_model", agent_model)
234+
# Drive the model once so it has a usage record.
235+
agent_model.chat([{"role": "user", "content": "hi"}])
236+
return super().setup_agents(agent_data, environment, task, user, seed_generator)
237+
238+
def run_agents(self, agents, task, environment, query):
239+
raise AgentError("simulated agent failure", component="agent")
240+
241+
tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}])
242+
benchmark = FailingAgentBenchmark(fail_on_task_error=False)
243+
244+
reports = benchmark.run(tasks, agent_data={"model": "test"})
245+
246+
report = reports[0]
247+
assert report["status"] == "agent_error"
248+
usage = report["usage"]
249+
assert "error" not in usage, f"usage became an error dict: {usage}"
250+
assert usage["models"]["agent_model"]["input_tokens"] == 11
251+
assert usage["models"]["agent_model"]["output_tokens"] == 3

0 commit comments

Comments
 (0)