|
6 | 6 |
|
7 | 7 | import pytest |
8 | 8 | from maseval import TaskQueue |
| 9 | +from maseval.core.exceptions import AgentError |
| 10 | +from conftest import DummyBenchmark, DummyModelAdapter |
9 | 11 |
|
10 | 12 |
|
11 | 13 | @pytest.mark.core |
@@ -96,3 +98,154 @@ def test_usage_property_returns_total(self): |
96 | 98 | total = benchmark.usage |
97 | 99 | assert total is not None |
98 | 100 | # cost may be None if DummyModelAdapter doesn't provide usage |
| 101 | + |
| 102 | + |
| 103 | +# --------------------------------------------------------------------------- |
| 104 | +# Regression tests for issue #60: judge (evaluator) token usage collection |
| 105 | +# --------------------------------------------------------------------------- |
| 106 | + |
| 107 | + |
| 108 | +class _JudgeEvaluator: |
| 109 | + """Minimal evaluator that invokes a model adapter at evaluate-time. |
| 110 | +
|
| 111 | + Not an ``Evaluator`` subclass — it implements the duck-typed |
| 112 | + interface (``filter_traces`` + ``__call__``) that ``DummyBenchmark.evaluate`` |
| 113 | + iterates over (see ``tests/conftest.py``). |
| 114 | + """ |
| 115 | + |
| 116 | + def __init__(self, model): |
| 117 | + self.model = model |
| 118 | + |
| 119 | + def filter_traces(self, traces): |
| 120 | + return traces |
| 121 | + |
| 122 | + def __call__(self, traces, final_answer=None): |
| 123 | + # Invokes the judge at evaluate-time (not at setup). |
| 124 | + self.model.chat([{"role": "user", "content": "judge this"}]) |
| 125 | + return {"score": 1.0, "passed": True} |
| 126 | + |
| 127 | + |
| 128 | +def _make_judge_benchmark(judge_usage): |
| 129 | + """Build a JudgeBenchmark whose setup_evaluators registers a judge model. |
| 130 | +
|
| 131 | + The judge model is created with the provided per-call usage dict. Each |
| 132 | + call to the model appends one usage record, so a single evaluator |
| 133 | + invocation produces exactly one record's worth of tokens. |
| 134 | + """ |
| 135 | + |
| 136 | + class JudgeBenchmark(DummyBenchmark): |
| 137 | + def setup_evaluators(self, environment, task, agents, user, seed_generator): |
| 138 | + judge_model = DummyModelAdapter(model_id="judge", usage=judge_usage) |
| 139 | + self.register("models", "judge_model", judge_model) |
| 140 | + return [_JudgeEvaluator(model=judge_model)] |
| 141 | + |
| 142 | + return JudgeBenchmark() |
| 143 | + |
| 144 | + |
| 145 | +@pytest.mark.core |
| 146 | +class TestBenchmarkJudgeUsage: |
| 147 | + """Tests that judge token usage reaches both per-task reports and ``benchmark.usage``.""" |
| 148 | + |
| 149 | + def test_judge_model_usage_captured_in_report(self): |
| 150 | + """A judge model invoked during evaluate() has non-zero usage in |
| 151 | + report['usage']['models']['judge_model'].""" |
| 152 | + judge_usage = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150} |
| 153 | + tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}]) |
| 154 | + benchmark = _make_judge_benchmark(judge_usage) |
| 155 | + |
| 156 | + reports = benchmark.run(tasks, agent_data={"model": "test"}) |
| 157 | + |
| 158 | + models = reports[0]["usage"]["models"] |
| 159 | + assert "judge_model" in models, f"judge_model not registered; got: {list(models)}" |
| 160 | + judge_entry = models["judge_model"] |
| 161 | + assert judge_entry["input_tokens"] == 100 |
| 162 | + assert judge_entry["output_tokens"] == 50 |
| 163 | + assert judge_entry["total_tokens"] == 150 |
| 164 | + |
| 165 | + def test_judge_model_usage_aggregated_in_benchmark_total(self): |
| 166 | + """``benchmark.usage`` includes judge tokens, and |
| 167 | + ``benchmark.usage_by_component`` has a non-zero ``models:judge_model`` |
| 168 | + entry.""" |
| 169 | + judge_usage = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150} |
| 170 | + tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}]) |
| 171 | + benchmark = _make_judge_benchmark(judge_usage) |
| 172 | + |
| 173 | + benchmark.run(tasks, agent_data={"model": "test"}) |
| 174 | + |
| 175 | + assert benchmark.usage.input_tokens >= 100 |
| 176 | + assert benchmark.usage.output_tokens >= 50 |
| 177 | + |
| 178 | + by_component = benchmark.usage_by_component |
| 179 | + assert "models:judge_model" in by_component, f"keys: {list(by_component)}" |
| 180 | + assert by_component["models:judge_model"].input_tokens == 100 |
| 181 | + assert by_component["models:judge_model"].output_tokens == 50 |
| 182 | + |
| 183 | + def test_agent_usage_captured_when_evaluation_raises(self): |
| 184 | + """When evaluate() raises and fail_on_evaluation_error=False, the |
| 185 | + report still carries a real usage dict (step 5 runs after step 4).""" |
| 186 | + |
| 187 | + class RaisingEvaluator: |
| 188 | + def __init__(self, task, environment, user): |
| 189 | + _ = task, environment, user |
| 190 | + |
| 191 | + def filter_traces(self, traces): |
| 192 | + return traces |
| 193 | + |
| 194 | + def __call__(self, traces, final_answer=None): |
| 195 | + raise RuntimeError("boom — simulated evaluator failure") |
| 196 | + |
| 197 | + class RaisingJudgeBenchmark(DummyBenchmark): |
| 198 | + def setup_evaluators(self, environment, task, agents, user, seed_generator): |
| 199 | + # Register an agent-side model with usage so we can assert |
| 200 | + # that pre-evaluate usage still survives the eval failure. |
| 201 | + agent_model = DummyModelAdapter( |
| 202 | + model_id="agent_model", |
| 203 | + usage={"input_tokens": 42, "output_tokens": 7, "total_tokens": 49}, |
| 204 | + ) |
| 205 | + self.register("models", "agent_model", agent_model) |
| 206 | + # Drive the model once so it has a usage record. |
| 207 | + agent_model.chat([{"role": "user", "content": "hi"}]) |
| 208 | + return [RaisingEvaluator(task, environment, user)] |
| 209 | + |
| 210 | + tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}]) |
| 211 | + benchmark = RaisingJudgeBenchmark(fail_on_evaluation_error=False) |
| 212 | + |
| 213 | + reports = benchmark.run(tasks, agent_data={"model": "test"}) |
| 214 | + |
| 215 | + report = reports[0] |
| 216 | + assert report["status"] == "evaluation_failed" |
| 217 | + usage = report["usage"] |
| 218 | + assert "error" not in usage, f"usage became an error dict: {usage}" |
| 219 | + assert usage["models"]["agent_model"]["input_tokens"] == 42 |
| 220 | + assert usage["models"]["agent_model"]["output_tokens"] == 7 |
| 221 | + |
| 222 | + def test_agent_usage_captured_when_execution_raises(self): |
| 223 | + """When run_agents raises (execution failure) and fail_on_task_error |
| 224 | + is False, the report still carries a real usage dict. Evaluate is |
| 225 | + skipped, but step 5 still runs.""" |
| 226 | + |
| 227 | + class FailingAgentBenchmark(DummyBenchmark): |
| 228 | + def setup_agents(self, agent_data, environment, task, user, seed_generator): |
| 229 | + agent_model = DummyModelAdapter( |
| 230 | + model_id="agent_model", |
| 231 | + usage={"input_tokens": 11, "output_tokens": 3, "total_tokens": 14}, |
| 232 | + ) |
| 233 | + self.register("models", "agent_model", agent_model) |
| 234 | + # Drive the model once so it has a usage record. |
| 235 | + agent_model.chat([{"role": "user", "content": "hi"}]) |
| 236 | + return super().setup_agents(agent_data, environment, task, user, seed_generator) |
| 237 | + |
| 238 | + def run_agents(self, agents, task, environment, query): |
| 239 | + raise AgentError("simulated agent failure", component="agent") |
| 240 | + |
| 241 | + tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}]) |
| 242 | + benchmark = FailingAgentBenchmark(fail_on_task_error=False) |
| 243 | + |
| 244 | + reports = benchmark.run(tasks, agent_data={"model": "test"}) |
| 245 | + |
| 246 | + report = reports[0] |
| 247 | + assert report["status"] == "agent_error" |
| 248 | + usage = report["usage"] |
| 249 | + assert "error" not in usage, f"usage became an error dict: {usage}" |
| 250 | + assert usage["models"]["agent_model"]["input_tokens"] == 11 |
| 251 | + assert usage["models"]["agent_model"]["output_tokens"] == 3 |
0 commit comments