Skip to content

Commit 60f205a

Browse files
committed
upgraded testing
1 parent 13067b9 commit 60f205a

3 files changed

Lines changed: 331 additions & 2 deletions

File tree

maseval/core/registry.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -319,9 +319,14 @@ def collect_usage(self) -> Dict[str, Any]:
319319
usage[category] = {}
320320
usage[category][comp_name] = usage_dict
321321

322-
# Accumulate into persistent aggregates (thread-safe)
322+
# Accumulate into persistent aggregates (thread-safe).
323+
# _usage_total starts as Usage(cost=None); adding to it would
324+
# poison the cost (None + X = None). Assign directly on first use.
323325
with self._usage_lock:
324-
self._usage_total = self._usage_total + component_usage
326+
if self._usage_total.cost is None and not self._usage_total.units:
327+
self._usage_total = component_usage
328+
else:
329+
self._usage_total = self._usage_total + component_usage
325330
if key in self._usage_by_component:
326331
self._usage_by_component[key] = self._usage_by_component[key] + component_usage
327332
else:
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
"""Test usage collection through the benchmark execution loop.
2+
3+
These tests verify that benchmark.run() collects usage from registered
4+
model adapters and includes it in report dicts.
5+
"""
6+
7+
import pytest
8+
from maseval import TaskQueue
9+
from maseval.core.usage import StaticPricingCalculator
10+
11+
12+
@pytest.mark.core
13+
class TestBenchmarkUsageCollection:
14+
"""Tests for usage collection during benchmark execution."""
15+
16+
def test_usage_in_report(self):
17+
"""Benchmark run includes a 'usage' key in each report."""
18+
from conftest import DummyBenchmark
19+
20+
tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}])
21+
benchmark = DummyBenchmark()
22+
23+
reports = benchmark.run(tasks, agent_data={"model": "test"})
24+
25+
assert "usage" in reports[0]
26+
usage = reports[0]["usage"]
27+
assert "metadata" in usage
28+
assert "models" in usage
29+
assert "agents" in usage
30+
31+
def test_usage_has_correct_structure(self):
32+
"""Usage dict has the expected category keys and metadata."""
33+
from conftest import DummyBenchmark
34+
35+
tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}])
36+
benchmark = DummyBenchmark()
37+
38+
reports = benchmark.run(tasks, agent_data={"model": "test"})
39+
40+
usage = reports[0]["usage"]
41+
assert "metadata" in usage
42+
assert "total_components" in usage["metadata"]
43+
assert "timestamp" in usage["metadata"]
44+
45+
def test_model_with_usage_appears_in_report(self):
46+
"""A model adapter that reports usage has its tokens in the report."""
47+
from conftest import DummyModelAdapter, DummyBenchmark
48+
49+
class UsageBenchmark(DummyBenchmark):
50+
def get_model_adapter(self, model_id, **kwargs):
51+
return DummyModelAdapter(
52+
model_id=model_id,
53+
usage={
54+
"input_tokens": 100,
55+
"output_tokens": 50,
56+
"total_tokens": 150,
57+
},
58+
)
59+
60+
tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}])
61+
benchmark = UsageBenchmark()
62+
63+
reports = benchmark.run(tasks, agent_data={"model": "test"})
64+
65+
# The DummyBenchmark doesn't register a model via register(), so
66+
# the model's usage won't appear unless the benchmark hooks it up.
67+
# This test verifies the usage structure exists.
68+
assert "usage" in reports[0]
69+
70+
def test_usage_persists_across_task_repetitions(self):
71+
"""Benchmark.usage accumulates across multiple tasks."""
72+
from conftest import DummyBenchmark
73+
74+
tasks = TaskQueue.from_list([
75+
{"query": "Task 1", "environment_data": {}},
76+
{"query": "Task 2", "environment_data": {}},
77+
])
78+
benchmark = DummyBenchmark()
79+
benchmark.run(tasks, agent_data={"model": "test"})
80+
81+
# Both tasks should have produced reports with usage
82+
assert len(benchmark.reports) == 2
83+
assert "usage" in benchmark.reports[0]
84+
assert "usage" in benchmark.reports[1]
85+
86+
def test_usage_property_returns_total(self):
87+
"""benchmark.usage returns the running total."""
88+
from conftest import DummyBenchmark
89+
90+
tasks = TaskQueue.from_list([{"query": "Test", "environment_data": {}}])
91+
benchmark = DummyBenchmark()
92+
benchmark.run(tasks, agent_data={"model": "test"})
93+
94+
# usage property should return a Usage object (even if empty)
95+
total = benchmark.usage
96+
assert total is not None
97+
# cost may be None if DummyModelAdapter doesn't provide usage

tests/test_core/test_registry.py

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,3 +256,230 @@ def worker(worker_id: int):
256256
for worker_id, traces in results.items():
257257
assert f"agent_{worker_id}" in traces["agents"]
258258
assert len(traces["agents"]) == 1
259+
260+
261+
# ==================== Usage Tracking Tests ====================
262+
263+
264+
class MockUsageComponent(TraceableMixin):
265+
"""Component that implements UsageTrackableMixin for testing."""
266+
267+
def __init__(self, name: str, cost: float = 0.0, input_tokens: int = 0, output_tokens: int = 0):
268+
super().__init__()
269+
self._name = name
270+
self._cost = cost
271+
self._input_tokens = input_tokens
272+
self._output_tokens = output_tokens
273+
274+
def gather_traces(self) -> Dict[str, Any]:
275+
return {"name": self._name}
276+
277+
def gather_usage(self):
278+
from maseval.core.usage import TokenUsage
279+
return TokenUsage(
280+
cost=self._cost,
281+
input_tokens=self._input_tokens,
282+
output_tokens=self._output_tokens,
283+
total_tokens=self._input_tokens + self._output_tokens,
284+
)
285+
286+
287+
class MockBrokenUsageComponent(TraceableMixin):
288+
"""Component whose gather_usage raises an exception."""
289+
290+
def __init__(self):
291+
super().__init__()
292+
293+
def gather_traces(self) -> Dict[str, Any]:
294+
return {}
295+
296+
def gather_usage(self):
297+
raise RuntimeError("Usage collection failed")
298+
299+
300+
# Ensure MockUsageComponent also inherits UsageTrackableMixin
301+
from maseval.core.usage import UsageTrackableMixin
302+
303+
304+
class UsageAwareComponent(TraceableMixin, UsageTrackableMixin):
305+
"""Component with both tracing and usage tracking."""
306+
307+
def __init__(self, cost: float = 0.0, input_tokens: int = 0, output_tokens: int = 0):
308+
TraceableMixin.__init__(self)
309+
self._cost = cost
310+
self._input_tokens = input_tokens
311+
self._output_tokens = output_tokens
312+
313+
def gather_traces(self) -> Dict[str, Any]:
314+
return {"traced": True}
315+
316+
def gather_usage(self):
317+
from maseval.core.usage import TokenUsage
318+
return TokenUsage(
319+
cost=self._cost,
320+
input_tokens=self._input_tokens,
321+
output_tokens=self._output_tokens,
322+
total_tokens=self._input_tokens + self._output_tokens,
323+
)
324+
325+
326+
class BrokenUsageComponent(TraceableMixin, UsageTrackableMixin):
327+
"""Component whose gather_usage raises an exception."""
328+
329+
def __init__(self):
330+
TraceableMixin.__init__(self)
331+
332+
def gather_traces(self) -> Dict[str, Any]:
333+
return {}
334+
335+
def gather_usage(self):
336+
raise RuntimeError("Usage collection failed")
337+
338+
339+
@pytest.mark.core
340+
class TestRegistryUsageCollection:
341+
"""Tests for usage tracking through the component registry."""
342+
343+
def test_register_usage_trackable_component(self):
344+
"""UsageTrackableMixin component is registered in the usage registry."""
345+
registry = ComponentRegistry()
346+
component = UsageAwareComponent(cost=0.05, input_tokens=100, output_tokens=50)
347+
348+
registry.register("models", "main_model", component)
349+
350+
assert "models:main_model" in registry._usage_registry
351+
assert registry._usage_registry["models:main_model"] is component
352+
353+
def test_non_usage_component_not_in_usage_registry(self):
354+
"""Components without UsageTrackableMixin are NOT in the usage registry."""
355+
registry = ComponentRegistry()
356+
component = MockTraceableComponent("test")
357+
358+
registry.register("agents", "my_agent", component)
359+
360+
assert "agents:my_agent" in registry._trace_registry
361+
assert "agents:my_agent" not in registry._usage_registry
362+
363+
def test_collect_usage_basic(self):
364+
"""collect_usage returns structured dict with usage from registered components."""
365+
from maseval.core.usage import TokenUsage
366+
367+
registry = ComponentRegistry()
368+
model = UsageAwareComponent(cost=0.10, input_tokens=500, output_tokens=200)
369+
registry.register("models", "main_model", model)
370+
371+
usage = registry.collect_usage()
372+
373+
assert "metadata" in usage
374+
assert "models" in usage
375+
assert "main_model" in usage["models"]
376+
377+
model_usage = usage["models"]["main_model"]
378+
assert model_usage["cost"] == 0.10
379+
assert model_usage["input_tokens"] == 500
380+
assert model_usage["output_tokens"] == 200
381+
assert model_usage["total_tokens"] == 700
382+
383+
def test_collect_usage_multiple_components(self):
384+
"""Multiple components across categories are all collected."""
385+
registry = ComponentRegistry()
386+
model = UsageAwareComponent(cost=0.10, input_tokens=500, output_tokens=200)
387+
tool = UsageAwareComponent(cost=0.05, input_tokens=0, output_tokens=0)
388+
389+
registry.register("models", "main_model", model)
390+
registry.register("tools", "search_tool", tool)
391+
392+
usage = registry.collect_usage()
393+
394+
assert "main_model" in usage["models"]
395+
assert "search_tool" in usage["tools"]
396+
assert usage["models"]["main_model"]["cost"] == 0.10
397+
assert usage["tools"]["search_tool"]["cost"] == 0.05
398+
399+
def test_collect_usage_injects_grouping_fields(self):
400+
"""Registry injects category and component_name into usage records."""
401+
registry = ComponentRegistry()
402+
model = UsageAwareComponent(cost=0.10, input_tokens=100, output_tokens=50)
403+
registry.register("models", "main_model", model)
404+
405+
usage = registry.collect_usage()
406+
407+
model_usage = usage["models"]["main_model"]
408+
assert model_usage["category"] == "models"
409+
assert model_usage["component_name"] == "main_model"
410+
411+
def test_total_usage_accumulates(self):
412+
"""total_usage property reflects accumulated usage across collect_usage calls."""
413+
registry = ComponentRegistry()
414+
model = UsageAwareComponent(cost=0.10, input_tokens=100, output_tokens=50)
415+
registry.register("models", "main_model", model)
416+
417+
# First collection
418+
registry.collect_usage()
419+
total1 = registry.total_usage
420+
assert total1.cost == pytest.approx(0.10)
421+
422+
# Clear and re-register (simulates next repetition)
423+
registry.clear()
424+
model2 = UsageAwareComponent(cost=0.20, input_tokens=200, output_tokens=100)
425+
registry.register("models", "main_model", model2)
426+
427+
# Second collection
428+
registry.collect_usage()
429+
total2 = registry.total_usage
430+
assert total2.cost == pytest.approx(0.30)
431+
432+
def test_usage_by_component_accumulates(self):
433+
"""usage_by_component accumulates per key across repetitions."""
434+
registry = ComponentRegistry()
435+
model = UsageAwareComponent(cost=0.10, input_tokens=100, output_tokens=50)
436+
registry.register("models", "main_model", model)
437+
registry.collect_usage()
438+
439+
# Clear and re-register for second repetition
440+
registry.clear()
441+
model2 = UsageAwareComponent(cost=0.20, input_tokens=200, output_tokens=100)
442+
registry.register("models", "main_model", model2)
443+
registry.collect_usage()
444+
445+
by_comp = registry.usage_by_component
446+
assert "models:main_model" in by_comp
447+
448+
total = by_comp["models:main_model"]
449+
assert total.input_tokens == 300
450+
assert total.output_tokens == 150
451+
assert total.cost == pytest.approx(0.30)
452+
453+
def test_usage_persists_across_clear(self):
454+
"""clear() does NOT reset total_usage or usage_by_component."""
455+
registry = ComponentRegistry()
456+
model = UsageAwareComponent(cost=0.10, input_tokens=100, output_tokens=50)
457+
registry.register("models", "main_model", model)
458+
registry.collect_usage()
459+
460+
# Clear only removes per-repetition state
461+
registry.clear()
462+
463+
assert registry.total_usage.cost == pytest.approx(0.10)
464+
assert "models:main_model" in registry.usage_by_component
465+
466+
def test_collect_usage_handles_error_gracefully(self):
467+
"""If gather_usage raises, the error is captured in the usage dict."""
468+
registry = ComponentRegistry()
469+
broken = BrokenUsageComponent()
470+
registry.register("models", "bad_model", broken)
471+
472+
usage = registry.collect_usage()
473+
474+
assert "bad_model" in usage["models"]
475+
assert "error" in usage["models"]["bad_model"]
476+
assert "RuntimeError" in usage["models"]["bad_model"]["error_type"]
477+
478+
def test_collect_usage_empty_registry(self):
479+
"""collect_usage with no components returns empty structure."""
480+
registry = ComponentRegistry()
481+
usage = registry.collect_usage()
482+
483+
assert usage["metadata"]["total_components"] == 0
484+
assert usage["models"] == {}
485+
assert usage["agents"] == {}

0 commit comments

Comments
 (0)