Skip to content

Commit ebc0961

Browse files
committed
improved testing
1 parent 0667cc0 commit ebc0961

6 files changed

Lines changed: 1447 additions & 5 deletions

File tree

maseval/benchmark/multiagentbench/adapters/marble_adapter.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,11 @@ def __init__(
4343
self._profile = getattr(marble_agent, "profile", "")
4444
self._communication_log: List[Dict[str, Any]] = []
4545
self._action_log: List[Dict[str, Any]] = []
46-
super().__init__(callbacks=callbacks)
46+
super().__init__(agent_instance=marble_agent, name=agent_id, callbacks=callbacks)
47+
# Initialize message history
48+
from maseval import MessageHistory
49+
50+
self.messages = MessageHistory()
4751

4852
@property
4953
def agent_id(self) -> str:
@@ -95,8 +99,8 @@ def _run_agent(self, query: str) -> str:
9599
)
96100

97101
# Update message history
98-
self._messages.add_message(role="user", content=query)
99-
self._messages.add_message(role="assistant", content=result)
102+
self.messages.add_message(role="user", content=query)
103+
self.messages.add_message(role="assistant", content=result)
100104

101105
return result
102106

tests/test_benchmarks/test_multiagentbench/test_benchmark.py

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
"""Tests for MultiAgentBench benchmark classes."""
22

3+
import pytest
4+
from typing import Any, Dict
5+
from unittest.mock import MagicMock, patch
36

47
from maseval import Task
58
from maseval.benchmark.multiagentbench import (
9+
MultiAgentBenchBenchmark,
10+
MarbleMultiAgentBenchBenchmark,
611
MultiAgentBenchEnvironment,
712
MultiAgentBenchEvaluator,
813
)
@@ -250,3 +255,221 @@ def test_evaluator_domain_from_task(
250255

251256
evaluator = evaluators[0]
252257
assert evaluator.domain == "bargaining"
258+
259+
260+
class TestMarbleMultiAgentBenchBenchmark:
261+
"""Tests for MarbleMultiAgentBenchBenchmark class."""
262+
263+
@pytest.fixture
264+
def marble_benchmark_class(self):
265+
"""Create a concrete MarbleMultiAgentBenchBenchmark class."""
266+
from conftest import DummyModelAdapter
267+
268+
class ConcreteMarbleBenchmark(MarbleMultiAgentBenchBenchmark):
269+
def get_model_adapter(self, model_id, **kwargs):
270+
adapter = DummyModelAdapter(
271+
model_id=model_id,
272+
responses=['{"rating": 4}'],
273+
)
274+
register_name = kwargs.get("register_name")
275+
if register_name:
276+
try:
277+
self.register("models", register_name, adapter)
278+
except ValueError:
279+
pass
280+
return adapter
281+
282+
return ConcreteMarbleBenchmark
283+
284+
def test_setup_agents_raises_import_error(
285+
self,
286+
marble_benchmark_class,
287+
sample_research_task: Task,
288+
):
289+
"""setup_agents should raise ImportError when MARBLE not available."""
290+
benchmark = marble_benchmark_class(progress_bar=False)
291+
env = benchmark.setup_environment({}, sample_research_task)
292+
293+
with pytest.raises(ImportError, match="MARBLE is not available"):
294+
benchmark.setup_agents({}, env, sample_research_task, None)
295+
296+
def test_create_marble_env_raises_import_error(
297+
self,
298+
marble_benchmark_class,
299+
sample_research_task: Task,
300+
):
301+
"""_create_marble_env should raise ImportError when MARBLE not available."""
302+
benchmark = marble_benchmark_class(progress_bar=False)
303+
304+
with pytest.raises(ImportError, match="MARBLE is not available"):
305+
benchmark._create_marble_env(sample_research_task)
306+
307+
def test_setup_agent_graph_silently_fails(
308+
self,
309+
marble_benchmark_class,
310+
sample_research_task: Task,
311+
):
312+
"""_setup_agent_graph should not raise when MARBLE not available."""
313+
benchmark = marble_benchmark_class(progress_bar=False)
314+
315+
# Should not raise, just return silently
316+
benchmark._setup_agent_graph({}, sample_research_task, None)
317+
318+
def test_run_agents_returns_structured_output(
319+
self,
320+
marble_benchmark_class,
321+
sample_research_task: Task,
322+
):
323+
"""run_agents should return structured output with agent_results."""
324+
from conftest import DummyAgentAdapter
325+
326+
benchmark = marble_benchmark_class(progress_bar=False)
327+
env = benchmark.setup_environment({}, sample_research_task)
328+
329+
# Create mock agents
330+
mock_agent1 = MagicMock()
331+
mock_agent1.run.return_value = "Result from agent1"
332+
mock_agent1.agent_id = "agent1"
333+
334+
mock_agent2 = MagicMock()
335+
mock_agent2.run.return_value = "Result from agent2"
336+
mock_agent2.agent_id = "agent2"
337+
mock_agent2.get_serialized_messages.return_value = "Communication log"
338+
339+
result = benchmark.run_agents(
340+
[mock_agent1, mock_agent2],
341+
sample_research_task,
342+
env,
343+
sample_research_task.query,
344+
)
345+
346+
assert "agent_results" in result
347+
assert "communications" in result
348+
assert "coordination_mode" in result
349+
assert len(result["agent_results"]) == 2
350+
assert result["agent_results"][0]["agent_id"] == "agent1"
351+
assert result["agent_results"][1]["agent_id"] == "agent2"
352+
353+
def test_run_agents_collects_communications(
354+
self,
355+
marble_benchmark_class,
356+
sample_research_task: Task,
357+
):
358+
"""run_agents should collect communications from agents."""
359+
benchmark = marble_benchmark_class(progress_bar=False)
360+
env = benchmark.setup_environment({}, sample_research_task)
361+
362+
# Create mock agent with get_serialized_messages
363+
mock_agent = MagicMock()
364+
mock_agent.run.return_value = "Result"
365+
mock_agent.agent_id = "agent1"
366+
mock_agent.get_serialized_messages.return_value = "Hello from agent1"
367+
368+
result = benchmark.run_agents(
369+
[mock_agent],
370+
sample_research_task,
371+
env,
372+
sample_research_task.query,
373+
)
374+
375+
assert "Hello from agent1" in result["communications"]
376+
377+
378+
class TestBenchmarkWithDifferentCoordinationModes:
379+
"""Tests for different coordination modes."""
380+
381+
def test_run_agents_with_cooperative_mode(
382+
self,
383+
benchmark_instance,
384+
sample_research_task: Task,
385+
):
386+
"""run_agents should work with cooperative coordination."""
387+
# sample_research_task uses cooperative mode by default
388+
env = benchmark_instance.setup_environment({}, sample_research_task)
389+
agents_list, _ = benchmark_instance.setup_agents({}, env, sample_research_task, None)
390+
391+
results = benchmark_instance.run_agents(
392+
agents_list,
393+
sample_research_task,
394+
env,
395+
sample_research_task.query,
396+
)
397+
398+
assert len(results) == 2
399+
400+
def test_run_agents_with_star_mode(self, benchmark_instance):
401+
"""run_agents should work with star coordination."""
402+
task_data = {
403+
"scenario": "research",
404+
"task_id": 1,
405+
"agents": [
406+
{"agent_id": "central", "profile": "Central coordinator"},
407+
{"agent_id": "worker1", "profile": "Worker 1"},
408+
],
409+
"coordinate_mode": "star",
410+
"relationships": [["central", "worker1", "coordinates"]],
411+
"environment": {"max_iterations": 10},
412+
"task": {"content": "Research task", "output_format": "5Q"},
413+
"max_iterations": 10,
414+
}
415+
task = Task(
416+
id="test_star",
417+
query="Research task",
418+
environment_data=task_data,
419+
evaluation_data={"model_id": "gpt-4o-mini"},
420+
metadata={"domain": "research"},
421+
)
422+
423+
env = benchmark_instance.setup_environment({}, task)
424+
agents_list, _ = benchmark_instance.setup_agents({}, env, task, None)
425+
426+
results = benchmark_instance.run_agents(agents_list, task, env, task.query)
427+
428+
assert len(results) == 2
429+
430+
431+
class TestBenchmarkWithEmptyAgents:
432+
"""Tests for edge cases with agents."""
433+
434+
def test_run_agents_with_empty_list(
435+
self,
436+
benchmark_instance,
437+
sample_research_task: Task,
438+
):
439+
"""run_agents should handle empty agent list."""
440+
env = benchmark_instance.setup_environment({}, sample_research_task)
441+
442+
results = benchmark_instance.run_agents(
443+
[],
444+
sample_research_task,
445+
env,
446+
sample_research_task.query,
447+
)
448+
449+
assert results == []
450+
451+
def test_setup_agents_with_no_agents_in_task(self, benchmark_instance):
452+
"""setup_agents should handle task with no agents."""
453+
task_data = {
454+
"scenario": "research",
455+
"task_id": 1,
456+
"agents": [], # No agents
457+
"coordinate_mode": "cooperative",
458+
"relationships": [],
459+
"environment": {"max_iterations": 10},
460+
"task": {"content": "Research task"},
461+
"max_iterations": 10,
462+
}
463+
task = Task(
464+
id="test_no_agents",
465+
query="Research task",
466+
environment_data=task_data,
467+
evaluation_data={"model_id": "gpt-4o-mini"},
468+
metadata={"domain": "research"},
469+
)
470+
471+
env = benchmark_instance.setup_environment({}, task)
472+
agents_list, agents_dict = benchmark_instance.setup_agents({}, env, task, None)
473+
474+
assert len(agents_list) == 0
475+
assert len(agents_dict) == 0

0 commit comments

Comments
 (0)