|
1 | 1 | """Tests for MultiAgentBench benchmark classes.""" |
2 | 2 |
|
| 3 | +import pytest |
| 4 | +from typing import Any, Dict |
| 5 | +from unittest.mock import MagicMock, patch |
3 | 6 |
|
4 | 7 | from maseval import Task |
5 | 8 | from maseval.benchmark.multiagentbench import ( |
| 9 | + MultiAgentBenchBenchmark, |
| 10 | + MarbleMultiAgentBenchBenchmark, |
6 | 11 | MultiAgentBenchEnvironment, |
7 | 12 | MultiAgentBenchEvaluator, |
8 | 13 | ) |
@@ -250,3 +255,221 @@ def test_evaluator_domain_from_task( |
250 | 255 |
|
251 | 256 | evaluator = evaluators[0] |
252 | 257 | assert evaluator.domain == "bargaining" |
| 258 | + |
| 259 | + |
| 260 | +class TestMarbleMultiAgentBenchBenchmark: |
| 261 | + """Tests for MarbleMultiAgentBenchBenchmark class.""" |
| 262 | + |
| 263 | + @pytest.fixture |
| 264 | + def marble_benchmark_class(self): |
| 265 | + """Create a concrete MarbleMultiAgentBenchBenchmark class.""" |
| 266 | + from conftest import DummyModelAdapter |
| 267 | + |
| 268 | + class ConcreteMarbleBenchmark(MarbleMultiAgentBenchBenchmark): |
| 269 | + def get_model_adapter(self, model_id, **kwargs): |
| 270 | + adapter = DummyModelAdapter( |
| 271 | + model_id=model_id, |
| 272 | + responses=['{"rating": 4}'], |
| 273 | + ) |
| 274 | + register_name = kwargs.get("register_name") |
| 275 | + if register_name: |
| 276 | + try: |
| 277 | + self.register("models", register_name, adapter) |
| 278 | + except ValueError: |
| 279 | + pass |
| 280 | + return adapter |
| 281 | + |
| 282 | + return ConcreteMarbleBenchmark |
| 283 | + |
| 284 | + def test_setup_agents_raises_import_error( |
| 285 | + self, |
| 286 | + marble_benchmark_class, |
| 287 | + sample_research_task: Task, |
| 288 | + ): |
| 289 | + """setup_agents should raise ImportError when MARBLE not available.""" |
| 290 | + benchmark = marble_benchmark_class(progress_bar=False) |
| 291 | + env = benchmark.setup_environment({}, sample_research_task) |
| 292 | + |
| 293 | + with pytest.raises(ImportError, match="MARBLE is not available"): |
| 294 | + benchmark.setup_agents({}, env, sample_research_task, None) |
| 295 | + |
| 296 | + def test_create_marble_env_raises_import_error( |
| 297 | + self, |
| 298 | + marble_benchmark_class, |
| 299 | + sample_research_task: Task, |
| 300 | + ): |
| 301 | + """_create_marble_env should raise ImportError when MARBLE not available.""" |
| 302 | + benchmark = marble_benchmark_class(progress_bar=False) |
| 303 | + |
| 304 | + with pytest.raises(ImportError, match="MARBLE is not available"): |
| 305 | + benchmark._create_marble_env(sample_research_task) |
| 306 | + |
| 307 | + def test_setup_agent_graph_silently_fails( |
| 308 | + self, |
| 309 | + marble_benchmark_class, |
| 310 | + sample_research_task: Task, |
| 311 | + ): |
| 312 | + """_setup_agent_graph should not raise when MARBLE not available.""" |
| 313 | + benchmark = marble_benchmark_class(progress_bar=False) |
| 314 | + |
| 315 | + # Should not raise, just return silently |
| 316 | + benchmark._setup_agent_graph({}, sample_research_task, None) |
| 317 | + |
| 318 | + def test_run_agents_returns_structured_output( |
| 319 | + self, |
| 320 | + marble_benchmark_class, |
| 321 | + sample_research_task: Task, |
| 322 | + ): |
| 323 | + """run_agents should return structured output with agent_results.""" |
| 324 | + from conftest import DummyAgentAdapter |
| 325 | + |
| 326 | + benchmark = marble_benchmark_class(progress_bar=False) |
| 327 | + env = benchmark.setup_environment({}, sample_research_task) |
| 328 | + |
| 329 | + # Create mock agents |
| 330 | + mock_agent1 = MagicMock() |
| 331 | + mock_agent1.run.return_value = "Result from agent1" |
| 332 | + mock_agent1.agent_id = "agent1" |
| 333 | + |
| 334 | + mock_agent2 = MagicMock() |
| 335 | + mock_agent2.run.return_value = "Result from agent2" |
| 336 | + mock_agent2.agent_id = "agent2" |
| 337 | + mock_agent2.get_serialized_messages.return_value = "Communication log" |
| 338 | + |
| 339 | + result = benchmark.run_agents( |
| 340 | + [mock_agent1, mock_agent2], |
| 341 | + sample_research_task, |
| 342 | + env, |
| 343 | + sample_research_task.query, |
| 344 | + ) |
| 345 | + |
| 346 | + assert "agent_results" in result |
| 347 | + assert "communications" in result |
| 348 | + assert "coordination_mode" in result |
| 349 | + assert len(result["agent_results"]) == 2 |
| 350 | + assert result["agent_results"][0]["agent_id"] == "agent1" |
| 351 | + assert result["agent_results"][1]["agent_id"] == "agent2" |
| 352 | + |
| 353 | + def test_run_agents_collects_communications( |
| 354 | + self, |
| 355 | + marble_benchmark_class, |
| 356 | + sample_research_task: Task, |
| 357 | + ): |
| 358 | + """run_agents should collect communications from agents.""" |
| 359 | + benchmark = marble_benchmark_class(progress_bar=False) |
| 360 | + env = benchmark.setup_environment({}, sample_research_task) |
| 361 | + |
| 362 | + # Create mock agent with get_serialized_messages |
| 363 | + mock_agent = MagicMock() |
| 364 | + mock_agent.run.return_value = "Result" |
| 365 | + mock_agent.agent_id = "agent1" |
| 366 | + mock_agent.get_serialized_messages.return_value = "Hello from agent1" |
| 367 | + |
| 368 | + result = benchmark.run_agents( |
| 369 | + [mock_agent], |
| 370 | + sample_research_task, |
| 371 | + env, |
| 372 | + sample_research_task.query, |
| 373 | + ) |
| 374 | + |
| 375 | + assert "Hello from agent1" in result["communications"] |
| 376 | + |
| 377 | + |
| 378 | +class TestBenchmarkWithDifferentCoordinationModes: |
| 379 | + """Tests for different coordination modes.""" |
| 380 | + |
| 381 | + def test_run_agents_with_cooperative_mode( |
| 382 | + self, |
| 383 | + benchmark_instance, |
| 384 | + sample_research_task: Task, |
| 385 | + ): |
| 386 | + """run_agents should work with cooperative coordination.""" |
| 387 | + # sample_research_task uses cooperative mode by default |
| 388 | + env = benchmark_instance.setup_environment({}, sample_research_task) |
| 389 | + agents_list, _ = benchmark_instance.setup_agents({}, env, sample_research_task, None) |
| 390 | + |
| 391 | + results = benchmark_instance.run_agents( |
| 392 | + agents_list, |
| 393 | + sample_research_task, |
| 394 | + env, |
| 395 | + sample_research_task.query, |
| 396 | + ) |
| 397 | + |
| 398 | + assert len(results) == 2 |
| 399 | + |
| 400 | + def test_run_agents_with_star_mode(self, benchmark_instance): |
| 401 | + """run_agents should work with star coordination.""" |
| 402 | + task_data = { |
| 403 | + "scenario": "research", |
| 404 | + "task_id": 1, |
| 405 | + "agents": [ |
| 406 | + {"agent_id": "central", "profile": "Central coordinator"}, |
| 407 | + {"agent_id": "worker1", "profile": "Worker 1"}, |
| 408 | + ], |
| 409 | + "coordinate_mode": "star", |
| 410 | + "relationships": [["central", "worker1", "coordinates"]], |
| 411 | + "environment": {"max_iterations": 10}, |
| 412 | + "task": {"content": "Research task", "output_format": "5Q"}, |
| 413 | + "max_iterations": 10, |
| 414 | + } |
| 415 | + task = Task( |
| 416 | + id="test_star", |
| 417 | + query="Research task", |
| 418 | + environment_data=task_data, |
| 419 | + evaluation_data={"model_id": "gpt-4o-mini"}, |
| 420 | + metadata={"domain": "research"}, |
| 421 | + ) |
| 422 | + |
| 423 | + env = benchmark_instance.setup_environment({}, task) |
| 424 | + agents_list, _ = benchmark_instance.setup_agents({}, env, task, None) |
| 425 | + |
| 426 | + results = benchmark_instance.run_agents(agents_list, task, env, task.query) |
| 427 | + |
| 428 | + assert len(results) == 2 |
| 429 | + |
| 430 | + |
| 431 | +class TestBenchmarkWithEmptyAgents: |
| 432 | + """Tests for edge cases with agents.""" |
| 433 | + |
| 434 | + def test_run_agents_with_empty_list( |
| 435 | + self, |
| 436 | + benchmark_instance, |
| 437 | + sample_research_task: Task, |
| 438 | + ): |
| 439 | + """run_agents should handle empty agent list.""" |
| 440 | + env = benchmark_instance.setup_environment({}, sample_research_task) |
| 441 | + |
| 442 | + results = benchmark_instance.run_agents( |
| 443 | + [], |
| 444 | + sample_research_task, |
| 445 | + env, |
| 446 | + sample_research_task.query, |
| 447 | + ) |
| 448 | + |
| 449 | + assert results == [] |
| 450 | + |
| 451 | + def test_setup_agents_with_no_agents_in_task(self, benchmark_instance): |
| 452 | + """setup_agents should handle task with no agents.""" |
| 453 | + task_data = { |
| 454 | + "scenario": "research", |
| 455 | + "task_id": 1, |
| 456 | + "agents": [], # No agents |
| 457 | + "coordinate_mode": "cooperative", |
| 458 | + "relationships": [], |
| 459 | + "environment": {"max_iterations": 10}, |
| 460 | + "task": {"content": "Research task"}, |
| 461 | + "max_iterations": 10, |
| 462 | + } |
| 463 | + task = Task( |
| 464 | + id="test_no_agents", |
| 465 | + query="Research task", |
| 466 | + environment_data=task_data, |
| 467 | + evaluation_data={"model_id": "gpt-4o-mini"}, |
| 468 | + metadata={"domain": "research"}, |
| 469 | + ) |
| 470 | + |
| 471 | + env = benchmark_instance.setup_environment({}, task) |
| 472 | + agents_list, agents_dict = benchmark_instance.setup_agents({}, env, task, None) |
| 473 | + |
| 474 | + assert len(agents_list) == 0 |
| 475 | + assert len(agents_dict) == 0 |
0 commit comments