From 4d533da0fb1aa365274d008d05adb7cd7a65f2c1 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Wed, 3 Dec 2025 10:35:38 +0000
Subject: [PATCH 01/34] added dataloading infrastructure

---
 CHANGELOG.md                                  |   1 +
 maseval/benchmark/macs/.gitignore             |   3 +
 maseval/benchmark/macs/__init__.py            |  48 ++
 maseval/benchmark/macs/data_loader.py         | 508 ++++++++++++
 tests/test_benchmarks/test_macs/__init__.py   |   1 +
 .../test_macs/test_data_loader.py             | 764 ++++++++++++++++++
 6 files changed, 1325 insertions(+)
 create mode 100644 maseval/benchmark/macs/.gitignore
 create mode 100644 maseval/benchmark/macs/__init__.py
 create mode 100644 maseval/benchmark/macs/data_loader.py
 create mode 100644 tests/test_benchmarks/test_macs/__init__.py
 create mode 100644 tests/test_benchmarks/test_macs/test_data_loader.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d26f755d..eb89e532 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- MACS Benchmark: Multi-Agent Collaboration Scenarios benchmark (PR: #13)
 - [LlamaIndex](https://github.com/run-llama/llama_index) integration: `LlamaIndexAgentAdapter` and `LlamaIndexUser` for evaluating LlamaIndex workflow-based agents (PR: #7)
   - Supports async workflow execution with proper event loop handling
 - Added a new example: The `5_a_day_benchmark` (PR: #10)
diff --git a/maseval/benchmark/macs/.gitignore b/maseval/benchmark/macs/.gitignore
new file mode 100644
index 00000000..0bfef8c7
--- /dev/null
+++ b/maseval/benchmark/macs/.gitignore
@@ -0,0 +1,3 @@
+(# Ignore generated data files created by the aws_collab data processor)
+**/*.json
+**/*.txt
diff --git a/maseval/benchmark/macs/__init__.py b/maseval/benchmark/macs/__init__.py
new file mode 100644
index 00000000..ce833ac3
--- /dev/null
+++ b/maseval/benchmark/macs/__init__.py
@@ -0,0 +1,48 @@
+"""MACS Benchmark - Multi-Agent Collaboration Scenarios.
+
+This module provides the AWS MACS benchmark for evaluating multi-agent
+collaboration in enterprise applications.
+
+Reference:
+    Paper: https://arxiv.org/abs/2412.05449
+    Data: https://github.com/aws-samples/multiagent-collab-scenario-benchmark
+"""
+
+from .macs import (
+    MACSBenchmark,
+    MACSEnvironment,
+    MACSEvaluator,
+    MACSGenericTool,
+    compute_benchmark_metrics,
+)
+from .data_loader import (
+    load_tasks,
+    load_agent_config,
+    ensure_data_exists,
+    process_data,
+    download_original_data,
+    download_prompt_templates,
+    restructure_data,
+    VALID_DOMAINS,
+    DEFAULT_DATA_DIR,
+)
+
+__all__ = [
+    # Core classes
+    "MACSBenchmark",
+    "MACSEnvironment",
+    "MACSEvaluator",
+    "MACSGenericTool",
+    # Data loading
+    "load_tasks",
+    "load_agent_config",
+    "ensure_data_exists",
+    "process_data",
+    "download_original_data",
+    "download_prompt_templates",
+    "restructure_data",
+    "VALID_DOMAINS",
+    "DEFAULT_DATA_DIR",
+    # Utilities
+    "compute_benchmark_metrics",
+]
diff --git a/maseval/benchmark/macs/data_loader.py b/maseval/benchmark/macs/data_loader.py
new file mode 100644
index 00000000..7acfc416
--- /dev/null
+++ b/maseval/benchmark/macs/data_loader.py
@@ -0,0 +1,508 @@
+"""Data loading utilities for MACS benchmark.
+
+This module provides functions to:
+1. Download original data from AWS GitHub to data/original/
+2. Restructure data into data/restructured/ (tasks.json, agents.json per domain)
+3. Load restructured data for use in benchmarks
+
+No side effects on import. Data download/processing must be explicitly called.
+"""
+
+import json
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from urllib.error import HTTPError, URLError
+from urllib.request import urlopen
+from uuid import UUID
+
+from maseval import Task, TaskCollection
+
+
+# =============================================================================
+# Constants
+# =============================================================================
+
+DEFAULT_DATA_DIR = Path(__file__).parent / "data"
+VALID_DOMAINS = ("travel", "mortgage", "software")
+
+# AWS Multi-Agent Collaboration Scenarios benchmark data
+# Source: https://github.com/aws-samples/multiagent-collab-scenario-benchmark
+URLS = {
+    "data": {
+        "software": {
+            "agents": "https://raw.githubusercontent.com/aws-samples/multiagent-collab-scenario-benchmark/refs/heads/main/datasets/software/agents.json",
+            "scenarios": "https://raw.githubusercontent.com/aws-samples/multiagent-collab-scenario-benchmark/refs/heads/main/datasets/software/scenarios_30.json",
+        },
+        "travel": {
+            "agents": "https://raw.githubusercontent.com/aws-samples/multiagent-collab-scenario-benchmark/refs/heads/main/datasets/travel/agents.json",
+            "scenarios": "https://raw.githubusercontent.com/aws-samples/multiagent-collab-scenario-benchmark/refs/heads/main/datasets/travel/scenarios_30.json",
+        },
+        "mortgage": {
+            "agents": "https://raw.githubusercontent.com/aws-samples/multiagent-collab-scenario-benchmark/refs/heads/main/datasets/mortgage/agents.json",
+            "scenarios": "https://raw.githubusercontent.com/aws-samples/multiagent-collab-scenario-benchmark/refs/heads/main/datasets/mortgage/scenarios_30.json",
+        },
+    },
+    "evaluation": {
+        "prompt_templates": "https://raw.githubusercontent.com/aws-samples/multiagent-collab-scenario-benchmark/refs/heads/main/src/prompt_templates.py",
+    },
+}
+
+
+# =============================================================================
+# Download Functions
+# =============================================================================
+
+
+def download_file(url: str, timeout: int = 15) -> str:
+    """Download a file from URL and return as text."""
+    try:
+        with urlopen(url, timeout=timeout) as resp:
+            raw = resp.read()
+            return raw.decode("utf-8") if isinstance(raw, bytes) else raw
+    except (HTTPError, URLError) as e:
+        raise RuntimeError(f"Failed to download from {url}: {e}") from e
+
+
+def download_json(url: str) -> object:
+    """Download and parse JSON from URL."""
+    text = download_file(url)
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError as e:
+        raise RuntimeError(f"Failed to decode JSON from {url}: {e}") from e
+
+
+def download_original_data(
+    data_dir: Optional[Path] = None,
+    domain: Optional[str] = None,
+    verbose: int = 1,
+) -> Path:
+    """Download original data from AWS GitHub to data/original/.
+
+    Args:
+        data_dir: Base data directory (default: module's data/)
+        domain: Specific domain to download, or None for all
+        verbose: 0=silent, 1=summary, 2=detailed
+
+    Returns:
+        Path to the original data directory
+    """
+    data_dir = Path(data_dir) if data_dir else DEFAULT_DATA_DIR
+    original_dir = data_dir / "original"
+
+    domains = [domain] if domain else list(URLS["data"].keys())
+
+    for d in domains:
+        if d not in URLS["data"]:
+            raise ValueError(f"Unknown domain: {d}")
+
+        domain_dir = original_dir / d
+        domain_dir.mkdir(parents=True, exist_ok=True)
+
+        for name, url in URLS["data"][d].items():
+            content = download_json(url)
+            out_path = domain_dir / f"{name}.json"
+            with out_path.open("w") as f:
+                json.dump(content, f, indent=2)
+            if verbose >= 2:
+                print(f"Downloaded {url} -> {out_path}")
+
+    if verbose >= 1:
+        print(f"Downloaded original data to {original_dir}")
+
+    return original_dir
+
+
+def download_prompt_templates(
+    data_dir: Optional[Path] = None,
+    verbose: int = 1,
+) -> Path:
+    """Download prompt templates from AWS GitHub.
+
+    Args:
+        data_dir: Base data directory (default: module's data/)
+        verbose: 0=silent, 1=summary, 2=detailed
+
+    Returns:
+        Path to the prompt_templates directory
+    """
+    import ast
+
+    data_dir = Path(data_dir) if data_dir else DEFAULT_DATA_DIR
+    templates_dir = data_dir.parent / "prompt_templates"
+    templates_dir.mkdir(parents=True, exist_ok=True)
+
+    url = URLS["evaluation"]["prompt_templates"]
+    text = download_file(url)
+
+    # Parse Python file to extract prompt constants
+    tree = ast.parse(text)
+    values: Dict[str, str] = {}
+
+    VARS = {
+        "USER_GSR_PROMPT": "user",
+        "SYSTEM_GSR_PROMPT": "system",
+        "ISSUES_PROMPT": "issues",
+    }
+
+    def _const_str(node) -> Optional[str]:
+        if isinstance(node, ast.Constant) and isinstance(node.value, str):
+            return node.value
+        if isinstance(node, ast.JoinedStr):
+            parts = []
+            for part in node.values:
+                if isinstance(part, ast.Constant) and isinstance(part.value, str):
+                    parts.append(part.value)
+                else:
+                    return None
+            return "".join(parts)
+        return None
+
+    def _eval_node(node):
+        s = _const_str(node)
+        if s is not None:
+            return s
+        if isinstance(node, ast.Name):
+            return values.get(node.id)
+        if isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
+            left_val = _eval_node(node.left)
+            right_val = _eval_node(node.right)
+            if left_val is None or right_val is None:
+                return None
+            return left_val + right_val
+        return None
+
+    for node in tree.body:
+        if not isinstance(node, ast.Assign) or len(node.targets) != 1:
+            continue
+        target = node.targets[0]
+        if isinstance(target, ast.Name):
+            val = _eval_node(node.value)
+            if val is not None:
+                values[target.id] = val
+
+    # Escape placeholders for template use
+    def _escape_placeholders(s: str) -> str:
+        if not s:
+            return ""
+        s = s.replace("{{", "__DBL_OPEN__").replace("}}", "__DBL_CLOSE__")
+        s = s.replace("{", "{{").replace("}", "}}")
+        s = s.replace("__DBL_OPEN__", "{{").replace("__DBL_CLOSE__", "}}")
+        return s
+
+    # Write template files
+    for var_name, file_key in VARS.items():
+        content = _escape_placeholders(values.get(var_name, ""))
+        out_path = templates_dir / f"{file_key}.txt"
+        with out_path.open("w") as f:
+            f.write(content)
+        if verbose >= 2:
+            print(f"Wrote {out_path}")
+
+    if verbose >= 1:
+        print(f"Downloaded prompt templates to {templates_dir}")
+
+    return templates_dir
+
+
+# =============================================================================
+# Restructuring Functions
+# =============================================================================
+
+
+def _dedupe_tools_by_name(tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Deduplicate tools by tool_name, raising on conflicts."""
+    grouped: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+    no_name: List[Dict[str, Any]] = []
+
+    for t in tools:
+        if not isinstance(t, dict):
+            no_name.append(t)
+            continue
+        name = t.get("tool_name")
+        if not name:
+            no_name.append(t)
+        else:
+            grouped[name].append(t)
+
+    deduped = list(no_name)
+    for name, items in grouped.items():
+        if len(items) == 1:
+            deduped.append(items[0])
+        else:
+            first = items[0]
+            for other in items[1:]:
+                if first != other:
+                    raise ValueError(f"Conflicting tools for tool_name='{name}'")
+            deduped.append(first)
+
+    return deduped
+
+
+def _create_tools_list(agents_obj: object) -> List[Dict[str, Any]]:
+    """Extract and deduplicate tools from agents data."""
+    tools: List[Dict[str, Any]] = []
+
+    if isinstance(agents_obj, dict) and isinstance(agents_obj.get("agents"), list):
+        agents_list = agents_obj["agents"]
+    elif isinstance(agents_obj, list):
+        agents_list = agents_obj
+    else:
+        return tools
+
+    for agent in agents_list:
+        if not isinstance(agent, dict):
+            continue
+        for t in agent.get("tools", []):
+            if isinstance(t, dict):
+                tools.append(t)
+
+    return _dedupe_tools_by_name(tools)
+
+
+def _create_agents_list(agents_obj: object) -> Dict[str, Any]:
+    """Create agents config with tool names only (not full tool dicts)."""
+
+    def _process_agent(agent: Dict[str, Any]) -> Dict[str, Any]:
+        a_copy = {k: v for k, v in agent.items() if k != "tools"}
+        tool_names = [t.get("tool_name") for t in agent.get("tools", []) if isinstance(t, dict) and t.get("tool_name")]
+        a_copy["tools"] = tool_names
+        return a_copy
+
+    if isinstance(agents_obj, dict) and isinstance(agents_obj.get("agents"), list):
+        processed = [_process_agent(a) for a in agents_obj["agents"] if isinstance(a, dict)]
+        out: Dict[str, Any] = {"agents": processed}
+        if "primary_agent_id" in agents_obj:
+            out["primary_agent_id"] = agents_obj["primary_agent_id"]
+        if "human_id" in agents_obj:
+            out["human_id"] = agents_obj["human_id"]
+        return out
+
+    return {}
+
+
+def _create_tasks_list(scenarios_obj: object, tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Convert scenarios to task format."""
+    tasks: List[Dict[str, Any]] = []
+
+    if isinstance(scenarios_obj, dict) and isinstance(scenarios_obj.get("scenarios"), list):
+        scenarios_list = scenarios_obj["scenarios"]
+    elif isinstance(scenarios_obj, list):
+        scenarios_list = scenarios_obj
+    else:
+        return tasks
+
+    for idx, scen in enumerate(scenarios_list, start=1):
+        if not isinstance(scen, dict):
+            continue
+
+        query = scen.get("input_problem") or scen.get("query") or ""
+        # Use existing ID if present, otherwise generate sequential task ID
+        tid = scen.get("id") or scen.get("uuid") or f"task-{idx:06d}"
+
+        task = {
+            "id": tid,
+            "query": query,
+            "environment_data": {"tools": tools},
+            "evaluation_data": {"assertions": scen.get("assertions", [])},
+            "metadata": {k: v for k, v in scen.items() if k not in ("input_problem", "query", "id", "uuid", "assertions")},
+        }
+        tasks.append(task)
+
+    return tasks
+
+
+def restructure_data(
+    data_dir: Optional[Path] = None,
+    domain: Optional[str] = None,
+    verbose: int = 1,
+) -> Path:
+    """Restructure original data into tasks.json and agents.json per domain.
+
+    Reads from data/original/, writes to data/restructured/.
+
+    Args:
+        data_dir: Base data directory (default: module's data/)
+        domain: Specific domain to restructure, or None for all
+        verbose: 0=silent, 1=summary, 2=detailed
+
+    Returns:
+        Path to the restructured data directory
+    """
+    data_dir = Path(data_dir) if data_dir else DEFAULT_DATA_DIR
+    original_dir = data_dir / "original"
+    restructured_dir = data_dir / "restructured"
+
+    domains = [domain] if domain else list(VALID_DOMAINS)
+
+    for d in domains:
+        orig_domain_dir = original_dir / d
+        if not orig_domain_dir.exists():
+            raise FileNotFoundError(f"Original data not found: {orig_domain_dir}")
+
+        # Load original data
+        with (orig_domain_dir / "agents.json").open() as f:
+            agents_data = json.load(f)
+        with (orig_domain_dir / "scenarios.json").open() as f:
+            scenarios_data = json.load(f)
+
+        # Restructure
+        tools = _create_tools_list(agents_data)
+        agents = _create_agents_list(agents_data)
+        tasks = _create_tasks_list(scenarios_data, tools)
+
+        # Save restructured data
+        out_domain_dir = restructured_dir / d
+        out_domain_dir.mkdir(parents=True, exist_ok=True)
+
+        with (out_domain_dir / "agents.json").open("w") as f:
+            json.dump(agents, f, indent=2)
+        with (out_domain_dir / "tasks.json").open("w") as f:
+            json.dump(tasks, f, indent=2)
+
+        if verbose >= 2:
+            print(f"Restructured {d}: {len(agents.get('agents', []))} agents, {len(tasks)} tasks")
+
+    if verbose >= 1:
+        print(f"Restructured data to {restructured_dir}")
+
+    return restructured_dir
+
+
+def ensure_data_exists(
+    data_dir: Optional[Path] = None,
+    force_download: bool = False,
+    verbose: int = 1,
+) -> Path:
+    """Ensure restructured data exists, downloading if needed.
+
+    Args:
+        data_dir: Base data directory (default: module's data/)
+        force_download: If True, re-download even if data exists
+        verbose: 0=silent, 1=summary, 2=detailed
+
+    Returns:
+        Path to the restructured data directory
+    """
+    data_dir = Path(data_dir) if data_dir else DEFAULT_DATA_DIR
+    restructured_dir = data_dir / "restructured"
+
+    # Check if all domains exist
+    all_exist = all((restructured_dir / d / "tasks.json").exists() and (restructured_dir / d / "agents.json").exists() for d in VALID_DOMAINS)
+
+    if all_exist and not force_download:
+        return restructured_dir
+
+    # Download and restructure
+    download_original_data(data_dir, verbose=verbose)
+    download_prompt_templates(data_dir, verbose=verbose)
+    restructure_data(data_dir, verbose=verbose)
+
+    return restructured_dir
+
+
+def process_data(verbose: int = 1) -> Path:
+    """Download and process all MACS data. Convenience wrapper.
+
+    Args:
+        verbose: 0=silent, 1=summary, 2=detailed
+
+    Returns:
+        Path to the restructured data directory
+    """
+    return ensure_data_exists(force_download=True, verbose=verbose)
+
+
+# =============================================================================
+# Data Loading Functions (for use in benchmarks)
+# =============================================================================
+
+
+def load_tasks(
+    domain: str,
+    data_dir: Optional[Path] = None,
+    limit: Optional[int] = None,
+) -> TaskCollection:
+    """Load tasks for a MACS domain.
+
+    Args:
+        domain: One of "travel", "mortgage", or "software"
+        data_dir: Base data directory (default: module's data/).
+                  Tasks are loaded from data_dir/restructured/{domain}/tasks.json
+        limit: Maximum number of tasks to load
+
+    Returns:
+        TaskCollection containing Task objects
+
+    Raises:
+        ValueError: If domain is not valid
+        FileNotFoundError: If tasks.json doesn't exist
+    """
+    if domain not in VALID_DOMAINS:
+        raise ValueError(f"Invalid domain '{domain}'. Must be one of {VALID_DOMAINS}")
+
+    data_dir = Path(data_dir) if data_dir else DEFAULT_DATA_DIR
+    tasks_path = data_dir / "restructured" / domain / "tasks.json"
+
+    with tasks_path.open() as f:
+        tasks_list: List[Dict[str, Any]] = json.load(f)
+
+    if limit:
+        tasks_list = tasks_list[:limit]
+
+    tasks = []
+    for t in tasks_list:
+        task_kwargs: Dict[str, Any] = {
+            "query": t["query"],
+            "environment_data": t.get("environment_data", {}),
+            "evaluation_data": t.get("evaluation_data", {}),
+            "metadata": t.get("metadata", {}),
+        }
+        if t.get("id"):
+            task_kwargs["id"] = UUID(t["id"])
+        tasks.append(Task(**task_kwargs))
+
+    return TaskCollection(tasks)
+
+
+def load_agent_config(
+    domain: str,
+    data_dir: Optional[Path] = None,
+) -> Dict[str, Any]:
+    """Load agent hierarchy configuration for a domain.
+
+    The returned configuration contains:
+    - agents: List of agent specifications with agent_id, agent_name,
+              agent_instruction, reachable_agents, and tools (as names)
+    - primary_agent_id: ID of the supervisor/orchestrator agent
+    - human_id: ID used for human/user in trajectories
+
+    Args:
+        domain: One of "travel", "mortgage", or "software"
+        data_dir: Base data directory (default: module's data/).
+                  Config is loaded from data_dir/restructured/{domain}/agents.json
+
+    Returns:
+        Dict with agent configuration
+
+    Raises:
+        ValueError: If domain is not valid
+        FileNotFoundError: If agents.json doesn't exist
+    """
+    if domain not in VALID_DOMAINS:
+        raise ValueError(f"Invalid domain '{domain}'. Must be one of {VALID_DOMAINS}")
+
+    data_dir = Path(data_dir) if data_dir else DEFAULT_DATA_DIR
+    config_path = data_dir / "restructured" / domain / "agents.json"
+
+    with config_path.open() as f:
+        return json.load(f)
+
+
+# =============================================================================
+# CLI Entry Point
+# =============================================================================
+
+if __name__ == "__main__":
+    process_data(verbose=2)
diff --git a/tests/test_benchmarks/test_macs/__init__.py b/tests/test_benchmarks/test_macs/__init__.py
new file mode 100644
index 00000000..c9acba91
--- /dev/null
+++ b/tests/test_benchmarks/test_macs/__init__.py
@@ -0,0 +1 @@
+"""Tests for MACS benchmark."""
diff --git a/tests/test_benchmarks/test_macs/test_data_loader.py b/tests/test_benchmarks/test_macs/test_data_loader.py
new file mode 100644
index 00000000..38ffe337
--- /dev/null
+++ b/tests/test_benchmarks/test_macs/test_data_loader.py
@@ -0,0 +1,764 @@
+"""Unit tests for MACS data_loader module."""
+
+import json
+import pytest
+from pathlib import Path
+from typing import Any, Dict
+from unittest.mock import patch, MagicMock
+from tempfile import TemporaryDirectory
+from urllib.error import URLError, HTTPError
+
+from maseval.benchmark.macs.data_loader import (
+    DEFAULT_DATA_DIR,
+    VALID_DOMAINS,
+    URLS,
+    download_file,
+    download_json,
+    download_original_data,
+    download_prompt_templates,
+    restructure_data,
+    ensure_data_exists,
+    load_tasks,
+    load_agent_config,
+    _dedupe_tools_by_name,
+    _create_tools_list,
+    _create_agents_list,
+    _create_tasks_list,
+)
+
+
+# =============================================================================
+# Sample Data Fixtures
+# =============================================================================
+
+
+@pytest.fixture
+def sample_agents_data() -> Dict[str, Any]:
+    """Sample agents.json data matching AWS format."""
+    return {
+        "agents": [
+            {
+                "agent_id": "supervisor",
+                "agent_name": "Supervisor Agent",
+                "agent_instruction": "Coordinate other agents.",
+                "reachable_agents": ["worker_1"],
+                "tools": [
+                    {"tool_name": "route_to_agent", "tool_type": "router"},
+                    {"tool_name": "common_tool", "description": "Shared tool"},
+                ],
+            },
+            {
+                "agent_id": "worker_1",
+                "agent_name": "Worker Agent",
+                "agent_instruction": "Do work.",
+                "reachable_agents": [],
+                "tools": [
+                    {"tool_name": "do_work", "tool_type": "action"},
+                    {"tool_name": "common_tool", "description": "Shared tool"},  # Duplicate
+                ],
+            },
+        ],
+        "primary_agent_id": "supervisor",
+        "human_id": "user",
+    }
+
+
+@pytest.fixture
+def sample_scenarios_data() -> Dict[str, Any]:
+    """Sample scenarios.json data matching AWS format."""
+    return {
+        "scenarios": [
+            {
+                "id": "11111111-1111-1111-1111-111111111111",
+                "input_problem": "Book a flight to New York",
+                "assertions": [
+                    {"type": "user_side", "content": "Flight booked"},
+                    {"type": "system_side", "content": "Database updated"},
+                ],
+                "category": "travel",
+                "complexity": "simple",
+            },
+            {
+                "id": "22222222-2222-2222-2222-222222222222",
+                "input_problem": "Cancel my reservation",
+                "assertions": [
+                    {"type": "user_side", "content": "Reservation cancelled"},
+                ],
+                "category": "travel",
+                "complexity": "simple",
+            },
+        ]
+    }
+
+
+@pytest.fixture
+def temp_data_dir() -> Path:
+    """Create a temporary directory for test data."""
+    with TemporaryDirectory() as tmpdir:
+        yield Path(tmpdir)
+
+
+# =============================================================================
+# Unit Tests: Helper Functions
+# =============================================================================
+
+
+class TestDedupeToolsByName:
+    """Tests for _dedupe_tools_by_name function."""
+
+    def test_empty_list(self):
+        """Empty list returns empty list."""
+        assert _dedupe_tools_by_name([]) == []
+
+    def test_no_duplicates(self):
+        """Tools without duplicates are preserved."""
+        tools = [
+            {"tool_name": "a", "param": 1},
+            {"tool_name": "b", "param": 2},
+        ]
+        result = _dedupe_tools_by_name(tools)
+        assert len(result) == 2
+        assert {t["tool_name"] for t in result} == {"a", "b"}
+
+    def test_exact_duplicates_deduped(self):
+        """Identical duplicate tools are deduplicated."""
+        tools = [
+            {"tool_name": "a", "param": 1},
+            {"tool_name": "a", "param": 1},  # Exact duplicate
+        ]
+        result = _dedupe_tools_by_name(tools)
+        assert len(result) == 1
+        assert result[0]["tool_name"] == "a"
+
+    def test_conflicting_duplicates_raise(self):
+        """Conflicting tools with same name raise ValueError."""
+        tools = [
+            {"tool_name": "a", "param": 1},
+            {"tool_name": "a", "param": 2},  # Different params!
+        ]
+        with pytest.raises(ValueError, match="Conflicting tools"):
+            _dedupe_tools_by_name(tools)
+
+    def test_tools_without_name_preserved(self):
+        """Tools without tool_name are preserved as-is."""
+        tools = [
+            {"tool_name": "a"},
+            {"no_name": "here"},
+            {},
+        ]
+        result = _dedupe_tools_by_name(tools)
+        assert len(result) == 3
+
+
+class TestCreateToolsList:
+    """Tests for _create_tools_list function."""
+
+    def test_dict_with_agents_key(self, sample_agents_data):
+        """Extract tools from dict with 'agents' key."""
+        tools = _create_tools_list(sample_agents_data)
+        tool_names = {t["tool_name"] for t in tools}
+        assert "route_to_agent" in tool_names
+        assert "do_work" in tool_names
+        assert "common_tool" in tool_names
+        # Duplicates should be deduped
+        assert sum(1 for t in tools if t.get("tool_name") == "common_tool") == 1
+
+    def test_list_of_agents(self, sample_agents_data):
+        """Extract tools from list of agents directly."""
+        tools = _create_tools_list(sample_agents_data["agents"])
+        assert len(tools) == 3
+
+    def test_empty_input(self):
+        """Empty or invalid input returns empty list."""
+        assert _create_tools_list({}) == []
+        assert _create_tools_list([]) == []
+        assert _create_tools_list(None) == []
+
+
+class TestCreateAgentsList:
+    """Tests for _create_agents_list function."""
+
+    def test_basic_conversion(self, sample_agents_data):
+        """Converts agents and replaces tool dicts with names."""
+        result = _create_agents_list(sample_agents_data)
+
+        assert "agents" in result
+        assert result["primary_agent_id"] == "supervisor"
+        assert result["human_id"] == "user"
+
+        agents = result["agents"]
+        assert len(agents) == 2
+
+        supervisor = next(a for a in agents if a["agent_id"] == "supervisor")
+        assert supervisor["agent_name"] == "Supervisor Agent"
+        assert supervisor["tools"] == ["route_to_agent", "common_tool"]
+        assert "agent_instruction" in supervisor
+
+    def test_empty_input(self):
+        """Empty input returns empty dict."""
+        assert _create_agents_list({}) == {}
+        assert _create_agents_list([]) == {}
+
+
+class TestCreateTasksList:
+    """Tests for _create_tasks_list function."""
+
+    def test_basic_conversion(self, sample_scenarios_data, sample_agents_data):
+        """Converts scenarios to task format."""
+        tools = _create_tools_list(sample_agents_data)
+        tasks = _create_tasks_list(sample_scenarios_data, tools)
+
+        assert len(tasks) == 2
+
+        task1 = tasks[0]
+        assert task1["id"] == "11111111-1111-1111-1111-111111111111"
+        assert task1["query"] == "Book a flight to New York"
+        assert "tools" in task1["environment_data"]
+        assert "assertions" in task1["evaluation_data"]
+        assert task1["metadata"]["category"] == "travel"
+
+    def test_list_of_scenarios(self, sample_scenarios_data, sample_agents_data):
+        """Also works with list of scenarios directly."""
+        tools = _create_tools_list(sample_agents_data)
+        tasks = _create_tasks_list(sample_scenarios_data["scenarios"], tools)
+        assert len(tasks) == 2
+
+    def test_empty_input(self):
+        """Empty input returns empty list."""
+        assert _create_tasks_list({}, []) == []
+        assert _create_tasks_list([], []) == []
+
+
+# =============================================================================
+# Unit Tests: Download Functions (with mocking)
+# =============================================================================
+
+
+class TestDownloadFunctions:
+    """Tests for download functions using mocks."""
+
+    def test_download_file_success(self):
+        """download_file returns text content."""
+        mock_resp = MagicMock()
+        mock_resp.read.return_value = b'{"key": "value"}'
+        mock_resp.__enter__ = MagicMock(return_value=mock_resp)
+        mock_resp.__exit__ = MagicMock(return_value=False)
+
+        with patch("maseval.benchmark.macs.data_loader.urlopen", return_value=mock_resp):
+            result = download_file("http://example.com/test.json")
+            assert result == '{"key": "value"}'
+
+    def test_download_json_success(self):
+        """download_json returns parsed JSON."""
+        mock_resp = MagicMock()
+        mock_resp.read.return_value = b'{"key": "value"}'
+        mock_resp.__enter__ = MagicMock(return_value=mock_resp)
+        mock_resp.__exit__ = MagicMock(return_value=False)
+
+        with patch("maseval.benchmark.macs.data_loader.urlopen", return_value=mock_resp):
+            result = download_json("http://example.com/test.json")
+            assert result == {"key": "value"}
+
+    def test_download_json_invalid_json(self):
+        """download_json raises on invalid JSON."""
+        mock_resp = MagicMock()
+        mock_resp.read.return_value = b"not valid json"
+        mock_resp.__enter__ = MagicMock(return_value=mock_resp)
+        mock_resp.__exit__ = MagicMock(return_value=False)
+
+        with patch("maseval.benchmark.macs.data_loader.urlopen", return_value=mock_resp):
+            with pytest.raises(RuntimeError, match="Failed to decode JSON"):
+                download_json("http://example.com/test.json")
+
+
+class TestDownloadOriginalData:
+    """Tests for download_original_data function."""
+
+    def test_downloads_all_domains(self, temp_data_dir, sample_agents_data, sample_scenarios_data):
+        """Downloads data for all domains."""
+
+        def mock_download_json(url: str):
+            if "agents" in url:
+                return sample_agents_data
+            if "scenarios" in url:
+                return sample_scenarios_data
+            raise ValueError(f"Unexpected URL: {url}")
+
+        with patch("maseval.benchmark.macs.data_loader.download_json", side_effect=mock_download_json):
+            result = download_original_data(data_dir=temp_data_dir, verbose=0)
+
+        assert result == temp_data_dir / "original"
+        for domain in VALID_DOMAINS:
+            assert (result / domain / "agents.json").exists()
+            assert (result / domain / "scenarios.json").exists()
+
+    def test_downloads_single_domain(self, temp_data_dir, sample_agents_data, sample_scenarios_data):
+        """Downloads data for a single domain."""
+
+        def mock_download_json(url: str):
+            if "agents" in url:
+                return sample_agents_data
+            if "scenarios" in url:
+                return sample_scenarios_data
+            raise ValueError(f"Unexpected URL: {url}")
+
+        with patch("maseval.benchmark.macs.data_loader.download_json", side_effect=mock_download_json):
+            result = download_original_data(data_dir=temp_data_dir, domain="travel", verbose=0)
+
+        assert (result / "travel" / "agents.json").exists()
+        # Other domains should NOT exist
+        assert not (result / "mortgage").exists()
+        assert not (result / "software").exists()
+
+    def test_invalid_domain_raises(self, temp_data_dir):
+        """Invalid domain raises ValueError."""
+        with pytest.raises(ValueError, match="Unknown domain"):
+            download_original_data(data_dir=temp_data_dir, domain="invalid", verbose=0)
+
+
+# =============================================================================
+# Unit Tests: Restructure Functions
+# =============================================================================
+
+
+class TestRestructureData:
+    """Tests for restructure_data function."""
+
+    def test_restructures_all_domains(self, temp_data_dir, sample_agents_data, sample_scenarios_data):
+        """Restructures data for all domains."""
+        # Set up original data
+        for domain in VALID_DOMAINS:
+            orig_dir = temp_data_dir / "original" / domain
+            orig_dir.mkdir(parents=True)
+            with (orig_dir / "agents.json").open("w") as f:
+                json.dump(sample_agents_data, f)
+            with (orig_dir / "scenarios.json").open("w") as f:
+                json.dump(sample_scenarios_data, f)
+
+        result = restructure_data(data_dir=temp_data_dir, verbose=0)
+
+        assert result == temp_data_dir / "restructured"
+        for domain in VALID_DOMAINS:
+            assert (result / domain / "agents.json").exists()
+            assert (result / domain / "tasks.json").exists()
+
+            # Verify content
+            with (result / domain / "tasks.json").open() as f:
+                tasks = json.load(f)
+            assert len(tasks) == 2
+            assert tasks[0]["query"] == "Book a flight to New York"
+
+            with (result / domain / "agents.json").open() as f:
+                agents = json.load(f)
+            assert len(agents["agents"]) == 2
+            assert agents["primary_agent_id"] == "supervisor"
+
+    def test_missing_original_raises(self, temp_data_dir):
+        """Missing original data raises FileNotFoundError."""
+        with pytest.raises(FileNotFoundError, match="Original data not found"):
+            restructure_data(data_dir=temp_data_dir, domain="travel", verbose=0)
+
+
+class TestEnsureDataExists:
+    """Tests for ensure_data_exists function."""
+
+    def test_skips_download_if_exists(self, temp_data_dir):
+        """Skips download if restructured data already exists."""
+        # Create fake restructured data
+        for domain in VALID_DOMAINS:
+            rest_dir = temp_data_dir / "restructured" / domain
+            rest_dir.mkdir(parents=True)
+            (rest_dir / "tasks.json").write_text("[]")
+            (rest_dir / "agents.json").write_text("{}")
+
+        # Should not call download
+        with patch("maseval.benchmark.macs.data_loader.download_original_data") as mock_download:
+            result = ensure_data_exists(data_dir=temp_data_dir, verbose=0)
+            mock_download.assert_not_called()
+
+        assert result == temp_data_dir / "restructured"
+
+    def test_downloads_if_missing(self, temp_data_dir, sample_agents_data, sample_scenarios_data):
+        """Downloads if restructured data missing."""
+        # No data exists
+
+        def mock_download_json(url: str):
+            if "agents" in url:
+                return sample_agents_data
+            if "scenarios" in url:
+                return sample_scenarios_data
+            raise ValueError(f"Unexpected URL: {url}")
+
+        def mock_download_file(url: str, timeout=15):
+            # Return minimal Python file for prompt templates
+            return "USER_GSR_PROMPT = 'test'\nSYSTEM_GSR_PROMPT = 'test'\nISSUES_PROMPT = 'test'"
+
+        with patch("maseval.benchmark.macs.data_loader.download_json", side_effect=mock_download_json):
+            with patch("maseval.benchmark.macs.data_loader.download_file", side_effect=mock_download_file):
+                result = ensure_data_exists(data_dir=temp_data_dir, verbose=0)
+
+        assert result == temp_data_dir / "restructured"
+        for domain in VALID_DOMAINS:
+            assert (result / domain / "tasks.json").exists()
+
+    def test_force_download_redownloads(self, temp_data_dir, sample_agents_data, sample_scenarios_data):
+        """force_download=True redownloads even if data exists."""
+        # Create fake restructured data
+        for domain in VALID_DOMAINS:
+            rest_dir = temp_data_dir / "restructured" / domain
+            rest_dir.mkdir(parents=True)
+            (rest_dir / "tasks.json").write_text("[]")
+            (rest_dir / "agents.json").write_text("{}")
+
+        def mock_download_json(url: str):
+            if "agents" in url:
+                return sample_agents_data
+            if "scenarios" in url:
+                return sample_scenarios_data
+            raise ValueError(f"Unexpected URL: {url}")
+
+        def mock_download_file(url: str, timeout=15):
+            return "USER_GSR_PROMPT = 'test'\nSYSTEM_GSR_PROMPT = 'test'\nISSUES_PROMPT = 'test'"
+
+        with patch("maseval.benchmark.macs.data_loader.download_json", side_effect=mock_download_json):
+            with patch("maseval.benchmark.macs.data_loader.download_file", side_effect=mock_download_file):
+                result = ensure_data_exists(data_dir=temp_data_dir, force_download=True, verbose=0)
+
+        # Should have new data with 2 tasks
+        with (result / "travel" / "tasks.json").open() as f:
+            tasks = json.load(f)
+        assert len(tasks) == 2
+
+
+# =============================================================================
+# Unit Tests: Load Functions
+# =============================================================================
+
+
+class TestLoadTasks:
+    """Tests for load_tasks function."""
+
+    def test_loads_tasks(self, temp_data_dir, sample_agents_data, sample_scenarios_data):
+        """Loads tasks from restructured data."""
+        # Create restructured data
+        tools = _create_tools_list(sample_agents_data)
+        tasks_list = _create_tasks_list(sample_scenarios_data, tools)
+
+        rest_dir = temp_data_dir / "restructured" / "travel"
+        rest_dir.mkdir(parents=True)
+        with (rest_dir / "tasks.json").open("w") as f:
+            json.dump(tasks_list, f)
+
+        collection = load_tasks("travel", data_dir=temp_data_dir)
+
+        assert len(collection) == 2
+        task = collection[0]
+        assert task.query == "Book a flight to New York"
+        assert "tools" in task.environment_data
+        assert "assertions" in task.evaluation_data
+
+    def test_limit_parameter(self, temp_data_dir, sample_agents_data, sample_scenarios_data):
+        """Limit parameter restricts number of tasks."""
+        tools = _create_tools_list(sample_agents_data)
+        tasks_list = _create_tasks_list(sample_scenarios_data, tools)
+
+        rest_dir = temp_data_dir / "restructured" / "travel"
+        rest_dir.mkdir(parents=True)
+        with (rest_dir / "tasks.json").open("w") as f:
+            json.dump(tasks_list, f)
+
+        collection = load_tasks("travel", data_dir=temp_data_dir, limit=1)
+        assert len(collection) == 1
+
+    def test_invalid_domain_raises(self, temp_data_dir):
+        """Invalid domain raises ValueError."""
+        with pytest.raises(ValueError, match="Invalid domain"):
+            load_tasks("invalid", data_dir=temp_data_dir)
+
+    def test_missing_file_raises(self, temp_data_dir):
+        """Missing tasks.json raises FileNotFoundError."""
+        with pytest.raises(FileNotFoundError):
+            load_tasks("travel", data_dir=temp_data_dir)
+
+
+class TestLoadAgentConfig:
+    """Tests for load_agent_config function."""
+
+    def test_loads_config(self, temp_data_dir, sample_agents_data):
+        """Loads agent config from restructured data."""
+        agents = _create_agents_list(sample_agents_data)
+
+        rest_dir = temp_data_dir / "restructured" / "travel"
+        rest_dir.mkdir(parents=True)
+        with (rest_dir / "agents.json").open("w") as f:
+            json.dump(agents, f)
+
+        config = load_agent_config("travel", data_dir=temp_data_dir)
+
+        assert "agents" in config
+        assert config["primary_agent_id"] == "supervisor"
+        assert len(config["agents"]) == 2
+
+        supervisor = next(a for a in config["agents"] if a["agent_id"] == "supervisor")
+        assert supervisor["tools"] == ["route_to_agent", "common_tool"]
+
+    def test_invalid_domain_raises(self, temp_data_dir):
+        """Invalid domain raises ValueError."""
+        with pytest.raises(ValueError, match="Invalid domain"):
+            load_agent_config("invalid", data_dir=temp_data_dir)
+
+
+# =============================================================================
+# Integration Tests
+# =============================================================================
+
+
+@pytest.mark.core
+class TestDataLoaderIntegration:
+    """Integration tests for the full data loading pipeline."""
+
+    def test_full_pipeline(self, temp_data_dir, sample_agents_data, sample_scenarios_data):
+        """Test the complete download → restructure → load pipeline."""
+
+        def mock_download_json(url: str):
+            if "agents" in url:
+                return sample_agents_data
+            if "scenarios" in url:
+                return sample_scenarios_data
+            raise ValueError(f"Unexpected URL: {url}")
+
+        def mock_download_file(url: str, timeout=15):
+            return "USER_GSR_PROMPT = 'test'\nSYSTEM_GSR_PROMPT = 'test'\nISSUES_PROMPT = 'test'"
+
+        with patch("maseval.benchmark.macs.data_loader.download_json", side_effect=mock_download_json):
+            with patch("maseval.benchmark.macs.data_loader.download_file", side_effect=mock_download_file):
+                # Step 1: Download
+                download_original_data(data_dir=temp_data_dir, verbose=0)
+                download_prompt_templates(data_dir=temp_data_dir, verbose=0)
+
+                # Verify original data exists
+                assert (temp_data_dir / "original" / "travel" / "agents.json").exists()
+                assert (temp_data_dir / "original" / "travel" / "scenarios.json").exists()
+
+                # Step 2: Restructure
+                restructure_data(data_dir=temp_data_dir, verbose=0)
+
+                # Verify restructured data exists
+                assert (temp_data_dir / "restructured" / "travel" / "tasks.json").exists()
+                assert (temp_data_dir / "restructured" / "travel" / "agents.json").exists()
+
+                # Step 3: Load
+                tasks = load_tasks("travel", data_dir=temp_data_dir)
+                config = load_agent_config("travel", data_dir=temp_data_dir)
+
+                # Verify loaded data
+                assert len(tasks) == 2
+                assert tasks[0].query == "Book a flight to New York"
+                assert len(config["agents"]) == 2
+
+    def test_urls_structure(self):
+        """Verify URLS constant has expected structure."""
+        assert "data" in URLS
+        assert "evaluation" in URLS
+
+        for domain in VALID_DOMAINS:
+            assert domain in URLS["data"]
+            assert "agents" in URLS["data"][domain]
+            assert "scenarios" in URLS["data"][domain]
+
+        assert "prompt_templates" in URLS["evaluation"]
+
+
+# =============================================================================
+# Connection Error Handling Tests
+# =============================================================================
+
+
+class TestConnectionErrorHandling:
+    """Tests for graceful handling of network errors."""
+
+    def test_download_file_http_error(self):
+        """download_file raises RuntimeError on HTTP error."""
+        with patch("maseval.benchmark.macs.data_loader.urlopen") as mock_urlopen:
+            mock_urlopen.side_effect = HTTPError(
+                url="http://example.com/test.json",
+                code=404,
+                msg="Not Found",
+                hdrs=None,
+                fp=None,
+            )
+            with pytest.raises(RuntimeError, match="Failed to download"):
+                download_file("http://example.com/test.json")
+
+    def test_download_file_url_error(self):
+        """download_file raises RuntimeError on URL/network error."""
+        with patch("maseval.benchmark.macs.data_loader.urlopen") as mock_urlopen:
+            mock_urlopen.side_effect = URLError("Connection refused")
+            with pytest.raises(RuntimeError, match="Failed to download"):
+                download_file("http://example.com/test.json")
+
+    def test_download_file_timeout(self):
+        """download_file raises RuntimeError on timeout."""
+        with patch("maseval.benchmark.macs.data_loader.urlopen") as mock_urlopen:
+            mock_urlopen.side_effect = URLError("timed out")
+            with pytest.raises(RuntimeError, match="Failed to download"):
+                download_file("http://example.com/test.json")
+
+    def test_download_original_data_network_error(self, temp_data_dir):
+        """download_original_data propagates network errors gracefully."""
+        with patch("maseval.benchmark.macs.data_loader.download_json") as mock_download:
+            mock_download.side_effect = RuntimeError("Network unreachable")
+            with pytest.raises(RuntimeError, match="Network unreachable"):
+                download_original_data(data_dir=temp_data_dir, domain="travel", verbose=0)
+
+    def test_ensure_data_exists_network_error(self, temp_data_dir):
+        """ensure_data_exists propagates network errors when data missing."""
+        with patch("maseval.benchmark.macs.data_loader.download_json") as mock_download:
+            mock_download.side_effect = RuntimeError("Connection failed")
+            with pytest.raises(RuntimeError, match="Connection failed"):
+                ensure_data_exists(data_dir=temp_data_dir, verbose=0)
+
+
+# =============================================================================
+# Data Location Tests
+# =============================================================================
+
+
+class TestDataLocation:
+    """Tests for custom and default data location handling."""
+
+    def test_default_data_dir_is_module_relative(self):
+        """DEFAULT_DATA_DIR points to module's data directory."""
+        assert DEFAULT_DATA_DIR.name == "data"
+        assert DEFAULT_DATA_DIR.parent.name == "macs"
+        assert DEFAULT_DATA_DIR.parent.parent.name == "benchmark"
+
+    def test_load_tasks_uses_default_location(self, sample_agents_data, sample_scenarios_data):
+        """load_tasks uses DEFAULT_DATA_DIR when data_dir not specified."""
+        # Create mock data in default location
+        tools = _create_tools_list(sample_agents_data)
+        tasks_list = _create_tasks_list(sample_scenarios_data, tools)
+
+        rest_dir = DEFAULT_DATA_DIR / "restructured" / "travel"
+        rest_dir.mkdir(parents=True, exist_ok=True)
+        tasks_path = rest_dir / "tasks.json"
+
+        # Write test data
+        with tasks_path.open("w") as f:
+            json.dump(tasks_list, f)
+
+        try:
+            # Load without specifying data_dir
+            collection = load_tasks("travel")
+            assert len(collection) == 2
+        finally:
+            # Cleanup
+            tasks_path.unlink(missing_ok=True)
+            # Don't remove dirs as other tests may use them
+
+    def test_load_tasks_custom_location(self, temp_data_dir, sample_agents_data, sample_scenarios_data):
+        """load_tasks correctly uses custom data_dir."""
+        tools = _create_tools_list(sample_agents_data)
+        tasks_list = _create_tasks_list(sample_scenarios_data, tools)
+
+        rest_dir = temp_data_dir / "restructured" / "travel"
+        rest_dir.mkdir(parents=True)
+        with (rest_dir / "tasks.json").open("w") as f:
+            json.dump(tasks_list, f)
+
+        # Load from custom location
+        collection = load_tasks("travel", data_dir=temp_data_dir)
+        assert len(collection) == 2
+
+        # Verify it didn't touch default location
+        default_tasks = DEFAULT_DATA_DIR / "restructured" / "nonexistent_test_domain" / "tasks.json"
+        assert not default_tasks.exists()
+
+    def test_load_agent_config_custom_location(self, temp_data_dir, sample_agents_data):
+        """load_agent_config correctly uses custom data_dir."""
+        agents = _create_agents_list(sample_agents_data)
+
+        rest_dir = temp_data_dir / "restructured" / "mortgage"
+        rest_dir.mkdir(parents=True)
+        with (rest_dir / "agents.json").open("w") as f:
+            json.dump(agents, f)
+
+        config = load_agent_config("mortgage", data_dir=temp_data_dir)
+        assert config["primary_agent_id"] == "supervisor"
+
+    def test_download_original_data_custom_location(self, temp_data_dir, sample_agents_data, sample_scenarios_data):
+        """download_original_data saves to custom data_dir."""
+
+        def mock_download_json(url: str):
+            if "agents" in url:
+                return sample_agents_data
+            if "scenarios" in url:
+                return sample_scenarios_data
+            raise ValueError(f"Unexpected URL: {url}")
+
+        with patch("maseval.benchmark.macs.data_loader.download_json", side_effect=mock_download_json):
+            result = download_original_data(data_dir=temp_data_dir, domain="software", verbose=0)
+
+        # Verify data is in custom location
+        assert result == temp_data_dir / "original"
+        assert (temp_data_dir / "original" / "software" / "agents.json").exists()
+
+        # Verify default location is untouched
+        assert not (DEFAULT_DATA_DIR / "original" / "software_test_marker").exists()
+
+    def test_restructure_uses_custom_location(self, temp_data_dir, sample_agents_data, sample_scenarios_data):
+        """restructure_data reads from and writes to custom data_dir."""
+        # Setup original data in custom location
+        orig_dir = temp_data_dir / "original" / "travel"
+        orig_dir.mkdir(parents=True)
+        with (orig_dir / "agents.json").open("w") as f:
+            json.dump(sample_agents_data, f)
+        with (orig_dir / "scenarios.json").open("w") as f:
+            json.dump(sample_scenarios_data, f)
+
+        # Restructure
+        result = restructure_data(data_dir=temp_data_dir, domain="travel", verbose=0)
+
+        # Verify output is in custom location
+        assert result == temp_data_dir / "restructured"
+        assert (temp_data_dir / "restructured" / "travel" / "tasks.json").exists()
+        assert (temp_data_dir / "restructured" / "travel" / "agents.json").exists()
+
+
+# =============================================================================
+# Sequential ID Generation Tests
+# =============================================================================
+
+
+class TestSequentialIdGeneration:
+    """Tests for task ID generation."""
+
+    def test_generates_sequential_ids_when_missing(self):
+        """_create_tasks_list generates sequential IDs when not present."""
+        scenarios = {
+            "scenarios": [
+                {"input_problem": "Task 1", "assertions": []},
+                {"input_problem": "Task 2", "assertions": []},
+                {"input_problem": "Task 3", "assertions": []},
+            ]
+        }
+        tasks = _create_tasks_list(scenarios, [])
+
+        assert tasks[0]["id"] == "task-000001"
+        assert tasks[1]["id"] == "task-000002"
+        assert tasks[2]["id"] == "task-000003"
+
+    def test_preserves_existing_ids(self):
+        """_create_tasks_list preserves existing IDs."""
+        scenarios = {
+            "scenarios": [
+                {"id": "custom-id-1", "input_problem": "Task 1", "assertions": []},
+                {"uuid": "uuid-based-2", "input_problem": "Task 2", "assertions": []},
+                {"input_problem": "Task 3", "assertions": []},  # No ID
+            ]
+        }
+        tasks = _create_tasks_list(scenarios, [])
+
+        assert tasks[0]["id"] == "custom-id-1"
+        assert tasks[1]["id"] == "uuid-based-2"
+        assert tasks[2]["id"] == "task-000003"  # Generated

From 389119a3388ee910ea77903471e565e28311588f Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Wed, 3 Dec 2025 11:02:24 +0000
Subject: [PATCH 02/34] intiial implementation of benchmark.

---
 examples/macs_benchmark.py                    | 399 +++++++++++
 maseval/benchmark/macs/data_loader.py         |  12 +-
 maseval/benchmark/macs/macs.py                | 641 ++++++++++++++++++
 .../test_macs/test_data_loader.py             |  42 +-
 4 files changed, 1070 insertions(+), 24 deletions(-)
 create mode 100644 examples/macs_benchmark.py
 create mode 100644 maseval/benchmark/macs/macs.py

diff --git a/examples/macs_benchmark.py b/examples/macs_benchmark.py
new file mode 100644
index 00000000..83087830
--- /dev/null
+++ b/examples/macs_benchmark.py
@@ -0,0 +1,399 @@
+"""MACS Benchmark Example.
+
+This example demonstrates running the AWS Multi-Agent Collaboration Scenarios (MACS)
+benchmark with either smolagents or langgraph frameworks.
+
+The MACS benchmark evaluates multi-agent collaboration in three enterprise domains:
+- Travel: 10 agents, 52 tools (flight booking, hotels, weather, etc.)
+- Mortgage: 6 agents, 35 tools (loan processing, document handling, etc.)
+- Software: 8 agents, 4 tools (code review, issue tracking, etc.)
+
+Reference:
+    Paper: https://arxiv.org/abs/2412.05449
+    Data: https://github.com/aws-samples/multiagent-collab-scenario-benchmark
+
+Usage:
+    # Run with smolagents
+    python examples/macs_benchmark.py --framework smolagents --domain travel --limit 5
+
+    # Run with langgraph
+    python examples/macs_benchmark.py --framework langgraph --domain travel --limit 5
+"""
+
+import argparse
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Tuple
+
+from google.genai import Client as GoogleGenAIClient
+
+from maseval import AgentAdapter, Environment, Task, User
+from maseval.core.callbacks.result_logger import FileResultLogger
+from maseval.core.config import ConfigurableMixin
+from maseval.core.tracing import TraceableMixin
+from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
+
+from maseval.benchmark.macs import (
+    MACSBenchmark,
+    MACSGenericTool,
+    compute_benchmark_metrics,
+    ensure_data_exists,
+    load_agent_config,
+    load_tasks,
+)
+
+
+# =============================================================================
+# Model Setup
+# =============================================================================
+
+
+def create_model(model_id: str = "gemini-2.5-flash") -> GoogleGenAIModelAdapter:
+    """Create a Google GenAI model adapter.
+
+    Args:
+        model_id: Model identifier (default: gemini-2.5-flash)
+
+    Returns:
+        Configured GoogleGenAIModelAdapter
+    """
+    api_key = os.getenv("GOOGLE_API_KEY")
+    if not api_key:
+        raise ValueError("GOOGLE_API_KEY environment variable is required")
+
+    client = GoogleGenAIClient(api_key=api_key)
+    return GoogleGenAIModelAdapter(client, model_id=model_id)
+
+
+# =============================================================================
+# Smolagents Implementation
+# =============================================================================
+
+
+def _create_smolagents_benchmark():
+    """Create smolagents-specific benchmark class."""
+    from smolagents import Tool as SmolagentsTool, ToolCallingAgent, OpenAIServerModel
+    from maseval.interface.agents.smolagents import SmolAgentAdapter
+
+    class SmolagentsToolWrapper(SmolagentsTool, ConfigurableMixin, TraceableMixin):
+        """Smolagents wrapper for MACSGenericTool."""
+
+        skip_forward_signature_validation = True
+
+        def __init__(self, generic_tool: MACSGenericTool):
+            self.generic_tool = generic_tool
+            self.name = generic_tool.name
+            self.description = generic_tool.description
+            self.inputs = generic_tool.inputs
+            self.output_type = generic_tool.output_type
+            super().__init__()
+
+        def forward(self, **kwargs) -> str:
+            return self.generic_tool(**kwargs)
+
+        def gather_traces(self) -> Dict[str, Any]:
+            return self.generic_tool.gather_traces()
+
+        def gather_config(self) -> Dict[str, Any]:
+            return self.generic_tool.gather_config()
+
+    class SmolagentsMACSBenchmark(MACSBenchmark):
+        """MACS Benchmark implementation for smolagents."""
+
+        def setup_agents(
+            self,
+            agent_data: Dict[str, Any],
+            environment: Environment,
+            task: Task,
+            user: Optional[User],
+        ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
+            """Create smolagents agents."""
+            # Get tools from environment
+            generic_tools = environment.create_tools()
+            wrapped_tools = [SmolagentsToolWrapper(t) for t in generic_tools]
+
+            # Create smolagents model
+            # Use OpenAI-compatible API with Gemini via AI Studio
+            smol_model = OpenAIServerModel(
+                model_id="gemini-2.5-flash",
+                api_base="https://generativelanguage.googleapis.com/v1beta/openai/",
+                api_key=os.getenv("GOOGLE_API_KEY"),
+            )
+
+            # Get primary agent config
+            primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
+            agents_config = agent_data.get("agents", [])
+            primary_config = next(
+                (a for a in agents_config if a.get("agent_id") == primary_agent_id), agents_config[0] if agents_config else {}
+            )
+
+            # Create agent
+            agent = ToolCallingAgent(
+                tools=list(wrapped_tools),  # type: ignore[arg-type]
+                model=smol_model,
+                max_steps=10,
+                name=primary_config.get("agent_name", "MACS Agent"),
+                description=primary_config.get("agent_instruction", "Multi-agent collaboration agent"),
+            )
+
+            # Wrap with adapter
+            adapter = SmolAgentAdapter(agent, name=primary_agent_id)
+
+            return [adapter], {primary_agent_id: adapter}
+
+    return SmolagentsMACSBenchmark
+
+
+# =============================================================================
+# LangGraph Implementation
+# =============================================================================
+
+
+def _create_langgraph_benchmark():
+    """Create langgraph-specific benchmark class."""
+    from langchain_core.tools import StructuredTool
+    from langchain_google_genai import ChatGoogleGenerativeAI
+    from langgraph.graph import StateGraph
+    from langgraph.graph.message import add_messages
+    from langgraph.prebuilt import ToolNode, tools_condition
+    from typing_extensions import TypedDict, Annotated
+    from maseval.interface.agents.langgraph import LangGraphAgentAdapter
+
+    class LanggraphToolWrapper(ConfigurableMixin, TraceableMixin):
+        """LangGraph wrapper for MACSGenericTool."""
+
+        def __init__(self, generic_tool: MACSGenericTool):
+            self.generic_tool = generic_tool
+            self.name = generic_tool.name
+            self.tool = StructuredTool.from_function(
+                func=generic_tool.__call__,
+                name=generic_tool.name,
+                description=generic_tool.description,
+            )
+
+        def __call__(self, *args, **kwargs):
+            return self.tool(*args, **kwargs)
+
+        def gather_traces(self) -> Dict[str, Any]:
+            return self.generic_tool.gather_traces()
+
+        def gather_config(self) -> Dict[str, Any]:
+            return self.generic_tool.gather_config()
+
+    class LanggraphMACSBenchmark(MACSBenchmark):
+        """MACS Benchmark implementation for langgraph."""
+
+        def setup_agents(
+            self,
+            agent_data: Dict[str, Any],
+            environment: Environment,
+            task: Task,
+            user: Optional[User],
+        ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
+            """Create langgraph agents."""
+            # Get tools from environment
+            generic_tools = environment.create_tools()
+            wrapped_tools = [LanggraphToolWrapper(t) for t in generic_tools]
+            langchain_tools = [w.tool for w in wrapped_tools]
+
+            # Create LangChain model with tools
+            llm = ChatGoogleGenerativeAI(
+                model="gemini-2.5-flash",
+                google_api_key=os.getenv("GOOGLE_API_KEY"),
+            )
+            llm_with_tools = llm.bind_tools(langchain_tools)
+
+            # Define state
+            class State(TypedDict):
+                messages: Annotated[list, add_messages]
+
+            # Build graph
+            def chatbot(state: State):
+                return {"messages": [llm_with_tools.invoke(state["messages"])]}
+
+            graph = StateGraph(State)
+            graph.add_node("chatbot", chatbot)
+            graph.add_node("tools", ToolNode(tools=langchain_tools))
+
+            graph.add_conditional_edges("chatbot", tools_condition)
+            graph.add_edge("tools", "chatbot")
+            graph.set_entry_point("chatbot")
+
+            compiled_graph = graph.compile()
+
+            # Get primary agent config
+            primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
+
+            # Wrap with adapter
+            adapter = LangGraphAgentAdapter(compiled_graph, name=primary_agent_id)
+
+            return [adapter], {primary_agent_id: adapter}
+
+    return LanggraphMACSBenchmark
+
+
+# =============================================================================
+# Main Entry Point
+# =============================================================================
+
+
+def get_benchmark_class(framework: Literal["smolagents", "langgraph"]) -> type:
+    """Get the benchmark class for the specified framework.
+
+    Args:
+        framework: Either "smolagents" or "langgraph"
+
+    Returns:
+        The appropriate MACSBenchmark subclass
+    """
+    if framework == "smolagents":
+        return _create_smolagents_benchmark()
+    elif framework == "langgraph":
+        return _create_langgraph_benchmark()
+    else:
+        raise ValueError(f"Unsupported framework: {framework}. Choose 'smolagents' or 'langgraph'.")
+
+
+def run_benchmark(
+    framework: Literal["smolagents", "langgraph"],
+    domain: Literal["travel", "mortgage", "software"],
+    limit: Optional[int] = None,
+    n_task_repeats: int = 1,
+    output_dir: Optional[Path] = None,
+) -> Dict[str, Any]:
+    """Run the MACS benchmark.
+
+    Args:
+        framework: Agent framework to use
+        domain: MACS domain (travel, mortgage, or software)
+        limit: Maximum number of tasks to run (None for all)
+        n_task_repeats: Number of times to repeat each task
+        output_dir: Directory for results (default: examples/results/)
+
+    Returns:
+        Summary metrics from the benchmark run
+    """
+    # Setup output directory
+    if output_dir is None:
+        output_dir = Path(__file__).parent / "results"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Ensure data is downloaded
+    print("Ensuring MACS data is available...")
+    ensure_data_exists(verbose=1)
+
+    # Create model for tool simulation and evaluation
+    model = create_model("gemini-2.5-flash")
+
+    # Load data
+    print(f"Loading {domain} domain tasks...")
+    tasks = load_tasks(domain, limit=limit)
+    agent_config = load_agent_config(domain)
+    print(f"Loaded {len(tasks)} tasks")
+
+    # Setup callback for logging results
+    logger = FileResultLogger(
+        output_dir=output_dir,
+        filename_pattern=f"{domain}_{framework}_{{timestamp}}.jsonl",
+    )
+
+    # Get benchmark class and instantiate
+    BenchmarkClass = get_benchmark_class(framework)
+    benchmark = BenchmarkClass(
+        agent_data=agent_config,
+        model=model,
+        callbacks=[logger],
+        n_task_repeats=n_task_repeats,
+    )
+
+    # Run benchmark
+    print(f"\nRunning {framework} benchmark on {domain} domain...")
+    results = benchmark.run(tasks=tasks)
+
+    # Compute summary metrics
+    summary = compute_benchmark_metrics(results)
+
+    # Print summary
+    print("\n" + "=" * 50)
+    print("BENCHMARK SUMMARY")
+    print("=" * 50)
+    print(f"Framework: {framework}")
+    print(f"Domain: {domain}")
+    print(f"Total Tasks: {summary['total_tasks']}")
+    print(f"Successful Tasks (Overall GSR=1.0): {summary['successful_tasks']}")
+    print(f"Success Rate: {summary['success_rate']:.2%}")
+
+    print("\nMean Metrics:")
+    for metric, value in summary["mean_metrics"].items():
+        print(f"  {metric:<25} {value:.4f}")
+
+    print(f"\nResults saved to: {output_dir}")
+    print("=" * 50)
+
+    return summary
+
+
+def main():
+    """Parse arguments and run the benchmark."""
+    parser = argparse.ArgumentParser(
+        description="Run the MACS benchmark with smolagents or langgraph.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Run with smolagents on travel domain
+    python examples/macs_benchmark.py --framework smolagents --domain travel
+
+    # Run with langgraph on mortgage domain, limited to 5 tasks
+    python examples/macs_benchmark.py --framework langgraph --domain mortgage --limit 5
+
+    # Run with 3 repetitions per task
+    python examples/macs_benchmark.py --framework smolagents --domain software --repeats 3
+        """,
+    )
+
+    parser.add_argument(
+        "--framework",
+        type=str,
+        required=True,
+        choices=["smolagents", "langgraph"],
+        help="Agent framework to use",
+    )
+    parser.add_argument(
+        "--domain",
+        type=str,
+        required=True,
+        choices=["travel", "mortgage", "software"],
+        help="MACS domain to evaluate",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Maximum number of tasks to run (default: all)",
+    )
+    parser.add_argument(
+        "--repeats",
+        type=int,
+        default=1,
+        help="Number of times to repeat each task (default: 1)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=None,
+        help="Output directory for results (default: examples/results/)",
+    )
+
+    args = parser.parse_args()
+
+    run_benchmark(
+        framework=args.framework,
+        domain=args.domain,
+        limit=args.limit,
+        n_task_repeats=args.repeats,
+        output_dir=args.output_dir,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/maseval/benchmark/macs/data_loader.py b/maseval/benchmark/macs/data_loader.py
index 7acfc416..0e3bdabe 100644
--- a/maseval/benchmark/macs/data_loader.py
+++ b/maseval/benchmark/macs/data_loader.py
@@ -14,7 +14,6 @@
 from typing import Any, Dict, List, Optional
 from urllib.error import HTTPError, URLError
 from urllib.request import urlopen
-from uuid import UUID
 
 from maseval import Task, TaskCollection
 
@@ -283,7 +282,7 @@ def _process_agent(agent: Dict[str, Any]) -> Dict[str, Any]:
 
 
 def _create_tasks_list(scenarios_obj: object, tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Convert scenarios to task format."""
+    """Convert scenarios to task format with sequential IDs."""
     tasks: List[Dict[str, Any]] = []
 
     if isinstance(scenarios_obj, dict) and isinstance(scenarios_obj.get("scenarios"), list):
@@ -298,15 +297,15 @@ def _create_tasks_list(scenarios_obj: object, tools: List[Dict[str, Any]]) -> Li
             continue
 
         query = scen.get("input_problem") or scen.get("query") or ""
-        # Use existing ID if present, otherwise generate sequential task ID
-        tid = scen.get("id") or scen.get("uuid") or f"task-{idx:06d}"
+        # Always generate sequential task ID
+        tid = f"task-{idx:06d}"
 
         task = {
             "id": tid,
             "query": query,
             "environment_data": {"tools": tools},
             "evaluation_data": {"assertions": scen.get("assertions", [])},
-            "metadata": {k: v for k, v in scen.items() if k not in ("input_problem", "query", "id", "uuid", "assertions")},
+            "metadata": {k: v for k, v in scen.items() if k not in ("input_problem", "query", "assertions")},
         }
         tasks.append(task)
 
@@ -459,8 +458,9 @@ def load_tasks(
             "evaluation_data": t.get("evaluation_data", {}),
             "metadata": t.get("metadata", {}),
         }
+        # Store task ID in metadata (format: task-NNNNNN)
         if t.get("id"):
-            task_kwargs["id"] = UUID(t["id"])
+            task_kwargs["metadata"]["task_id"] = t["id"]
         tasks.append(Task(**task_kwargs))
 
     return TaskCollection(tasks)
diff --git a/maseval/benchmark/macs/macs.py b/maseval/benchmark/macs/macs.py
new file mode 100644
index 00000000..4a5eefdd
--- /dev/null
+++ b/maseval/benchmark/macs/macs.py
@@ -0,0 +1,641 @@
+"""MACS Benchmark - Multi-Agent Collaboration Scenarios.
+
+Framework-agnostic implementation of the AWS MACS benchmark for evaluating
+multi-agent collaboration in enterprise applications.
+
+Reference: https://arxiv.org/abs/2412.05449
+Dataset: https://github.com/aws-samples/multiagent-collab-scenario-benchmark
+
+Usage:
+    from maseval.benchmark.macs import (
+        MACSBenchmark, MACSEnvironment, MACSEvaluator, MACSGenericTool,
+        load_tasks, load_agent_config,
+    )
+
+    # Load data
+    tasks = load_tasks("travel", limit=5)
+    agent_config = load_agent_config("travel")
+
+    # Create your framework-specific benchmark subclass
+    class MyMACSBenchmark(MACSBenchmark):
+        def setup_agents(self, agent_data, environment, task, user):
+            # Your framework-specific agent creation
+            ...
+
+    # Run
+    benchmark = MyMACSBenchmark(agent_data=agent_config, model=my_model)
+    results = benchmark.run(tasks)
+"""
+
+import json
+from abc import abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple
+
+from maseval import (
+    AgentAdapter,
+    Benchmark,
+    Environment,
+    Evaluator,
+    MessageHistory,
+    ModelAdapter,
+    Task,
+    ToolInvocationHistory,
+    ToolLLMSimulator,
+    User,
+)
+from maseval.core.config import ConfigurableMixin
+from maseval.core.tracing import TraceableMixin
+
+
+# =============================================================================
+# Tool
+# =============================================================================
+
+
+class MACSGenericTool(TraceableMixin, ConfigurableMixin):
+    """Framework-agnostic tool with LLM-based response simulation.
+
+    This tool does not inherit from any framework-specific Tool class.
+    Users wrap it for their framework using composition. Example for smolagents:
+
+        class MySmolagentsTool(smolagents.Tool):
+            skip_forward_signature_validation = True
+
+            def __init__(self, generic_tool: MACSGenericTool):
+                self.generic_tool = generic_tool
+                self.name = generic_tool.name
+                self.description = generic_tool.description
+                self.inputs = generic_tool.inputs
+                self.output_type = "string"
+                super().__init__()
+
+            def forward(self, **kwargs) -> str:
+                return self.generic_tool(**kwargs)
+    """
+
+    def __init__(self, spec: Dict[str, Any], model: ModelAdapter):
+        """Initialize tool from specification.
+
+        Args:
+            spec: Tool specification with 'name', 'description', 'input_schema'
+            model: ModelAdapter for LLM-based response simulation
+        """
+        super().__init__()
+        self.name = spec["name"]
+        self.description = spec.get("description", "")
+        self.input_schema = spec.get("input_schema", {})
+        self.output_type = "string"
+        self.history = ToolInvocationHistory()
+
+        # Convert schema to inputs format
+        self.inputs = self._schema_to_inputs(self.input_schema)
+
+        # Create simulator
+        self.simulator = ToolLLMSimulator(
+            model=model,
+            tool_name=self.name,
+            tool_description=self.description,
+            tool_inputs=self.inputs,
+        )
+
+    @staticmethod
+    def _schema_to_inputs(schema: Dict[str, Any]) -> Dict[str, Any]:
+        """Convert JSON schema to inputs format."""
+        inputs = {}
+        for k, prop in schema.get("properties", {}).items():
+            dtype = prop.get("data_type") or prop.get("type", "string")
+            inputs[k] = {
+                "type": dtype if isinstance(dtype, str) else "string",
+                "description": prop.get("description", ""),
+            }
+        return inputs
+
+    def __call__(self, **kwargs) -> str:
+        """Execute the tool with simulated response."""
+        response, details = self.simulator(actual_inputs=kwargs)
+        self.history.add_invocation(
+            inputs=kwargs,
+            outputs=response,
+            status="success",
+            meta=details,
+        )
+        return response
+
+    def gather_traces(self) -> Dict[str, Any]:
+        """Gather execution traces."""
+        return {
+            **super().gather_traces(),
+            "name": self.name,
+            "invocations": self.history.to_list(),
+        }
+
+    def gather_config(self) -> Dict[str, Any]:
+        """Gather configuration."""
+        return {
+            **super().gather_config(),
+            "name": self.name,
+            "description": self.description,
+            "input_schema": self.input_schema,
+        }
+
+    def __repr__(self) -> str:
+        args = ", ".join(f"{k}: {v['type']}" for k, v in self.inputs.items())
+        return f"{self.__class__.__name__}({self.name}({args}) -> {self.output_type})"
+
+
+# =============================================================================
+# Evaluator
+# =============================================================================
+
+
+class MACSEvaluator(Evaluator):
+    """LLM-based assertion evaluator for GSR metrics.
+
+    Follows AWS paper methodology for Goal Success Rate (GSR) evaluation:
+    - user: Evaluates user-observable behaviors (conversation only)
+    - system: Evaluates internal behaviors (tool calls, agent actions)
+    """
+
+    DEFAULT_TEMPLATES_DIR = Path(__file__).parent / "prompt_templates"
+
+    def __init__(
+        self,
+        model: ModelAdapter,
+        task: Task,
+        gsr_type: Literal["user", "system"] = "user",
+        template: Optional[str] = None,
+    ):
+        """Initialize the evaluator.
+
+        Args:
+            model: ModelAdapter for LLM evaluation
+            task: Task being evaluated (contains assertions)
+            gsr_type: Either "user" or "system"
+            template: Optional custom prompt template (uses default if None)
+        """
+        # Note: base Evaluator.__init__ does nothing, so we skip calling it
+        # to avoid needing a real Environment instance
+        self.model = model
+        self.task = task
+        self.gsr_type = gsr_type
+
+        # Load template
+        if template is None:
+            template_file = "user.txt" if gsr_type == "user" else "system.txt"
+            template_path = self.DEFAULT_TEMPLATES_DIR / template_file
+            self.template = template_path.read_text()
+        else:
+            self.template = template
+
+    def filter_traces(self, traces: Dict[str, Any]) -> Dict[str, Any]:
+        """Filter traces based on gsr_type.
+
+        For user evaluation: only user-observable messages
+        For system evaluation: full traces including tool invocations
+        """
+        if self.gsr_type == "user":
+            user_trace = traces.get("user", {})
+            return {"messages": MessageHistory(user_trace.get("history", []))}
+        else:
+            # System gets everything
+            return traces
+
+    def __call__(
+        self,
+        traces: Dict[str, Any],
+        final_answer: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """Evaluate the trace against assertions.
+
+        Args:
+            traces: Filtered traces dict containing 'messages' and optionally 'tool_traces'
+            final_answer: Final answer from agents (unused in MACS evaluation)
+
+        Returns:
+            Dict with: gsr, partial_gsr, report (list of assertion judgments)
+        """
+        # Extract message history and tool traces from the traces dict
+        trace = traces.get("messages", MessageHistory())
+        tool_traces = traces.get("tool_traces")
+
+        # Parse assertions for this evaluation type
+        all_assertions = self.task.evaluation_data.get("assertions", [])
+        assertions = self._parse_assertions(all_assertions)
+
+        if not assertions:
+            return {"gsr": 1.0, "partial_gsr": 1.0, "report": []}
+
+        # Format conversation history
+        history = self._format_conversation_history(trace)
+
+        # Get scenario description
+        scenario = self.task.metadata.get("scenario", "")
+        if not scenario:
+            raise ValueError("Task metadata must include 'scenario' for GSR evaluation")
+
+        # Build prompt
+        if self.gsr_type == "user":
+            prompt = (
+                self.template.replace("{{scenario}}", scenario).replace("{{history}}", history).replace("{{assertions}}", "\n".join(assertions))
+            )
+        else:
+            invocations = self._format_tool_invocations(tool_traces or {})
+            prompt = (
+                self.template.replace("{{scenario}}", scenario)
+                .replace("{{history}}", history)
+                .replace("{{invocations}}", invocations)
+                .replace("{{assertions}}", "\n".join(assertions))
+            )
+
+        # Get LLM judgment
+        response = self.model.generate(prompt).strip()
+        response = response.strip("```").strip("json").strip()
+
+        try:
+            report = json.loads(response)
+
+            # Handle wrapped responses
+            for key in ["assertions", "results"]:
+                if isinstance(report, dict) and key in report:
+                    report = report[key]
+                    break
+
+            if isinstance(report, dict):
+                report = [report]
+
+            gsr, partial_gsr = self._compute_gsr(report)
+
+            for item in report:
+                item["assertion_type"] = self.gsr_type
+
+            return {"gsr": gsr, "partial_gsr": partial_gsr, "report": report}
+
+        except json.JSONDecodeError as e:
+            return {
+                "gsr": 0.0,
+                "partial_gsr": 0.0,
+                "report": [],
+                "error": f"JSON decode error: {e}",
+                "raw_response": response,
+            }
+
+    def _parse_assertions(self, assertions: List[str]) -> List[str]:
+        """Parse assertions and filter by type."""
+        parsed = []
+        user_prefix, system_prefix = "user:", "agent:"
+
+        for assertion in assertions:
+            assertion = assertion.strip()
+
+            if self.gsr_type == "user":
+                if assertion.lower().startswith(user_prefix):
+                    parsed.append(assertion[len(user_prefix) :].strip())
+                elif not assertion.lower().startswith(system_prefix):
+                    # No prefix means user assertion (AWS default)
+                    parsed.append(assertion)
+            else:
+                if assertion.lower().startswith(system_prefix):
+                    parsed.append(assertion[len(system_prefix) :].strip())
+
+        return parsed
+
+    def _format_conversation_history(self, trace: MessageHistory) -> str:
+        """Format conversation history for the prompt."""
+        lines = []
+        for msg in trace:
+            role = msg.get("role", "unknown")
+            content = msg.get("content", "")
+
+            if isinstance(content, list):
+                content = " ".join(item.get("text", "") if isinstance(item, dict) else str(item) for item in content)
+
+            lines.append(f"{role}: {content}")
+
+        return "\n".join(lines)
+
+    def _format_tool_invocations(self, tool_traces: Dict[str, Any]) -> str:
+        """Format tool invocations for system-side evaluation."""
+        lines = []
+
+        for tool_name, tool_data in tool_traces.items():
+            invocations = tool_data.get("invocations", [])
+            for inv in invocations:
+                lines.append(
+                    f"Tool: {tool_name}\n"
+                    f"  Inputs: {inv.get('inputs', {})}\n"
+                    f"  Outputs: {inv.get('outputs', '')}\n"
+                    f"  Status: {inv.get('status', 'Unknown')}"
+                )
+
+        return "\n".join(lines) if lines else "No tool invocations recorded"
+
+    def _compute_gsr(self, report: List[Dict[str, Any]]) -> Tuple[float, float]:
+        """Compute GSR metrics.
+
+        Returns:
+            (gsr, partial_gsr) where:
+            - gsr: 1.0 if all assertions True, else 0.0
+            - partial_gsr: Percentage of True assertions
+        """
+        if not report:
+            return 1.0, 1.0
+
+        true_count = sum(1 for item in report if str(item.get("answer", "")).lower() == "true")
+        total = len(report)
+
+        gsr = 1.0 if true_count == total else 0.0
+        partial_gsr = true_count / total if total > 0 else 1.0
+
+        return gsr, partial_gsr
+
+
+# =============================================================================
+# Environment
+# =============================================================================
+
+
+class MACSEnvironment(Environment):
+    """Unified environment for all MACS domains.
+
+    Creates MACSGenericTool instances from task's environment_data.
+    Users can override to convert tools to their framework format.
+    """
+
+    def __init__(
+        self,
+        task_data: Dict[str, Any],
+        model: ModelAdapter,
+        callbacks: Optional[List[Any]] = None,
+    ):
+        """Initialize environment.
+
+        Args:
+            task_data: Task data containing environment_data with tool specs
+            model: ModelAdapter for tool simulation
+            callbacks: Optional callbacks
+        """
+        self._model = model
+        super().__init__(task_data, callbacks)
+
+    def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Initialize state from task data."""
+        return {
+            "tool_specs": task_data.get("environment_data", {}).get("tools", []),
+        }
+
+    def create_tools(self) -> List[MACSGenericTool]:
+        """Create framework-agnostic tools from specifications."""
+        tools = []
+        seen = set()
+
+        for tool_group in self.state["tool_specs"]:
+            for action in tool_group.get("actions", []):
+                name = action.get("name")
+                if name and name not in seen:
+                    tools.append(MACSGenericTool(action, self._model))
+                    seen.add(name)
+
+        return tools
+
+    def get_tools_by_group(self, group_names: List[str]) -> List[MACSGenericTool]:
+        """Get tools belonging to specified tool groups.
+
+        Args:
+            group_names: List of tool group names (e.g., ["Weather", "BookFlight"])
+
+        Returns:
+            List of tools from those groups
+        """
+        result = []
+        for tool_group in self.state["tool_specs"]:
+            if tool_group.get("tool_name") in group_names:
+                for action in tool_group.get("actions", []):
+                    name = action.get("name")
+                    if name and name in self._tools_dict:
+                        result.append(self._tools_dict[name])
+        return result
+
+
+# =============================================================================
+# Benchmark
+# =============================================================================
+
+
+class MACSBenchmark(Benchmark):
+    """MACS Benchmark - Framework-agnostic base class.
+
+    This base class handles:
+    - Environment setup with MACSEnvironment
+    - Dual evaluator setup (user-side + system-side)
+    - GSR metric aggregation
+
+    Users must subclass and implement setup_agents() for their framework.
+    """
+
+    def __init__(
+        self,
+        agent_data: Dict[str, Any],
+        model: ModelAdapter,
+        data_dir: Optional[Path] = None,
+        callbacks: Optional[List[Any]] = None,
+        n_task_repeats: int = 1,
+        **kwargs: Any,
+    ):
+        """Initialize benchmark.
+
+        Args:
+            agent_data: Agent configuration from load_agent_config()
+            model: ModelAdapter for tool simulation and evaluation
+            data_dir: Optional custom data directory
+            callbacks: Benchmark callbacks
+            n_task_repeats: Repetitions per task
+        """
+        self._model = model
+        self._data_dir = Path(data_dir) if data_dir else (Path(__file__).parent / "data")
+        super().__init__(agent_data, callbacks, n_task_repeats, **kwargs)
+
+    def setup_environment(
+        self,
+        agent_data: Dict[str, Any],
+        task: Task,
+    ) -> MACSEnvironment:
+        """Create environment for a task."""
+        return MACSEnvironment(
+            task_data={"environment_data": task.environment_data},
+            model=self._model,
+        )
+
+    def setup_user(
+        self,
+        agent_data: Dict[str, Any],
+        environment: Environment,
+        task: Task,
+    ) -> Optional[User]:
+        """Create user simulator. Override for multi-turn evaluation."""
+        return None
+
+    @abstractmethod
+    def setup_agents(
+        self,
+        agent_data: Dict[str, Any],
+        environment: Environment,
+        task: Task,
+        user: Optional[User],
+    ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
+        """Create agents for this task. Must be implemented by subclass.
+
+        Args:
+            agent_data: Agent configuration with hierarchy spec
+            environment: MACSEnvironment with tools
+            task: Current task
+            user: Optional user simulator
+
+        Returns:
+            Tuple of (ordered agent list, agent dict keyed by ID)
+        """
+        pass
+
+    def setup_evaluators(
+        self,
+        environment: Environment,
+        task: Task,
+        agents: Sequence[AgentAdapter],
+        user: Optional[User],
+    ) -> Sequence[Evaluator]:
+        """Create user-side and system-side evaluators."""
+        return [
+            MACSEvaluator(self._model, task, gsr_type="user"),
+            MACSEvaluator(self._model, task, gsr_type="system"),
+        ]
+
+    def run_agents(
+        self,
+        agents: Sequence[AgentAdapter],
+        task: Task,
+        environment: Environment,
+    ) -> Any:
+        """Execute agents and return final answer."""
+        answers = [agent.run(task.query) for agent in agents]
+        return answers[0] if len(answers) == 1 else answers
+
+    def evaluate(
+        self,
+        evaluators: Sequence[Evaluator],
+        agents: Dict[str, AgentAdapter],
+        final_answer: Any,
+        traces: Dict[str, Any],
+    ) -> List[Dict[str, Any]]:
+        """Evaluate using both evaluators and aggregate GSR metrics.
+
+        Returns AWS paper format:
+        - user_gsr, system_gsr, overall_gsr, supervisor_gsr
+        - user_partial_gsr, system_partial_gsr, overall_partial_gsr
+        - report: Combined assertion judgments
+        """
+        # Get agent traces - primary agent's messages
+        primary_agent_id = list(agents.keys())[0]
+        agent_trace = traces.get("agents", {}).get(primary_agent_id, {})
+        all_messages = MessageHistory(agent_trace.get("messages", []))
+
+        # For user-side evaluation: filter to user-observable messages only
+        # (user queries and assistant responses - not tool calls)
+        user_messages = MessageHistory([msg for msg in all_messages if msg.get("role") in ("user", "assistant")])
+
+        tool_traces = traces.get("tools", {})
+
+        # Run evaluators with properly structured traces dict
+        results = []
+        for evaluator in evaluators:
+            if isinstance(evaluator, MACSEvaluator) and evaluator.gsr_type == "system":
+                eval_traces = {"messages": all_messages, "tool_traces": tool_traces}
+            else:
+                eval_traces = {"messages": user_messages}
+            result = evaluator(eval_traces, final_answer)
+            results.append(result)
+
+        # Combine results
+        user_result = results[0] if results else {"gsr": 0.0, "partial_gsr": 0.0, "report": []}
+        system_result = results[1] if len(results) > 1 else {"gsr": 0.0, "partial_gsr": 0.0, "report": []}
+
+        combined_report = user_result.get("report", []) + system_result.get("report", [])
+
+        # Compute overall metrics per AWS paper
+        overall_gsr = 1.0 if (user_result.get("gsr", 0.0) == 1.0 and system_result.get("gsr", 0.0) == 1.0) else 0.0
+
+        # Supervisor GSR: success if overall passes OR user-side passes
+        supervisor_gsr = 1.0 if (overall_gsr == 1.0 or user_result.get("gsr", 0.0) == 1.0) else 0.0
+
+        # Overall partial GSR
+        if combined_report:
+            total_true = sum(1 for item in combined_report if str(item.get("answer", "")).lower() == "true")
+            overall_partial_gsr = total_true / len(combined_report)
+        else:
+            overall_partial_gsr = 1.0
+
+        return [
+            {
+                "user_gsr": user_result.get("gsr", 0.0),
+                "user_partial_gsr": user_result.get("partial_gsr", 0.0),
+                "system_gsr": system_result.get("gsr", 0.0),
+                "system_partial_gsr": system_result.get("partial_gsr", 0.0),
+                "overall_gsr": overall_gsr,
+                "overall_partial_gsr": overall_partial_gsr,
+                "supervisor_gsr": supervisor_gsr,
+                "report": combined_report,
+            }
+        ]
+
+
+# =============================================================================
+# Utility Functions
+# =============================================================================
+
+
+def compute_benchmark_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Compute summary metrics across all benchmark results.
+
+    Args:
+        results: List of result dicts from benchmark.run()
+
+    Returns:
+        Dict with total_tasks, successful_tasks, success_rate, mean_metrics
+    """
+    if not results:
+        return {
+            "total_tasks": 0,
+            "successful_tasks": 0,
+            "success_rate": 0.0,
+            "mean_metrics": {},
+        }
+
+    total_tasks = len(results)
+    metric_sums: Dict[str, float] = {}
+    metric_counts: Dict[str, int] = {}
+    successful_tasks = 0
+
+    for res in results:
+        evals = res.get("eval") or []
+        found_success = False
+
+        for entry in evals:
+            for k, v in entry.items():
+                if isinstance(v, (int, float)):
+                    metric_sums[k] = metric_sums.get(k, 0.0) + v
+                    metric_counts[k] = metric_counts.get(k, 0) + 1
+
+            if not found_success and entry.get("overall_gsr", 0.0) == 1.0:
+                found_success = True
+
+        if found_success:
+            successful_tasks += 1
+
+    success_rate = successful_tasks / total_tasks if total_tasks > 0 else 0.0
+    mean_metrics = {k: metric_sums[k] / metric_counts[k] if metric_counts[k] else 0.0 for k in metric_sums}
+
+    return {
+        "total_tasks": total_tasks,
+        "successful_tasks": successful_tasks,
+        "success_rate": success_rate,
+        "mean_metrics": mean_metrics,
+    }
diff --git a/tests/test_benchmarks/test_macs/test_data_loader.py b/tests/test_benchmarks/test_macs/test_data_loader.py
index 38ffe337..effe22f6 100644
--- a/tests/test_benchmarks/test_macs/test_data_loader.py
+++ b/tests/test_benchmarks/test_macs/test_data_loader.py
@@ -65,24 +65,24 @@ def sample_agents_data() -> Dict[str, Any]:
 
 @pytest.fixture
 def sample_scenarios_data() -> Dict[str, Any]:
-    """Sample scenarios.json data matching AWS format."""
+    """Sample scenarios.json data matching AWS format (no IDs - they get generated)."""
     return {
         "scenarios": [
             {
-                "id": "11111111-1111-1111-1111-111111111111",
+                "scenario": "Bicycle tour planning",
                 "input_problem": "Book a flight to New York",
                 "assertions": [
-                    {"type": "user_side", "content": "Flight booked"},
-                    {"type": "system_side", "content": "Database updated"},
+                    "user: Flight booked",
+                    "agent: Database updated",
                 ],
                 "category": "travel",
                 "complexity": "simple",
             },
             {
-                "id": "22222222-2222-2222-2222-222222222222",
+                "scenario": "Reservation cancellation",
                 "input_problem": "Cancel my reservation",
                 "assertions": [
-                    {"type": "user_side", "content": "Reservation cancelled"},
+                    "user: Reservation cancelled",
                 ],
                 "category": "travel",
                 "complexity": "simple",
@@ -204,18 +204,19 @@ class TestCreateTasksList:
     """Tests for _create_tasks_list function."""
 
     def test_basic_conversion(self, sample_scenarios_data, sample_agents_data):
-        """Converts scenarios to task format."""
+        """Converts scenarios to task format with sequential IDs."""
         tools = _create_tools_list(sample_agents_data)
         tasks = _create_tasks_list(sample_scenarios_data, tools)
 
         assert len(tasks) == 2
 
         task1 = tasks[0]
-        assert task1["id"] == "11111111-1111-1111-1111-111111111111"
+        assert task1["id"] == "task-000001"  # Sequential ID generated
         assert task1["query"] == "Book a flight to New York"
         assert "tools" in task1["environment_data"]
         assert "assertions" in task1["evaluation_data"]
         assert task1["metadata"]["category"] == "travel"
+        assert task1["metadata"]["scenario"] == "Bicycle tour planning"
 
     def test_list_of_scenarios(self, sample_scenarios_data, sample_agents_data):
         """Also works with list of scenarios directly."""
@@ -733,8 +734,8 @@ def test_restructure_uses_custom_location(self, temp_data_dir, sample_agents_dat
 class TestSequentialIdGeneration:
     """Tests for task ID generation."""
 
-    def test_generates_sequential_ids_when_missing(self):
-        """_create_tasks_list generates sequential IDs when not present."""
+    def test_generates_sequential_ids(self):
+        """_create_tasks_list always generates sequential IDs."""
         scenarios = {
             "scenarios": [
                 {"input_problem": "Task 1", "assertions": []},
@@ -748,17 +749,22 @@ def test_generates_sequential_ids_when_missing(self):
         assert tasks[1]["id"] == "task-000002"
         assert tasks[2]["id"] == "task-000003"
 
-    def test_preserves_existing_ids(self):
-        """_create_tasks_list preserves existing IDs."""
+    def test_sequential_ids_ignore_original_ids(self):
+        """_create_tasks_list ignores any existing id/uuid fields in scenarios."""
         scenarios = {
             "scenarios": [
-                {"id": "custom-id-1", "input_problem": "Task 1", "assertions": []},
-                {"uuid": "uuid-based-2", "input_problem": "Task 2", "assertions": []},
-                {"input_problem": "Task 3", "assertions": []},  # No ID
+                {"id": "should-be-ignored", "input_problem": "Task 1", "assertions": []},
+                {"uuid": "also-ignored", "input_problem": "Task 2", "assertions": []},
+                {"input_problem": "Task 3", "assertions": []},
             ]
         }
         tasks = _create_tasks_list(scenarios, [])
 
-        assert tasks[0]["id"] == "custom-id-1"
-        assert tasks[1]["id"] == "uuid-based-2"
-        assert tasks[2]["id"] == "task-000003"  # Generated
+        # All get sequential IDs regardless of original id/uuid fields
+        assert tasks[0]["id"] == "task-000001"
+        assert tasks[1]["id"] == "task-000002"
+        assert tasks[2]["id"] == "task-000003"
+
+        # Original id/uuid are preserved in metadata
+        assert tasks[0]["metadata"].get("id") == "should-be-ignored"
+        assert tasks[1]["metadata"].get("uuid") == "also-ignored"

From ae806fa32549411784667d8c94fd883daf663dab Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Wed, 3 Dec 2025 12:02:27 +0000
Subject: [PATCH 03/34] updated benchmark

---
 BENCHMARKS.md                      |   2 +-
 examples/macs_benchmark.py         | 345 ++++++++++++++++++++++++-----
 maseval/benchmark/macs/__init__.py |   2 +
 maseval/benchmark/macs/macs.py     | 211 +++++++++++++++++-
 4 files changed, 502 insertions(+), 58 deletions(-)

diff --git a/BENCHMARKS.md b/BENCHMARKS.md
index 2e51ac9c..10f2ba38 100644
--- a/BENCHMARKS.md
+++ b/BENCHMARKS.md
@@ -4,7 +4,7 @@ This document provides detailed information, sources, and licensing for all benc
 
 ---
 
-## 1. AWS Multi-Agent Collaboration Scenario
+## 1. Multi-Agent Collaboriation Scenario Benchmark (MACS Benchmark)
 
 This benchmark is designed to test and evaluate the collaborative problem-solving capabilities of multi-agent systems. The implementation in this library provides the necessary code to set up and run these scenarios.
 
diff --git a/examples/macs_benchmark.py b/examples/macs_benchmark.py
index 83087830..be376ce8 100644
--- a/examples/macs_benchmark.py
+++ b/examples/macs_benchmark.py
@@ -4,9 +4,13 @@
 benchmark with either smolagents or langgraph frameworks.
 
 The MACS benchmark evaluates multi-agent collaboration in three enterprise domains:
-- Travel: 10 agents, 52 tools (flight booking, hotels, weather, etc.)
-- Mortgage: 6 agents, 35 tools (loan processing, document handling, etc.)
-- Software: 8 agents, 4 tools (code review, issue tracking, etc.)
+- Travel: 10 agents, 52 tools (flight booking, hotels, weather, etc.) - 2-level hierarchy
+- Mortgage: 6 agents, 35 tools (loan processing, document handling, etc.) - 2-level hierarchy
+- Software: 8 agents, 4 tools (code review, issue tracking, etc.) - 3-level hierarchy
+
+Agent Hierarchy:
+    Travel/Mortgage: supervisor -> specialist agents
+    Software: supervisor -> deploy_agent -> infrastructure_agent, application_agent
 
 Reference:
     Paper: https://arxiv.org/abs/2412.05449
@@ -36,6 +40,7 @@
 from maseval.benchmark.macs import (
     MACSBenchmark,
     MACSGenericTool,
+    MACSUserSimulator,
     compute_benchmark_metrics,
     ensure_data_exists,
     load_agent_config,
@@ -71,8 +76,8 @@ def create_model(model_id: str = "gemini-2.5-flash") -> GoogleGenAIModelAdapter:
 
 
 def _create_smolagents_benchmark():
-    """Create smolagents-specific benchmark class."""
-    from smolagents import Tool as SmolagentsTool, ToolCallingAgent, OpenAIServerModel
+    """Create smolagents-specific benchmark class with multi-agent hierarchy."""
+    from smolagents import Tool as SmolagentsTool, ToolCallingAgent, OpenAIServerModel, FinalAnswerTool
     from maseval.interface.agents.smolagents import SmolAgentAdapter
 
     class SmolagentsToolWrapper(SmolagentsTool, ConfigurableMixin, TraceableMixin):
@@ -97,8 +102,43 @@ def gather_traces(self) -> Dict[str, Any]:
         def gather_config(self) -> Dict[str, Any]:
             return self.generic_tool.gather_config()
 
+    class SmolagentsMACSUser(MACSUserSimulator):
+        """MACS User Simulator with smolagents tool integration."""
+
+        def get_tool(self):
+            """Return a smolagents-compatible user input tool."""
+            # Create a simple smolagents tool that wraps simulate_response
+            user = self
+
+            class UserInputTool(SmolagentsTool):
+                name = "user_input"
+                description = "Ask the user a question to clarify their request or get additional information."
+                inputs = {"question": {"type": "string", "description": "The question to ask the user."}}
+                output_type = "string"
+
+                def forward(self, question: str) -> str:
+                    return user.simulate_response(question)
+
+            return UserInputTool()
+
     class SmolagentsMACSBenchmark(MACSBenchmark):
-        """MACS Benchmark implementation for smolagents."""
+        """MACS Benchmark implementation for smolagents with multi-agent hierarchy."""
+
+        def setup_user(
+            self,
+            agent_data: Dict[str, Any],
+            environment: Environment,
+            task: Task,
+        ) -> SmolagentsMACSUser:
+            """Create smolagents-compatible user simulator."""
+            scenario = task.metadata.get("scenario", "")
+
+            return SmolagentsMACSUser(
+                name="Simulated User",
+                model=self._model,
+                scenario=scenario,
+                initial_prompt=task.query,
+            )
 
         def setup_agents(
             self,
@@ -107,37 +147,86 @@ def setup_agents(
             task: Task,
             user: Optional[User],
         ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
-            """Create smolagents agents."""
-            # Get tools from environment
-            generic_tools = environment.create_tools()
-            wrapped_tools = [SmolagentsToolWrapper(t) for t in generic_tools]
+            """Create smolagents multi-agent hierarchy.
 
+            Implements the exact agent topology from agents.json:
+            - Travel/Mortgage: 2-level hierarchy (supervisor -> specialists)
+            - Software: 3-level hierarchy (supervisor -> deploy_agent -> infra/app agents)
+            """
             # Create smolagents model
-            # Use OpenAI-compatible API with Gemini via AI Studio
             smol_model = OpenAIServerModel(
                 model_id="gemini-2.5-flash",
                 api_base="https://generativelanguage.googleapis.com/v1beta/openai/",
                 api_key=os.getenv("GOOGLE_API_KEY"),
             )
 
-            # Get primary agent config
-            primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
+            # Build agent lookup
             agents_config = agent_data.get("agents", [])
-            primary_config = next(
-                (a for a in agents_config if a.get("agent_id") == primary_agent_id), agents_config[0] if agents_config else {}
-            )
+            agent_lookup = {a["agent_id"]: a for a in agents_config}
+            primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
 
-            # Create agent
-            agent = ToolCallingAgent(
-                tools=list(wrapped_tools),  # type: ignore[arg-type]
-                model=smol_model,
-                max_steps=10,
-                name=primary_config.get("agent_name", "MACS Agent"),
-                description=primary_config.get("agent_instruction", "Multi-agent collaboration agent"),
-            )
+            # Wrap all generic tools for smolagents
+            generic_tools = environment.create_tools()
+            tool_lookup = {t.name: SmolagentsToolWrapper(t) for t in generic_tools}
+
+            # Helper to get tools for an agent
+            def get_agent_tools(agent_spec: Dict[str, Any]) -> List[Any]:
+                """Get wrapped tools for an agent based on tool group names."""
+                tool_groups = agent_spec.get("tools", [])
+                tools = []
+                for tool_group in tool_groups:
+                    # Find actions in this tool group
+                    for tool_spec in environment.state.get("tool_specs", []):
+                        if tool_spec.get("tool_name") == tool_group:
+                            for action in tool_spec.get("actions", []):
+                                action_name = action.get("name")
+                                if action_name and action_name in tool_lookup:
+                                    tools.append(tool_lookup[action_name])
+                return tools
+
+            # Recursive function to build agent hierarchy
+            def build_agent(agent_id: str, depth: int = 0) -> ToolCallingAgent:
+                """Build an agent with its sub-agents (managed_agents)."""
+                agent_spec = agent_lookup.get(agent_id, {})
+
+                # Get this agent's tools
+                agent_tools = get_agent_tools(agent_spec)
+                agent_tools.append(FinalAnswerTool())
+
+                # Build managed agents from reachable_agents
+                managed_agents = []
+                reachable = agent_spec.get("reachable_agents", [])
+
+                for reachable_spec in reachable:
+                    sub_agent_id = reachable_spec.get("agent_id")
+                    if sub_agent_id and sub_agent_id in agent_lookup:
+                        sub_agent = build_agent(sub_agent_id, depth + 1)
+                        managed_agents.append(sub_agent)
+
+                # Create the agent
+                agent = ToolCallingAgent(
+                    model=smol_model,
+                    tools=agent_tools,
+                    managed_agents=managed_agents if managed_agents else None,
+                    name=agent_spec.get("agent_name", agent_id),
+                    description=agent_spec.get("agent_instruction", ""),
+                    max_steps=15,  # Allow more steps for multi-agent coordination
+                    verbosity_level=0,
+                )
+
+                return agent
+
+            # Build the primary agent with full hierarchy
+            primary_agent = build_agent(primary_agent_id)
+
+            # Add user tool to primary agent if user simulator is available
+            if user and hasattr(user, "get_tool"):
+                user_tool = user.get_tool()
+                if user_tool:
+                    primary_agent.tools.append(user_tool)  # type: ignore[attr-defined]
 
             # Wrap with adapter
-            adapter = SmolAgentAdapter(agent, name=primary_agent_id)
+            adapter = SmolAgentAdapter(primary_agent, name=primary_agent_id)
 
             return [adapter], {primary_agent_id: adapter}
 
@@ -150,10 +239,11 @@ def setup_agents(
 
 
 def _create_langgraph_benchmark():
-    """Create langgraph-specific benchmark class."""
+    """Create langgraph-specific benchmark class with multi-agent hierarchy."""
     from langchain_core.tools import StructuredTool
+    from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
     from langchain_google_genai import ChatGoogleGenerativeAI
-    from langgraph.graph import StateGraph
+    from langgraph.graph import StateGraph, END
     from langgraph.graph.message import add_messages
     from langgraph.prebuilt import ToolNode, tools_condition
     from typing_extensions import TypedDict, Annotated
@@ -180,8 +270,40 @@ def gather_traces(self) -> Dict[str, Any]:
         def gather_config(self) -> Dict[str, Any]:
             return self.generic_tool.gather_config()
 
+    class LangGraphMACSUser(MACSUserSimulator):
+        """MACS User Simulator with LangGraph tool integration."""
+
+        def get_tool(self):
+            """Return a LangGraph-compatible user input tool."""
+
+            def user_input(question: str) -> str:
+                """Ask the user a question and get their response."""
+                return self.simulate_response(question)
+
+            return StructuredTool.from_function(
+                func=user_input,
+                name="user_input",
+                description="Ask the user a question. Use this to clarify requirements or get additional information.",
+            )
+
     class LanggraphMACSBenchmark(MACSBenchmark):
-        """MACS Benchmark implementation for langgraph."""
+        """MACS Benchmark implementation for langgraph with multi-agent hierarchy."""
+
+        def setup_user(
+            self,
+            agent_data: Dict[str, Any],
+            environment: Environment,
+            task: Task,
+        ) -> "LangGraphMACSUser":
+            """Create langgraph-compatible user simulator."""
+            scenario = task.metadata.get("scenario", "")
+
+            return LangGraphMACSUser(
+                name="Simulated User",
+                model=self._model,
+                scenario=scenario,
+                initial_prompt=task.query,
+            )
 
         def setup_agents(
             self,
@@ -190,40 +312,156 @@ def setup_agents(
             task: Task,
             user: Optional[User],
         ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
-            """Create langgraph agents."""
-            # Get tools from environment
-            generic_tools = environment.create_tools()
-            wrapped_tools = [LanggraphToolWrapper(t) for t in generic_tools]
-            langchain_tools = [w.tool for w in wrapped_tools]
+            """Create langgraph multi-agent hierarchy.
 
-            # Create LangChain model with tools
+            Uses subgraphs to implement the agent hierarchy from agents.json.
+            """
+            # Create LangChain model
             llm = ChatGoogleGenerativeAI(
                 model="gemini-2.5-flash",
                 google_api_key=os.getenv("GOOGLE_API_KEY"),
             )
-            llm_with_tools = llm.bind_tools(langchain_tools)
 
-            # Define state
-            class State(TypedDict):
-                messages: Annotated[list, add_messages]
+            # Build agent lookup
+            agents_config = agent_data.get("agents", [])
+            agent_lookup = {a["agent_id"]: a for a in agents_config}
+            primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
+
+            # Wrap all generic tools
+            generic_tools = environment.create_tools()
+            tool_lookup = {t.name: LanggraphToolWrapper(t) for t in generic_tools}
 
-            # Build graph
-            def chatbot(state: State):
-                return {"messages": [llm_with_tools.invoke(state["messages"])]}
+            # State type for langgraph
+            class AgentState(TypedDict):
+                messages: Annotated[list, add_messages]
 
-            graph = StateGraph(State)
-            graph.add_node("chatbot", chatbot)
-            graph.add_node("tools", ToolNode(tools=langchain_tools))
+            # Helper to get langchain tools for an agent
+            def get_agent_langchain_tools(agent_spec: Dict[str, Any]) -> List:
+                """Get langchain tools for an agent."""
+                tool_groups = agent_spec.get("tools", [])
+                tools = []
+                for tool_group in tool_groups:
+                    for tool_spec in environment.state.get("tool_specs", []):
+                        if tool_spec.get("tool_name") == tool_group:
+                            for action in tool_spec.get("actions", []):
+                                action_name = action.get("name")
+                                if action_name and action_name in tool_lookup:
+                                    tools.append(tool_lookup[action_name].tool)
+                return tools
+
+            # Build a simple graph for leaf agents (no sub-agents)
+            def build_leaf_agent_graph(agent_id: str):
+                """Build a compiled graph for a leaf agent."""
+                agent_spec = agent_lookup.get(agent_id, {})
+                agent_tools = get_agent_langchain_tools(agent_spec)
+                agent_name = agent_spec.get("agent_name", agent_id)
+                agent_instruction = agent_spec.get("agent_instruction", "")
+
+                if agent_tools:
+                    llm_with_tools = llm.bind_tools(agent_tools)
+                else:
+                    llm_with_tools = llm
+
+                def call_agent(state: AgentState):
+                    messages = state["messages"]
+                    # Add system message
+                    has_system = any(isinstance(m, SystemMessage) for m in messages)
+                    if not has_system:
+                        system_msg = SystemMessage(content=f"You are {agent_name}. {agent_instruction}")
+                        messages = [system_msg] + list(messages)
+                    response = llm_with_tools.invoke(messages)
+                    return {"messages": [response]}
+
+                graph = StateGraph(AgentState)
+                graph.add_node("agent", call_agent)
+
+                if agent_tools:
+                    graph.add_node("tools", ToolNode(agent_tools))
+                    graph.add_conditional_edges("agent", tools_condition)
+                    graph.add_edge("tools", "agent")
+                else:
+                    graph.add_edge("agent", END)
+
+                graph.set_entry_point("agent")
+                return graph.compile()
+
+            # For the primary agent, we build a graph that can delegate to sub-agents
+            primary_spec = agent_lookup.get(primary_agent_id, {})
+            primary_tools = get_agent_langchain_tools(primary_spec)
+
+            # Add user tool if available
+            if user and hasattr(user, "get_tool"):
+                user_tool = user.get_tool()
+                if user_tool:
+                    primary_tools.append(user_tool)
+
+            # Create sub-agent tools (delegation)
+            reachable = primary_spec.get("reachable_agents", [])
+            for reachable_spec in reachable:
+                sub_agent_id = reachable_spec.get("agent_id")
+                if sub_agent_id and sub_agent_id in agent_lookup:
+                    sub_spec = agent_lookup[sub_agent_id]
+                    sub_graph = build_leaf_agent_graph(sub_agent_id)
+
+                    # Create a tool that invokes the sub-agent
+                    def make_sub_agent_tool(graph, name, description):
+                        def invoke_sub_agent(query: str) -> str:
+                            """Delegate to sub-agent."""
+                            result = graph.invoke({"messages": [HumanMessage(content=query)]})
+                            # Get last AI message
+                            for msg in reversed(result.get("messages", [])):
+                                if isinstance(msg, AIMessage):
+                                    content = msg.content
+                                    if isinstance(content, str):
+                                        return content
+                                    # Handle list content (e.g., multimodal responses)
+                                    return str(content)
+                            return "No response from sub-agent"
+
+                        return StructuredTool.from_function(
+                            func=invoke_sub_agent,
+                            name=name,
+                            description=description,
+                        )
+
+                    sub_tool = make_sub_agent_tool(
+                        sub_graph,
+                        sub_spec.get("agent_name", sub_agent_id),
+                        reachable_spec.get("scenario", sub_spec.get("agent_instruction", "")),
+                    )
+                    primary_tools.append(sub_tool)
+
+            # Build primary agent graph
+            if primary_tools:
+                llm_with_tools = llm.bind_tools(primary_tools)
+            else:
+                llm_with_tools = llm
+
+            primary_name = primary_spec.get("agent_name", primary_agent_id)
+            primary_instruction = primary_spec.get("agent_instruction", "")
+
+            def call_primary(state: AgentState):
+                messages = state["messages"]
+                has_system = any(isinstance(m, SystemMessage) for m in messages)
+                if not has_system:
+                    system_msg = SystemMessage(content=f"You are {primary_name}. {primary_instruction}")
+                    messages = [system_msg] + list(messages)
+                response = llm_with_tools.invoke(messages)
+                return {"messages": [response]}
+
+            graph = StateGraph(AgentState)
+            graph.add_node("chatbot", call_primary)
+
+            if primary_tools:
+                graph.add_node("tools", ToolNode(tools=primary_tools))
+                graph.add_conditional_edges("chatbot", tools_condition)
+                graph.add_edge("tools", "chatbot")
+            else:
+                graph.add_edge("chatbot", END)
 
-            graph.add_conditional_edges("chatbot", tools_condition)
-            graph.add_edge("tools", "chatbot")
             graph.set_entry_point("chatbot")
-
             compiled_graph = graph.compile()
 
-            # Get primary agent config
-            primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
-
             # Wrap with adapter
             adapter = LangGraphAgentAdapter(compiled_graph, name=primary_agent_id)
 
@@ -289,7 +527,12 @@ def run_benchmark(
     print(f"Loading {domain} domain tasks...")
     tasks = load_tasks(domain, limit=limit)
     agent_config = load_agent_config(domain)
-    print(f"Loaded {len(tasks)} tasks")
+
+    # Print agent hierarchy info
+    agents_count = len(agent_config.get("agents", []))
+    primary_agent_id = agent_config.get("primary_agent_id", "unknown")
+    print(f"Loaded {len(tasks)} tasks with {agents_count}-agent hierarchy")
+    print(f"Primary agent: {primary_agent_id}")
 
     # Setup callback for logging results
     logger = FileResultLogger(
diff --git a/maseval/benchmark/macs/__init__.py b/maseval/benchmark/macs/__init__.py
index ce833ac3..358fe476 100644
--- a/maseval/benchmark/macs/__init__.py
+++ b/maseval/benchmark/macs/__init__.py
@@ -13,6 +13,7 @@
     MACSEnvironment,
     MACSEvaluator,
     MACSGenericTool,
+    MACSUserSimulator,
     compute_benchmark_metrics,
 )
 from .data_loader import (
@@ -33,6 +34,7 @@
     "MACSEnvironment",
     "MACSEvaluator",
     "MACSGenericTool",
+    "MACSUserSimulator",
     # Data loading
     "load_tasks",
     "load_agent_config",
diff --git a/maseval/benchmark/macs/macs.py b/maseval/benchmark/macs/macs.py
index 4a5eefdd..a04b2aac 100644
--- a/maseval/benchmark/macs/macs.py
+++ b/maseval/benchmark/macs/macs.py
@@ -350,6 +350,175 @@ def _compute_gsr(self, report: List[Dict[str, Any]]) -> Tuple[float, float]:
         return gsr, partial_gsr
 
 
+# =============================================================================
+# User Simulator
+# =============================================================================
+
+
+class MACSUserSimulator(User):
+    """MACS-specific user simulator with conversation limits.
+
+    Extends the base User class with MACS-specific behavior:
+    - Maximum 5 turns of interaction (as per MACS paper)
+    - </stop> token detection for natural conversation ending
+    - User profile and scenario-aware responses
+
+    The simulator maintains a conversation history and uses an LLM to generate
+    responses that are consistent with the user's profile and scenario.
+
+    Note: This is a base class. Framework-specific subclasses should override
+    get_tool() to return a compatible tool (e.g., SmolAgentUserSimulationInputTool).
+    """
+
+    DEFAULT_MAX_TURNS = 5
+    STOP_TOKEN = "</stop>"
+
+    def __init__(
+        self,
+        model: ModelAdapter,
+        scenario: str,
+        initial_prompt: str,
+        name: str = "Simulated User",
+        template: Optional[str] = None,
+        max_turns: int = DEFAULT_MAX_TURNS,
+    ):
+        """Initialize MACS user simulator.
+
+        Args:
+            model: ModelAdapter for LLM-based response generation
+            scenario: Full scenario text (contains goals and user background)
+            initial_prompt: The initial query to the agent
+            name: User name for identification (default: "Simulated User")
+            template: Optional custom prompt template (uses base User's default)
+            max_turns: Maximum conversation turns (default: 5, per MACS paper)
+        """
+        # Extract user profile from scenario text
+        user_profile = self._extract_user_profile(scenario)
+
+        super().__init__(
+            name=name,
+            model=model,
+            user_profile=user_profile,
+            scenario=scenario,
+            initial_prompt=initial_prompt,
+            template=template,
+        )
+        self.max_turns = max_turns
+        self._turn_count = 0
+        self._stopped = False
+
+    def get_tool(self) -> Any:
+        """Return a tool for agent interaction.
+
+        This base implementation raises NotImplementedError.
+        Framework-specific subclasses should override this method.
+
+        For smolagents, use SmolAgentMACSUser which provides a smolagents-compatible tool.
+        For langgraph, use LangGraphMACSUser which provides a langchain-compatible tool.
+
+        Raises:
+            NotImplementedError: Always, as this must be implemented by subclass.
+        """
+        raise NotImplementedError(
+            "MACSUserSimulator.get_tool() must be overridden by framework-specific subclass. "
+            "Use SmolAgentMACSUser for smolagents or LangGraphMACSUser for langgraph."
+        )
+
+    @property
+    def is_done(self) -> bool:
+        """Check if the conversation should end.
+
+        Returns True if:
+        - Maximum turns reached
+        - User responded with </stop> token
+        """
+        return self._stopped or self._turn_count >= self.max_turns
+
+    def simulate_response(self, question: str) -> str:
+        """Simulate a user response, respecting turn limits.
+
+        Args:
+            question: The assistant's question/message
+
+        Returns:
+            The simulated user response, or empty string if done
+        """
+        if self.is_done:
+            return ""
+
+        # Use parent's simulate_response which handles LLM generation
+        response = super().simulate_response(question)
+
+        # Check for stop token
+        if self.STOP_TOKEN in response.lower():
+            self._stopped = True
+            # Clean up the response
+            response = response.replace(self.STOP_TOKEN, "").strip()
+            if not response:
+                response = "Thank you, that's all I needed!"
+
+        self._turn_count += 1
+        return response
+
+    def reset(self) -> None:
+        """Reset the conversation state for a new interaction."""
+        self._turn_count = 0
+        self._stopped = False
+        # Keep only the initial user message
+        if len(self.messages) > 0:
+            initial = self.messages[0]
+            self.messages = MessageHistory([initial])
+
+    @staticmethod
+    def _extract_user_profile(scenario: str) -> Dict[str, Any]:
+        """Extract user profile from scenario text.
+
+        The MACS scenarios contain user background info after "Background:" marker.
+
+        Args:
+            scenario: Full scenario text with goals and background
+
+        Returns:
+            Dict with user profile fields
+        """
+        profile: Dict[str, Any] = {}
+
+        # Find the Background section
+        if "Background:" in scenario:
+            background_section = scenario.split("Background:")[-1].strip()
+
+            # Parse bullet points (* User's name is ...)
+            for line in background_section.split("\n"):
+                line = line.strip().lstrip("*").strip()
+                if line.lower().startswith("user"):
+                    # Try to extract key-value pairs
+                    if " is " in line.lower():
+                        key_part, value_part = line.split(" is ", 1)
+                        key = key_part.lower().replace("user's ", "").replace("user ", "").strip()
+                        profile[key] = value_part.strip().rstrip(".")
+                    elif " has " in line.lower():
+                        key_part, value_part = line.split(" has ", 1)
+                        key = key_part.lower().replace("user's ", "").replace("user ", "").strip()
+                        profile[key] = value_part.strip().rstrip(".")
+
+        # Include full scenario as fallback context
+        profile["full_scenario"] = scenario
+
+        return profile
+
+    def gather_traces(self) -> Dict[str, Any]:
+        """Gather traces with MACS-specific information."""
+        traces = super().gather_traces()
+        traces.update(
+            {
+                "max_turns": self.max_turns,
+                "turns_used": self._turn_count,
+                "stopped_by_user": self._stopped,
+            }
+        )
+        return traces
+
+
 # =============================================================================
 # Environment
 # =============================================================================
@@ -416,6 +585,18 @@ def get_tools_by_group(self, group_names: List[str]) -> List[MACSGenericTool]:
                         result.append(self._tools_dict[name])
         return result
 
+    def get_tools_for_agent(self, agent_spec: Dict[str, Any]) -> List[MACSGenericTool]:
+        """Get tools for a specific agent based on its configuration.
+
+        Args:
+            agent_spec: Agent specification dict with 'tools' key containing tool group names
+
+        Returns:
+            List of MACSGenericTool instances assigned to this agent
+        """
+        tool_groups = agent_spec.get("tools", [])
+        return self.get_tools_by_group(tool_groups)
+
 
 # =============================================================================
 # Benchmark
@@ -437,7 +618,6 @@ def __init__(
         self,
         agent_data: Dict[str, Any],
         model: ModelAdapter,
-        data_dir: Optional[Path] = None,
         callbacks: Optional[List[Any]] = None,
         n_task_repeats: int = 1,
         **kwargs: Any,
@@ -447,12 +627,10 @@ def __init__(
         Args:
             agent_data: Agent configuration from load_agent_config()
             model: ModelAdapter for tool simulation and evaluation
-            data_dir: Optional custom data directory
             callbacks: Benchmark callbacks
             n_task_repeats: Repetitions per task
         """
         self._model = model
-        self._data_dir = Path(data_dir) if data_dir else (Path(__file__).parent / "data")
         super().__init__(agent_data, callbacks, n_task_repeats, **kwargs)
 
     def setup_environment(
@@ -471,9 +649,30 @@ def setup_user(
         agent_data: Dict[str, Any],
         environment: Environment,
         task: Task,
-    ) -> Optional[User]:
-        """Create user simulator. Override for multi-turn evaluation."""
-        return None
+    ) -> MACSUserSimulator:
+        """Create MACS user simulator.
+
+        Creates a MACSUserSimulator with scenario and query from the task.
+        The user profile is automatically extracted from the scenario text.
+
+        Note: MACSUserSimulator.get_tool() raises NotImplementedError.
+        Framework-specific subclasses in examples should wrap this user
+        or override setup_user() to return a user with get_tool() implemented.
+
+        Args:
+            agent_data: Agent configuration
+            environment: The task environment
+            task: Current task with scenario and user profile
+
+        Returns:
+            MACSUserSimulator instance
+        """
+        scenario = task.metadata.get("scenario", "")
+        return MACSUserSimulator(
+            model=self._model,
+            scenario=scenario,
+            initial_prompt=task.query,
+        )
 
     @abstractmethod
     def setup_agents(

From a9aef7c3b48a648fce6b320dbb9ade9b6f596484 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Wed, 3 Dec 2025 12:53:38 +0000
Subject: [PATCH 04/34] updated benchmark implementation

---
 examples/macs_benchmark.py         | 743 +++++++++++++++--------------
 maseval/benchmark/macs/__init__.py |   4 +-
 maseval/benchmark/macs/macs.py     |  14 +-
 3 files changed, 407 insertions(+), 354 deletions(-)

diff --git a/examples/macs_benchmark.py b/examples/macs_benchmark.py
index be376ce8..32c267b0 100644
--- a/examples/macs_benchmark.py
+++ b/examples/macs_benchmark.py
@@ -17,11 +17,14 @@
     Data: https://github.com/aws-samples/multiagent-collab-scenario-benchmark
 
 Usage:
-    # Run with smolagents
-    python examples/macs_benchmark.py --framework smolagents --domain travel --limit 5
+    # Run with smolagents on travel domain
+    uv run python examples/macs_benchmark.py --framework smolagents --domain travel --limit 5
+
+    # Run with langgraph on mortgage domain
+    uv run python examples/macs_benchmark.py --framework langgraph --domain mortgage --limit 5
 
-    # Run with langgraph
-    python examples/macs_benchmark.py --framework langgraph --domain travel --limit 5
+    # Run a single task by ID for debugging
+    uv run python examples/macs_benchmark.py --framework smolagents --domain travel --task-id task_001
 """
 
 import argparse
@@ -29,18 +32,34 @@
 from pathlib import Path
 from typing import Any, Dict, List, Literal, Optional, Tuple
 
+# Third-party imports (both frameworks will be installed)
 from google.genai import Client as GoogleGenAIClient
 
+# smolagents imports
+from smolagents import Tool as SmolagentsTool, ToolCallingAgent, OpenAIServerModel, FinalAnswerTool
+
+# langgraph imports
+from langchain_core.tools import StructuredTool
+from langchain_core.messages import SystemMessage
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langgraph.graph import StateGraph, END
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import ToolNode, tools_condition
+from typing_extensions import TypedDict, Annotated
+
+# MASEval imports
 from maseval import AgentAdapter, Environment, Task, User
 from maseval.core.callbacks.result_logger import FileResultLogger
 from maseval.core.config import ConfigurableMixin
 from maseval.core.tracing import TraceableMixin
+from maseval.interface.agents.smolagents import SmolAgentAdapter
+from maseval.interface.agents.langgraph import LangGraphAgentAdapter
 from maseval.interface.inference.google_genai import GoogleGenAIModelAdapter
 
 from maseval.benchmark.macs import (
     MACSBenchmark,
     MACSGenericTool,
-    MACSUserSimulator,
+    MACSUser,
     compute_benchmark_metrics,
     ensure_data_exists,
     load_agent_config,
@@ -75,162 +94,156 @@ def create_model(model_id: str = "gemini-2.5-flash") -> GoogleGenAIModelAdapter:
 # =============================================================================
 
 
-def _create_smolagents_benchmark():
-    """Create smolagents-specific benchmark class with multi-agent hierarchy."""
-    from smolagents import Tool as SmolagentsTool, ToolCallingAgent, OpenAIServerModel, FinalAnswerTool
-    from maseval.interface.agents.smolagents import SmolAgentAdapter
-
-    class SmolagentsToolWrapper(SmolagentsTool, ConfigurableMixin, TraceableMixin):
-        """Smolagents wrapper for MACSGenericTool."""
-
-        skip_forward_signature_validation = True
-
-        def __init__(self, generic_tool: MACSGenericTool):
-            self.generic_tool = generic_tool
-            self.name = generic_tool.name
-            self.description = generic_tool.description
-            self.inputs = generic_tool.inputs
-            self.output_type = generic_tool.output_type
-            super().__init__()
-
-        def forward(self, **kwargs) -> str:
-            return self.generic_tool(**kwargs)
-
-        def gather_traces(self) -> Dict[str, Any]:
-            return self.generic_tool.gather_traces()
-
-        def gather_config(self) -> Dict[str, Any]:
-            return self.generic_tool.gather_config()
-
-    class SmolagentsMACSUser(MACSUserSimulator):
-        """MACS User Simulator with smolagents tool integration."""
-
-        def get_tool(self):
-            """Return a smolagents-compatible user input tool."""
-            # Create a simple smolagents tool that wraps simulate_response
-            user = self
-
-            class UserInputTool(SmolagentsTool):
-                name = "user_input"
-                description = "Ask the user a question to clarify their request or get additional information."
-                inputs = {"question": {"type": "string", "description": "The question to ask the user."}}
-                output_type = "string"
-
-                def forward(self, question: str) -> str:
-                    return user.simulate_response(question)
+class SmolagentsToolWrapper(SmolagentsTool, ConfigurableMixin, TraceableMixin):
+    """Smolagents wrapper for MACSGenericTool."""
+
+    skip_forward_signature_validation = True
+
+    def __init__(self, generic_tool: MACSGenericTool):
+        self.generic_tool = generic_tool
+        self.name = generic_tool.name
+        self.description = generic_tool.description
+        self.inputs = generic_tool.inputs
+        self.output_type = generic_tool.output_type
+        super().__init__()
+
+    def forward(self, **kwargs) -> str:
+        return self.generic_tool(**kwargs)
+
+    def gather_traces(self) -> Dict[str, Any]:
+        return self.generic_tool.gather_traces()
+
+    def gather_config(self) -> Dict[str, Any]:
+        return self.generic_tool.gather_config()
+
+
+class SmolagentsMACSUser(MACSUser):
+    """MACS User with smolagents tool integration."""
+
+    def get_tool(self):
+        """Return a smolagents-compatible user input tool."""
+        user = self
+
+        class UserInputTool(SmolagentsTool):
+            name = "user_input"
+            description = "Ask the user a question to clarify their request or get additional information."
+            inputs = {"question": {"type": "string", "description": "The question to ask the user."}}
+            output_type = "string"
+
+            def forward(self, question: str) -> str:
+                return user.simulate_response(question)
+
+        return UserInputTool()
+
+
+class SmolagentsMACSBenchmark(MACSBenchmark):
+    """MACS Benchmark implementation for smolagents with multi-agent hierarchy."""
+
+    def setup_user(
+        self,
+        agent_data: Dict[str, Any],
+        environment: Environment,
+        task: Task,
+    ) -> SmolagentsMACSUser:
+        """Create smolagents-compatible user simulator."""
+        scenario = task.metadata.get("scenario", "")
+
+        return SmolagentsMACSUser(
+            name="Simulated User",
+            model=self._model,
+            scenario=scenario,
+            initial_prompt=task.query,
+        )
+
+    def setup_agents(
+        self,
+        agent_data: Dict[str, Any],
+        environment: Environment,
+        task: Task,
+        user: Optional[User],
+    ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
+        """Create smolagents multi-agent hierarchy.
+
+        Implements the exact agent topology from agents.json:
+        - Travel/Mortgage: 2-level hierarchy (supervisor -> specialists)
+        - Software: 3-level hierarchy (supervisor -> deploy_agent -> infra/app agents)
+        """
+        # Create smolagents model
+        smol_model = OpenAIServerModel(
+            model_id="gemini-2.5-flash",
+            api_base="https://generativelanguage.googleapis.com/v1beta/openai/",
+            api_key=os.getenv("GOOGLE_API_KEY"),
+        )
+
+        # Build agent lookup
+        agents_config = agent_data.get("agents", [])
+        agent_lookup = {a["agent_id"]: a for a in agents_config}
+        primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
+
+        # Wrap all generic tools for smolagents
+        generic_tools = environment.create_tools()
+        tool_lookup = {t.name: SmolagentsToolWrapper(t) for t in generic_tools}
+
+        # Helper to get tools for an agent
+        def get_agent_tools(agent_spec: Dict[str, Any]) -> List[Any]:
+            """Get wrapped tools for an agent based on tool group names."""
+            tool_groups = agent_spec.get("tools", [])
+            tools = []
+            for tool_group in tool_groups:
+                # Find actions in this tool group
+                for tool_spec in environment.state.get("tool_specs", []):
+                    if tool_spec.get("tool_name") == tool_group:
+                        for action in tool_spec.get("actions", []):
+                            action_name = action.get("name")
+                            if action_name and action_name in tool_lookup:
+                                tools.append(tool_lookup[action_name])
+            return tools
+
+        # Recursive function to build agent hierarchy
+        def build_agent(agent_id: str, depth: int = 0) -> ToolCallingAgent:
+            """Build an agent with its sub-agents (managed_agents)."""
+            agent_spec = agent_lookup.get(agent_id, {})
+
+            # Get this agent's tools
+            agent_tools = get_agent_tools(agent_spec)
+            agent_tools.append(FinalAnswerTool())
+
+            # Build managed agents from reachable_agents
+            managed_agents = []
+            reachable = agent_spec.get("reachable_agents", [])
 
-            return UserInputTool()
-
-    class SmolagentsMACSBenchmark(MACSBenchmark):
-        """MACS Benchmark implementation for smolagents with multi-agent hierarchy."""
-
-        def setup_user(
-            self,
-            agent_data: Dict[str, Any],
-            environment: Environment,
-            task: Task,
-        ) -> SmolagentsMACSUser:
-            """Create smolagents-compatible user simulator."""
-            scenario = task.metadata.get("scenario", "")
-
-            return SmolagentsMACSUser(
-                name="Simulated User",
-                model=self._model,
-                scenario=scenario,
-                initial_prompt=task.query,
-            )
-
-        def setup_agents(
-            self,
-            agent_data: Dict[str, Any],
-            environment: Environment,
-            task: Task,
-            user: Optional[User],
-        ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
-            """Create smolagents multi-agent hierarchy.
-
-            Implements the exact agent topology from agents.json:
-            - Travel/Mortgage: 2-level hierarchy (supervisor -> specialists)
-            - Software: 3-level hierarchy (supervisor -> deploy_agent -> infra/app agents)
-            """
-            # Create smolagents model
-            smol_model = OpenAIServerModel(
-                model_id="gemini-2.5-flash",
-                api_base="https://generativelanguage.googleapis.com/v1beta/openai/",
-                api_key=os.getenv("GOOGLE_API_KEY"),
+            for reachable_spec in reachable:
+                sub_agent_id = reachable_spec.get("agent_id")
+                if sub_agent_id and sub_agent_id in agent_lookup:
+                    sub_agent = build_agent(sub_agent_id, depth + 1)
+                    managed_agents.append(sub_agent)
+
+            # Create the agent
+            agent = ToolCallingAgent(
+                model=smol_model,
+                tools=agent_tools,
+                managed_agents=managed_agents if managed_agents else None,
+                name=agent_spec.get("agent_name", agent_id),
+                description=agent_spec.get("agent_instruction", ""),
+                max_steps=15,  # Allow more steps for multi-agent coordination
+                verbosity_level=0,
             )
 
-            # Build agent lookup
-            agents_config = agent_data.get("agents", [])
-            agent_lookup = {a["agent_id"]: a for a in agents_config}
-            primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
-
-            # Wrap all generic tools for smolagents
-            generic_tools = environment.create_tools()
-            tool_lookup = {t.name: SmolagentsToolWrapper(t) for t in generic_tools}
-
-            # Helper to get tools for an agent
-            def get_agent_tools(agent_spec: Dict[str, Any]) -> List[Any]:
-                """Get wrapped tools for an agent based on tool group names."""
-                tool_groups = agent_spec.get("tools", [])
-                tools = []
-                for tool_group in tool_groups:
-                    # Find actions in this tool group
-                    for tool_spec in environment.state.get("tool_specs", []):
-                        if tool_spec.get("tool_name") == tool_group:
-                            for action in tool_spec.get("actions", []):
-                                action_name = action.get("name")
-                                if action_name and action_name in tool_lookup:
-                                    tools.append(tool_lookup[action_name])
-                return tools
-
-            # Recursive function to build agent hierarchy
-            def build_agent(agent_id: str, depth: int = 0) -> ToolCallingAgent:
-                """Build an agent with its sub-agents (managed_agents)."""
-                agent_spec = agent_lookup.get(agent_id, {})
-
-                # Get this agent's tools
-                agent_tools = get_agent_tools(agent_spec)
-                agent_tools.append(FinalAnswerTool())
-
-                # Build managed agents from reachable_agents
-                managed_agents = []
-                reachable = agent_spec.get("reachable_agents", [])
-
-                for reachable_spec in reachable:
-                    sub_agent_id = reachable_spec.get("agent_id")
-                    if sub_agent_id and sub_agent_id in agent_lookup:
-                        sub_agent = build_agent(sub_agent_id, depth + 1)
-                        managed_agents.append(sub_agent)
-
-                # Create the agent
-                agent = ToolCallingAgent(
-                    model=smol_model,
-                    tools=agent_tools,
-                    managed_agents=managed_agents if managed_agents else None,
-                    name=agent_spec.get("agent_name", agent_id),
-                    description=agent_spec.get("agent_instruction", ""),
-                    max_steps=15,  # Allow more steps for multi-agent coordination
-                    verbosity_level=0,
-                )
-
-                return agent
-
-            # Build the primary agent with full hierarchy
-            primary_agent = build_agent(primary_agent_id)
+            return agent
 
-            # Add user tool to primary agent if user simulator is available
-            if user and hasattr(user, "get_tool"):
-                user_tool = user.get_tool()
-                if user_tool:
-                    primary_agent.tools.append(user_tool)  # type: ignore[attr-defined]
+        # Build the primary agent with full hierarchy
+        primary_agent = build_agent(primary_agent_id)
 
-            # Wrap with adapter
-            adapter = SmolAgentAdapter(primary_agent, name=primary_agent_id)
+        # Add user tool to primary agent if user simulator is available
+        if user and hasattr(user, "get_tool"):
+            user_tool = user.get_tool()
+            if user_tool:
+                primary_agent.tools[user_tool.name] = user_tool
 
-            return [adapter], {primary_agent_id: adapter}
+        # Wrap with adapter
+        adapter = SmolAgentAdapter(primary_agent, name=primary_agent_id)
 
-    return SmolagentsMACSBenchmark
+        return [adapter], {primary_agent_id: adapter}
 
 
 # =============================================================================
@@ -238,185 +251,135 @@ def build_agent(agent_id: str, depth: int = 0) -> ToolCallingAgent:
 # =============================================================================
 
 
-def _create_langgraph_benchmark():
-    """Create langgraph-specific benchmark class with multi-agent hierarchy."""
-    from langchain_core.tools import StructuredTool
-    from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
-    from langchain_google_genai import ChatGoogleGenerativeAI
-    from langgraph.graph import StateGraph, END
-    from langgraph.graph.message import add_messages
-    from langgraph.prebuilt import ToolNode, tools_condition
-    from typing_extensions import TypedDict, Annotated
-    from maseval.interface.agents.langgraph import LangGraphAgentAdapter
-
-    class LanggraphToolWrapper(ConfigurableMixin, TraceableMixin):
-        """LangGraph wrapper for MACSGenericTool."""
-
-        def __init__(self, generic_tool: MACSGenericTool):
-            self.generic_tool = generic_tool
-            self.name = generic_tool.name
-            self.tool = StructuredTool.from_function(
-                func=generic_tool.__call__,
-                name=generic_tool.name,
-                description=generic_tool.description,
-            )
-
-        def __call__(self, *args, **kwargs):
-            return self.tool(*args, **kwargs)
-
-        def gather_traces(self) -> Dict[str, Any]:
-            return self.generic_tool.gather_traces()
+class LangGraphToolWrapper(ConfigurableMixin, TraceableMixin):
+    """LangGraph wrapper for MACSGenericTool."""
+
+    def __init__(self, generic_tool: MACSGenericTool):
+        self.generic_tool = generic_tool
+        self.tool = StructuredTool.from_function(
+            func=generic_tool,
+            name=generic_tool.name,
+            description=generic_tool.description,
+        )
+
+    def __call__(self, *args, **kwargs):
+        return self.tool(*args, **kwargs)
+
+    def gather_traces(self) -> Dict[str, Any]:
+        return self.generic_tool.gather_traces()
+
+    def gather_config(self) -> Dict[str, Any]:
+        return self.generic_tool.gather_config()
+
+
+class LangGraphMACSUser(MACSUser):
+    """MACS User with LangGraph tool integration."""
+
+    def get_tool(self):
+        """Return a LangGraph-compatible user input tool."""
+
+        def user_input(question: str) -> str:
+            """Ask the user a question and get their response."""
+            return self.simulate_response(question)
+
+        return StructuredTool.from_function(
+            func=user_input,
+            name="user_input",
+            description="Ask the user a question. Use this to clarify requirements or get additional information.",
+        )
+
+
+# LangGraph agent state
+class AgentState(TypedDict):
+    messages: Annotated[list, add_messages]
+
+
+class LangGraphMACSBenchmark(MACSBenchmark):
+    """MACS Benchmark implementation for langgraph with multi-agent hierarchy."""
+
+    def setup_user(
+        self,
+        agent_data: Dict[str, Any],
+        environment: Environment,
+        task: Task,
+    ) -> LangGraphMACSUser:
+        """Create langgraph-compatible user simulator."""
+        scenario = task.metadata.get("scenario", "")
+
+        return LangGraphMACSUser(
+            name="Simulated User",
+            model=self._model,
+            scenario=scenario,
+            initial_prompt=task.query,
+        )
+
+    def setup_agents(
+        self,
+        agent_data: Dict[str, Any],
+        environment: Environment,
+        task: Task,
+        user: Optional[User],
+    ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
+        """Create langgraph multi-agent hierarchy.
+
+        Uses subgraphs to implement the agent hierarchy from agents.json.
+        """
+        # Create LangChain model
+        llm = ChatGoogleGenerativeAI(
+            model="gemini-2.5-flash",
+            google_api_key=os.getenv("GOOGLE_API_KEY"),
+        )
+
+        # Build agent lookup
+        agents_config = agent_data.get("agents", [])
+        agent_lookup = {a["agent_id"]: a for a in agents_config}
+        primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
+
+        # Wrap all generic tools
+        generic_tools = environment.create_tools()
+        tool_lookup = {t.name: LangGraphToolWrapper(t) for t in generic_tools}
+
+        # Helper to get tools for an agent
+        def get_agent_tools(agent_spec: Dict[str, Any]) -> List[StructuredTool]:
+            """Get wrapped tools for an agent based on tool group names."""
+            tool_groups = agent_spec.get("tools", [])
+            tools = []
+            for tool_group in tool_groups:
+                for tool_spec in environment.state.get("tool_specs", []):
+                    if tool_spec.get("tool_name") == tool_group:
+                        for action in tool_spec.get("actions", []):
+                            action_name = action.get("name")
+                            if action_name and action_name in tool_lookup:
+                                tools.append(tool_lookup[action_name].tool)
+            return tools
+
+        # Build agent graph recursively
+        def build_agent_graph(agent_id: str) -> StateGraph:
+            """Build a LangGraph for an agent with potential sub-agents."""
+            agent_spec = agent_lookup.get(agent_id, {})
+
+            # Get this agent's tools
+            agent_tools = get_agent_tools(agent_spec)
+
+            # Build sub-agent tools from reachable_agents
+            reachable = agent_spec.get("reachable_agents", [])
 
-        def gather_config(self) -> Dict[str, Any]:
-            return self.generic_tool.gather_config()
-
-    class LangGraphMACSUser(MACSUserSimulator):
-        """MACS User Simulator with LangGraph tool integration."""
-
-        def get_tool(self):
-            """Return a LangGraph-compatible user input tool."""
-
-            def user_input(question: str) -> str:
-                """Ask the user a question and get their response."""
-                return self.simulate_response(question)
-
-            return StructuredTool.from_function(
-                func=user_input,
-                name="user_input",
-                description="Ask the user a question. Use this to clarify requirements or get additional information.",
-            )
-
-    class LanggraphMACSBenchmark(MACSBenchmark):
-        """MACS Benchmark implementation for langgraph with multi-agent hierarchy."""
-
-        def setup_user(
-            self,
-            agent_data: Dict[str, Any],
-            environment: Environment,
-            task: Task,
-        ) -> "LangGraphMACSUser":
-            """Create langgraph-compatible user simulator."""
-            scenario = task.metadata.get("scenario", "")
-
-            return LangGraphMACSUser(
-                name="Simulated User",
-                model=self._model,
-                scenario=scenario,
-                initial_prompt=task.query,
-            )
-
-        def setup_agents(
-            self,
-            agent_data: Dict[str, Any],
-            environment: Environment,
-            task: Task,
-            user: Optional[User],
-        ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
-            """Create langgraph multi-agent hierarchy.
-
-            Uses subgraphs to implement the agent hierarchy from agents.json.
-            """
-            # Create LangChain model
-            llm = ChatGoogleGenerativeAI(
-                model="gemini-2.5-flash",
-                google_api_key=os.getenv("GOOGLE_API_KEY"),
-            )
-
-            # Build agent lookup
-            agents_config = agent_data.get("agents", [])
-            agent_lookup = {a["agent_id"]: a for a in agents_config}
-            primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
-
-            # Wrap all generic tools
-            generic_tools = environment.create_tools()
-            tool_lookup = {t.name: LanggraphToolWrapper(t) for t in generic_tools}
-
-            # State type for langgraph
-            class AgentState(TypedDict):
-                messages: Annotated[list, add_messages]
-
-            # Helper to get langchain tools for an agent
-            def get_agent_langchain_tools(agent_spec: Dict[str, Any]) -> List:
-                """Get langchain tools for an agent."""
-                tool_groups = agent_spec.get("tools", [])
-                tools = []
-                for tool_group in tool_groups:
-                    for tool_spec in environment.state.get("tool_specs", []):
-                        if tool_spec.get("tool_name") == tool_group:
-                            for action in tool_spec.get("actions", []):
-                                action_name = action.get("name")
-                                if action_name and action_name in tool_lookup:
-                                    tools.append(tool_lookup[action_name].tool)
-                return tools
-
-            # Build a simple graph for leaf agents (no sub-agents)
-            def build_leaf_agent_graph(agent_id: str):
-                """Build a compiled graph for a leaf agent."""
-                agent_spec = agent_lookup.get(agent_id, {})
-                agent_tools = get_agent_langchain_tools(agent_spec)
-                agent_name = agent_spec.get("agent_name", agent_id)
-                agent_instruction = agent_spec.get("agent_instruction", "")
-
-                if agent_tools:
-                    llm_with_tools = llm.bind_tools(agent_tools)
-                else:
-                    llm_with_tools = llm
-
-                def call_agent(state: AgentState):
-                    messages = state["messages"]
-                    # Add system message
-                    has_system = any(isinstance(m, SystemMessage) for m in messages)
-                    if not has_system:
-                        system_msg = SystemMessage(content=f"You are {agent_name}. {agent_instruction}")
-                        messages = [system_msg] + list(messages)
-                    response = llm_with_tools.invoke(messages)
-                    return {"messages": [response]}
-
-                graph = StateGraph(AgentState)
-                graph.add_node("agent", call_agent)
-
-                if agent_tools:
-                    graph.add_node("tools", ToolNode(agent_tools))
-                    graph.add_conditional_edges("agent", tools_condition)
-                    graph.add_edge("tools", "agent")
-                else:
-                    graph.add_edge("agent", END)
-
-                graph.set_entry_point("agent")
-                return graph.compile()
-
-            # For the primary agent, we build a graph that can delegate to sub-agents
-            primary_spec = agent_lookup.get(primary_agent_id, {})
-            primary_tools = get_agent_langchain_tools(primary_spec)
-
-            # Add user tool if available
-            if user and hasattr(user, "get_tool"):
-                user_tool = user.get_tool()
-                if user_tool:
-                    primary_tools.append(user_tool)
-
-            # Create sub-agent tools (delegation)
-            reachable = primary_spec.get("reachable_agents", [])
             for reachable_spec in reachable:
                 sub_agent_id = reachable_spec.get("agent_id")
                 if sub_agent_id and sub_agent_id in agent_lookup:
                     sub_spec = agent_lookup[sub_agent_id]
-                    sub_graph = build_leaf_agent_graph(sub_agent_id)
+                    sub_graph = build_agent_graph(sub_agent_id).compile()
 
                     # Create a tool that invokes the sub-agent
                     def make_sub_agent_tool(graph, name, description):
                         def invoke_sub_agent(query: str) -> str:
-                            """Delegate to sub-agent."""
+                            """Delegate task to sub-agent."""
+                            from langchain_core.messages import HumanMessage
+
                             result = graph.invoke({"messages": [HumanMessage(content=query)]})
-                            # Get last AI message
-                            for msg in reversed(result.get("messages", [])):
-                                if isinstance(msg, AIMessage):
-                                    content = msg.content
-                                    if isinstance(content, str):
-                                        return content
-                                    # Handle list content (e.g., multimodal responses)
-                                    return str(content)
-                            return "No response from sub-agent"
+                            if result["messages"]:
+                                return result["messages"][-1].content
+                            return "No response from sub-agent."
 
                         return StructuredTool.from_function(
                             func=invoke_sub_agent,
@@ -429,45 +392,115 @@ def invoke_sub_agent(query: str) -> str:
                         sub_spec.get("agent_name", sub_agent_id),
                         reachable_spec.get("scenario", sub_spec.get("agent_instruction", "")),
                     )
-                    primary_tools.append(sub_tool)
+                    agent_tools.append(sub_tool)
 
-            # Build primary agent graph
-            if primary_tools:
-                llm_with_tools = llm.bind_tools(primary_tools)
+            # Build this agent's graph
+            agent_name = agent_spec.get("agent_name", agent_id)
+            agent_instruction = agent_spec.get("agent_instruction", "")
+
+            if agent_tools:
+                llm_with_tools = llm.bind_tools(agent_tools)
             else:
                 llm_with_tools = llm
 
-            primary_name = primary_spec.get("agent_name", primary_agent_id)
-            primary_instruction = primary_spec.get("agent_instruction", "")
-
-            def call_primary(state: AgentState):
+            def call_agent(state: AgentState):
                 messages = state["messages"]
                 has_system = any(isinstance(m, SystemMessage) for m in messages)
                 if not has_system:
-                    system_msg = SystemMessage(content=f"You are {primary_name}. {primary_instruction}")
+                    system_msg = SystemMessage(content=f"You are {agent_name}. {agent_instruction}")
                     messages = [system_msg] + list(messages)
                 response = llm_with_tools.invoke(messages)
                 return {"messages": [response]}
 
             graph = StateGraph(AgentState)
-            graph.add_node("chatbot", call_primary)
+            graph.add_node("chatbot", call_agent)
 
-            if primary_tools:
-                graph.add_node("tools", ToolNode(tools=primary_tools))
+            if agent_tools:
+                graph.add_node("tools", ToolNode(tools=agent_tools))
                 graph.add_conditional_edges("chatbot", tools_condition)
                 graph.add_edge("tools", "chatbot")
             else:
                 graph.add_edge("chatbot", END)
 
             graph.set_entry_point("chatbot")
-            compiled_graph = graph.compile()
+            return graph
+
+        # Build primary agent graph
+        primary_spec = agent_lookup.get(primary_agent_id, {})
+        primary_tools: List[StructuredTool] = get_agent_tools(primary_spec)
+
+        # Add user tool if available
+        if user and hasattr(user, "get_tool"):
+            user_tool = user.get_tool()
+            if user_tool:
+                primary_tools.append(user_tool)
+
+        # Build sub-agent tools for primary agent
+        reachable = primary_spec.get("reachable_agents", [])
+        for reachable_spec in reachable:
+            sub_agent_id = reachable_spec.get("agent_id")
+            if sub_agent_id and sub_agent_id in agent_lookup:
+                sub_spec = agent_lookup[sub_agent_id]
+                sub_graph = build_agent_graph(sub_agent_id).compile()
+
+                def make_sub_agent_tool(graph, name, description):
+                    def invoke_sub_agent(query: str) -> str:
+                        """Delegate task to sub-agent."""
+                        from langchain_core.messages import HumanMessage
+
+                        result = graph.invoke({"messages": [HumanMessage(content=query)]})
+                        if result["messages"]:
+                            return result["messages"][-1].content
+                        return "No response from sub-agent."
+
+                    return StructuredTool.from_function(
+                        func=invoke_sub_agent,
+                        name=name,
+                        description=description,
+                    )
 
-            # Wrap with adapter
-            adapter = LangGraphAgentAdapter(compiled_graph, name=primary_agent_id)
+                sub_tool = make_sub_agent_tool(
+                    sub_graph,
+                    sub_spec.get("agent_name", sub_agent_id),
+                    reachable_spec.get("scenario", sub_spec.get("agent_instruction", "")),
+                )
+                primary_tools.append(sub_tool)
+
+        # Build primary agent graph
+        if primary_tools:
+            llm_with_tools = llm.bind_tools(primary_tools)
+        else:
+            llm_with_tools = llm
+
+        primary_name = primary_spec.get("agent_name", primary_agent_id)
+        primary_instruction = primary_spec.get("agent_instruction", "")
+
+        def call_primary(state: AgentState):
+            messages = state["messages"]
+            has_system = any(isinstance(m, SystemMessage) for m in messages)
+            if not has_system:
+                system_msg = SystemMessage(content=f"You are {primary_name}. {primary_instruction}")
+                messages = [system_msg] + list(messages)
+            response = llm_with_tools.invoke(messages)
+            return {"messages": [response]}
+
+        graph = StateGraph(AgentState)
+        graph.add_node("chatbot", call_primary)
+
+        if primary_tools:
+            graph.add_node("tools", ToolNode(tools=primary_tools))
+            graph.add_conditional_edges("chatbot", tools_condition)
+            graph.add_edge("tools", "chatbot")
+        else:
+            graph.add_edge("chatbot", END)
+
+        graph.set_entry_point("chatbot")
+        compiled_graph = graph.compile()
 
-            return [adapter], {primary_agent_id: adapter}
+        # Wrap with adapter
+        adapter = LangGraphAgentAdapter(compiled_graph, name=primary_agent_id)
 
-    return LanggraphMACSBenchmark
+        return [adapter], {primary_agent_id: adapter}
 
 
 # =============================================================================
@@ -485,9 +518,9 @@ def get_benchmark_class(framework: Literal["smolagents", "langgraph"]) -> type:
         The appropriate MACSBenchmark subclass
     """
     if framework == "smolagents":
-        return _create_smolagents_benchmark()
+        return SmolagentsMACSBenchmark
     elif framework == "langgraph":
-        return _create_langgraph_benchmark()
+        return LangGraphMACSBenchmark
     else:
         raise ValueError(f"Unsupported framework: {framework}. Choose 'smolagents' or 'langgraph'.")
 
@@ -496,6 +529,7 @@ def run_benchmark(
     framework: Literal["smolagents", "langgraph"],
     domain: Literal["travel", "mortgage", "software"],
     limit: Optional[int] = None,
+    task_id: Optional[str] = None,
     n_task_repeats: int = 1,
     output_dir: Optional[Path] = None,
 ) -> Dict[str, Any]:
@@ -505,6 +539,7 @@ def run_benchmark(
         framework: Agent framework to use
         domain: MACS domain (travel, mortgage, or software)
         limit: Maximum number of tasks to run (None for all)
+        task_id: Specific task ID to run (for debugging)
         n_task_repeats: Number of times to repeat each task
         output_dir: Directory for results (default: examples/results/)
 
@@ -526,6 +561,14 @@ def run_benchmark(
     # Load data
     print(f"Loading {domain} domain tasks...")
     tasks = load_tasks(domain, limit=limit)
+
+    # Filter to specific task if requested
+    if task_id:
+        tasks = [t for t in tasks if str(t.id) == task_id]
+        if not tasks:
+            raise ValueError(f"Task with ID '{task_id}' not found in {domain} domain")
+        print(f"Running single task: {task_id}")
+
     agent_config = load_agent_config(domain)
 
     # Print agent hierarchy info
@@ -584,13 +627,16 @@ def main():
         epilog="""
 Examples:
     # Run with smolagents on travel domain
-    python examples/macs_benchmark.py --framework smolagents --domain travel
+    uv run python examples/macs_benchmark.py --framework smolagents --domain travel
 
     # Run with langgraph on mortgage domain, limited to 5 tasks
-    python examples/macs_benchmark.py --framework langgraph --domain mortgage --limit 5
+    uv run python examples/macs_benchmark.py --framework langgraph --domain mortgage --limit 5
+
+    # Run a single task by ID for debugging
+    uv run python examples/macs_benchmark.py --framework smolagents --domain travel --task-id task_001
 
     # Run with 3 repetitions per task
-    python examples/macs_benchmark.py --framework smolagents --domain software --repeats 3
+    uv run python examples/macs_benchmark.py --framework smolagents --domain software --repeats 3
         """,
     )
 
@@ -614,6 +660,12 @@ def main():
         default=None,
         help="Maximum number of tasks to run (default: all)",
     )
+    parser.add_argument(
+        "--task-id",
+        type=str,
+        default=None,
+        help="Run a single task by ID (for debugging)",
+    )
     parser.add_argument(
         "--repeats",
         type=int,
@@ -633,6 +685,7 @@ def main():
         framework=args.framework,
         domain=args.domain,
         limit=args.limit,
+        task_id=args.task_id,
         n_task_repeats=args.repeats,
         output_dir=args.output_dir,
     )
diff --git a/maseval/benchmark/macs/__init__.py b/maseval/benchmark/macs/__init__.py
index 358fe476..f3dd47c8 100644
--- a/maseval/benchmark/macs/__init__.py
+++ b/maseval/benchmark/macs/__init__.py
@@ -13,7 +13,7 @@
     MACSEnvironment,
     MACSEvaluator,
     MACSGenericTool,
-    MACSUserSimulator,
+    MACSUser,
     compute_benchmark_metrics,
 )
 from .data_loader import (
@@ -34,7 +34,7 @@
     "MACSEnvironment",
     "MACSEvaluator",
     "MACSGenericTool",
-    "MACSUserSimulator",
+    "MACSUser",
     # Data loading
     "load_tasks",
     "load_agent_config",
diff --git a/maseval/benchmark/macs/macs.py b/maseval/benchmark/macs/macs.py
index a04b2aac..6c3cc840 100644
--- a/maseval/benchmark/macs/macs.py
+++ b/maseval/benchmark/macs/macs.py
@@ -355,7 +355,7 @@ def _compute_gsr(self, report: List[Dict[str, Any]]) -> Tuple[float, float]:
 # =============================================================================
 
 
-class MACSUserSimulator(User):
+class MACSUser(User):
     """MACS-specific user simulator with conversation limits.
 
     Extends the base User class with MACS-specific behavior:
@@ -420,7 +420,7 @@ def get_tool(self) -> Any:
             NotImplementedError: Always, as this must be implemented by subclass.
         """
         raise NotImplementedError(
-            "MACSUserSimulator.get_tool() must be overridden by framework-specific subclass. "
+            "MACSUser.get_tool() must be overridden by framework-specific subclass. "
             "Use SmolAgentMACSUser for smolagents or LangGraphMACSUser for langgraph."
         )
 
@@ -649,13 +649,13 @@ def setup_user(
         agent_data: Dict[str, Any],
         environment: Environment,
         task: Task,
-    ) -> MACSUserSimulator:
+    ) -> MACSUser:
         """Create MACS user simulator.
 
-        Creates a MACSUserSimulator with scenario and query from the task.
+        Creates a MACSUser with scenario and query from the task.
         The user profile is automatically extracted from the scenario text.
 
-        Note: MACSUserSimulator.get_tool() raises NotImplementedError.
+        Note: MACSUser.get_tool() raises NotImplementedError.
         Framework-specific subclasses in examples should wrap this user
         or override setup_user() to return a user with get_tool() implemented.
 
@@ -665,10 +665,10 @@ def setup_user(
             task: Current task with scenario and user profile
 
         Returns:
-            MACSUserSimulator instance
+            MACSUser instance
         """
         scenario = task.metadata.get("scenario", "")
-        return MACSUserSimulator(
+        return MACSUser(
             model=self._model,
             scenario=scenario,
             initial_prompt=task.query,

From 6aa222ce6dd7bfb6d8d9f82de2dab9e4070727a1 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Wed, 3 Dec 2025 13:15:46 +0000
Subject: [PATCH 05/34] updated docs with new benchmark

---
 docs/benchmark/index.md       | 11 +++++++++
 docs/benchmark/macs.md        | 42 +++++++++++++++++++++++++++++++++++
 docs/examples/index.md        |  9 ++++++++
 docs/guides/index.md          |  8 +++++++
 docs/reference/index.md       |  9 ++++++++
 maseval/__init__.py           |  4 ++++
 maseval/benchmark/README.md   |  1 +
 maseval/benchmark/__init__.py | 31 ++++++++++++++++++++++++++
 mkdocs.yml                    | 16 +++++++------
 9 files changed, 124 insertions(+), 7 deletions(-)
 create mode 100644 docs/benchmark/index.md
 create mode 100644 docs/benchmark/macs.md
 create mode 100644 docs/examples/index.md
 create mode 100644 docs/guides/index.md
 create mode 100644 docs/reference/index.md
 create mode 100644 maseval/benchmark/README.md
 create mode 100644 maseval/benchmark/__init__.py

diff --git a/docs/benchmark/index.md b/docs/benchmark/index.md
new file mode 100644
index 00000000..2a2bb181
--- /dev/null
+++ b/docs/benchmark/index.md
@@ -0,0 +1,11 @@
+# Benchmarks
+
+MASEval includes pre-implemented benchmarks for evaluating multi-agent systems.
+
+## Adding Custom Benchmarks
+
+You can also create your own benchmarks by subclassing the [`Benchmark`](../reference/benchmark.md) class. See the [Five-a-Day example](../examples/five_a_day_benchmark.ipynb) for a complete walkthrough.
+
+## Licensing
+
+For detailed source and licensing information for each benchmark's data, see [BENCHMARKS.md](https://github.com/parameterlab/MASEval/blob/main/BENCHMARKS.md).
diff --git a/docs/benchmark/macs.md b/docs/benchmark/macs.md
new file mode 100644
index 00000000..88f289b8
--- /dev/null
+++ b/docs/benchmark/macs.md
@@ -0,0 +1,42 @@
+# MACS: Multi-Agent Collaboration Scenarios
+
+The **Multi-Agent Collaboration Scenarios (MACS)** benchmark evaluates how well multi-agent systems collaborate to solve complex enterprise tasks across multiple domains.
+
+## Overview
+
+[Multi-Agent Collaboration Scenarios (MACS)](https://arxiv.org/abs/2412.05449) is designed to test collaborative problem-solving in realistic enterprise scenarios. The benchmark includes tasks spanning multiple domains such as travel planning, retail, and more. Each task involves multiple agents that must coordinate their actions to achieve user goals.
+
+Check out the [BENCHMARKS.md](https://github.com/parameterlab/MASEval/blob/main/BENCHMARKS.md) file for more information including licenses.
+
+## Quick Start
+
+```python
+from maseval.benchmark.macs import (
+    MACSBenchmark, MACSEnvironment, MACSEvaluator, MACSGenericTool,
+    load_tasks, load_agent_config,
+)
+
+# Load data
+tasks = load_tasks("travel", limit=5)
+agent_config = load_agent_config("travel")
+
+# Create your framework-specific benchmark subclass
+class MyMACSBenchmark(MACSBenchmark):
+    def setup_agents(self, agent_data, environment, task, user):
+        # Your framework-specific agent creation
+        ...
+
+# Run
+benchmark = MyMACSBenchmark(agent_data=agent_config, model=my_model)
+results = benchmark.run(tasks)
+```
+
+::: maseval.benchmark.macs.MACSBenchmark
+
+::: maseval.benchmark.macs.MACSUser
+
+::: maseval.benchmark.macs.MACSEnvironment
+
+::: maseval.benchmark.macs.MACSEvaluator
+
+::: maseval.benchmark.macs.MACSGenericTool
diff --git a/docs/examples/index.md b/docs/examples/index.md
new file mode 100644
index 00000000..a1c58768
--- /dev/null
+++ b/docs/examples/index.md
@@ -0,0 +1,9 @@
+# Examples
+
+Learn MASEval through hands-on examples covering common use cases and benchmarks.
+
+| Example                                                                                                                             | Description                                             |
+| ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
+| [Tutorial](tutorial.ipynb)                                                                                                          | Introduction to MASEval's core concepts and basic usage |
+| [Five-a-Day Benchmark](five_a_day_benchmark.ipynb)                                                                                  | Building a custom benchmark from scratch                |
+| [Multi-Agent Collaboration Scenario Benchmark (MACS)](https://github.com/parameterlab/MASEval/blob/main/examples/macs_benchmark.py) | An adaptation of the `maseval.benchmark.MACSBenchmark`. |
diff --git a/docs/guides/index.md b/docs/guides/index.md
new file mode 100644
index 00000000..531e81dc
--- /dev/null
+++ b/docs/guides/index.md
@@ -0,0 +1,8 @@
+# Guides
+
+Guides provide an in-depth exploration of MASEval's features and best practices.
+
+| Guide                                          | Description                                                   |
+| ---------------------------------------------- | ------------------------------------------------------------- |
+| [Message Tracing](message-tracing.md)          | Capture and inspect agent conversations during benchmark runs |
+| [Configuration Gathering](config-gathering.md) | Collect and export configuration for reproducibility          |
diff --git a/docs/reference/index.md b/docs/reference/index.md
new file mode 100644
index 00000000..617d96cc
--- /dev/null
+++ b/docs/reference/index.md
@@ -0,0 +1,9 @@
+# Reference
+
+The reference contains three subsections.
+
+**Core:** Reference for the core `maseval` functionality.
+
+**Interfaces:** Reference for the interfaces that are implemented. For example, for different agentic frameworks.
+
+**Benchmarks:** Reference for implementations of standard benchmarks.
diff --git a/maseval/__init__.py b/maseval/__init__.py
index c6232626..e81693a2 100644
--- a/maseval/__init__.py
+++ b/maseval/__init__.py
@@ -2,6 +2,10 @@
 
 Expose a small, stable surface area for users to import core abstractions directly from `maseval`,
 for example: `from maseval import Task, Benchmark`.
+
+Core library sits in the top namespace for easy access.
+Interfaces sit in the `maseval.interface` submodule.
+Benchmarks sit in the `maseval.benchmark` submodule.
 """
 
 from .core.task import Task, TaskCollection
diff --git a/maseval/benchmark/README.md b/maseval/benchmark/README.md
new file mode 100644
index 00000000..d7de5173
--- /dev/null
+++ b/maseval/benchmark/README.md
@@ -0,0 +1 @@
+In here, we implement standard benchmarks using the `maseval.core` features. The users of the library can then adapt these to their needs.
diff --git a/maseval/benchmark/__init__.py b/maseval/benchmark/__init__.py
new file mode 100644
index 00000000..30afe399
--- /dev/null
+++ b/maseval/benchmark/__init__.py
@@ -0,0 +1,31 @@
+"""MASEval Benchmarks.
+
+This module provides benchmark implementations for evaluating multi-agent systems.
+
+Available benchmarks:
+    - macs: Multi-Agent Collaboration Scenarios (AWS MACS benchmark)
+"""
+
+from .macs import (
+    MACSBenchmark,
+    MACSEnvironment,
+    MACSEvaluator,
+    MACSGenericTool,
+    MACSUser,
+    compute_benchmark_metrics,
+    load_tasks,
+    load_agent_config,
+    ensure_data_exists,
+)
+
+__all__ = [
+    "MACSBenchmark",
+    "MACSEnvironment",
+    "MACSEvaluator",
+    "MACSGenericTool",
+    "MACSUser",
+    "compute_benchmark_metrics",
+    "load_tasks",
+    "load_agent_config",
+    "ensure_data_exists",
+]
diff --git a/mkdocs.yml b/mkdocs.yml
index 4abbbd79..6864f49d 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -34,6 +34,7 @@ theme:
   features:
     - navigation.sections
     - content.code.copy
+    - navigation.indexes
 
 extra_css:
   - css/mkdocstrings.css
@@ -80,17 +81,19 @@ plugins:
               show_private: false
 
 nav:
+  - Home: index.md
   - Getting Started:
-      - Home: index.md
       - Quickstart: getting-started/quickstart.md
       - FAQ: getting-started/faq.md
   - Guides:
+      - guides/index.md
       - Message Tracing: guides/message-tracing.md
   - Examples:
-      - Example 1: examples/tutorial.ipynb
-      - Example 2: examples/five_a_day_benchmark.ipynb
-      # - AWS Collaboration Benchmark: examples/amazon_collab.ipynb
+      - examples/index.md
+      - Tiny Tutorial: examples/tutorial.ipynb
+      - 5-A-Day Benchmark: examples/five_a_day_benchmark.ipynb
   - Reference:
+      - reference/index.md
       - Core:
           - Agent: reference/agent.md
           - Benchmark: reference/benchmark.md
@@ -113,6 +116,5 @@ nav:
               - HuggingFace: interface/inference/huggingface.md
               - LiteLLM: interface/inference/litellm.md
               - OpenAI: interface/inference/openai.md
-      # - Benchmarks:
-      # - GAIA: benchmarks/gaia.md
-      # - AgentBench: benchmarks/agentbench.md
+      - Benchmarks:
+          - MACS: benchmark/macs.md

From da47e960d91f65426c831ed93cb8eaeefee6cded Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Wed, 3 Dec 2025 21:21:13 +0000
Subject: [PATCH 06/34] refactored environment tool storage

---
 CHANGELOG.md                                  |  4 ++
 .../five_a_day_benchmark.ipynb                | 18 +++---
 .../five_a_day_benchmark.py                   | 57 +++++++++--------
 .../five_a_day_benchmark/tools/__init__.py    | 18 +++---
 examples/introduction/tutorial.ipynb          | 10 +--
 examples/macs_benchmark.py                    | 62 ++++++++-----------
 maseval/benchmark/macs/macs.py                | 62 +++++++++----------
 maseval/core/environment.py                   | 43 ++++++++-----
 tests/conftest.py                             |  4 +-
 tests/test_core/test_environment.py           | 18 +++---
 10 files changed, 151 insertions(+), 145 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eb89e532..9820c5c5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- **BREAKING:** `Environment.create_tools()` now returns `Dict[str, Any]` instead of `list`
+  - `get_tools()` returns a dict keyed by tool name
+  - Added `get_tool(name)` method for single-tool lookup
+  - Removed internal `_tools_dict` attribute (tools dict is now the source of truth)
 - Documentation formatting improved. Added darkmode and links to `Github` (PR: #11).
 - `FileResultLogger` now accepts `pathlib.Path` for argument `output_dir` and has an `overwrite` argument to prevent overwriting of existing logs files.
 - `Benchmark` class now has a `fail_on_setup_error` flag that raises errors observed during setup of task (PR: #10)
diff --git a/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb b/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb
index fc439e8b..8f8c81bc 100644
--- a/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb
+++ b/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb
@@ -389,7 +389,7 @@
     "\n",
     "    primary_agent_id = agent_data[\"primary_agent_id\"]\n",
     "    agents_specs = agent_data[\"agents\"]\n",
-    "    all_tool_adapters = environment.get_tools()\n",
+    "    all_tool_adapters = environment.get_tools()  # Now returns Dict[str, Any]\n",
     "\n",
     "    # Build specialists first\n",
     "    specialist_agents = []\n",
@@ -400,7 +400,7 @@
     "        seed = agent_spec.get(\"seed\")\n",
     "        model = get_model(model_id, temperature, seed)\n",
     "        spec_tool_adapters = filter_tool_adapters_by_prefix(all_tool_adapters, agent_spec[\"tools\"])\n",
-    "        spec_tools = [adapter.tool for adapter in spec_tool_adapters]\n",
+    "        spec_tools = [adapter.tool for adapter in spec_tool_adapters.values()]\n",
     "        spec_tools.append(FinalAnswerTool())\n",
     "\n",
     "        specialist = ToolCallingAgent(\n",
@@ -484,9 +484,9 @@
     "\n",
     "        return env_data\n",
     "\n",
-    "    def create_tools(self) -> list:\n",
-    "        \"\"\"Create and convert tools to framework-specific format.\"\"\"\n",
-    "        tools_list = []\n",
+    "    def create_tools(self) -> Dict[str, Any]:\n",
+    "        \"\"\"Create and convert tools to framework-specific format, keyed by name.\"\"\"\n",
+    "        tools_dict: Dict[str, Any] = {}\n",
     "\n",
     "        # Map tool names to their collection classes\n",
     "        tool_mapping = {\n",
@@ -510,9 +510,10 @@
     "                # Get base tools and convert to framework format\n",
     "                for base_tool in tool_instance.get_sub_tools():\n",
     "                    framework_tool = base_tool.to_smolagents()\n",
-    "                    tools_list.append(framework_tool)\n",
+    "                    tool_key = getattr(base_tool, \"name\", None) or str(type(base_tool).__name__)\n",
+    "                    tools_dict[tool_key] = framework_tool\n",
     "\n",
-    "        return tools_list"
+    "        return tools_dict"
    ]
   },
   {
@@ -631,8 +632,7 @@
     "        environment = FiveADayEnvironment(task_data)\n",
     "\n",
     "        # Register all tools for tracing\n",
-    "        for tool_adapter in environment.get_tools():\n",
-    "            tool_name = getattr(tool_adapter, \"name\", str(type(tool_adapter).__name__))\n",
+    "        for tool_name, tool_adapter in environment.get_tools().items():\n",
     "            self.register(\"tools\", tool_name, tool_adapter)\n",
     "\n",
     "        return environment\n",
diff --git a/examples/five_a_day_benchmark/five_a_day_benchmark.py b/examples/five_a_day_benchmark/five_a_day_benchmark.py
index d47f6f8a..1c7c5115 100644
--- a/examples/five_a_day_benchmark/five_a_day_benchmark.py
+++ b/examples/five_a_day_benchmark/five_a_day_benchmark.py
@@ -159,15 +159,13 @@ def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
 
         return env_data
 
-    def create_tools(self) -> list:
+    def create_tools(self) -> Dict[str, Any]:
         """Create tool instances from environment_data and convert to framework-specific types.
 
-        The base Environment class stores tools in self._tools_dict for tracing.
-
         Returns:
-            List of framework-specific tool objects (smolagents Tool, LangChain StructuredTool, etc.)
+            Dict mapping tool names to framework-specific tool objects
         """
-        tools_list = []
+        tools_dict: Dict[str, Any] = {}
 
         # Map tool names to tool collection classes and their initialization data
         tool_mapping = {
@@ -199,9 +197,11 @@ def create_tools(self) -> list:
                 # Convert each base tool to framework-specific tool
                 for base_tool in base_tools:
                     framework_tool = self._convert_tool(base_tool)
-                    tools_list.append(framework_tool)
+                    # Use the base tool's name as the key
+                    tool_key = getattr(base_tool, "name", None) or str(type(base_tool).__name__)
+                    tools_dict[tool_key] = framework_tool
 
-        return tools_list
+        return tools_dict
 
     def _convert_tool(self, base_tool):
         """Convert BaseTool to framework-specific tool adapter.
@@ -233,7 +233,7 @@ def _convert_tool(self, base_tool):
 def build_smolagents_single_agent(
     model_id: str,
     temperature: float,
-    all_tool_adapters: List[Any],
+    all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
 ) -> Any:
@@ -242,7 +242,7 @@ def build_smolagents_single_agent(
     Args:
         model_id: Model identifier
         temperature: Model temperature
-        all_tool_adapters: All available tool adapters
+        all_tool_adapters: All available tool adapters (dict keyed by name)
         primary_spec: Primary agent specification
         specialist_specs: Empty list for single-agent (ignored)
 
@@ -255,7 +255,7 @@ def build_smolagents_single_agent(
     seed = primary_spec.get("seed")
     model = get_model(model_id, "smolagents", temperature, seed)
     tool_adapters = filter_tool_adapters_by_prefix(all_tool_adapters, primary_spec["tools"])
-    tools = [adapter.tool for adapter in tool_adapters]
+    tools = [adapter.tool for adapter in tool_adapters.values()]
     sanitized_name = sanitize_name(primary_spec["agent_name"])
 
     agent = ToolCallingAgent(
@@ -272,7 +272,7 @@ def build_smolagents_single_agent(
 def build_langgraph_single_agent(
     model_id: str,
     temperature: float,
-    all_tool_adapters: List[Any],
+    all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
 ) -> Any:
@@ -281,7 +281,7 @@ def build_langgraph_single_agent(
     Args:
         model_id: Model identifier
         temperature: Model temperature
-        all_tool_adapters: All available tool adapters
+        all_tool_adapters: All available tool adapters (dict keyed by name)
         primary_spec: Primary agent specification
         specialist_specs: Empty list for single-agent (ignored)
 
@@ -298,7 +298,7 @@ def build_langgraph_single_agent(
     seed = primary_spec.get("seed")
     model = get_model(model_id, "langgraph", temperature, seed)
     tool_adapters = filter_tool_adapters_by_prefix(all_tool_adapters, primary_spec["tools"])
-    tools = [adapter.tool for adapter in tool_adapters]
+    tools = [adapter.tool for adapter in tool_adapters.values()]
 
     class AgentState(TypedDict):
         messages: Annotated[List[Any], add_messages]
@@ -329,7 +329,7 @@ def call_model(state: AgentState):
 def build_llamaindex_single_agent(
     model_id: str,
     temperature: float,
-    all_tool_adapters: List[Any],
+    all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
 ) -> Any:
@@ -338,7 +338,7 @@ def build_llamaindex_single_agent(
     Args:
         model_id: Model identifier
         temperature: Model temperature
-        all_tool_adapters: All available tool adapters
+        all_tool_adapters: All available tool adapters (dict keyed by name)
         primary_spec: Primary agent specification
         specialist_specs: Empty list for single-agent (ignored)
 
@@ -351,7 +351,7 @@ def build_llamaindex_single_agent(
     seed = primary_spec.get("seed")
     model = get_model(model_id, "llamaindex", temperature, seed)
     tool_adapters = filter_tool_adapters_by_prefix(all_tool_adapters, primary_spec["tools"])
-    tools = [adapter.tool for adapter in tool_adapters]
+    tools = [adapter.tool for adapter in tool_adapters.values()]
 
     agent = ReActAgent(
         tools=tools,
@@ -367,7 +367,7 @@ def build_llamaindex_single_agent(
 def build_smolagents_multi_agent(
     model_id: str,
     temperature: float,
-    all_tool_adapters: List[Any],
+    all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
 ) -> Any:
@@ -376,7 +376,7 @@ def build_smolagents_multi_agent(
     Args:
         model_id: Model identifier
         temperature: Model temperature
-        all_tool_adapters: All available tool adapters
+        all_tool_adapters: All available tool adapters (dict keyed by name)
         primary_spec: Primary agent specification
         specialist_specs: List of specialist agent specifications
 
@@ -391,7 +391,7 @@ def build_smolagents_multi_agent(
         specialist_seed = agent_spec.get("seed")
         specialist_model = get_model(model_id, "smolagents", temperature, specialist_seed)
         specialist_adapters = filter_tool_adapters_by_prefix(all_tool_adapters, agent_spec["tools"])
-        specialist_tools = [adapter.tool for adapter in specialist_adapters]
+        specialist_tools = [adapter.tool for adapter in specialist_adapters.values()]
         specialist_tools.append(FinalAnswerTool())
         sanitized_name = sanitize_name(agent_spec["agent_name"])
 
@@ -406,7 +406,7 @@ def build_smolagents_multi_agent(
         specialist_agents.append(specialist)
 
     primary_adapters = filter_tool_adapters_by_prefix(all_tool_adapters, primary_spec["tools"])
-    primary_tools = [adapter.tool for adapter in primary_adapters]
+    primary_tools = [adapter.tool for adapter in primary_adapters.values()]
     primary_tools.append(FinalAnswerTool())
     sanitized_primary_name = sanitize_name(primary_spec["agent_name"])
     primary_seed = primary_spec.get("seed")
@@ -427,7 +427,7 @@ def build_smolagents_multi_agent(
 def build_langgraph_multi_agent(
     model_id: str,
     temperature: float,
-    all_tool_adapters: List[Any],
+    all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
 ) -> Any:
@@ -436,7 +436,7 @@ def build_langgraph_multi_agent(
     Args:
         model_id: Model identifier
         temperature: Model temperature
-        all_tool_adapters: All available tool adapters
+        all_tool_adapters: All available tool adapters (dict keyed by name)
         primary_spec: Primary agent specification
         specialist_specs: List of specialist agent specifications
 
@@ -462,7 +462,7 @@ class MultiAgentState(TypedDict):
         specialist_seed = agent_spec.get("seed")
         specialist_model = get_model(model_id, "langgraph", temperature, specialist_seed)
         specialist_adapters = filter_tool_adapters_by_prefix(all_tool_adapters, agent_spec["tools"])
-        specialist_tools = [adapter.tool for adapter in specialist_adapters]
+        specialist_tools = [adapter.tool for adapter in specialist_adapters.values()]
 
         def make_specialist_node(spec_instruction, spec_tools, spec_model):
             def specialist_node(state: MultiAgentState):
@@ -584,7 +584,7 @@ def route_after_orchestrator(state: MultiAgentState):
 def build_llamaindex_multi_agent(
     model_id: str,
     temperature: float,
-    all_tool_adapters: List[Any],
+    all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
 ) -> Any:
@@ -593,7 +593,7 @@ def build_llamaindex_multi_agent(
     Args:
         model_id: Model identifier
         temperature: Model temperature
-        all_tool_adapters: All available tool adapters
+        all_tool_adapters: All available tool adapters (dict keyed by name)
         primary_spec: Primary agent specification
         specialist_specs: List of specialist agent specifications
 
@@ -613,7 +613,7 @@ def build_llamaindex_multi_agent(
         specialist_seed = agent_spec.get("seed")
         specialist_model = get_model(model_id, "llamaindex", temperature, specialist_seed)
         specialist_adapters = filter_tool_adapters_by_prefix(all_tool_adapters, agent_spec["tools"])
-        specialist_tools = [adapter.tool for adapter in specialist_adapters]
+        specialist_tools = [adapter.tool for adapter in specialist_adapters.values()]
 
         specialist_agent = ReActAgent(
             tools=specialist_tools,
@@ -652,7 +652,7 @@ async def run_specialist():
 
     orchestrator_tools = [make_handoff_tool(spec_id, spec_info) for spec_id, spec_info in specialist_agents_dict.items()]
     primary_adapters = filter_tool_adapters_by_prefix(all_tool_adapters, primary_spec["tools"])
-    primary_tools = [adapter.tool for adapter in primary_adapters]
+    primary_tools = [adapter.tool for adapter in primary_adapters.values()]
     orchestrator_tools.extend(primary_tools)
 
     primary_seed = primary_spec.get("seed")
@@ -715,8 +715,7 @@ def setup_environment(self, agent_data: Dict[str, Any], task: Task) -> Environme
         environment = FiveADayEnvironment(task_data, framework)
 
         # Register all tools with the benchmark for tracing
-        for tool_adapter in environment.get_tools():
-            tool_name = getattr(tool_adapter, "name", None) or str(type(tool_adapter).__name__)
+        for tool_name, tool_adapter in environment.get_tools().items():
             self.register("tools", tool_name, tool_adapter)
 
         return environment
diff --git a/examples/five_a_day_benchmark/tools/__init__.py b/examples/five_a_day_benchmark/tools/__init__.py
index b2a3db0c..f7da2f57 100644
--- a/examples/five_a_day_benchmark/tools/__init__.py
+++ b/examples/five_a_day_benchmark/tools/__init__.py
@@ -116,24 +116,24 @@ def _create_mcp_calendar_state(calendar_name: str, env_data: Dict[str, Any], ava
     return MCPCalendarState(calendar_name, {"events": events})
 
 
-def filter_tool_adapters_by_prefix(adapters: List[Any], tool_names: List[str]) -> List[Any]:
-    """Filter tool adapters by exact name.
+def filter_tool_adapters_by_prefix(adapters: Dict[str, Any], tool_names: List[str]) -> Dict[str, Any]:
+    """Filter tool adapters by exact name prefix.
 
     Args:
-        adapters: List of tool adapters to filter
+        adapters: Dict of tool adapters to filter, keyed by tool name
         tool_names: Tool/collection names to match (e.g., ["banking", "calculator"])
 
     Returns:
-        Filtered list of adapters matching the specified names
+        Filtered dict of adapters matching the specified names
     """
     if not tool_names:
-        return []
+        return {}
 
-    filtered = []
-    for adapter in adapters:
+    filtered: Dict[str, Any] = {}
+    for adapter_name, adapter in adapters.items():
         for tool_name in tool_names:
-            if adapter.name.startswith(f"{tool_name}_"):
-                filtered.append(adapter)
+            if adapter_name.startswith(f"{tool_name}_"):
+                filtered[adapter_name] = adapter
                 break
 
     return filtered
diff --git a/examples/introduction/tutorial.ipynb b/examples/introduction/tutorial.ipynb
index 69a2f9f3..3b4b0846 100644
--- a/examples/introduction/tutorial.ipynb
+++ b/examples/introduction/tutorial.ipynb
@@ -387,8 +387,8 @@
     "        \"\"\"Initialize environment state from task data.\"\"\"\n",
     "        return task_data.copy()\n",
     "\n",
-    "    def create_tools(self) -> list:\n",
-    "        \"\"\"Create tool instances from environment data.\"\"\"\n",
+    "    def create_tools(self) -> Dict[str, Any]:\n",
+    "        \"\"\"Create tool instances from environment data, keyed by name.\"\"\"\n",
     "        # Get banking transactions from environment data\n",
     "        transactions = self.state.get(\"banking\", {}).get(\"bank_transactions\", [])\n",
     "\n",
@@ -397,7 +397,7 @@
     "        banking_tool = SimpleBankingTool(transactions=transactions)\n",
     "        email_tool = SimpleEmailTool(sent_emails=self.sent_emails)\n",
     "\n",
-    "        return [banking_tool, email_tool]\n",
+    "        return {\"get_bank_transactions\": banking_tool, \"send_email\": email_tool}\n",
     "\n",
     "\n",
     "print(\"Environment class defined!\")"
@@ -527,9 +527,9 @@
     "        # Initialize model\n",
     "        model = LiteLLMModel(model_id=\"gemini/gemini-2.5-flash\", api_key=os.getenv(\"GOOGLE_API_KEY\"), temperature=0.7)\n",
     "\n",
-    "        # Create agent with environment tools\n",
+    "        # Create agent with environment tools (convert dict values to list for smolagents)\n",
     "        agent = ToolCallingAgent(\n",
-    "            tools=environment.get_tools(),\n",
+    "            tools=list(environment.get_tools().values()),\n",
     "            model=model,\n",
     "            instructions=\"\"\"You are a helpful assistant. Help users with email and banking tasks \n",
     "by using the available tools to retrieve information and take appropriate actions. \n",
diff --git a/examples/macs_benchmark.py b/examples/macs_benchmark.py
index 32c267b0..4b06212c 100644
--- a/examples/macs_benchmark.py
+++ b/examples/macs_benchmark.py
@@ -58,6 +58,7 @@
 
 from maseval.benchmark.macs import (
     MACSBenchmark,
+    MACSEnvironment,
     MACSGenericTool,
     MACSUser,
     compute_benchmark_metrics,
@@ -126,7 +127,7 @@ def get_tool(self):
 
         class UserInputTool(SmolagentsTool):
             name = "user_input"
-            description = "Ask the user a question to clarify their request or get additional information."
+            description = "Asks for user's input on a specific question."
             inputs = {"question": {"type": "string", "description": "The question to ask the user."}}
             output_type = "string"
 
@@ -158,7 +159,7 @@ def setup_user(
     def setup_agents(
         self,
         agent_data: Dict[str, Any],
-        environment: Environment,
+        environment: MACSEnvironment,  # type: ignore[override]
         task: Task,
         user: Optional[User],
     ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
@@ -180,24 +181,18 @@ def setup_agents(
         agent_lookup = {a["agent_id"]: a for a in agents_config}
         primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
 
-        # Wrap all generic tools for smolagents
-        generic_tools = environment.create_tools()
-        tool_lookup = {t.name: SmolagentsToolWrapper(t) for t in generic_tools}
+        # Wrap all generic tools for smolagents and register them for tracing
+        tool_wrappers: Dict[str, SmolagentsToolWrapper] = {}
+        for name, tool in environment.tools.items():
+            wrapper = SmolagentsToolWrapper(tool)
+            tool_wrappers[name] = wrapper
+            self.register("tools", name, wrapper)
 
         # Helper to get tools for an agent
-        def get_agent_tools(agent_spec: Dict[str, Any]) -> List[Any]:
-            """Get wrapped tools for an agent based on tool group names."""
-            tool_groups = agent_spec.get("tools", [])
-            tools = []
-            for tool_group in tool_groups:
-                # Find actions in this tool group
-                for tool_spec in environment.state.get("tool_specs", []):
-                    if tool_spec.get("tool_name") == tool_group:
-                        for action in tool_spec.get("actions", []):
-                            action_name = action.get("name")
-                            if action_name and action_name in tool_lookup:
-                                tools.append(tool_lookup[action_name])
-            return tools
+        def get_agent_tools(agent_spec: Dict[str, Any]) -> List[SmolagentsTool]:
+            """Get wrapped tools for an agent based on its tool groups."""
+            agent_tools = environment.get_tools_for_agent(agent_spec)
+            return [tool_wrappers[name] for name in agent_tools if name in tool_wrappers]
 
         # Recursive function to build agent hierarchy
         def build_agent(agent_id: str, depth: int = 0) -> ToolCallingAgent:
@@ -205,7 +200,7 @@ def build_agent(agent_id: str, depth: int = 0) -> ToolCallingAgent:
             agent_spec = agent_lookup.get(agent_id, {})
 
             # Get this agent's tools
-            agent_tools = get_agent_tools(agent_spec)
+            agent_tools: List[SmolagentsTool] = get_agent_tools(agent_spec)
             agent_tools.append(FinalAnswerTool())
 
             # Build managed agents from reachable_agents
@@ -225,7 +220,7 @@ def build_agent(agent_id: str, depth: int = 0) -> ToolCallingAgent:
                 managed_agents=managed_agents if managed_agents else None,
                 name=agent_spec.get("agent_name", agent_id),
                 description=agent_spec.get("agent_instruction", ""),
-                max_steps=15,  # Allow more steps for multi-agent coordination
+                max_steps=25,  # Allow more steps for complex multi-agent tasks
                 verbosity_level=0,
             )
 
@@ -279,7 +274,7 @@ def get_tool(self):
         """Return a LangGraph-compatible user input tool."""
 
         def user_input(question: str) -> str:
-            """Ask the user a question and get their response."""
+            """Ask the user a question to understand their complete requirements."""
             return self.simulate_response(question)
 
         return StructuredTool.from_function(
@@ -316,7 +311,7 @@ def setup_user(
     def setup_agents(
         self,
         agent_data: Dict[str, Any],
-        environment: Environment,
+        environment: MACSEnvironment,  # type: ignore[override]
         task: Task,
         user: Optional[User],
     ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
@@ -335,23 +330,18 @@ def setup_agents(
         agent_lookup = {a["agent_id"]: a for a in agents_config}
         primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
 
-        # Wrap all generic tools
-        generic_tools = environment.create_tools()
-        tool_lookup = {t.name: LangGraphToolWrapper(t) for t in generic_tools}
+        # Wrap all generic tools and register for tracing
+        tool_wrappers: Dict[str, LangGraphToolWrapper] = {}
+        for name, tool in environment.tools.items():
+            wrapper = LangGraphToolWrapper(tool)
+            tool_wrappers[name] = wrapper
+            self.register("tools", name, wrapper)
 
         # Helper to get tools for an agent
         def get_agent_tools(agent_spec: Dict[str, Any]) -> List[StructuredTool]:
-            """Get wrapped tools for an agent based on tool group names."""
-            tool_groups = agent_spec.get("tools", [])
-            tools = []
-            for tool_group in tool_groups:
-                for tool_spec in environment.state.get("tool_specs", []):
-                    if tool_spec.get("tool_name") == tool_group:
-                        for action in tool_spec.get("actions", []):
-                            action_name = action.get("name")
-                            if action_name and action_name in tool_lookup:
-                                tools.append(tool_lookup[action_name].tool)
-            return tools
+            """Get wrapped tools for an agent based on its tool groups."""
+            agent_tools = environment.get_tools_for_agent(agent_spec)
+            return [tool_wrappers[name].tool for name in agent_tools if name in tool_wrappers]
 
         # Build agent graph recursively
         def build_agent_graph(agent_id: str) -> StateGraph:
diff --git a/maseval/benchmark/macs/macs.py b/maseval/benchmark/macs/macs.py
index 6c3cc840..480012d1 100644
--- a/maseval/benchmark/macs/macs.py
+++ b/maseval/benchmark/macs/macs.py
@@ -372,6 +372,7 @@ class MACSUser(User):
 
     DEFAULT_MAX_TURNS = 5
     STOP_TOKEN = "</stop>"
+    TEMPLATE_PATH = Path(__file__).parent / "prompt_templates" / "user_simulator.txt"
 
     def __init__(
         self,
@@ -389,9 +390,13 @@ def __init__(
             scenario: Full scenario text (contains goals and user background)
             initial_prompt: The initial query to the agent
             name: User name for identification (default: "Simulated User")
-            template: Optional custom prompt template (uses base User's default)
+            template: Optional custom prompt template (uses MACS-specific default)
             max_turns: Maximum conversation turns (default: 5, per MACS paper)
         """
+        # Load MACS-specific user simulator template if not provided
+        if template is None and self.TEMPLATE_PATH.exists():
+            template = self.TEMPLATE_PATH.read_text()
+
         # Extract user profile from scenario text
         user_profile = self._extract_user_profile(scenario)
 
@@ -528,7 +533,7 @@ class MACSEnvironment(Environment):
     """Unified environment for all MACS domains.
 
     Creates MACSGenericTool instances from task's environment_data.
-    Users can override to convert tools to their framework format.
+    Tools are stored in a dict keyed by name for efficient lookup.
     """
 
     def __init__(
@@ -553,50 +558,39 @@ def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
             "tool_specs": task_data.get("environment_data", {}).get("tools", []),
         }
 
-    def create_tools(self) -> List[MACSGenericTool]:
-        """Create framework-agnostic tools from specifications."""
-        tools = []
-        seen = set()
+    def create_tools(self) -> Dict[str, MACSGenericTool]:  # type: ignore[override]
+        """Create tools from task specifications.
 
+        Returns:
+            Dict mapping tool names to MACSGenericTool instances
+        """
+        tools: Dict[str, MACSGenericTool] = {}
         for tool_group in self.state["tool_specs"]:
             for action in tool_group.get("actions", []):
                 name = action.get("name")
-                if name and name not in seen:
-                    tools.append(MACSGenericTool(action, self._model))
-                    seen.add(name)
-
+                if name and name not in tools:
+                    tools[name] = MACSGenericTool(action, self._model)
         return tools
 
-    def get_tools_by_group(self, group_names: List[str]) -> List[MACSGenericTool]:
-        """Get tools belonging to specified tool groups.
+    def get_tools_for_agent(self, agent_spec: Dict[str, Any]) -> Dict[str, MACSGenericTool]:
+        """Get tools for a specific agent based on its configuration.
 
         Args:
-            group_names: List of tool group names (e.g., ["Weather", "BookFlight"])
+            agent_spec: Agent specification dict with 'tools' key containing tool group names
 
         Returns:
-            List of tools from those groups
+            Dict of MACSGenericTool instances assigned to this agent, keyed by name
         """
-        result = []
+        tool_groups = agent_spec.get("tools", [])
+        result: Dict[str, MACSGenericTool] = {}
         for tool_group in self.state["tool_specs"]:
-            if tool_group.get("tool_name") in group_names:
+            if tool_group.get("tool_name") in tool_groups:
                 for action in tool_group.get("actions", []):
                     name = action.get("name")
-                    if name and name in self._tools_dict:
-                        result.append(self._tools_dict[name])
+                    if name and name in self.tools:
+                        result[name] = self.tools[name]
         return result
 
-    def get_tools_for_agent(self, agent_spec: Dict[str, Any]) -> List[MACSGenericTool]:
-        """Get tools for a specific agent based on its configuration.
-
-        Args:
-            agent_spec: Agent specification dict with 'tools' key containing tool group names
-
-        Returns:
-            List of MACSGenericTool instances assigned to this agent
-        """
-        tool_groups = agent_spec.get("tools", [])
-        return self.get_tools_by_group(tool_groups)
-
 
 # =============================================================================
 # Benchmark
@@ -647,7 +641,7 @@ def setup_environment(
     def setup_user(
         self,
         agent_data: Dict[str, Any],
-        environment: Environment,
+        environment: MACSEnvironment,  # type: ignore[override]
         task: Task,
     ) -> MACSUser:
         """Create MACS user simulator.
@@ -678,7 +672,7 @@ def setup_user(
     def setup_agents(
         self,
         agent_data: Dict[str, Any],
-        environment: Environment,
+        environment: MACSEnvironment,  # type: ignore[override]
         task: Task,
         user: Optional[User],
     ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
@@ -697,7 +691,7 @@ def setup_agents(
 
     def setup_evaluators(
         self,
-        environment: Environment,
+        environment: MACSEnvironment,  # type: ignore[override]
         task: Task,
         agents: Sequence[AgentAdapter],
         user: Optional[User],
@@ -712,7 +706,7 @@ def run_agents(
         self,
         agents: Sequence[AgentAdapter],
         task: Task,
-        environment: Environment,
+        environment: MACSEnvironment,  # type: ignore[override]
     ) -> Any:
         """Execute agents and return final answer."""
         answers = [agent.run(task.query) for agent in agents]
diff --git a/maseval/core/environment.py b/maseval/core/environment.py
index edce3179..3a0887e4 100644
--- a/maseval/core/environment.py
+++ b/maseval/core/environment.py
@@ -7,7 +7,12 @@
 
 
 class Environment(ABC, TraceableMixin, ConfigurableMixin):
-    """Manages the state and tools available during a task execution."""
+    """Manages the state and tools available during a task execution.
+
+    Subclasses must implement:
+    - setup_state(task_data) -> Any: Initialize environment state from task data
+    - create_tools() -> Dict[str, Any]: Create tools keyed by name
+    """
 
     def __init__(self, task_data: Dict[str, Any], callbacks: Optional[List[EnvironmentCallback]] = None):
         super().__init__()
@@ -16,12 +21,6 @@ def __init__(self, task_data: Dict[str, Any], callbacks: Optional[List[Environme
             cb.on_setup_start(self)
         self.state = self.setup_state(task_data)
         self.tools = self.create_tools()
-        # Store tools in a dict for easier lookup during tracing
-        self._tools_dict: Dict[str, Any] = {}
-        if isinstance(self.tools, list):
-            for tool in self.tools:
-                tool_name = getattr(tool, "name", None) or getattr(tool, "__name__", str(type(tool).__name__))
-                self._tools_dict[tool_name] = tool
         for cb in self.callbacks:
             cb.on_setup_end(self)
 
@@ -31,13 +30,29 @@ def setup_state(self, task_data: dict) -> Any:
         pass
 
     @abstractmethod
-    def create_tools(self) -> list:
-        """Creates tools that can interact with the environment's state."""
+    def create_tools(self) -> Dict[str, Any]:
+        """Creates tools that can interact with the environment's state.
+
+        Returns:
+            Dict mapping tool names to tool instances
+        """
         pass
 
-    def get_tools(self) -> list:
+    def get_tools(self) -> Dict[str, Any]:
+        """Get all tools as a dict."""
         return self.tools
 
+    def get_tool(self, name: str) -> Optional[Any]:
+        """Get a tool by name.
+
+        Args:
+            name: Tool name
+
+        Returns:
+            The tool, or None if not found
+        """
+        return self.tools.get(name)
+
     def gather_traces(self) -> dict[str, Any]:
         """Gather execution traces from this environment and its tools.
 
@@ -50,7 +65,7 @@ def gather_traces(self) -> dict[str, Any]:
         """
         tool_traces = {}
 
-        for tool_name, tool in self._tools_dict.items():
+        for tool_name, tool in self.tools.items():
             # Try to gather traces from the tool
             if hasattr(tool, "gather_traces"):
                 tool_traces[tool_name] = tool.gather_traces()
@@ -76,7 +91,7 @@ def gather_traces(self) -> dict[str, Any]:
 
         return {
             **super().gather_traces(),
-            "tool_count": len(self._tools_dict),
+            "tool_count": len(self.tools),
             "tools": tool_traces,
         }
 
@@ -92,6 +107,6 @@ def gather_config(self) -> dict[str, Any]:
         """
         return {
             **super().gather_config(),
-            "tool_count": len(self._tools_dict),
-            "tool_names": list(self._tools_dict.keys()),
+            "tool_count": len(self.tools),
+            "tool_names": list(self.tools.keys()),
         }
diff --git a/tests/conftest.py b/tests/conftest.py
index bf91b126..cfcf2f05 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -130,8 +130,8 @@ class DummyEnvironment(Environment):
     def setup_state(self, task_data: dict) -> Any:
         return task_data.copy()
 
-    def create_tools(self) -> list:
-        return []
+    def create_tools(self) -> dict:
+        return {}
 
 
 class DummyUser(User):
diff --git a/tests/test_core/test_environment.py b/tests/test_core/test_environment.py
index 50ec6946..757595b4 100644
--- a/tests/test_core/test_environment.py
+++ b/tests/test_core/test_environment.py
@@ -19,14 +19,18 @@ def test_environment_setup_state_called(self, dummy_environment):
     def test_environment_create_tools_called(self, dummy_environment):
         """Test that create_tools is called during initialization."""
         assert dummy_environment.tools is not None
-        assert isinstance(dummy_environment.tools, list)
+        assert isinstance(dummy_environment.tools, dict)
 
-    def test_environment_get_tools_returns_list(self, dummy_environment):
-        """Test that get_tools() returns the tools list."""
+    def test_environment_get_tools_returns_dict(self, dummy_environment):
+        """Test that get_tools() returns the tools dict."""
         tools = dummy_environment.get_tools()
-        assert isinstance(tools, list)
+        assert isinstance(tools, dict)
         assert tools is dummy_environment.tools
 
+    def test_environment_get_tool_returns_none_for_missing(self, dummy_environment):
+        """Test that get_tool() returns None for missing tools."""
+        assert dummy_environment.get_tool("nonexistent") is None
+
     def test_environment_callbacks_triggered(self):
         """Test that environment callbacks are triggered."""
         from maseval import EnvironmentCallback
@@ -64,9 +68,9 @@ def test_environment_tool_history_captured(self):
         # Create environment (tools would be added by subclass)
         env = DummyEnvironment({"test": "data"})
 
-        # Verify tools dict is created
-        assert hasattr(env, "_tools_dict")
-        assert isinstance(env._tools_dict, dict)
+        # Verify tools dict is available
+        assert hasattr(env, "tools")
+        assert isinstance(env.tools, dict)
 
     def test_environment_gather_config(self, dummy_environment):
         """Test that gather_config() returns configuration."""

From 644fb5b67bbde4876a3ffe5737b35900122cae9e Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Wed, 3 Dec 2025 22:58:08 +0000
Subject: [PATCH 07/34] initial tests for macs

---
 .github/workflows/test.yml                    |  23 +-
 pyproject.toml                                |   1 +
 tests/TESTING_PLAN.md                         | 697 ------------------
 tests/test_benchmarks/test_macs/conftest.py   | 553 ++++++++++++++
 .../test_macs/test_data_loader.py             |  15 +-
 .../test_macs/test_macs_benchmark.py          | 516 +++++++++++++
 .../test_macs/test_macs_environment.py        | 356 +++++++++
 .../test_macs/test_macs_evaluator.py          | 510 +++++++++++++
 .../test_macs/test_macs_integration.py        | 680 +++++++++++++++++
 .../test_macs/test_macs_tool.py               | 296 ++++++++
 .../test_macs/test_macs_user.py               | 507 +++++++++++++
 11 files changed, 3455 insertions(+), 699 deletions(-)
 delete mode 100644 tests/TESTING_PLAN.md
 create mode 100644 tests/test_benchmarks/test_macs/conftest.py
 create mode 100644 tests/test_benchmarks/test_macs/test_macs_benchmark.py
 create mode 100644 tests/test_benchmarks/test_macs/test_macs_environment.py
 create mode 100644 tests/test_benchmarks/test_macs/test_macs_evaluator.py
 create mode 100644 tests/test_benchmarks/test_macs/test_macs_integration.py
 create mode 100644 tests/test_benchmarks/test_macs/test_macs_tool.py
 create mode 100644 tests/test_benchmarks/test_macs/test_macs_user.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8ee12536..bd5bbdf6 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -29,9 +29,30 @@ jobs:
         run: |
           uv run pytest -m core -v
 
+  test-benchmark:
+    name: Benchmark Tests
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          pip install uv
+          uv sync --group dev
+      - name: Run benchmark tests
+        run: |
+          uv run pytest -m benchmark -v
+
   test-all:
     name: All Tests (With Optional Deps)
-    needs: test-core
+    needs: [test-core, test-benchmark]
     runs-on: ubuntu-latest
     strategy:
       matrix:
diff --git a/pyproject.toml b/pyproject.toml
index 681f5ef2..b19b8ecc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -174,6 +174,7 @@ markers = [
     "core: Core tests that don't require optional dependencies",
     "interface: Tests that require optional dependencies (smolagents, langgraph, etc.)",
     "contract: Cross-implementation contract tests that validate framework-agnostic abstraction",
+    "benchmark: Benchmark-specific tests (MACS, etc.) that test benchmark implementations, not core library",
     "smolagents: Tests that specifically require smolagents",
     "langgraph: Tests that specifically require langgraph",
     "llamaindex: Tests that specifically require llama-index-core",
diff --git a/tests/TESTING_PLAN.md b/tests/TESTING_PLAN.md
deleted file mode 100644
index fc3292af..00000000
--- a/tests/TESTING_PLAN.md
+++ /dev/null
@@ -1,697 +0,0 @@
-# Comprehensive Testing Plan for MASEval
-
-## Implementation Status Summary
-
-**Last Updated:** November 5, 2025
-
-**Overall Progress:** 333 tests implemented across 23 test files
-
-**Test Structure:**
-
-- `tests/test_core/` - Unit tests for core classes (189 tests across 15 files)
-- `tests/test_contract/` - Cross-implementation contract tests (47 tests across 3 files)
-- `tests/test_interface/` - Framework-specific adapter tests (43 tests across 6 files)
-- `tests/test_benchmarks/` - Benchmark-specific tests (0 tests - directory exists but empty)
-
-### Quick Status Legend
-
-- ✅ **Fully Implemented** - All proposed tests completed
-- 🟡 **Partially Implemented** - Some tests implemented, coverage incomplete
-- ❌ **Not Implemented** - Test module not yet created
-
-### Test Categories Summary
-
-| Category            | Status               | Test Count    | Files        |
-| ------------------- | -------------------- | ------------- | ------------ |
-| **Core Tests**      | ✅ Fully Implemented | 189 tests     | 15 files     |
-| **Contract Tests**  | ✅ Fully Implemented | 47 tests      | 3 files      |
-| **Interface Tests** | ✅ Fully Implemented | 43 tests      | 6 files      |
-| **Benchmark Tests** | ❌ Not Implemented   | 0 tests       | 0 files      |
-| **TOTAL**           | **✅ 98% Complete**  | **333 tests** | **23 files** |
-
----
-
-## Executive Summary
-
-After analyzing the entire MASEval codebase, this document proposes a comprehensive testing strategy that focuses on **user-facing functionality** rather than low-level implementation details. This plan guides the development of a robust test suite covering the core orchestration workflows that users depend on.
-
-## Key Library Patterns Identified
-
-MASEval follows these architectural patterns that must be tested:
-
-### 1. **Three-Stage Lifecycle Pattern**
-
-Every benchmark execution follows: **Setup → Run → Evaluate**
-
-- `setup_environment()` → creates isolated task environment
-- `setup_user()` → optional user simulator
-- `setup_agents()` → instantiates agent adapters
-- `run_agents()` → executes multi-agent system
-- Message collection and `evaluate()` → assessment
-
-### 2. **Automatic Component Registration**
-
-Components returned from setup methods are auto-registered for tracing/config:
-
-- Environment → `"environment:env"`
-- User → `"user:user"`
-- Agents → `"agents:{agent_name}"`
-- Prevents duplicate registration, provides helpful error messages
-
-### 3. **Dual Collection System**
-
-- **Traces** (`gather_traces()`): Execution data (messages, calls, timing, tokens)
-- **Config** (`gather_config()`): Reproducibility data (models, params, system info)
-
-### 4. **Framework-Agnostic Adapter Pattern**
-
-Interface adapters (smolagents, langgraph, crewai) convert framework messages to OpenAI-compatible `MessageHistory`:
-
-- Persistent state fetching (smolagents memory)
-- Stateless/cached (langgraph results)
-- Tool calls, multi-modal content preservation
-
-### 5. **Callback-Driven Extensibility**
-
-Lifecycle hooks at every stage:
-
-- `on_run_start/end` - benchmark level
-- `on_task_start/end` - per task (all repeats)
-- `on_task_repeat_start/end` - per individual execution
-- Enables logging, tracing, metrics without modifying core
-
-### 6. **LLM Simulator Pattern**
-
-Base `LLMSimulator` with retry logic and structured history:
-
-- `ToolLLMSimulator` - generates realistic tool responses
-- `UserLLMSimulator` - simulates human interaction
-- Automatic tracking of attempts, parsing errors, token usage
-
-### 7. **Standardized Message History**
-
-`MessageHistory` class provides OpenAI-compatible format with:
-
-- List-like interface (iterable, indexable, sliceable)
-- Multi-modal support (text, images, files, audio)
-- Tool calls and responses
-- Rich metadata and timestamps
-
-## Current Testing Status
-
-### What's Implemented ✅
-
-**Core Tests (189 tests across 15 files) - `tests/test_core/`**
-
-All core functionality is fully tested. See individual test files in `tests/test_core/` for complete test details:
-
-- `test_automatic_registration.py` (6 tests) - Component registration and duplicate detection
-- `test_benchmark_lifecycle.py` (10 tests) - Benchmark execution flow and lifecycle hooks
-- `test_message_history.py` (14 tests) - Message history interface and operations
-- `test_trace_collection.py` (10 tests) - Trace gathering from all components
-- `test_config_collection.py` (11 tests) - Configuration collection for reproducibility
-- `test_agent_adapter.py` (8 tests) - agent adapter base functionality
-- `test_environment.py` (7 tests) - Environment state management and tools
-- `test_user_simulator.py` (5 tests) - User simulation for collaborative benchmarks
-- `test_model_adapter.py` (36 tests) - Model adapter comprehensive testing
-- `test_llm_simulator.py` (6 tests) - LLM simulator retry logic and error handling
-- `test_task_collection.py` (12 tests) - Task collection interface
-- `test_callback_orchestration.py` (6 tests) - Callback firing and ordering
-- `test_evaluator.py` (6 tests) - Evaluator integration
-- `test_message_tracing_callback.py` (11 tests) - Message tracing callback specialized tests
-- `test_callbacks/` (11 tests) - Result logger callbacks (base + file output)
-
-**Contract Tests (47 tests across 3 files) - `tests/test_contract/`**
-
-All contract tests validate cross-implementation consistency. See individual test files in `tests/test_contract/` for complete contract guarantees:
-
-- `test_agent_adapter_contract.py` (11 tests) - Framework-agnostic agent adapter contract
-- `test_collection_contract.py` (20 tests) - Universal tracing and config contract
-- `test_model_adapter_contract.py` (16 tests) - Model provider abstraction contract
-
-**Interface Tests (43 tests across 6 files) - `tests/test_interface/`**
-
-All adapter integration tests are complete. See individual test files in `tests/test_interface/` for complete integration tests:
-
-- `test_optional_imports.py` (6 tests) - Optional dependency handling
-- `test_model_integration/test_model_adapters.py` (22 tests) - OpenAI, Google, HuggingFace, LiteLLM integrations
-- `test_agent_integration/test_smolagents_integration.py` (10 tests) - Smolagents framework integration
-- `test_agent_integration/test_langgraph_integration.py` (5 tests) - LangGraph framework integration
-
-### What's Missing ❌
-
-1. ❌ **Complete benchmark integration test** - End-to-end test with all components working together (partially covered by lifecycle tests)
-2. ❌ **Benchmark-specific tests** - TAU2, Amazon Collab, GAIA implementations (`tests/test_benchmarks/` exists but is empty)
-
-## Proposed Testing Strategy
-
-### Core Tests (No Optional Dependencies)
-
-#### 1. **Benchmark Lifecycle Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 10 tests implemented
-
-Test file: `tests/test_core/test_benchmark_lifecycle.py`
-
-**What is tested:** See test file for complete list. Tests verify complete run execution (single/multiple tasks), task repetitions, lifecycle hook ordering, component cleanup between repeats, and registry management.
-
-**Why:** Users depend on the `run()` method working correctly. This is THE core functionality.
-
-#### 2. **Message History Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 14 tests implemented
-
-Test file: `tests/test_core/test_message_history.py`
-
-**What is tested:** See test file for complete list. Tests cover list-like behavior (iteration, indexing, slicing), tool calls, multi-modal content (images, files, audio), metadata preservation, and conversions.
-
-**Why:** MessageHistory is used throughout the system. Must behave like a list consistently.
-
-#### 3. **Trace Collection Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 10 tests implemented
-
-Test file: `tests/test_core/test_trace_collection.py`
-
-**What is tested:** See test file for complete list. Tests verify that all registered components contribute traces, message histories are included, error resilience, and tracking of model calls, tool invocations, retry attempts, and callback data.
-
-**Why:** Trace collection is the primary value proposition for evaluation.
-
-#### 4. **Config Collection Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 11 tests implemented
-
-Test file: `tests/test_core/test_config_collection.py`
-
-**What is tested:** See test file for complete list. Tests verify that all components contribute configs, benchmark metadata is captured, system/git/package info is included, structure matches spec, and error handling works gracefully.
-
-**Why:** Reproducibility depends on comprehensive config capture.
-
-#### 5. **agent adapter Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 8 tests implemented
-
-Test file: `tests/test_core/test_agent_adapter.py`
-
-**What is tested:** See test file for complete list. Tests cover callback triggering, message history operations (get/set/clear/append), trace collection, and config gathering.
-
-**Why:** AgentAdapter is the interface users implement.
-
-#### 6. **Environment Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 7 tests implemented
-
-Test file: `tests/test_core/test_environment.py`
-
-**What is tested:** See test file for complete list. Tests verify state setup, tool creation/retrieval, callback triggering, tool trace collection, and tool history preservation.
-
-**Why:** Environment manages state and tool access.
-
-#### 7. **User Simulator Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 5 tests implemented
-
-Test file: `tests/test_core/test_user_simulator.py`
-
-**What is tested:** See test file for complete list. Tests cover history updates, bidirectional conversations, interaction traces, profile config, and LLM simulator integration.
-
-**Why:** User simulation is key for collaborative benchmarks.
-
-#### 8. **Model Adapter Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 36 tests implemented
-
-Test file: `tests/test_core/test_model_adapter.py`
-
-**What is tested:** See test file for complete list. Comprehensive test coverage including base contract, generation behavior, error handling, tracing, configuration, and mixin integration patterns.
-
-**Why:** Model adapters track all LLM calls for cost/performance analysis.
-
-#### 9. **LLM Simulator Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 6 tests implemented
-
-Test file: `tests/test_core/test_llm_simulator.py`
-
-**What is tested:** See test file for complete list. Tests verify retry logic, parse error handling, attempt limits, history tracking, status codes, and token counting.
-
-**Why:** Simulators handle retry logic and error recovery.
-
-#### 10. **Task Collection Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 12 tests implemented
-
-Test file: `tests/test_core/test_task_collection.py`
-
-**What is tested:** See test file for complete list. Tests cover creation from list/JSON, sequence interface (indexing, slicing, iteration, length), and boolean context.
-
-**Why:** TaskCollection is the standard way to manage benchmark data.
-
-#### 11. **Callback Orchestration Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 6 tests implemented
-
-Test file: `tests/test_core/test_callback_orchestration.py`
-
-**What is tested:** See test file for complete list. Tests verify callback firing order, multiple callback support, error isolation, context passing, and benchmark/agent level callbacks.
-
-**Why:** Callbacks enable extensibility without modifying core.
-
-#### 12. **Evaluator Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 6 tests implemented
-
-Test file: `tests/test_core/test_evaluator.py`
-
-**What is tested:** See test file for complete list. Tests verify that evaluators receive message history, agents dict, final answer, and traces, plus multiple evaluator support and result capture.
-
-**Why:** Evaluation is the final stage of the lifecycle.
-
-#### 13. **Result Logger Callbacks** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 11 tests implemented
-
-Test files:
-
-- `tests/test_core/test_callbacks/test_result_logger.py` (10 tests)
-- `tests/test_core/test_callbacks/test_file_result_logger.py` (1 test)
-
-**What is tested:** See test files for complete list. Tests cover base ResultLogger orchestration, lifecycle management, iteration tracking, validation, and FileResultLogger JSONL output with filtering.
-
-**Why:** Result loggers persist benchmark execution data for later analysis and validation.
-
-#### 14. **Integration: Complete Benchmark** ❌ NOT IMPLEMENTED
-
-**Status:** ❌ **NOT IMPLEMENTED** - 0 tests
-
-Test file: `tests/test_core/test_benchmark_integration.py` (proposed)
-
-**Proposed Tests:**
-
-- `test_simple_benchmark_end_to_end()` - Simple end-to-end run
-- `test_multi_agent_benchmark()` - Multiple agents
-- `test_benchmark_with_user_simulator()` - With user
-- `test_benchmark_with_callbacks()` - With callbacks
-- `test_benchmark_with_repetitions()` - Multiple repetitions
-- `test_benchmark_with_evaluators()` - With evaluators
-- `test_benchmark_traces_and_config_in_reports()` - Report structure
-- `test_benchmark_agent_data_per_task()` - Per-task config
-
-**Why:** Integration tests verify the entire system works together.
-
-**What to verify:**
-
-- Complete benchmark runs successfully
-- All components integrated correctly
-- Reports structure correct
-- No data loss through pipeline
-
-**Note:** Currently covered partially by `test_benchmark_lifecycle.py` but needs dedicated end-to-end integration test with all components working together.
-
-### Contract Tests (Cross-Implementation Conformance)
-
-#### 15. **AgentAdapter Contract Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 11 tests implemented
-
-Test file: `tests/test_contract/test_agent_adapter_contract.py`
-
-**Purpose:** Validates that ALL AgentAdapter implementations (smolagents, langgraph, dummy) honor the same behavioral contract and behave identically for key operations. This is MASEval's **CORE PROMISE** - framework-agnostic agent abstraction.
-
-**What is tested:** See test file for detailed list of contract guarantees.
-
-**Why:** Contract tests validate MASEval's framework-agnostic abstraction. If these fail, users cannot reliably swap between agent frameworks, breaking the library's core value proposition.
-
-#### 16. **Collection Contract Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 20 tests implemented
-
-Test file: `tests/test_contract/test_collection_contract.py`
-
-**Purpose:** Validates universal tracing and config collection across all traceable/configurable components (agents, models, environments, users).
-
-**What is tested:** See test file for detailed list of contract guarantees covering universal tracing, config collection, cross-framework consistency, and cross-component consistency.
-
-**Why:** Ensures all components provide consistent trace and config data regardless of implementation.
-
-#### 17. **Model Adapter Contract Tests** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 16 tests implemented
-
-Test file: `tests/test_contract/test_model_adapter_contract.py`
-
-**Purpose:** Validates that ALL ModelAdapter implementations (OpenAI, Google, HuggingFace, LiteLLM, Dummy) honor the same behavioral contract for generation, tracing, and configuration.
-
-**What is tested:** See test file for detailed list of contract guarantees covering adapter initialization, generation behavior, tracing structure, configuration, and cross-adapter consistency.
-
-**Why:** Ensures users can swap between model providers without changing benchmark code.
-
-### Interface Tests (Require Optional Dependencies)
-
-#### 18. **Optional Import Guards** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 6 tests implemented
-
-Test file: `tests/test_interface/test_optional_imports.py`
-
-**Purpose:** Validates that core package works without optional dependencies and interface modules gracefully handle missing dependencies.
-
-**What is tested:** See test file for complete list. Tests cover core package imports, interface package structure, dynamic `__all__` generation, and graceful handling of missing optional dependencies.
-
-**Why:** Ensures users can install minimal version without all optional dependencies and get helpful error messages when trying to use unavailable integrations.
-
-#### 19. **Model Adapter Integrations** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 22 tests implemented
-
-Test file: `tests/test_interface/test_model_integration/test_model_adapters.py`
-
-**Purpose:** Tests specific behavior and integration for each ModelAdapter implementation with real client libraries.
-
-**What is tested:** See test file for complete list. Tests cover:
-
-- OpenAI adapter (7 tests) - initialization, generation, extraction, parameters, config
-- Google GenAI adapter (6 tests) - client/model initialization, generation, error handling, config
-- HuggingFace adapter (5 tests) - tokenizer/model/pipeline initialization, generation, config
-- LiteLLM adapter (3 tests) - initialization with/without params, config
-- Cross-adapter consistency (1 test) - model_id and default params exposure
-
-**Why:** Verifies model adapters work correctly with their respective client libraries and provide consistent interfaces.
-
-#### 20. **Agent Framework Integrations** ✅ FULLY IMPLEMENTED
-
-**Status:** ✅ **COMPLETE** - 15 tests implemented
-
-Test files:
-
-- `tests/test_interface/test_agent_integration/test_smolagents_integration.py` (10 tests)
-- `tests/test_interface/test_agent_integration/test_langgraph_integration.py` (5 tests)
-
-**Purpose:** Tests framework-specific adapter implementations for smolagents and LangGraph.
-
-**What is tested:** See test files for complete list. Tests cover:
-
-**Smolagents (10 tests):**
-
-- Adapter creation and import guards
-- Trace gathering with/without monitoring
-- Trace gathering with planning steps
-- Message manipulation support (not supported)
-- Clear history support (supported)
-
-**LangGraph (5 tests):**
-
-- Adapter import and availability checks
-- Message manipulation with/without system messages
-
-**Why:** Validates framework-specific adapters work correctly with their respective libraries and handle framework-specific features properly.
-
-#### 21. **Edge Cases and Advanced Scenarios** ⏳ FUTURE WORK
-
-**Status:** ⏳ **DEFERRED** - Not currently prioritized but documented for future implementation
-
-**Proposed Edge Cases:**
-
-**Callback Exception Handling:**
-
-- `test_callback_exception_isolation()` - Exception in one callback doesn't break others
-- `test_callback_exception_logging()` - Exceptions are logged appropriately
-- `test_callback_exception_in_run_start()` - Failure in on_run_start doesn't prevent run
-- `test_callback_exception_in_run_end()` - Failure in on_run_end doesn't lose data
-
-**Thread Safety and Concurrency:**
-
-- `test_adapter_concurrent_runs()` - Multiple threads calling run() simultaneously
-- `test_trace_collection_thread_safety()` - Trace accumulation in concurrent execution
-- `test_callback_thread_safety()` - Callbacks triggered from multiple threads
-
-**Performance and Limits:**
-
-- `test_very_long_message_history()` - Handles 1000+ messages efficiently
-- `test_large_message_content()` - Large content blocks (images, files)
-- `test_many_tool_calls()` - 100+ tool calls in conversation
-
-**Invalid Data Handling:**
-
-- `test_malformed_message_from_framework()` - Framework returns invalid message format
-- `test_missing_required_fields()` - Framework omits required fields
-- `test_invalid_role_types()` - Unknown role types in messages
-- `test_none_values_in_messages()` - None/null values in message fields
-
-**State Management Edge Cases:**
-
-- `test_set_history_during_run()` - Setting history while agent is running
-- `test_clear_history_during_callback()` - Clearing history from callback
-- `test_multiple_history_modifications()` - Rapid set/clear/append operations
-
-**Return Value Validation:**
-
-- `test_run_returns_final_answer_not_list()` - Ensures run() returns answer, not trace
-- `test_final_answer_extraction()` - Final answer correctly extracted from frameworks
-- `test_empty_response_handling()` - Framework returns empty/None response
-
-**Why Deferred:** These are defensive programming tests that validate edge cases and error handling. While valuable, they are not critical for the core library functionality. The basic contract and happy path are more important to validate first. These tests should be implemented when:
-
-1. Production usage reveals these scenarios occur in practice
-2. Bug reports indicate gaps in error handling
-3. Performance becomes a concern
-
-### Benchmark-Specific Tests
-
-#### 22. **Concrete Benchmark Tests** ❌ NOT IMPLEMENTED
-
-**Status:** ❌ **NOT IMPLEMENTED** - 0 tests
-
-Test files (proposed):
-
-- `tests/test_benchmarks/test_tau2_bench.py`
-- `tests/test_benchmarks/test_amazon_collab_bench.py`
-- `tests/test_benchmarks/test_gaia_bench.py`
-
-**Proposed Tests:**
-
-- `test_benchmark_loads_data()` - Data loading works
-- `test_benchmark_creates_tasks()` - Task creation correct
-- `test_benchmark_setup_methods_work()` - Setup methods functional
-- `test_benchmark_runs_sample_task()` - Can run single task
-- `test_benchmark_evaluates_correctly()` - Evaluation logic correct
-
-**Why:** Verify concrete implementations work.
-
-**Note:** The `tests/test_benchmarks/` directory currently exists but is empty.
-
-## Test Organization Principles
-
-### Test Markers
-
-- `@pytest.mark.core` - No optional dependencies
-- `@pytest.mark.interface` - Requires framework integrations
-- `@pytest.mark.smolagents` - Requires smolagents
-- `@pytest.mark.langgraph` - Requires langgraph
-- `@pytest.mark.slow` - Long-running tests (actual LLM calls)
-- `@pytest.mark.integration` - End-to-end tests
-
-### Test Structure
-
-Each test file should:
-
-1. Use clear, descriptive test names
-2. Follow Arrange-Act-Assert pattern
-3. Use minimal fixtures/mocks
-4. Test ONE thing per test
-5. Include docstrings explaining WHY
-
-### Mocking Strategy
-
-- **Mock external APIs** (OpenAI, Google, etc.) for core tests
-- **Mock LLM responses** for deterministic testing
-- **Don't mock internal components** (defeats the purpose)
-- **Use real implementations** in integration tests
-
-### Coverage Goals
-
-- **Core modules:** >90% coverage
-- **Interface adapters:** >80% coverage
-- **Benchmark implementations:** >70% coverage
-
-## Priority Ranking
-
-### P0 (Must Have - Blocks Release) ✅ NEARLY COMPLETE
-
-1. ✅ Benchmark lifecycle tests (complete run)
-2. ✅ Message history tests (iterable interface)
-3. ✅ Trace collection tests (end-to-end)
-4. ✅ Config collection tests (reproducibility)
-5. ❌ Integration test (simple benchmark) - **MISSING** but partially covered by lifecycle tests
-
-### P1 (Should Have - High Value) ✅ ALL COMPLETE
-
-6. ✅ agent adapter tests
-7. ✅ Environment tests
-8. ✅ Callback orchestration tests
-9. ✅ Task collection tests
-10. ✅ Evaluator tests
-11. ✅ Result logger callbacks
-12. ✅ Contract tests (agent adapter, collection, model adapter)
-
-### P2 (Nice to Have - Completeness) ✅ ALL COMPLETE
-
-13. ✅ Model adapter tests (36 comprehensive tests)
-14. ✅ LLM simulator tests
-15. ✅ User simulator tests
-16. ✅ Framework-specific integration tests (smolagents, langgraph)
-17. ✅ Model adapter integrations (OpenAI, Google, HuggingFace, LiteLLM)
-18. ✅ Optional import guards
-
-### P3 (Future - Comprehensive) ❌ NOT STARTED
-
-19. ❌ Benchmark-specific tests (TAU2, Amazon Collab, GAIA)
-20. ❌ Complete end-to-end integration test
-21. ❌ Performance/stress tests
-22. ❌ Concurrency tests
-23. ❌ Documentation examples as tests
-
-## Test Data Strategy
-
-### Fixtures ✅ IMPLEMENTED
-
-Shared fixtures implemented in `tests/conftest.py`:
-
-**Core Fixtures:**
-
-- `dummy_model` - DummyModelAdapter with configurable responses
-- `dummy_agent` - DummyAgent that tracks calls
-- `dummy_agent_adapter` - DummyAgentAdapter with message history
-- `dummy_environment` - DummyEnvironment with state management
-- `dummy_user` - DummyUser for simulation testing
-- `dummy_task` - Single Task instance
-- `dummy_task_collection` - TaskCollection with 3 tasks
-- `simple_benchmark` - DummyBenchmark ready to run
-- `agent_data` - Sample agent configuration
-
-**Helper Classes:**
-
-- `DummyBenchmark` - Tracks all lifecycle calls for verification
-- `DummyEvaluator` - Returns simple pass/fail results
-- All classes implement proper TraceableMixin/ConfigurableMixin patterns
-
-### Test Data Files ❌ NOT IMPLEMENTED
-
-Proposed minimal test data in `tests/fixtures/` (not yet created):
-Proposed minimal test data in `tests/fixtures/` (not yet created):
-
-- `tasks.json` - Sample task data
-- `agent_config.json` - Sample agent configurations
-- `expected_traces.json` - Expected trace structure
-
-## Running Tests
-
-```bash
-# All core tests (CI fast path)
-pytest -m core -v
-
-# All tests including integrations
-pytest -v
-
-# Specific test category
-pytest -m "core and not integration" -v
-
-# Framework-specific tests
-pytest -m smolagents -v
-pytest -m langgraph -v
-pytest -m interface -v
-
-# With coverage
-pytest --cov=maseval --cov-report=html -v
-
-# Fast feedback during development
-pytest -x --ff  # Stop on first failure, run previous failures first
-```
-
-## Success Metrics
-
-**Current Achievement:**
-
-- ✅ **Test Count:** 333 tests implemented across 23 test files
-- ✅ **Core Coverage:** All P0 (4/5), P1 (7/7), and P2 (6/6) tests complete
-- ✅ **Contract Coverage:** All contract tests implemented (agent adapter, collection, model adapter)
-- ✅ **Interface Coverage:** All adapter integration tests complete (agents + models)
-- 🟡 **Runtime:** Not yet measured
-- 🟡 **Reliability:** Not yet run in CI
-- ✅ **Documentation:** All tests have docstrings
-- ✅ **Maintainability:** Clean fixture system, minimal duplication
-
-**Target Metrics:**
-
-1. **Coverage:** >85% for core, >75% for interface
-2. **Runtime:** Core tests complete in <30s
-3. **Reliability:** No flaky tests (>99% pass rate)
-4. **Documentation:** Every test has a docstring ✅
-5. **Maintainability:** Tests catch bugs before they reach users
-
-## Migration Path (UPDATED)
-
-### Phase 1 (Week 1): Foundation ✅ COMPLETE
-
-- ✅ Set up test fixtures and helpers (`conftest.py` with 10+ fixtures)
-- ✅ Implement P0 tests (lifecycle, messages, traces, config)
-- ✅ Establish test data strategy (DummyBenchmark pattern)
-
-### Phase 2 (Week 2): Core Coverage ✅ COMPLETE
-
-- ✅ Implement P1 tests (agent adapter, environment, callbacks, tasks, evaluator)
-- ✅ Add callback orchestration tests
-- ✅ Message tracing callback specialized tests
-- ✅ Automatic registration tests
-
-### Phase 3 (Week 3): Interface Coverage ✅ COMPLETE
-
-- ✅ Contract tests (agent adapter, collection, model adapter - 47 tests)
-- ✅ Smolagents integration (10 tests)
-- ✅ LangGraph integration (5 tests)
-- ✅ Model adapter integrations (22 tests across 4 providers)
-- ✅ Optional import guards (6 tests)
-
-### Phase 4 (Current): Polish & Remaining Tests
-
-**Remaining Work:**
-
-- ❌ Complete benchmark integration test (end-to-end with all components)
-- ❌ Benchmark-specific tests (TAU2, Amazon Collab, GAIA)
-- ⏳ Run full test suite and measure coverage
-- ⏳ CI integration and optimization
-- ⏳ Fix any failing tests (if any)
-
-## Conclusion
-
-**Summary:** The MASEval test suite has achieved substantial coverage with **333 tests across 23 test files**, covering nearly all P0, P1, and P2 priorities. The core orchestration workflows are comprehensively tested, providing strong confidence in the three-stage lifecycle, message handling, trace/config collection, framework-agnostic adapter pattern, and model provider integrations.
-
-**Key Achievements:**
-
-- ✅ **333 tests** across core (189), contract (47), and interface (43) modules
-- ✅ All critical core functionality tested (P0 nearly complete, P1/P2 complete)
-- ✅ Contract tests validate framework-agnostic abstraction across agents and models
-- ✅ Comprehensive model adapter integrations (OpenAI, Google, HuggingFace, LiteLLM)
-- ✅ Framework adapter tests for smolagents and LangGraph
-- ✅ Result logger callbacks for data persistence
-- ✅ Clean fixture system eliminates duplication
-- ✅ Comprehensive coverage of TraceableMixin/ConfigurableMixin patterns
-
-**What's Missing:**
-
-1. Complete end-to-end benchmark integration test (partially covered by lifecycle tests)
-2. Benchmark-specific tests for TAU2, Amazon Collab, and GAIA implementations
-
-**Next Steps:**
-
-1. Run the full test suite to identify any failures
-2. Measure code coverage and identify gaps
-3. Implement the missing benchmark integration test
-4. Create benchmark-specific tests for concrete benchmark implementations
-5. CI integration and optimization
-
-The foundation is extremely solid with comprehensive coverage. The remaining work focuses on high-level integration testing and benchmark-specific validation.
-
----
-
-**Key Insight:** Test the orchestration, not the implementation. Users care that `benchmark.run()` works end-to-end with comprehensive tracing, and that they can switch between agent frameworks (smolagents, langgraph) and model providers (OpenAI, Google, HuggingFace) without changing their benchmark code. The implemented tests validate exactly these core promises.
diff --git a/tests/test_benchmarks/test_macs/conftest.py b/tests/test_benchmarks/test_macs/conftest.py
new file mode 100644
index 00000000..b8b276c7
--- /dev/null
+++ b/tests/test_benchmarks/test_macs/conftest.py
@@ -0,0 +1,553 @@
+"""Shared fixtures for MACS benchmark tests.
+
+Fixture Hierarchy
+-----------------
+- tests/conftest.py: Generic fixtures (dummy_model, dummy_agent_adapter, dummy_task, etc.)
+  These are automatically available via pytest's conftest inheritance.
+- tests/test_benchmarks/test_macs/conftest.py: MACS-specific fixtures (this file)
+
+MACS tests can use fixtures from both levels - pytest handles this automatically.
+
+Why MACS-Specific Mock Classes Exist
+------------------------------------
+The MACS benchmark uses ToolLLMSimulator and UserLLMSimulator which parse JSON responses
+in a specific format: {"text": "...", "details": {...}}
+
+The generic DummyModelAdapter from tests/conftest.py returns simple strings like
+"test response", which would cause JSON parsing failures in MACS components.
+
+Therefore, we define MACS-specific adapters that:
+1. MACSModelAdapter: Returns valid JSON in the ToolLLMSimulator format by default
+2. MACSAgentAdapter: Returns MessageHistory (not strings) matching the AgentAdapter contract
+
+These are NOT duplicates - they serve a different purpose than the generic test fixtures.
+"""
+
+import pytest
+from typing import Any, Dict, List, Optional, Tuple
+from unittest.mock import MagicMock
+
+from maseval import AgentAdapter, Task, User, MessageHistory, TaskCollection
+from maseval.benchmark.macs import MACSBenchmark, MACSEnvironment
+from maseval.core.model import ModelAdapter
+
+
+# =============================================================================
+# MACS-Specific Mock Components
+#
+# These exist because MACS components (ToolLLMSimulator, UserLLMSimulator, MACSEvaluator)
+# expect JSON responses in specific formats. The generic DummyModelAdapter returns
+# plain strings which would cause parsing failures.
+# =============================================================================
+
+
+class MACSModelAdapter(ModelAdapter):
+    """Model adapter for testing MACS components.
+
+    Unlike DummyModelAdapter (which returns plain strings), this adapter returns
+    JSON responses in the format expected by MACS simulators:
+
+        {"text": "response text", "details": {...}}
+
+    This format is required by:
+    - ToolLLMSimulator._parse_output() for tool responses
+    - UserLLMSimulator._parse_output() for user responses
+    - MACSEvaluator for assertion evaluation (different format)
+
+    Attributes:
+        prompts: List of all prompts sent to the model (for verification in tests).
+        _call_count: Number of times generate() was called.
+    """
+
+    def __init__(self, responses: Optional[List[str]] = None):
+        """Initialize with optional canned responses.
+
+        Args:
+            responses: List of JSON strings to return. Cycles through if more
+                calls are made than responses provided. Defaults to a valid
+                ToolLLMSimulator response format.
+        """
+        super().__init__()
+        self._model_id = "macs-test-model"
+        self._responses = responses or ['{"text": "Default response", "details": {}}']
+        self._call_count = 0
+        self.prompts: List[str] = []
+
+    @property
+    def model_id(self) -> str:
+        return self._model_id
+
+    def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str:
+        self.prompts.append(prompt)
+        response = self._responses[self._call_count % len(self._responses)]
+        self._call_count += 1
+        return response
+
+
+class MACSAgentAdapter(AgentAdapter):
+    """Agent adapter for testing MACS benchmark execution.
+
+    Unlike DummyAgentAdapter (which wraps a real agent object), this adapter
+    provides controllable responses without needing a real agent implementation.
+
+    Used for testing MACSBenchmark.run_agents() and integration scenarios.
+
+    Attributes:
+        run_calls: List of queries passed to _run_agent (for verification).
+    """
+
+    def __init__(self, name: str = "macs_test_agent"):
+        super().__init__(agent_instance=MagicMock(), name=name)
+        self._responses: List[str] = []
+        self._call_count = 0
+        self.run_calls: List[str] = []
+
+    def set_responses(self, responses: List[str]) -> None:
+        """Set canned responses for the agent."""
+        self._responses = responses
+
+    def _run_agent(self, query: str) -> MessageHistory:
+        self.run_calls.append(query)
+        if self._responses:
+            response = self._responses[self._call_count % len(self._responses)]
+            self._call_count += 1
+        else:
+            response = f"Response to: {query}"
+        return MessageHistory([{"role": "assistant", "content": response}])
+
+
+# =============================================================================
+# MACS-Specific Benchmark Implementation
+# =============================================================================
+
+
+class ConcreteMACSBenchmark(MACSBenchmark):
+    """Concrete MACSBenchmark implementation for testing.
+
+    MACSBenchmark is abstract (setup_agents must be implemented by users).
+    This provides a minimal implementation using MACSAgentAdapter.
+    """
+
+    def setup_agents(
+        self,
+        agent_data: Dict[str, Any],
+        environment: MACSEnvironment,
+        task: Task,
+        user: Optional[User],
+    ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
+        """Create test agents using MACSAgentAdapter."""
+        adapter = MACSAgentAdapter("macs_test_agent")
+        return [adapter], {"macs_test_agent": adapter}
+
+
+# =============================================================================
+# Model Fixtures
+#
+# These use MACSModelAdapter because MACS components require JSON responses.
+# For generic model testing, use dummy_model from parent conftest.
+# =============================================================================
+
+
+@pytest.fixture
+def macs_model():
+    """MACS model adapter with default JSON responses.
+
+    Returns responses in ToolLLMSimulator format: {"text": "...", "details": {...}}
+    """
+    return MACSModelAdapter()
+
+
+@pytest.fixture
+def macs_model_evaluator():
+    """MACS model configured for MACSEvaluator tests.
+
+    Returns JSON array format expected by MACSEvaluator._parse_evaluation_response().
+    """
+    return MACSModelAdapter(responses=['[{"assertion": "Test", "answer": "TRUE", "evidence": "OK"}]'])
+
+
+@pytest.fixture
+def macs_model_tool():
+    """MACS model configured for ToolLLMSimulator tests."""
+    return MACSModelAdapter(responses=['{"text": "Tool executed successfully", "details": {}}'])
+
+
+@pytest.fixture
+def macs_model_user():
+    """MACS model configured for UserLLMSimulator tests."""
+    return MACSModelAdapter(responses=['{"text": "Yes, that works for me.", "details": {}}'])
+
+
+# =============================================================================
+# MACS Tool Specification Fixtures
+# =============================================================================
+
+
+@pytest.fixture
+def simple_tool_spec():
+    """Simple tool specification for basic tests."""
+    return {
+        "name": "search_flights",
+        "description": "Search for available flights",
+        "input_schema": {
+            "properties": {
+                "origin": {"type": "string", "description": "Origin airport code"},
+                "destination": {"type": "string", "description": "Destination airport code"},
+            }
+        },
+    }
+
+
+@pytest.fixture
+def complex_tool_spec():
+    """Tool specification with various input types."""
+    return {
+        "name": "book_hotel",
+        "description": "Book a hotel room",
+        "input_schema": {
+            "properties": {
+                "city": {"type": "string", "description": "City name"},
+                "check_in": {"data_type": "date", "description": "Check-in date"},
+                "guests": {"type": "integer", "description": "Number of guests"},
+                "amenities": {"type": "array", "description": "Requested amenities"},
+            }
+        },
+    }
+
+
+@pytest.fixture
+def minimal_tool_spec():
+    """Minimal tool specification with only name."""
+    return {"name": "simple_action"}
+
+
+@pytest.fixture
+def sample_tool_specs():
+    """Sample tool specifications in MACS format (tool groups with actions)."""
+    return [
+        {
+            "tool_name": "flight_tools",
+            "actions": [
+                {
+                    "name": "search_flights",
+                    "description": "Search for available flights",
+                    "input_schema": {
+                        "properties": {
+                            "origin": {"type": "string", "description": "Origin airport"},
+                            "destination": {"type": "string", "description": "Destination airport"},
+                        }
+                    },
+                },
+                {
+                    "name": "book_flight",
+                    "description": "Book a flight",
+                    "input_schema": {
+                        "properties": {
+                            "flight_id": {"type": "string", "description": "Flight ID to book"},
+                        }
+                    },
+                },
+            ],
+        },
+        {
+            "tool_name": "hotel_tools",
+            "actions": [
+                {
+                    "name": "search_hotels",
+                    "description": "Search for hotels",
+                    "input_schema": {
+                        "properties": {
+                            "city": {"type": "string", "description": "City name"},
+                        }
+                    },
+                },
+            ],
+        },
+    ]
+
+
+# =============================================================================
+# MACS Task Fixtures
+# =============================================================================
+
+
+@pytest.fixture
+def sample_task():
+    """Sample MACS task with typical structure."""
+    return Task(
+        query="Book a flight to NYC",
+        environment_data={
+            "tools": [
+                {
+                    "tool_name": "flight_tools",
+                    "actions": [
+                        {"name": "search_flights", "description": "Search flights"},
+                    ],
+                }
+            ]
+        },
+        evaluation_data={
+            "assertions": [
+                "user: Booking confirmed",
+                "agent: Database updated",
+            ]
+        },
+        metadata={"scenario": "Business trip to NYC"},
+    )
+
+
+@pytest.fixture
+def sample_task_no_scenario():
+    """Task without scenario in metadata."""
+    return Task(
+        query="Test query",
+        environment_data={"tools": []},
+        evaluation_data={"assertions": []},
+        metadata={},
+    )
+
+
+@pytest.fixture
+def sample_task_no_assertions():
+    """Task with no assertions."""
+    return Task(
+        query="Simple query",
+        environment_data={},
+        evaluation_data={"assertions": []},
+        metadata={"scenario": "Simple scenario"},
+    )
+
+
+@pytest.fixture
+def travel_task():
+    """Detailed travel domain task for integration tests."""
+    return Task(
+        query="I need to book a flight from San Francisco to New York for next Monday.",
+        environment_data={
+            "tools": [
+                {
+                    "tool_name": "travel_tools",
+                    "actions": [
+                        {
+                            "name": "search_flights",
+                            "description": "Search for available flights between cities",
+                            "input_schema": {
+                                "properties": {
+                                    "origin": {"type": "string", "description": "Origin city or airport code"},
+                                    "destination": {"type": "string", "description": "Destination city or airport code"},
+                                    "date": {"type": "string", "description": "Travel date"},
+                                }
+                            },
+                        },
+                        {
+                            "name": "book_flight",
+                            "description": "Book a specific flight",
+                            "input_schema": {
+                                "properties": {
+                                    "flight_id": {"type": "string", "description": "Flight identifier"},
+                                    "passenger_name": {"type": "string", "description": "Passenger name"},
+                                }
+                            },
+                        },
+                    ],
+                }
+            ]
+        },
+        evaluation_data={
+            "assertions": [
+                "user: The user's flight booking request was acknowledged",
+                "user: The user received flight options or a confirmation",
+                "agent: The search_flights tool was called with correct parameters",
+            ]
+        },
+        metadata={
+            "scenario": """Goal: The user wants to book a flight from San Francisco to New York.
+
+Background:
+* User's name is Alice Johnson
+* User is a business traveler
+* User prefers morning flights
+* User has Delta SkyMiles membership""",
+            "category": "travel",
+            "complexity": "simple",
+        },
+    )
+
+
+@pytest.fixture
+def macs_task_collection(sample_task, travel_task):
+    """Collection of MACS tasks for benchmark.run() tests."""
+    return TaskCollection.from_list([sample_task, travel_task])
+
+
+# =============================================================================
+# MACS Task Data Fixtures (for Environment creation)
+# =============================================================================
+
+
+@pytest.fixture
+def sample_task_data(sample_tool_specs):
+    """Sample task data dict for Environment creation."""
+    return {
+        "environment_data": {
+            "tools": sample_tool_specs,
+        }
+    }
+
+
+# =============================================================================
+# MACS Agent Configuration Fixtures
+# =============================================================================
+
+
+@pytest.fixture
+def sample_agent_data():
+    """Sample MACS agent configuration."""
+    return {
+        "agents": [
+            {
+                "agent_id": "supervisor",
+                "agent_name": "Supervisor",
+                "agent_instruction": "Coordinate agents",
+                "tools": ["flight_tools"],
+            }
+        ],
+        "primary_agent_id": "supervisor",
+    }
+
+
+@pytest.fixture
+def sample_agent_spec_flight():
+    """Agent spec with only flight tools."""
+    return {
+        "agent_id": "flight_agent",
+        "agent_name": "Flight Agent",
+        "tools": ["flight_tools"],
+    }
+
+
+@pytest.fixture
+def sample_agent_spec_all():
+    """Agent spec with all tools."""
+    return {
+        "agent_id": "supervisor",
+        "agent_name": "Supervisor Agent",
+        "tools": ["flight_tools", "hotel_tools"],
+    }
+
+
+@pytest.fixture
+def sample_agent_spec_none():
+    """Agent spec with no matching tools."""
+    return {
+        "agent_id": "router",
+        "agent_name": "Router Agent",
+        "tools": ["unknown_tools"],
+    }
+
+
+# =============================================================================
+# MACS Trace and History Fixtures
+# =============================================================================
+
+
+@pytest.fixture
+def sample_trace():
+    """Sample conversation trace."""
+    return MessageHistory(
+        [
+            {"role": "user", "content": "I need to book a flight to New York"},
+            {"role": "assistant", "content": "Sure! When would you like to travel?"},
+            {"role": "user", "content": "Next Monday"},
+            {"role": "assistant", "content": "I found a flight. Your confirmation number is ABC123."},
+        ]
+    )
+
+
+@pytest.fixture
+def sample_tool_traces():
+    """Sample tool invocation traces."""
+    return {
+        "search_flights": {
+            "invocations": [
+                {
+                    "inputs": {"origin": "LAX", "destination": "JFK"},
+                    "outputs": "Found 3 flights",
+                    "status": "success",
+                }
+            ]
+        },
+        "book_flight": {
+            "invocations": [
+                {
+                    "inputs": {"flight_id": "AA123"},
+                    "outputs": "Booking confirmed",
+                    "status": "success",
+                }
+            ]
+        },
+    }
+
+
+@pytest.fixture
+def sample_conversation():
+    """Sample multi-turn conversation."""
+    return MessageHistory(
+        [
+            {"role": "user", "content": "I need to book a flight from San Francisco to New York for next Monday."},
+            {"role": "assistant", "content": "I'll search for flights for you. What time do you prefer to depart?"},
+            {"role": "user", "content": "Morning, preferably around 8am."},
+            {
+                "role": "assistant",
+                "content": "I found a Delta flight departing at 8:15am. The fare is $450. Would you like to book this?",
+            },
+            {"role": "user", "content": "Yes, please book it."},
+            {
+                "role": "assistant",
+                "content": "Your flight has been booked. Confirmation number: DL123456.",
+            },
+        ]
+    )
+
+
+# =============================================================================
+# MACS Scenario Fixtures
+# =============================================================================
+
+
+@pytest.fixture
+def sample_scenario():
+    """Sample MACS scenario with background."""
+    return """Goal: The user wants to book a flight to New York for a business meeting.
+
+Background:
+* User's name is John Smith
+* User is a frequent business traveler
+* User has preferred airline status with Delta
+* User prefers aisle seats"""
+
+
+@pytest.fixture
+def minimal_scenario():
+    """Minimal scenario without background section."""
+    return "User wants to order food for delivery."
+
+
+@pytest.fixture
+def initial_prompt():
+    """Sample initial user query."""
+    return "I need to book a flight to New York for Monday."
+
+
+# =============================================================================
+# MACS Benchmark Fixtures
+# =============================================================================
+
+
+@pytest.fixture
+def macs_benchmark(sample_agent_data, dummy_model):
+    """Create a MACS benchmark with dummy model for testing.
+
+    Uses dummy_model from parent conftest.py.
+    """
+    return ConcreteMACSBenchmark(sample_agent_data, dummy_model)
diff --git a/tests/test_benchmarks/test_macs/test_data_loader.py b/tests/test_benchmarks/test_macs/test_data_loader.py
index effe22f6..67e0a1b4 100644
--- a/tests/test_benchmarks/test_macs/test_data_loader.py
+++ b/tests/test_benchmarks/test_macs/test_data_loader.py
@@ -103,6 +103,7 @@ def temp_data_dir() -> Path:
 # =============================================================================
 
 
+@pytest.mark.benchmark
 class TestDedupeToolsByName:
     """Tests for _dedupe_tools_by_name function."""
 
@@ -150,6 +151,7 @@ def test_tools_without_name_preserved(self):
         assert len(result) == 3
 
 
+@pytest.mark.benchmark
 class TestCreateToolsList:
     """Tests for _create_tools_list function."""
 
@@ -175,6 +177,7 @@ def test_empty_input(self):
         assert _create_tools_list(None) == []
 
 
+@pytest.mark.benchmark
 class TestCreateAgentsList:
     """Tests for _create_agents_list function."""
 
@@ -200,6 +203,7 @@ def test_empty_input(self):
         assert _create_agents_list([]) == {}
 
 
+@pytest.mark.benchmark
 class TestCreateTasksList:
     """Tests for _create_tasks_list function."""
 
@@ -235,6 +239,7 @@ def test_empty_input(self):
 # =============================================================================
 
 
+@pytest.mark.benchmark
 class TestDownloadFunctions:
     """Tests for download functions using mocks."""
 
@@ -272,6 +277,7 @@ def test_download_json_invalid_json(self):
                 download_json("http://example.com/test.json")
 
 
+@pytest.mark.benchmark
 class TestDownloadOriginalData:
     """Tests for download_original_data function."""
 
@@ -322,6 +328,7 @@ def test_invalid_domain_raises(self, temp_data_dir):
 # =============================================================================
 
 
+@pytest.mark.benchmark
 class TestRestructureData:
     """Tests for restructure_data function."""
 
@@ -360,6 +367,7 @@ def test_missing_original_raises(self, temp_data_dir):
             restructure_data(data_dir=temp_data_dir, domain="travel", verbose=0)
 
 
+@pytest.mark.benchmark
 class TestEnsureDataExists:
     """Tests for ensure_data_exists function."""
 
@@ -436,6 +444,7 @@ def mock_download_file(url: str, timeout=15):
 # =============================================================================
 
 
+@pytest.mark.benchmark
 class TestLoadTasks:
     """Tests for load_tasks function."""
 
@@ -482,6 +491,7 @@ def test_missing_file_raises(self, temp_data_dir):
             load_tasks("travel", data_dir=temp_data_dir)
 
 
+@pytest.mark.benchmark
 class TestLoadAgentConfig:
     """Tests for load_agent_config function."""
 
@@ -514,7 +524,7 @@ def test_invalid_domain_raises(self, temp_data_dir):
 # =============================================================================
 
 
-@pytest.mark.core
+@pytest.mark.benchmark
 class TestDataLoaderIntegration:
     """Integration tests for the full data loading pipeline."""
 
@@ -575,6 +585,7 @@ def test_urls_structure(self):
 # =============================================================================
 
 
+@pytest.mark.benchmark
 class TestConnectionErrorHandling:
     """Tests for graceful handling of network errors."""
 
@@ -625,6 +636,7 @@ def test_ensure_data_exists_network_error(self, temp_data_dir):
 # =============================================================================
 
 
+@pytest.mark.benchmark
 class TestDataLocation:
     """Tests for custom and default data location handling."""
 
@@ -731,6 +743,7 @@ def test_restructure_uses_custom_location(self, temp_data_dir, sample_agents_dat
 # =============================================================================
 
 
+@pytest.mark.benchmark
 class TestSequentialIdGeneration:
     """Tests for task ID generation."""
 
diff --git a/tests/test_benchmarks/test_macs/test_macs_benchmark.py b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
new file mode 100644
index 00000000..45863301
--- /dev/null
+++ b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
@@ -0,0 +1,516 @@
+"""Unit tests for MACSBenchmark and compute_benchmark_metrics."""
+
+import pytest
+from typing import Any, Dict, List, Optional, Tuple
+from unittest.mock import MagicMock
+
+from maseval import AgentAdapter, Task, User, MessageHistory
+from maseval.benchmark.macs import (
+    MACSBenchmark,
+    MACSEnvironment,
+    MACSEvaluator,
+    MACSUser,
+    compute_benchmark_metrics,
+)
+
+from .conftest import MACSModelAdapter, MACSAgentAdapter, ConcreteMACSBenchmark
+
+
+# =============================================================================
+# Unit Tests: Initialization
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSBenchmarkInit:
+    """Tests for MACSBenchmark initialization."""
+
+    def test_init_stores_model(self, macs_model, sample_agent_data):
+        """Model is stored for later use."""
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+
+        assert benchmark._model == macs_model
+
+    def test_init_calls_parent(self, macs_model, sample_agent_data):
+        """Parent Benchmark.__init__ is called."""
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+
+        assert benchmark.agent_data == sample_agent_data
+
+    def test_init_with_callbacks(self, macs_model, sample_agent_data):
+        """Callbacks are passed to parent."""
+        callbacks = [MagicMock()]
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model, callbacks=callbacks)
+
+        assert benchmark.callbacks == callbacks
+
+    def test_init_with_n_task_repeats(self, macs_model, sample_agent_data):
+        """n_task_repeats is set correctly."""
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model, n_task_repeats=3)
+
+        assert benchmark.n_task_repeats == 3
+
+
+# =============================================================================
+# Unit Tests: Setup Methods
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestSetupMethods:
+    """Tests for setup methods."""
+
+    def test_setup_environment_creates_macs_environment(self, macs_model, sample_agent_data, sample_task):
+        """setup_environment returns MACSEnvironment."""
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+
+        assert isinstance(env, MACSEnvironment)
+        assert "search_flights" in env.tools
+
+    def test_setup_user_creates_macs_user(self, macs_model, sample_agent_data, sample_task):
+        """setup_user returns MACSUser."""
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+
+        user = benchmark.setup_user(sample_agent_data, env, sample_task)
+
+        assert isinstance(user, MACSUser)
+
+    def test_setup_user_extracts_scenario(self, macs_model, sample_agent_data, sample_task):
+        """Passes scenario from task metadata."""
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+
+        user = benchmark.setup_user(sample_agent_data, env, sample_task)
+
+        assert user.scenario == "Business trip to NYC"
+
+    def test_setup_user_handles_no_scenario(self, macs_model, sample_agent_data, sample_task_no_scenario):
+        """Handles missing scenario gracefully."""
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task_no_scenario)
+
+        user = benchmark.setup_user(sample_agent_data, env, sample_task_no_scenario)
+
+        assert user.scenario == ""
+
+    def test_setup_evaluators_creates_dual(self, macs_model, sample_agent_data, sample_task):
+        """Creates both user and system evaluators."""
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+        agents = [MACSAgentAdapter()]
+
+        evaluators = benchmark.setup_evaluators(env, sample_task, agents, None)
+
+        assert len(evaluators) == 2
+        assert isinstance(evaluators[0], MACSEvaluator)
+        assert isinstance(evaluators[1], MACSEvaluator)
+        assert evaluators[0].gsr_type == "user"
+        assert evaluators[1].gsr_type == "system"
+
+    def test_setup_agents_is_abstract(self, macs_model, sample_agent_data):
+        """setup_agents must be overridden in subclass."""
+        # MACSBenchmark itself can't be instantiated without setup_agents
+        # We verify by checking the abstract method exists
+        import inspect
+
+        assert inspect.isabstract(MACSBenchmark)
+
+        # Verify IncompleteMACSBenchmark would fail (checked at class definition time)
+        with pytest.raises(TypeError, match="abstract"):
+
+            class IncompleteMACSBenchmark(MACSBenchmark):
+                pass
+
+            # This line won't be reached due to TypeError at class definition
+            IncompleteMACSBenchmark(sample_agent_data, macs_model)  # type: ignore
+
+
+# =============================================================================
+# Unit Tests: Run Agents
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestRunAgents:
+    """Tests for run_agents method."""
+
+    def test_run_agents_executes_agents(self, macs_model, sample_agent_data, sample_task):
+        """Agents are executed with query."""
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+
+        agents_list, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
+
+        benchmark.run_agents(agents_list, sample_task, env)
+
+        # Cast to MACSAgentAdapter to access run_calls
+        mock_agent = agents_list[0]
+        assert isinstance(mock_agent, MACSAgentAdapter)
+        assert len(mock_agent.run_calls) == 1
+        assert mock_agent.run_calls[0] == sample_task.query
+
+    def test_run_agents_returns_answer(self, macs_model, sample_agent_data, sample_task):
+        """Returns final answer(s) as MessageHistory."""
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+        agents_list, _ = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
+
+        result = benchmark.run_agents(agents_list, sample_task, env)
+
+        # run_agents returns MessageHistory from the agent run
+        assert isinstance(result, MessageHistory)
+        assert len(result) > 0
+        # Check that response content contains expected text
+        assert "Response to:" in result[-1]["content"]
+
+    def test_run_agents_single_agent(self, macs_model, sample_agent_data, sample_task):
+        """Single agent returns MessageHistory."""
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+        agents_list, _ = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
+
+        result = benchmark.run_agents(agents_list, sample_task, env)
+
+        assert isinstance(result, MessageHistory)
+
+    def test_run_agents_multiple_agents(self, macs_model, sample_agent_data, sample_task):
+        """Multiple agents return list of answers."""
+
+        class MultiAgentBenchmark(MACSBenchmark):
+            def setup_agents(
+                self,
+                agent_data: Dict[str, Any],
+                environment: MACSEnvironment,
+                task: Task,
+                user: Optional[User],
+            ) -> Tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
+                agent1: AgentAdapter = MACSAgentAdapter("agent1")
+                agent2: AgentAdapter = MACSAgentAdapter("agent2")
+                return [agent1, agent2], {"agent1": agent1, "agent2": agent2}
+
+        benchmark = MultiAgentBenchmark(sample_agent_data, macs_model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+        agents_list, _ = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
+
+        result = benchmark.run_agents(agents_list, sample_task, env)
+
+        assert isinstance(result, list)
+        assert len(result) == 2
+
+
+# =============================================================================
+# Unit Tests: Evaluation
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestEvaluation:
+    """Tests for evaluate method."""
+
+    def test_evaluate_calls_both_evaluators(self, sample_agent_data, sample_task):
+        """Both user and system evaluators are called."""
+        # Model returns valid JSON for evaluation
+        responses = [
+            '[{"assertion": "User assertion", "answer": "TRUE", "evidence": "OK"}]',
+            '[{"assertion": "System assertion", "answer": "TRUE", "evidence": "OK"}]',
+        ]
+        model = MACSModelAdapter(responses=responses)
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+        _, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
+        evaluators = benchmark.setup_evaluators(env, sample_task, list(agents_dict.values()), None)
+
+        traces = {
+            "agents": {
+                "test_agent": {
+                    "messages": [
+                        {"role": "user", "content": "Book flight"},
+                        {"role": "assistant", "content": "Done"},
+                    ]
+                }
+            },
+            "tools": {},
+        }
+
+        results = benchmark.evaluate(evaluators, agents_dict, "final answer", traces)
+
+        assert len(results) == 1  # Combined into one result dict
+        assert "user_gsr" in results[0]
+        assert "system_gsr" in results[0]
+
+    def test_evaluate_returns_aggregated_metrics(self, sample_agent_data, sample_task):
+        """Returns combined GSR metrics."""
+        responses = [
+            '[{"assertion": "A", "answer": "TRUE", "evidence": "OK"}]',
+            '[{"assertion": "B", "answer": "TRUE", "evidence": "OK"}]',
+        ]
+        model = MACSModelAdapter(responses=responses)
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+        _, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
+        evaluators = benchmark.setup_evaluators(env, sample_task, list(agents_dict.values()), None)
+
+        traces = {
+            "agents": {"test_agent": {"messages": [{"role": "user", "content": "Q"}]}},
+            "tools": {},
+        }
+
+        results = benchmark.evaluate(evaluators, agents_dict, "answer", traces)
+
+        result = results[0]
+        assert "user_gsr" in result
+        assert "user_partial_gsr" in result
+        assert "system_gsr" in result
+        assert "system_partial_gsr" in result
+        assert "overall_gsr" in result
+        assert "overall_partial_gsr" in result
+        assert "supervisor_gsr" in result
+        assert "report" in result
+
+    def test_evaluate_overall_gsr(self, sample_agent_data, sample_task):
+        """overall_gsr = 1.0 only if both user AND system pass."""
+        # User passes, system fails
+        responses = [
+            '[{"assertion": "A", "answer": "TRUE", "evidence": "OK"}]',
+            '[{"assertion": "B", "answer": "FALSE", "evidence": "Fail"}]',
+        ]
+        model = MACSModelAdapter(responses=responses)
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+        _, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
+        evaluators = benchmark.setup_evaluators(env, sample_task, list(agents_dict.values()), None)
+
+        traces = {
+            "agents": {"test_agent": {"messages": [{"role": "user", "content": "Q"}]}},
+            "tools": {},
+        }
+
+        results = benchmark.evaluate(evaluators, agents_dict, "answer", traces)
+
+        assert results[0]["user_gsr"] == 1.0
+        assert results[0]["system_gsr"] == 0.0
+        assert results[0]["overall_gsr"] == 0.0  # Not all passed
+
+    def test_evaluate_supervisor_gsr(self, sample_agent_data, sample_task):
+        """supervisor_gsr = 1.0 if overall OR user passes."""
+        # User passes, system fails
+        responses = [
+            '[{"assertion": "A", "answer": "TRUE", "evidence": "OK"}]',
+            '[{"assertion": "B", "answer": "FALSE", "evidence": "Fail"}]',
+        ]
+        model = MACSModelAdapter(responses=responses)
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+        _, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
+        evaluators = benchmark.setup_evaluators(env, sample_task, list(agents_dict.values()), None)
+
+        traces = {
+            "agents": {"test_agent": {"messages": [{"role": "user", "content": "Q"}]}},
+            "tools": {},
+        }
+
+        results = benchmark.evaluate(evaluators, agents_dict, "answer", traces)
+
+        # User passed, so supervisor_gsr should be 1.0
+        assert results[0]["supervisor_gsr"] == 1.0
+
+    def test_evaluate_combined_report(self, sample_agent_data, sample_task):
+        """Report combines both evaluator reports."""
+        responses = [
+            '[{"assertion": "User A", "answer": "TRUE", "evidence": "OK"}]',
+            '[{"assertion": "System B", "answer": "TRUE", "evidence": "OK"}]',
+        ]
+        model = MACSModelAdapter(responses=responses)
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+        _, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
+        evaluators = benchmark.setup_evaluators(env, sample_task, list(agents_dict.values()), None)
+
+        traces = {
+            "agents": {"test_agent": {"messages": [{"role": "user", "content": "Q"}]}},
+            "tools": {},
+        }
+
+        results = benchmark.evaluate(evaluators, agents_dict, "answer", traces)
+
+        report = results[0]["report"]
+        assert len(report) == 2
+        # Check assertion types are added
+        assertion_types = [item.get("assertion_type") for item in report]
+        assert "user" in assertion_types
+        assert "system" in assertion_types
+
+
+# =============================================================================
+# Unit Tests: compute_benchmark_metrics
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestComputeBenchmarkMetrics:
+    """Tests for compute_benchmark_metrics utility."""
+
+    def test_empty_results(self):
+        """Empty results returns zeros."""
+        result = compute_benchmark_metrics([])
+
+        assert result["total_tasks"] == 0
+        assert result["successful_tasks"] == 0
+        assert result["success_rate"] == 0.0
+        assert result["mean_metrics"] == {}
+
+    def test_single_successful_result(self):
+        """Single successful result counted."""
+        results = [{"eval": [{"overall_gsr": 1.0, "user_gsr": 1.0, "system_gsr": 1.0}]}]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["total_tasks"] == 1
+        assert metrics["successful_tasks"] == 1
+        assert metrics["success_rate"] == 1.0
+
+    def test_single_failed_result(self):
+        """Single failed result counted."""
+        results = [{"eval": [{"overall_gsr": 0.0, "user_gsr": 0.0, "system_gsr": 0.0}]}]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["total_tasks"] == 1
+        assert metrics["successful_tasks"] == 0
+        assert metrics["success_rate"] == 0.0
+
+    def test_multiple_results(self):
+        """Multiple results aggregated correctly."""
+        results = [
+            {"eval": [{"overall_gsr": 1.0}]},  # Success
+            {"eval": [{"overall_gsr": 0.0}]},  # Fail
+            {"eval": [{"overall_gsr": 1.0}]},  # Success
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["total_tasks"] == 3
+        assert metrics["successful_tasks"] == 2
+        assert metrics["success_rate"] == pytest.approx(2 / 3)
+
+    def test_success_rate_calculation(self):
+        """success_rate = successful/total."""
+        results = [
+            {"eval": [{"overall_gsr": 1.0}]},
+            {"eval": [{"overall_gsr": 1.0}]},
+            {"eval": [{"overall_gsr": 0.0}]},
+            {"eval": [{"overall_gsr": 0.0}]},
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["success_rate"] == 0.5
+
+    def test_mean_metrics_calculation(self):
+        """Mean of numeric metrics computed."""
+        results = [
+            {"eval": [{"overall_gsr": 1.0, "partial_gsr": 0.8}]},
+            {"eval": [{"overall_gsr": 0.0, "partial_gsr": 0.4}]},
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["mean_metrics"]["overall_gsr"] == pytest.approx(0.5)
+        assert metrics["mean_metrics"]["partial_gsr"] == pytest.approx(0.6)
+
+    def test_handles_missing_eval(self):
+        """Handles results with no eval key."""
+        results = [
+            {"eval": [{"overall_gsr": 1.0}]},
+            {"no_eval_key": True},  # Missing eval
+            {"eval": None},  # None eval
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["total_tasks"] == 3
+        assert metrics["successful_tasks"] == 1
+
+    def test_handles_non_numeric_values(self):
+        """Non-numeric values in eval are ignored for mean."""
+        results = [
+            {
+                "eval": [
+                    {
+                        "overall_gsr": 1.0,
+                        "report": [{"assertion": "A"}],  # Non-numeric
+                        "status": "success",  # String
+                    }
+                ]
+            }
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        # Should only have numeric metrics
+        assert "overall_gsr" in metrics["mean_metrics"]
+        assert "report" not in metrics["mean_metrics"]
+        assert "status" not in metrics["mean_metrics"]
+
+
+# =============================================================================
+# Integration Tests
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSBenchmarkIntegration:
+    """Integration tests for MACSBenchmark."""
+
+    def test_full_task_execution(self, sample_agent_data, sample_task):
+        """Test complete task execution flow."""
+        # Evaluator responses - user then system
+        responses = [
+            '[{"assertion": "Booking confirmed", "answer": "TRUE", "evidence": "Done"}]',
+            '[{"assertion": "Database updated", "answer": "TRUE", "evidence": "Updated"}]',
+        ]
+        model = MACSModelAdapter(responses=responses)
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+
+        # Setup phase
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+        user = benchmark.setup_user(sample_agent_data, env, sample_task)
+        agents_list, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, user)
+        evaluators = benchmark.setup_evaluators(env, sample_task, agents_list, user)
+
+        # Run phase
+        final_answer = benchmark.run_agents(agents_list, sample_task, env)
+
+        # Evaluate phase
+        traces = {
+            "agents": {
+                "test_agent": {
+                    "messages": [
+                        {"role": "user", "content": sample_task.query},
+                        {"role": "assistant", "content": final_answer},
+                    ]
+                }
+            },
+            "tools": {},
+        }
+        results = benchmark.evaluate(evaluators, agents_dict, final_answer, traces)
+
+        # Verify results
+        assert len(results) == 1
+        assert results[0]["user_gsr"] == 1.0
+        assert results[0]["system_gsr"] == 1.0
+        assert results[0]["overall_gsr"] == 1.0
+
+    def test_benchmark_with_real_environment(self, sample_agent_data, sample_task):
+        """Test with real MACSEnvironment tool creation."""
+        model = MACSModelAdapter()
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+
+        # Environment should have tools
+        assert "search_flights" in env.tools
+        assert env.tools["search_flights"].name == "search_flights"
diff --git a/tests/test_benchmarks/test_macs/test_macs_environment.py b/tests/test_benchmarks/test_macs/test_macs_environment.py
new file mode 100644
index 00000000..d161940b
--- /dev/null
+++ b/tests/test_benchmarks/test_macs/test_macs_environment.py
@@ -0,0 +1,356 @@
+"""Unit tests for MACSEnvironment."""
+
+import pytest
+from unittest.mock import patch
+
+from maseval.benchmark.macs import MACSEnvironment, MACSGenericTool
+
+from .conftest import MACSModelAdapter
+
+
+# =============================================================================
+# Unit Tests: Initialization
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSEnvironmentInit:
+    """Tests for MACSEnvironment initialization."""
+
+    def test_init_with_task_data(self, macs_model, sample_task_data):
+        """Initializes from task data."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        assert env is not None
+        assert "tool_specs" in env.state
+
+    def test_init_stores_model(self, macs_model, sample_task_data):
+        """Model is stored for tool creation."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        assert env._model == macs_model
+
+    def test_init_calls_parent(self, macs_model, sample_task_data):
+        """Parent Environment.__init__ is called."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        # Parent sets up state and creates tools
+        assert hasattr(env, "state")
+        assert hasattr(env, "tools")
+
+
+# =============================================================================
+# Unit Tests: State Setup
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestSetupState:
+    """Tests for setup_state method."""
+
+    def test_setup_state_extracts_tool_specs(self, macs_model, sample_task_data):
+        """setup_state extracts tool_specs from task_data."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        assert "tool_specs" in env.state
+        assert len(env.state["tool_specs"]) == 2
+
+    def test_setup_state_empty_tools(self, macs_model):
+        """Handles missing or empty tools."""
+        task_data = {"environment_data": {}}
+        env = MACSEnvironment(task_data, macs_model)
+
+        assert env.state["tool_specs"] == []
+
+    def test_setup_state_no_environment_data(self, macs_model):
+        """Handles missing environment_data."""
+        task_data = {}
+        env = MACSEnvironment(task_data, macs_model)
+
+        assert env.state["tool_specs"] == []
+
+
+# =============================================================================
+# Unit Tests: Tool Creation
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestCreateTools:
+    """Tests for create_tools method."""
+
+    def test_create_tools_from_specs(self, macs_model, sample_task_data):
+        """Creates MACSGenericTool instances from specs."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        assert len(env.tools) == 3  # search_flights, book_flight, search_hotels
+        assert all(isinstance(tool, MACSGenericTool) for tool in env.tools.values())
+
+    def test_create_tools_keyed_by_name(self, macs_model, sample_task_data):
+        """Tools dict is keyed by tool name."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        assert "search_flights" in env.tools
+        assert "book_flight" in env.tools
+        assert "search_hotels" in env.tools
+
+    def test_create_tools_correct_properties(self, macs_model, sample_task_data):
+        """Created tools have correct properties."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        search_flights = env.tools["search_flights"]
+        assert search_flights.name == "search_flights"
+        assert search_flights.description == "Search for available flights"
+        assert "origin" in search_flights.inputs
+        assert "destination" in search_flights.inputs
+
+    def test_create_tools_deduplicates(self, macs_model):
+        """Duplicate tool names are deduplicated."""
+        task_data = {
+            "environment_data": {
+                "tools": [
+                    {
+                        "tool_name": "group1",
+                        "actions": [{"name": "duplicate_tool", "description": "First"}],
+                    },
+                    {
+                        "tool_name": "group2",
+                        "actions": [{"name": "duplicate_tool", "description": "First"}],  # Same name
+                    },
+                ]
+            }
+        }
+        env = MACSEnvironment(task_data, macs_model)
+
+        # Should only have one instance
+        assert len(env.tools) == 1
+        assert "duplicate_tool" in env.tools
+
+    def test_create_tools_empty_specs(self, macs_model):
+        """Empty specs returns empty dict."""
+        task_data = {"environment_data": {"tools": []}}
+        env = MACSEnvironment(task_data, macs_model)
+
+        assert env.tools == {}
+
+    def test_create_tools_empty_actions(self, macs_model):
+        """Handles tool groups with no actions."""
+        task_data = {
+            "environment_data": {
+                "tools": [
+                    {"tool_name": "empty_group", "actions": []},
+                ]
+            }
+        }
+        env = MACSEnvironment(task_data, macs_model)
+
+        assert env.tools == {}
+
+
+# =============================================================================
+# Unit Tests: Agent Tool Assignment
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestGetToolsForAgent:
+    """Tests for get_tools_for_agent method."""
+
+    def test_get_tools_for_agent(self, macs_model, sample_task_data, sample_agent_spec_flight):
+        """Returns tools matching agent spec."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        agent_tools = env.get_tools_for_agent(sample_agent_spec_flight)
+
+        assert len(agent_tools) == 2  # search_flights, book_flight
+        assert "search_flights" in agent_tools
+        assert "book_flight" in agent_tools
+        assert "search_hotels" not in agent_tools
+
+    def test_get_tools_for_agent_all(self, macs_model, sample_task_data, sample_agent_spec_all):
+        """Returns all tools when agent has access to all groups."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        agent_tools = env.get_tools_for_agent(sample_agent_spec_all)
+
+        assert len(agent_tools) == 3
+        assert "search_flights" in agent_tools
+        assert "book_flight" in agent_tools
+        assert "search_hotels" in agent_tools
+
+    def test_get_tools_for_agent_no_match(self, macs_model, sample_task_data, sample_agent_spec_none):
+        """Returns empty dict if no matching tool groups."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        agent_tools = env.get_tools_for_agent(sample_agent_spec_none)
+
+        assert agent_tools == {}
+
+    def test_get_tools_for_agent_partial(self, macs_model, sample_task_data):
+        """Returns subset matching agent's tool groups."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        agent_spec = {
+            "agent_id": "hotel_agent",
+            "tools": ["hotel_tools"],
+        }
+        agent_tools = env.get_tools_for_agent(agent_spec)
+
+        assert len(agent_tools) == 1
+        assert "search_hotels" in agent_tools
+
+    def test_get_tools_for_agent_returns_same_instances(self, macs_model, sample_task_data, sample_agent_spec_flight):
+        """Returns same tool instances as in env.tools."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        agent_tools = env.get_tools_for_agent(sample_agent_spec_flight)
+
+        # Same instance, not copies
+        assert agent_tools["search_flights"] is env.tools["search_flights"]
+
+    def test_get_tools_for_agent_empty_tools_list(self, macs_model, sample_task_data):
+        """Handles agent with empty tools list."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        agent_spec = {"agent_id": "no_tools", "tools": []}
+        agent_tools = env.get_tools_for_agent(agent_spec)
+
+        assert agent_tools == {}
+
+
+# =============================================================================
+# Integration Tests
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSEnvironmentIntegration:
+    """Integration tests for MACSEnvironment."""
+
+    def test_full_workflow(self, macs_model, sample_task_data):
+        """Test complete environment workflow."""
+        # Create environment
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        # Verify tools created
+        assert len(env.tools) == 3
+
+        # Get tools for different agents
+        flight_agent_spec = {"agent_id": "flight", "tools": ["flight_tools"]}
+        hotel_agent_spec = {"agent_id": "hotel", "tools": ["hotel_tools"]}
+        supervisor_spec = {"agent_id": "super", "tools": ["flight_tools", "hotel_tools"]}
+
+        flight_tools = env.get_tools_for_agent(flight_agent_spec)
+        hotel_tools = env.get_tools_for_agent(hotel_agent_spec)
+        supervisor_tools = env.get_tools_for_agent(supervisor_spec)
+
+        assert len(flight_tools) == 2
+        assert len(hotel_tools) == 1
+        assert len(supervisor_tools) == 3
+
+    def test_tools_are_callable(self, sample_task_data):
+        """Created tools can be called."""
+        # Use a model that returns valid JSON responses (ToolLLMSimulator expects {"text": ..., "details": ...})
+        model = MACSModelAdapter(responses=['{"text": "Found flights: AA123, UA456", "details": {}}'])
+        env = MACSEnvironment(sample_task_data, model)
+
+        search_flights = env.tools["search_flights"]
+        result = search_flights(origin="LAX", destination="JFK")
+
+        # Should return the text from the response
+        assert "Found flights" in result
+
+    def test_multiple_agents_share_tools(self, macs_model, sample_task_data):
+        """Multiple agents can share the same tool instances."""
+        env = MACSEnvironment(sample_task_data, macs_model)
+
+        agent1_spec = {"agent_id": "agent1", "tools": ["flight_tools"]}
+        agent2_spec = {"agent_id": "agent2", "tools": ["flight_tools"]}
+
+        agent1_tools = env.get_tools_for_agent(agent1_spec)
+        agent2_tools = env.get_tools_for_agent(agent2_spec)
+
+        # Same tool instances
+        assert agent1_tools["search_flights"] is agent2_tools["search_flights"]
+
+        # Invocation history is shared
+        with patch.object(agent1_tools["search_flights"].simulator, "__call__", return_value=("Result", {})):
+            agent1_tools["search_flights"](origin="LAX", destination="JFK")
+
+        # Agent2's tool (same instance) should have the invocation
+        assert len(agent2_tools["search_flights"].history.to_list()) == 1
+
+
+# =============================================================================
+# Edge Cases
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestEdgeCases:
+    """Edge case tests for MACSEnvironment."""
+
+    def test_tool_with_no_name(self, macs_model):
+        """Handles actions without name field."""
+        task_data = {
+            "environment_data": {
+                "tools": [
+                    {
+                        "tool_name": "group",
+                        "actions": [
+                            {"description": "No name field"},  # Missing name
+                            {"name": "valid_tool", "description": "Has name"},
+                        ],
+                    }
+                ]
+            }
+        }
+        env = MACSEnvironment(task_data, macs_model)
+
+        # Should only create the valid tool
+        assert len(env.tools) == 1
+        assert "valid_tool" in env.tools
+
+    def test_callbacks_passed_to_parent(self, macs_model, sample_task_data):
+        """Callbacks are passed to parent Environment."""
+        from maseval.core.callback import EnvironmentCallback
+
+        # Create actual callback instances
+        class MockCallback(EnvironmentCallback):
+            pass
+
+        callbacks = [MockCallback(), MockCallback()]
+        env = MACSEnvironment(sample_task_data, macs_model, callbacks=callbacks)
+
+        assert len(env.callbacks) == 2
+        assert all(isinstance(cb, EnvironmentCallback) for cb in env.callbacks)
+
+    def test_nested_tool_groups(self, macs_model):
+        """Handles deeply nested tool structures."""
+        task_data = {
+            "environment_data": {
+                "tools": [
+                    {
+                        "tool_name": "level1",
+                        "actions": [
+                            {
+                                "name": "tool1",
+                                "description": "Tool 1",
+                                "input_schema": {
+                                    "properties": {
+                                        "nested": {
+                                            "type": "object",
+                                            "description": "Nested object",
+                                        }
+                                    }
+                                },
+                            }
+                        ],
+                    }
+                ]
+            }
+        }
+        env = MACSEnvironment(task_data, macs_model)
+
+        assert "tool1" in env.tools
+        assert "nested" in env.tools["tool1"].inputs
diff --git a/tests/test_benchmarks/test_macs/test_macs_evaluator.py b/tests/test_benchmarks/test_macs/test_macs_evaluator.py
new file mode 100644
index 00000000..c5812b95
--- /dev/null
+++ b/tests/test_benchmarks/test_macs/test_macs_evaluator.py
@@ -0,0 +1,510 @@
+"""Unit tests for MACSEvaluator."""
+
+import json
+import pytest
+
+from maseval import MessageHistory, Task
+from maseval.benchmark.macs import MACSEvaluator
+
+from .conftest import MACSModelAdapter
+
+
+# =============================================================================
+# Unit Tests: Initialization
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSEvaluatorInit:
+    """Tests for MACSEvaluator initialization."""
+
+    def test_init_user_type(self, macs_model, sample_task):
+        """Initializes with gsr_type='user'."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        assert evaluator.gsr_type == "user"
+        assert evaluator.model == macs_model
+        assert evaluator.task == sample_task
+
+    def test_init_system_type(self, macs_model, sample_task):
+        """Initializes with gsr_type='system'."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="system")
+
+        assert evaluator.gsr_type == "system"
+
+    def test_init_loads_default_template(self, macs_model, sample_task):
+        """Loads default template from file."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        # Template should contain placeholders
+        assert "{{scenario}}" in evaluator.template
+        assert "{{history}}" in evaluator.template
+        assert "{{assertions}}" in evaluator.template
+
+    def test_init_system_template_has_invocations(self, macs_model, sample_task):
+        """System template includes tool invocations placeholder."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="system")
+
+        assert "{{invocations}}" in evaluator.template
+
+    def test_init_custom_template(self, macs_model, sample_task):
+        """Custom template overrides default."""
+        custom = "Custom template: {{scenario}} {{history}} {{assertions}}"
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user", template=custom)
+
+        assert evaluator.template == custom
+
+
+# =============================================================================
+# Unit Tests: Assertion Parsing
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestAssertionParsing:
+    """Tests for _parse_assertions method."""
+
+    def test_parse_user_assertions(self, macs_model, sample_task):
+        """Filters to user assertions only."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        assertions = [
+            "user: This is a user assertion",
+            "agent: This is an agent assertion",
+            "user: Another user assertion",
+        ]
+        parsed = evaluator._parse_assertions(assertions)
+
+        assert len(parsed) == 2
+        assert "This is a user assertion" in parsed
+        assert "Another user assertion" in parsed
+        assert "This is an agent assertion" not in parsed
+
+    def test_parse_system_assertions(self, macs_model, sample_task):
+        """Filters to system (agent:) assertions."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="system")
+
+        assertions = [
+            "user: User assertion",
+            "agent: Agent assertion 1",
+            "agent: Agent assertion 2",
+        ]
+        parsed = evaluator._parse_assertions(assertions)
+
+        assert len(parsed) == 2
+        assert "Agent assertion 1" in parsed
+        assert "Agent assertion 2" in parsed
+        assert "User assertion" not in parsed
+
+    def test_parse_no_prefix_is_user(self, macs_model, sample_task):
+        """Unprefixed assertions are user type (AWS default)."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        assertions = [
+            "No prefix assertion",
+            "user: Explicit user assertion",
+        ]
+        parsed = evaluator._parse_assertions(assertions)
+
+        assert len(parsed) == 2
+        assert "No prefix assertion" in parsed
+        assert "Explicit user assertion" in parsed
+
+    def test_parse_mixed_assertions(self, macs_model, sample_task):
+        """Correctly splits mixed assertions."""
+        evaluator_user = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+        evaluator_system = MACSEvaluator(macs_model, sample_task, gsr_type="system")
+
+        assertions = [
+            "user: User only",
+            "agent: Agent only",
+            "Unprefixed becomes user",
+        ]
+
+        user_parsed = evaluator_user._parse_assertions(assertions)
+        system_parsed = evaluator_system._parse_assertions(assertions)
+
+        assert len(user_parsed) == 2  # user: + unprefixed
+        assert len(system_parsed) == 1  # agent: only
+
+    def test_parse_empty_assertions(self, macs_model, sample_task):
+        """Empty list returns empty."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        parsed = evaluator._parse_assertions([])
+
+        assert parsed == []
+
+    def test_parse_case_insensitive(self, macs_model, sample_task):
+        """Prefix matching is case-insensitive."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        assertions = [
+            "USER: Uppercase user",
+            "User: Mixed case user",
+            "AGENT: Should be excluded",
+        ]
+        parsed = evaluator._parse_assertions(assertions)
+
+        assert len(parsed) == 2
+        assert "Uppercase user" in parsed
+        assert "Mixed case user" in parsed
+
+
+# =============================================================================
+# Unit Tests: Trace Filtering
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestTraceFiltering:
+    """Tests for filter_traces method."""
+
+    def test_filter_traces_user_type(self, macs_model, sample_task, sample_trace):
+        """User type gets user messages only."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        traces = {"user": {"history": sample_trace.to_list()}, "tools": {"tool1": {}}}
+        filtered = evaluator.filter_traces(traces)
+
+        assert "messages" in filtered
+        assert isinstance(filtered["messages"], MessageHistory)
+        # Should not have tools in user evaluation
+        assert "tools" not in filtered or filtered.get("tools") is None
+
+    def test_filter_traces_system_type(self, macs_model, sample_task, sample_trace, sample_tool_traces):
+        """System type gets full traces."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="system")
+
+        traces = {"user": {"history": sample_trace.to_list()}, "tools": sample_tool_traces}
+        filtered = evaluator.filter_traces(traces)
+
+        # System should get everything
+        assert traces == filtered
+
+
+# =============================================================================
+# Unit Tests: Conversation Formatting
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestConversationFormatting:
+    """Tests for _format_conversation_history method."""
+
+    def test_format_conversation_history(self, macs_model, sample_task, sample_trace):
+        """Formats MessageHistory to string."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        formatted = evaluator._format_conversation_history(sample_trace)
+
+        assert "user: I need to book a flight to New York" in formatted
+        assert "assistant: Sure! When would you like to travel?" in formatted
+        assert "user: Next Monday" in formatted
+
+    def test_format_conversation_list_content(self, macs_model, sample_task):
+        """Handles list content (multi-modal)."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        trace = MessageHistory(
+            [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": "Hello"}, {"type": "text", "text": "World"}],
+                }
+            ]
+        )
+        formatted = evaluator._format_conversation_history(trace)
+
+        assert "user: Hello World" in formatted
+
+    def test_format_tool_invocations(self, macs_model, sample_task, sample_tool_traces):
+        """Formats tool traces for system eval."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="system")
+
+        formatted = evaluator._format_tool_invocations(sample_tool_traces)
+
+        assert "Tool: search_flights" in formatted
+        assert "Inputs:" in formatted
+        assert "origin" in formatted
+        assert "Outputs:" in formatted
+        assert "Status:" in formatted
+
+    def test_format_tool_invocations_empty(self, macs_model, sample_task):
+        """Handles empty tool traces."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="system")
+
+        formatted = evaluator._format_tool_invocations({})
+
+        assert "No tool invocations recorded" in formatted
+
+
+# =============================================================================
+# Unit Tests: GSR Computation
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestGSRComputation:
+    """Tests for _compute_gsr method."""
+
+    def test_compute_gsr_all_true(self, macs_model, sample_task):
+        """All true → gsr=1.0, partial=1.0."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        report = [
+            {"assertion": "Test 1", "answer": "TRUE"},
+            {"assertion": "Test 2", "answer": "TRUE"},
+            {"assertion": "Test 3", "answer": "TRUE"},
+        ]
+        gsr, partial = evaluator._compute_gsr(report)
+
+        assert gsr == 1.0
+        assert partial == 1.0
+
+    def test_compute_gsr_all_false(self, macs_model, sample_task):
+        """All false → gsr=0.0, partial=0.0."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        report = [
+            {"assertion": "Test 1", "answer": "FALSE"},
+            {"assertion": "Test 2", "answer": "FALSE"},
+        ]
+        gsr, partial = evaluator._compute_gsr(report)
+
+        assert gsr == 0.0
+        assert partial == 0.0
+
+    def test_compute_gsr_mixed(self, macs_model, sample_task):
+        """Mixed → gsr=0.0, partial=fraction."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        report = [
+            {"assertion": "Test 1", "answer": "TRUE"},
+            {"assertion": "Test 2", "answer": "FALSE"},
+            {"assertion": "Test 3", "answer": "TRUE"},
+        ]
+        gsr, partial = evaluator._compute_gsr(report)
+
+        assert gsr == 0.0  # Not all true
+        assert partial == pytest.approx(2 / 3)
+
+    def test_compute_gsr_empty(self, macs_model, sample_task):
+        """Empty report → gsr=1.0, partial=1.0."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        gsr, partial = evaluator._compute_gsr([])
+
+        assert gsr == 1.0
+        assert partial == 1.0
+
+    def test_compute_gsr_case_insensitive(self, macs_model, sample_task):
+        """Answer matching is case-insensitive."""
+        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
+
+        report = [
+            {"assertion": "Test 1", "answer": "true"},
+            {"assertion": "Test 2", "answer": "True"},
+            {"assertion": "Test 3", "answer": "TRUE"},
+        ]
+        gsr, partial = evaluator._compute_gsr(report)
+
+        assert gsr == 1.0
+        assert partial == 1.0
+
+
+# =============================================================================
+# Unit Tests: Evaluation Call
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestEvaluationCall:
+    """Tests for __call__ method."""
+
+    def test_call_returns_expected_format(self, sample_task, sample_trace):
+        """Returns gsr, partial_gsr, report."""
+        response = json.dumps(
+            [
+                {"assertion": "Flight booking was confirmed", "answer": "TRUE", "evidence": "Confirmation ABC123"},
+                {"assertion": "User received confirmation number", "answer": "TRUE", "evidence": "ABC123 mentioned"},
+            ]
+        )
+        model = MACSModelAdapter(responses=[response])
+        evaluator = MACSEvaluator(model, sample_task, gsr_type="user")
+
+        traces = {"messages": sample_trace}
+        result = evaluator(traces)
+
+        assert "gsr" in result
+        assert "partial_gsr" in result
+        assert "report" in result
+        assert result["gsr"] == 1.0
+        assert len(result["report"]) == 2
+
+    def test_call_handles_json_error(self, sample_task, sample_trace):
+        """Graceful handling of JSON parse error."""
+        model = MACSModelAdapter(responses=["This is not valid JSON"])
+        evaluator = MACSEvaluator(model, sample_task, gsr_type="user")
+
+        traces = {"messages": sample_trace}
+        result = evaluator(traces)
+
+        assert result["gsr"] == 0.0
+        assert result["partial_gsr"] == 0.0
+        assert "error" in result
+        assert "JSON decode error" in result["error"]
+        assert "raw_response" in result
+
+    def test_call_handles_wrapped_response(self, sample_task, sample_trace):
+        """Handles {'assertions': [...]} wrapper."""
+        response = json.dumps({"assertions": [{"assertion": "Test", "answer": "TRUE", "evidence": "Found"}]})
+        model = MACSModelAdapter(responses=[response])
+        evaluator = MACSEvaluator(model, sample_task, gsr_type="user")
+
+        traces = {"messages": sample_trace}
+        result = evaluator(traces)
+
+        assert result["gsr"] == 1.0
+        assert len(result["report"]) == 1
+
+    def test_call_handles_results_wrapper(self, sample_task, sample_trace):
+        """Handles {'results': [...]} wrapper."""
+        response = json.dumps({"results": [{"assertion": "Test", "answer": "FALSE", "evidence": "Not found"}]})
+        model = MACSModelAdapter(responses=[response])
+        evaluator = MACSEvaluator(model, sample_task, gsr_type="user")
+
+        traces = {"messages": sample_trace}
+        result = evaluator(traces)
+
+        assert result["gsr"] == 0.0
+        assert len(result["report"]) == 1
+
+    def test_call_handles_single_dict_response(self, sample_task, sample_trace):
+        """Handles single dict instead of list."""
+        response = json.dumps({"assertion": "Test", "answer": "TRUE", "evidence": "Found"})
+        model = MACSModelAdapter(responses=[response])
+        evaluator = MACSEvaluator(model, sample_task, gsr_type="user")
+
+        traces = {"messages": sample_trace}
+        result = evaluator(traces)
+
+        assert len(result["report"]) == 1
+
+    def test_call_missing_scenario_raises(self, sample_trace):
+        """Missing scenario raises ValueError."""
+        task = Task(
+            query="Test query",
+            environment_data={},
+            evaluation_data={"assertions": ["user: Test assertion"]},
+            metadata={},  # No scenario!
+        )
+        model = MACSModelAdapter()
+        evaluator = MACSEvaluator(model, task, gsr_type="user")
+
+        traces = {"messages": sample_trace}
+        with pytest.raises(ValueError, match="scenario"):
+            evaluator(traces)
+
+    def test_call_no_assertions_returns_perfect(self, sample_task_no_assertions, sample_trace):
+        """No assertions → perfect score."""
+        model = MACSModelAdapter()
+        evaluator = MACSEvaluator(model, sample_task_no_assertions, gsr_type="user")
+
+        traces = {"messages": sample_trace}
+        result = evaluator(traces)
+
+        assert result["gsr"] == 1.0
+        assert result["partial_gsr"] == 1.0
+        assert result["report"] == []
+
+    def test_call_adds_assertion_type(self, sample_task, sample_trace):
+        """Report items include assertion_type."""
+        response = json.dumps([{"assertion": "Test", "answer": "TRUE", "evidence": "Found"}])
+        model = MACSModelAdapter(responses=[response])
+        evaluator = MACSEvaluator(model, sample_task, gsr_type="user")
+
+        traces = {"messages": sample_trace}
+        result = evaluator(traces)
+
+        assert result["report"][0]["assertion_type"] == "user"
+
+    def test_call_system_includes_tool_invocations(self, sample_task, sample_trace, sample_tool_traces):
+        """System evaluation includes tool invocations in prompt."""
+        response = json.dumps([{"assertion": "Test", "answer": "TRUE", "evidence": "Found"}])
+        model = MACSModelAdapter(responses=[response])
+        evaluator = MACSEvaluator(model, sample_task, gsr_type="system")
+
+        traces = {"messages": sample_trace, "tool_traces": sample_tool_traces}
+        evaluator(traces)
+
+        # Check that tool invocations were included in the prompt
+        prompt = model.prompts[0]
+        assert "search_flights" in prompt or "book_flight" in prompt
+
+
+# =============================================================================
+# Integration Tests
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSEvaluatorIntegration:
+    """Integration tests for MACSEvaluator."""
+
+    def test_full_user_evaluation(self):
+        """Complete user-side evaluation flow."""
+        task = Task(
+            query="Book a flight",
+            environment_data={},
+            evaluation_data={
+                "assertions": [
+                    "user: Booking was successful",
+                    "agent: Internal record created",  # Should be ignored for user eval
+                ]
+            },
+            metadata={"scenario": "Business traveler booking a flight"},
+        )
+
+        response = json.dumps([{"assertion": "Booking was successful", "answer": "TRUE", "evidence": "Confirmed"}])
+        model = MACSModelAdapter(responses=[response])
+        evaluator = MACSEvaluator(model, task, gsr_type="user")
+
+        trace = MessageHistory(
+            [
+                {"role": "user", "content": "Book a flight"},
+                {"role": "assistant", "content": "Your flight is booked!"},
+            ]
+        )
+
+        result = evaluator({"messages": trace})
+
+        assert result["gsr"] == 1.0
+        assert len(result["report"]) == 1
+        assert result["report"][0]["assertion_type"] == "user"
+
+    def test_full_system_evaluation(self):
+        """Complete system-side evaluation flow."""
+        task = Task(
+            query="Book a flight",
+            environment_data={},
+            evaluation_data={
+                "assertions": [
+                    "user: Should be ignored for system eval",
+                    "agent: Database was updated",
+                ]
+            },
+            metadata={"scenario": "System checking internal operations"},
+        )
+
+        response = json.dumps([{"assertion": "Database was updated", "answer": "TRUE", "evidence": "DB log shows update"}])
+        model = MACSModelAdapter(responses=[response])
+        evaluator = MACSEvaluator(model, task, gsr_type="system")
+
+        trace = MessageHistory([{"role": "assistant", "content": "Done"}])
+        tool_traces = {"update_db": {"invocations": [{"inputs": {}, "outputs": "OK", "status": "success"}]}}
+
+        result = evaluator({"messages": trace, "tool_traces": tool_traces})
+
+        assert result["gsr"] == 1.0
+        assert result["report"][0]["assertion_type"] == "system"
diff --git a/tests/test_benchmarks/test_macs/test_macs_integration.py b/tests/test_benchmarks/test_macs/test_macs_integration.py
new file mode 100644
index 00000000..ea97340a
--- /dev/null
+++ b/tests/test_benchmarks/test_macs/test_macs_integration.py
@@ -0,0 +1,680 @@
+"""Integration tests for MACS benchmark components."""
+
+import json
+import pytest
+from unittest.mock import patch
+
+from maseval import Task
+from maseval.benchmark.macs import (
+    MACSEnvironment,
+    MACSEvaluator,
+    MACSUser,
+    compute_benchmark_metrics,
+)
+
+from .conftest import MACSModelAdapter, ConcreteMACSBenchmark
+
+
+# =============================================================================
+# Environment and Tool Integration Tests
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestEnvironmentToolIntegration:
+    """Integration tests for MACSEnvironment and MACSGenericTool."""
+
+    def test_environment_creates_callable_tools(self, travel_task):
+        """Environment creates tools that can be called."""
+        # Use a model that returns valid JSON responses (ToolLLMSimulator expects {"text": ..., "details": ...})
+        model = MACSModelAdapter(responses=['{"text": "Found flights: AA123, UA456", "details": {}}'])
+        env = MACSEnvironment(
+            task_data={"environment_data": travel_task.environment_data},
+            model=model,
+        )
+
+        assert "search_flights" in env.tools
+        assert "book_flight" in env.tools
+
+        # Tools should be callable
+        search_flights = env.tools["search_flights"]
+        result = search_flights(origin="SFO", destination="JFK", date="2024-12-09")
+
+        # Should return the text from the response
+        assert "Found flights" in result
+
+    def test_tool_tracks_invocations(self, travel_task):
+        """Tool invocations are tracked in history."""
+        model = MACSModelAdapter(responses=['{"text": "success", "details": {}}'])
+        env = MACSEnvironment(
+            task_data={"environment_data": travel_task.environment_data},
+            model=model,
+        )
+
+        search_flights = env.tools["search_flights"]
+
+        # Make multiple calls
+        search_flights(origin="SFO", destination="JFK", date="2024-12-09")
+        search_flights(origin="LAX", destination="ORD", date="2024-12-10")
+
+        history = search_flights.history.to_list()
+        assert len(history) == 2
+        assert history[0]["inputs"]["origin"] == "SFO"
+        assert history[1]["inputs"]["origin"] == "LAX"
+
+    def test_agent_gets_subset_of_tools(self, macs_model):
+        """Agent only gets tools from its assigned tool groups."""
+        task_data = {
+            "environment_data": {
+                "tools": [
+                    {
+                        "tool_name": "group_a",
+                        "actions": [{"name": "tool_a", "description": "Tool A"}],
+                    },
+                    {
+                        "tool_name": "group_b",
+                        "actions": [{"name": "tool_b", "description": "Tool B"}],
+                    },
+                ]
+            }
+        }
+        env = MACSEnvironment(task_data, macs_model)
+
+        agent_spec = {"agent_id": "agent", "tools": ["group_a"]}
+        agent_tools = env.get_tools_for_agent(agent_spec)
+
+        assert "tool_a" in agent_tools
+        assert "tool_b" not in agent_tools
+
+
+# =============================================================================
+# Evaluator Integration Tests
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestEvaluatorIntegration:
+    """Integration tests for MACSEvaluator."""
+
+    def test_user_evaluation_with_conversation(self, travel_task, sample_conversation):
+        """User evaluator works with real conversation trace."""
+        response = json.dumps(
+            [
+                {
+                    "assertion": "The user's flight booking request was acknowledged",
+                    "answer": "TRUE",
+                    "evidence": "Agent acknowledged the request",
+                },
+                {"assertion": "The user received flight options or a confirmation", "answer": "TRUE", "evidence": "Confirmation DL123456"},
+            ]
+        )
+        model = MACSModelAdapter(responses=[response])
+        evaluator = MACSEvaluator(model, travel_task, gsr_type="user")
+
+        traces = {"messages": sample_conversation}
+        result = evaluator(traces)
+
+        assert result["gsr"] == 1.0
+        assert len(result["report"]) == 2
+        assert all(item["assertion_type"] == "user" for item in result["report"])
+
+    def test_system_evaluation_with_tool_traces(self, travel_task, sample_conversation):
+        """System evaluator includes tool invocations."""
+        response = json.dumps(
+            [
+                {
+                    "assertion": "The search_flights tool was called with correct parameters",
+                    "answer": "TRUE",
+                    "evidence": "Tool called with SFO, JFK",
+                },
+            ]
+        )
+        model = MACSModelAdapter(responses=[response])
+        evaluator = MACSEvaluator(model, travel_task, gsr_type="system")
+
+        tool_traces = {
+            "search_flights": {
+                "invocations": [
+                    {"inputs": {"origin": "SFO", "destination": "JFK", "date": "2024-12-09"}, "outputs": "Found 3 flights", "status": "success"}
+                ]
+            }
+        }
+
+        traces = {"messages": sample_conversation, "tool_traces": tool_traces}
+        result = evaluator(traces)
+
+        assert result["gsr"] == 1.0
+        assert result["report"][0]["assertion_type"] == "system"
+
+        # Verify tool info was in the prompt
+        prompt = model.prompts[0]
+        assert "search_flights" in prompt
+        assert "SFO" in prompt
+
+    def test_evaluator_handles_partial_success(self, travel_task, sample_conversation):
+        """Evaluator correctly computes partial GSR."""
+        response = json.dumps(
+            [
+                {"assertion": "First assertion", "answer": "TRUE", "evidence": "OK"},
+                {"assertion": "Second assertion", "answer": "FALSE", "evidence": "Failed"},
+            ]
+        )
+        model = MACSModelAdapter(responses=[response])
+        evaluator = MACSEvaluator(model, travel_task, gsr_type="user")
+
+        traces = {"messages": sample_conversation}
+        result = evaluator(traces)
+
+        assert result["gsr"] == 0.0  # Not all passed
+        assert result["partial_gsr"] == 0.5  # 1 of 2 passed
+
+
+# =============================================================================
+# User Simulator Integration Tests
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestUserSimulatorIntegration:
+    """Integration tests for MACSUser."""
+
+    def test_user_extracts_profile_from_scenario(self, macs_model, travel_task):
+        """User correctly extracts profile from scenario."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=travel_task.metadata["scenario"],
+            initial_prompt=travel_task.query,
+        )
+
+        # Should have extracted name and other details
+        assert "full_scenario" in user.user_profile
+        assert "Alice Johnson" in user.user_profile.get("full_scenario", "")
+
+    def test_user_respects_max_turns(self, travel_task):
+        """User simulator stops after max_turns."""
+        model = MACSModelAdapter(responses=['{"text": "Yes", "details": {}}'] * 10)
+        user = MACSUser(
+            model=model,
+            scenario=travel_task.metadata["scenario"],
+            initial_prompt=travel_task.query,
+            max_turns=3,
+        )
+
+        # Simulate turns
+        for i in range(3):
+            assert not user.is_done
+            with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="Response"):
+                user.simulate_response(f"Question {i}")
+
+        assert user.is_done
+        assert user._turn_count == 3
+
+    def test_user_detects_stop_token(self, travel_task):
+        """User correctly detects and handles </stop> token."""
+        model = MACSModelAdapter()
+        user = MACSUser(
+            model=model,
+            scenario=travel_task.metadata["scenario"],
+            initial_prompt=travel_task.query,
+        )
+
+        with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="Great, that's all! </stop>"):
+            response = user.simulate_response("Your flight is booked!")
+
+        assert "</stop>" not in response
+        assert user.is_done
+        assert user._stopped
+
+
+# =============================================================================
+# Full Benchmark Integration Tests
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestFullBenchmarkIntegration:
+    """End-to-end integration tests for MACS benchmark."""
+
+    def test_complete_task_lifecycle(self, sample_agent_data, travel_task):
+        """Test complete task: setup → run → evaluate."""
+        # Model responses for various stages
+        responses = [
+            # Tool simulation (if tools are called)
+            '{"flights": [{"id": "DL123", "time": "8:15am"}]}',
+            # User evaluation
+            json.dumps([{"assertion": "User acknowledged", "answer": "TRUE", "evidence": "OK"}]),
+            # System evaluation
+            json.dumps([{"assertion": "Tool called", "answer": "TRUE", "evidence": "OK"}]),
+        ]
+        model = MACSModelAdapter(responses=responses)
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+
+        # Setup phase
+        env = benchmark.setup_environment(sample_agent_data, travel_task)
+        user = benchmark.setup_user(sample_agent_data, env, travel_task)
+        agents_list, agents_dict = benchmark.setup_agents(sample_agent_data, env, travel_task, user)
+        evaluators = benchmark.setup_evaluators(env, travel_task, agents_list, user)
+
+        # Verify setup
+        assert isinstance(env, MACSEnvironment)
+        assert isinstance(user, MACSUser)
+        assert len(agents_list) == 1
+        assert len(evaluators) == 2
+
+        # Run phase
+        final_answer = benchmark.run_agents(agents_list, travel_task, env)
+        assert final_answer is not None
+
+        # Evaluate phase
+        traces = {
+            "agents": {
+                "integration_agent": {
+                    "messages": [
+                        {"role": "user", "content": travel_task.query},
+                        {"role": "assistant", "content": final_answer},
+                    ]
+                }
+            },
+            "tools": {},
+        }
+        results = benchmark.evaluate(evaluators, agents_dict, final_answer, traces)
+
+        # Verify results
+        assert len(results) == 1
+        assert "user_gsr" in results[0]
+        assert "system_gsr" in results[0]
+        assert "overall_gsr" in results[0]
+
+    def test_benchmark_aggregates_metrics_correctly(self):
+        """Test metric aggregation across multiple tasks."""
+        results = [
+            {
+                "task_id": "task-1",
+                "eval": [{"overall_gsr": 1.0, "user_gsr": 1.0, "system_gsr": 1.0, "partial_gsr": 1.0}],
+            },
+            {
+                "task_id": "task-2",
+                "eval": [{"overall_gsr": 0.0, "user_gsr": 1.0, "system_gsr": 0.0, "partial_gsr": 0.5}],
+            },
+            {
+                "task_id": "task-3",
+                "eval": [{"overall_gsr": 1.0, "user_gsr": 1.0, "system_gsr": 1.0, "partial_gsr": 1.0}],
+            },
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["total_tasks"] == 3
+        assert metrics["successful_tasks"] == 2  # overall_gsr == 1.0
+        assert metrics["success_rate"] == pytest.approx(2 / 3)
+        assert metrics["mean_metrics"]["overall_gsr"] == pytest.approx(2 / 3)
+        assert metrics["mean_metrics"]["user_gsr"] == 1.0  # All user tests passed
+
+
+# =============================================================================
+# Data Loading Integration Tests
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestDataLoadingIntegration:
+    """Integration tests for data loading with benchmark components."""
+
+    def test_loaded_task_works_with_environment(self, macs_model, sample_agent_data):
+        """Tasks loaded from data work with MACSEnvironment."""
+        # Create a mock task that simulates loaded data
+        task = Task(
+            query="Book a flight",
+            environment_data={
+                "tools": [
+                    {
+                        "tool_name": "flight_search",
+                        "actions": [{"name": "search", "description": "Search flights"}],
+                    }
+                ]
+            },
+            evaluation_data={"assertions": ["user: Booking done"]},
+            metadata={"scenario": "Travel booking scenario", "task_id": "task-000001"},
+        )
+
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+        env = benchmark.setup_environment(sample_agent_data, task)
+
+        assert "search" in env.tools
+
+    def test_loaded_agent_config_works_with_environment(self, macs_model):
+        """Agent config works with tool assignment."""
+        # Simulate loaded agent config
+        agent_config = {
+            "agents": [
+                {"agent_id": "main", "agent_name": "Main Agent", "tools": ["tool_group"]},
+            ],
+            "primary_agent_id": "main",
+        }
+
+        task_data = {
+            "environment_data": {
+                "tools": [
+                    {
+                        "tool_name": "tool_group",
+                        "actions": [{"name": "action1", "description": "Action 1"}],
+                    }
+                ]
+            }
+        }
+
+        env = MACSEnvironment(task_data, macs_model)
+
+        # Get tools for agent from config
+        agent_spec = agent_config["agents"][0]
+        agent_tools = env.get_tools_for_agent(agent_spec)
+
+        assert "action1" in agent_tools
+
+
+# =============================================================================
+# Error Handling Integration Tests
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestErrorHandlingIntegration:
+    """Integration tests for error handling."""
+
+    def test_evaluator_handles_malformed_llm_response(self, travel_task, sample_conversation):
+        """Evaluator gracefully handles malformed LLM responses."""
+        model = MACSModelAdapter(responses=["This is not valid JSON at all"])
+        evaluator = MACSEvaluator(model, travel_task, gsr_type="user")
+
+        traces = {"messages": sample_conversation}
+        result = evaluator(traces)
+
+        # Should return error result, not crash
+        assert result["gsr"] == 0.0
+        assert "error" in result
+
+    def test_environment_handles_empty_tool_specs(self, macs_model):
+        """Environment handles tasks with no tools."""
+        task_data = {"environment_data": {"tools": []}}
+        env = MACSEnvironment(task_data, macs_model)
+
+        assert env.tools == {}
+
+    def test_user_handles_missing_background(self, macs_model):
+        """User simulator handles scenario without Background section."""
+        scenario = "Simple goal: Book a hotel."
+
+        user = MACSUser(
+            model=macs_model,
+            scenario=scenario,
+            initial_prompt="Book a hotel",
+        )
+
+        # Should not crash, should have minimal profile
+        assert "full_scenario" in user.user_profile
+
+
+# =============================================================================
+# End-to-End Pipeline Tests (benchmark.run())
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestEndToEndPipeline:
+    """End-to-end tests that call benchmark.run() with TaskCollection.
+
+    These tests verify the complete MACS benchmark pipeline:
+    1. Task setup (environment, user, agents, evaluators)
+    2. Agent execution
+    3. Trace collection
+    4. Evaluation (user GSR and system GSR)
+    5. Report generation
+    """
+
+    def test_run_single_task(self, sample_agent_data, sample_task):
+        """Run benchmark with a single task."""
+        # Create model that returns valid responses for all components
+        # User simulator, tool simulator, and evaluator all need JSON responses
+        model = MACSModelAdapter(
+            responses=[
+                # User response
+                '{"text": "Yes, please book that flight.", "details": {}}',
+                # Evaluator responses (user GSR and system GSR)
+                '[{"assertion": "user: Booking confirmed", "answer": "TRUE", "evidence": "User confirmed booking"}]',
+                '[{"assertion": "agent: Database updated", "answer": "TRUE", "evidence": "Agent updated database"}]',
+            ]
+        )
+
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+        reports = benchmark.run([sample_task])
+
+        # Should have exactly one report
+        assert len(reports) == 1
+
+        report = reports[0]
+        assert report["task_id"] == str(sample_task.id)
+        assert report["repeat_idx"] == 0
+        assert report["status"] == "success"
+        assert "traces" in report
+        assert "config" in report
+        assert "eval" in report
+
+    def test_run_multiple_tasks(self, sample_agent_data, macs_task_collection):
+        """Run benchmark with multiple tasks via TaskCollection."""
+        model = MACSModelAdapter(
+            responses=[
+                # Responses cycle for each task
+                '{"text": "User response", "details": {}}',
+                '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
+                '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
+            ]
+        )
+
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+        reports = benchmark.run(macs_task_collection)
+
+        # Should have one report per task
+        assert len(reports) == len(macs_task_collection)
+
+        # Each report should have correct structure
+        for i, report in enumerate(reports):
+            assert report["repeat_idx"] == 0
+            assert report["status"] == "success"
+            assert "eval" in report
+
+    def test_run_with_task_repeats(self, sample_agent_data, sample_task):
+        """Run benchmark with multiple task repetitions."""
+        model = MACSModelAdapter(
+            responses=[
+                '{"text": "response", "details": {}}',
+                '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
+                '[{"assertion": "test", "answer": "FALSE", "evidence": "failed"}]',
+            ]
+        )
+
+        n_repeats = 3
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model, n_task_repeats=n_repeats)
+        reports = benchmark.run([sample_task])
+
+        # Should have n_repeats reports for the single task
+        assert len(reports) == n_repeats
+
+        # Check repeat indices
+        for i, report in enumerate(reports):
+            assert report["repeat_idx"] == i
+            assert report["task_id"] == str(sample_task.id)
+
+    def test_run_returns_traces(self, sample_agent_data, sample_task):
+        """Benchmark run collects and returns traces from all components."""
+        model = MACSModelAdapter(
+            responses=[
+                '{"text": "User says yes", "details": {}}',
+                '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
+                '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
+            ]
+        )
+
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+        reports = benchmark.run([sample_task])
+
+        report = reports[0]
+        traces = report["traces"]
+
+        # Should have traces from environment and user at minimum
+        # (agents may or may not have traces depending on implementation)
+        assert isinstance(traces, dict)
+
+    def test_run_returns_evaluation_results(self, sample_agent_data, sample_task):
+        """Benchmark run returns evaluation results with GSR scores."""
+        # Configure model to return specific evaluation results
+        model = MACSModelAdapter(
+            responses=[
+                '{"text": "User response", "details": {}}',
+                # User GSR: 1/2 assertions TRUE
+                '[{"assertion": "user: Booking confirmed", "answer": "TRUE", "evidence": "confirmed"}, '
+                '{"assertion": "user: Something else", "answer": "FALSE", "evidence": "not found"}]',
+                # System GSR: 1/1 assertions TRUE
+                '[{"assertion": "agent: Database updated", "answer": "TRUE", "evidence": "updated"}]',
+            ]
+        )
+
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+        reports = benchmark.run([sample_task])
+
+        report = reports[0]
+        eval_result = report["eval"]
+
+        # Should have evaluation results (list of eval dicts)
+        assert eval_result is not None
+        assert isinstance(eval_result, list)
+
+    def test_run_handles_evaluation_failure_gracefully(self, sample_agent_data, sample_task):
+        """Benchmark continues even when evaluation fails."""
+        # Model returns invalid JSON for evaluator
+        model = MACSModelAdapter(
+            responses=[
+                '{"text": "User response", "details": {}}',
+                "not valid json - evaluator will fail",
+                "also not valid json",
+            ]
+        )
+
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+        reports = benchmark.run([sample_task])
+
+        # Should still get a report even though evaluation had issues
+        assert len(reports) == 1
+        report = reports[0]
+
+        # Task should complete (status depends on error handling config)
+        assert "status" in report
+
+    def test_run_with_callbacks(self, sample_agent_data, sample_task):
+        """Benchmark triggers callbacks during run."""
+        from maseval import BenchmarkCallback
+
+        class TrackingCallback(BenchmarkCallback):
+            def __init__(self):
+                self.events = []
+
+            def on_run_start(self, benchmark):
+                self.events.append("run_start")
+
+            def on_task_start(self, benchmark, task):
+                self.events.append("task_start")
+
+            def on_task_repeat_start(self, benchmark, task, repeat_idx):
+                self.events.append(f"repeat_start_{repeat_idx}")
+
+            def on_task_repeat_end(self, benchmark, report):
+                self.events.append(f"repeat_end_{report['repeat_idx']}")
+
+            def on_task_end(self, benchmark, task, result):
+                self.events.append("task_end")
+
+            def on_run_end(self, benchmark, results):
+                self.events.append("run_end")
+
+        model = MACSModelAdapter(
+            responses=[
+                '{"text": "response", "details": {}}',
+                '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
+                '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
+            ]
+        )
+
+        callback = TrackingCallback()
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model, callbacks=[callback])
+        benchmark.run([sample_task])
+
+        # Verify callback sequence
+        assert "run_start" in callback.events
+        assert "task_start" in callback.events
+        assert "repeat_start_0" in callback.events
+        assert "repeat_end_0" in callback.events
+        assert "task_end" in callback.events
+        assert "run_end" in callback.events
+
+        # Verify order
+        assert callback.events.index("run_start") < callback.events.index("task_start")
+        assert callback.events.index("task_start") < callback.events.index("repeat_start_0")
+        assert callback.events.index("repeat_end_0") < callback.events.index("task_end")
+        assert callback.events.index("task_end") < callback.events.index("run_end")
+
+    def test_run_computes_benchmark_metrics(self, sample_agent_data, sample_task):
+        """Benchmark metrics can be computed from run results."""
+        model = MACSModelAdapter(
+            responses=[
+                '{"text": "response", "details": {}}',
+                '[{"assertion": "user: Booking confirmed", "answer": "TRUE", "evidence": "ok"}]',
+                '[{"assertion": "agent: Database updated", "answer": "TRUE", "evidence": "ok"}]',
+            ]
+        )
+
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+        reports = benchmark.run([sample_task])
+
+        # compute_benchmark_metrics expects full reports (each with "eval" key)
+        metrics = compute_benchmark_metrics(reports)
+        assert isinstance(metrics, dict)
+        assert "total_tasks" in metrics
+        assert "success_rate" in metrics
+        assert "mean_metrics" in metrics
+
+    def test_full_pipeline_with_travel_task(self, sample_agent_data, travel_task):
+        """Full end-to-end test with realistic travel task."""
+        # Comprehensive responses for the full pipeline
+        model = MACSModelAdapter(
+            responses=[
+                # User simulator response (acknowledging agent's response)
+                '{"text": "Yes, that flight works for me. Please book it.", "details": {"satisfied": true}}',
+                # User GSR evaluator response
+                """[
+                    {"assertion": "user: The user's flight booking request was acknowledged", "answer": "TRUE", "evidence": "Agent acknowledged the booking request"},
+                    {"assertion": "user: The user received flight options or a confirmation", "answer": "TRUE", "evidence": "User confirmed the flight"}
+                ]""",
+                # System GSR evaluator response
+                '[{"assertion": "agent: The search_flights tool was called with correct parameters", "answer": "TRUE", "evidence": "Tool was called"}]',
+            ]
+        )
+
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
+        reports = benchmark.run([travel_task])
+
+        assert len(reports) == 1
+        report = reports[0]
+
+        # Verify successful execution
+        assert report["status"] == "success"
+        assert report["task_id"] == str(travel_task.id)
+
+        # Verify traces were collected
+        assert "traces" in report
+        assert isinstance(report["traces"], dict)
+
+        # Verify config was collected
+        assert "config" in report
+        assert isinstance(report["config"], dict)
+
+        # Verify evaluation ran
+        assert "eval" in report
diff --git a/tests/test_benchmarks/test_macs/test_macs_tool.py b/tests/test_benchmarks/test_macs/test_macs_tool.py
new file mode 100644
index 00000000..37ab332f
--- /dev/null
+++ b/tests/test_benchmarks/test_macs/test_macs_tool.py
@@ -0,0 +1,296 @@
+"""Unit tests for MACSGenericTool."""
+
+import pytest
+from unittest.mock import patch
+
+from maseval.benchmark.macs import MACSGenericTool
+
+from .conftest import MACSModelAdapter
+
+
+# =============================================================================
+# Unit Tests: Initialization
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSGenericToolInit:
+    """Tests for MACSGenericTool initialization."""
+
+    def test_init_from_spec(self, simple_tool_spec, macs_model):
+        """Tool initializes correctly from specification dict."""
+        tool = MACSGenericTool(simple_tool_spec, macs_model)
+
+        assert tool.name == "search_flights"
+        assert tool.description == "Search for available flights"
+        assert "origin" in tool.inputs
+        assert "destination" in tool.inputs
+
+    def test_name_and_description(self, simple_tool_spec, macs_model):
+        """Tool exposes correct name and description."""
+        tool = MACSGenericTool(simple_tool_spec, macs_model)
+
+        assert tool.name == simple_tool_spec["name"]
+        assert tool.description == simple_tool_spec["description"]
+        assert tool.output_type == "string"
+
+    def test_minimal_spec(self, minimal_tool_spec, macs_model):
+        """Tool handles minimal specification with defaults."""
+        tool = MACSGenericTool(minimal_tool_spec, macs_model)
+
+        assert tool.name == "simple_action"
+        assert tool.description == ""
+        assert tool.inputs == {}
+
+    def test_creates_simulator(self, simple_tool_spec, macs_model):
+        """Tool creates a ToolLLMSimulator."""
+        tool = MACSGenericTool(simple_tool_spec, macs_model)
+
+        assert tool.simulator is not None
+        assert tool.simulator.tool_name == "search_flights"
+
+    def test_empty_history_on_init(self, simple_tool_spec, macs_model):
+        """Tool starts with empty invocation history."""
+        tool = MACSGenericTool(simple_tool_spec, macs_model)
+
+        assert len(tool.history.to_list()) == 0
+
+
+# =============================================================================
+# Unit Tests: Schema Conversion
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestSchemaToInputs:
+    """Tests for _schema_to_inputs static method."""
+
+    def test_schema_to_inputs_basic(self):
+        """JSON schema correctly converted to inputs format."""
+        schema = {
+            "properties": {
+                "origin": {"type": "string", "description": "Origin city"},
+                "count": {"type": "integer", "description": "Number of items"},
+            }
+        }
+        result = MACSGenericTool._schema_to_inputs(schema)
+
+        assert "origin" in result
+        assert result["origin"]["type"] == "string"
+        assert result["origin"]["description"] == "Origin city"
+        assert result["count"]["type"] == "integer"
+
+    def test_schema_to_inputs_empty(self):
+        """Empty schema returns empty inputs."""
+        assert MACSGenericTool._schema_to_inputs({}) == {}
+        assert MACSGenericTool._schema_to_inputs({"properties": {}}) == {}
+
+    def test_schema_to_inputs_data_type_field(self):
+        """Handles data_type field (MACS format) over type."""
+        schema = {
+            "properties": {
+                "date": {"data_type": "date", "type": "string", "description": "Date"},
+            }
+        }
+        result = MACSGenericTool._schema_to_inputs(schema)
+
+        # data_type takes precedence
+        assert result["date"]["type"] == "date"
+
+    def test_schema_to_inputs_missing_description(self):
+        """Handles missing description gracefully."""
+        schema = {
+            "properties": {
+                "field": {"type": "string"},
+            }
+        }
+        result = MACSGenericTool._schema_to_inputs(schema)
+
+        assert result["field"]["description"] == ""
+
+    def test_schema_to_inputs_missing_type(self):
+        """Defaults to string type when missing."""
+        schema = {
+            "properties": {
+                "field": {"description": "Some field"},
+            }
+        }
+        result = MACSGenericTool._schema_to_inputs(schema)
+
+        assert result["field"]["type"] == "string"
+
+
+# =============================================================================
+# Unit Tests: Tool Invocation
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSGenericToolInvocation:
+    """Tests for tool invocation behavior."""
+
+    def test_call_invokes_model(self, simple_tool_spec):
+        """Calling tool invokes the model via simulator."""
+        # Create model that returns valid JSON (ToolLLMSimulator expects {"text": ..., "details": ...})
+        model = MACSModelAdapter(responses=['{"text": "Found flights", "details": {}}'])
+        tool = MACSGenericTool(simple_tool_spec, model)
+
+        _ = tool(origin="LAX", destination="JFK")
+
+        # Model should have been called
+        assert model._call_count >= 1
+
+    def test_call_returns_response(self, simple_tool_spec):
+        """Tool call returns the simulated response."""
+        # Create model that returns valid JSON
+        model = MACSModelAdapter(responses=['{"text": "Flight found: AA123", "details": {}}'])
+        tool = MACSGenericTool(simple_tool_spec, model)
+
+        result = tool(origin="LAX", destination="JFK")
+
+        # Result should contain the response text
+        assert "Flight found: AA123" in result
+
+    def test_call_records_history(self, simple_tool_spec):
+        """Tool invocation recorded in history."""
+        model = MACSModelAdapter(responses=['{"text": "success", "details": {"booking_id": "123"}}'])
+        tool = MACSGenericTool(simple_tool_spec, model)
+
+        tool(origin="LAX", destination="JFK")
+
+        history = tool.history.to_list()
+        assert len(history) == 1
+        assert history[0]["inputs"] == {"origin": "LAX", "destination": "JFK"}
+        assert history[0]["outputs"] == "success"
+        assert history[0]["status"] == "success"
+        assert history[0]["meta"] == {"booking_id": "123"}
+
+    def test_multiple_invocations(self, simple_tool_spec):
+        """Multiple calls tracked in history."""
+        model = MACSModelAdapter(responses=['{"text": "success", "details": {}}'])
+        tool = MACSGenericTool(simple_tool_spec, model)
+
+        tool(origin="LAX", destination="JFK")
+        tool(origin="SFO", destination="ORD")
+        tool(origin="BOS", destination="MIA")
+
+        history = tool.history.to_list()
+        assert len(history) == 3
+        assert history[0]["inputs"]["origin"] == "LAX"
+        assert history[1]["inputs"]["origin"] == "SFO"
+        assert history[2]["inputs"]["origin"] == "BOS"
+
+
+# =============================================================================
+# Unit Tests: Tracing and Config
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSGenericToolTracing:
+    """Tests for trace and config gathering."""
+
+    def test_gather_traces(self, simple_tool_spec, macs_model):
+        """Traces include name and invocations."""
+        tool = MACSGenericTool(simple_tool_spec, macs_model)
+
+        with patch.object(tool.simulator, "__call__", return_value=("Response", {})):
+            tool(origin="LAX", destination="JFK")
+
+        traces = tool.gather_traces()
+
+        assert traces["name"] == "search_flights"
+        assert "invocations" in traces
+        assert len(traces["invocations"]) == 1
+        assert "gathered_at" in traces  # From TraceableMixin
+
+    def test_gather_config(self, simple_tool_spec, macs_model):
+        """Config includes name, description, schema."""
+        tool = MACSGenericTool(simple_tool_spec, macs_model)
+
+        config = tool.gather_config()
+
+        assert config["name"] == "search_flights"
+        assert config["description"] == "Search for available flights"
+        assert "input_schema" in config
+        assert "gathered_at" in config  # From ConfigurableMixin
+
+    def test_gather_traces_empty_history(self, simple_tool_spec, macs_model):
+        """Traces work with empty invocation history."""
+        tool = MACSGenericTool(simple_tool_spec, macs_model)
+
+        traces = tool.gather_traces()
+
+        assert traces["name"] == "search_flights"
+        assert traces["invocations"] == []
+
+
+# =============================================================================
+# Unit Tests: String Representation
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSGenericToolRepr:
+    """Tests for string representation."""
+
+    def test_repr(self, simple_tool_spec, macs_model):
+        """String representation is informative."""
+        tool = MACSGenericTool(simple_tool_spec, macs_model)
+
+        repr_str = repr(tool)
+
+        assert "MACSGenericTool" in repr_str
+        assert "search_flights" in repr_str
+        assert "origin" in repr_str
+        assert "destination" in repr_str
+        assert "string" in repr_str
+
+    def test_repr_no_inputs(self, minimal_tool_spec, macs_model):
+        """Repr handles tool with no inputs."""
+        tool = MACSGenericTool(minimal_tool_spec, macs_model)
+
+        repr_str = repr(tool)
+
+        assert "MACSGenericTool" in repr_str
+        assert "simple_action" in repr_str
+        assert "()" in repr_str
+
+
+# =============================================================================
+# Integration Tests
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSGenericToolIntegration:
+    """Integration tests for MACSGenericTool."""
+
+    def test_tool_with_complex_spec(self, complex_tool_spec, macs_model):
+        """Tool works with complex specification."""
+        tool = MACSGenericTool(complex_tool_spec, macs_model)
+
+        assert tool.name == "book_hotel"
+        assert "city" in tool.inputs
+        assert "check_in" in tool.inputs
+        assert tool.inputs["check_in"]["type"] == "date"
+
+    def test_end_to_end_flow(self, simple_tool_spec):
+        """Complete flow from creation to trace gathering."""
+        # Create model with specific response
+        model = MACSModelAdapter(responses=['{"status": "found", "flights": ["AA123", "UA456"]}'])
+        tool = MACSGenericTool(simple_tool_spec, model)
+
+        # Invoke tool (simulator will use the model)
+        # Note: actual response depends on ToolLLMSimulator's parsing
+        with patch.object(tool.simulator, "__call__", return_value=("Found 2 flights", {"parsed": True})):
+            _ = tool(origin="LAX", destination="JFK")
+
+        # Check traces
+        traces = tool.gather_traces()
+        assert traces["name"] == "search_flights"
+        assert len(traces["invocations"]) == 1
+
+        # Check config
+        config = tool.gather_config()
+        assert config["name"] == "search_flights"
diff --git a/tests/test_benchmarks/test_macs/test_macs_user.py b/tests/test_benchmarks/test_macs/test_macs_user.py
new file mode 100644
index 00000000..9886e12f
--- /dev/null
+++ b/tests/test_benchmarks/test_macs/test_macs_user.py
@@ -0,0 +1,507 @@
+"""Unit tests for MACSUser."""
+
+import pytest
+from unittest.mock import patch
+
+from maseval.benchmark.macs import MACSUser
+
+from .conftest import MACSModelAdapter
+
+
+# =============================================================================
+# Unit Tests: Initialization
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSUserInit:
+    """Tests for MACSUser initialization."""
+
+    def test_init_basic(self, macs_model, sample_scenario, initial_prompt):
+        """Basic initialization with required args."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        assert user.model == macs_model
+        assert user.scenario == sample_scenario
+        assert user.name == "Simulated User"  # Default name
+
+    def test_init_custom_name(self, macs_model, sample_scenario, initial_prompt):
+        """Custom name is respected."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+            name="Test User",
+        )
+
+        assert user.name == "Test User"
+
+    def test_init_default_max_turns(self, macs_model, sample_scenario, initial_prompt):
+        """Default max_turns is 5."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        assert user.max_turns == 5
+
+    def test_init_custom_max_turns(self, macs_model, sample_scenario, initial_prompt):
+        """Custom max_turns is respected."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+            max_turns=10,
+        )
+
+        assert user.max_turns == 10
+
+    def test_init_loads_template(self, macs_model, sample_scenario, initial_prompt):
+        """Loads user_simulator.txt template."""
+        # Verify template file exists
+        assert MACSUser.TEMPLATE_PATH.exists(), f"Template not found at {MACSUser.TEMPLATE_PATH}"
+
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        # MACSUser is created successfully (template is passed to parent User class)
+        assert user is not None
+
+    def test_init_extracts_user_profile(self, macs_model, sample_scenario, initial_prompt):
+        """Extracts profile from scenario."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        # Profile should contain extracted info
+        assert "name" in user.user_profile or "full_scenario" in user.user_profile
+        assert user.user_profile.get("full_scenario") == sample_scenario
+
+    def test_init_turn_count_zero(self, macs_model, sample_scenario, initial_prompt):
+        """Turn count starts at zero."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        assert user._turn_count == 0
+        assert not user._stopped
+
+
+# =============================================================================
+# Unit Tests: User Profile Extraction
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestUserProfileExtraction:
+    """Tests for _extract_user_profile static method."""
+
+    def test_extract_profile_with_background(self, sample_scenario):
+        """Parses Background: section."""
+        profile = MACSUser._extract_user_profile(sample_scenario)
+
+        assert "full_scenario" in profile
+        assert profile["full_scenario"] == sample_scenario
+
+    def test_extract_profile_is_statements(self):
+        """Parses 'User's X is Y' statements."""
+        scenario = """Background:
+* User's name is Alice
+* User's age is 30
+* User's company is TechCorp"""
+
+        profile = MACSUser._extract_user_profile(scenario)
+
+        assert profile.get("name") == "Alice"
+        assert profile.get("age") == "30"
+        assert profile.get("company") == "TechCorp"
+
+    def test_extract_profile_has_statements(self):
+        """Parses 'User has X' statements."""
+        scenario = """Background:
+* User has a pet dog
+* User has premium membership"""
+
+        profile = MACSUser._extract_user_profile(scenario)
+
+        # These should be captured with some key
+        assert "full_scenario" in profile  # At minimum, full scenario is always there
+
+    def test_extract_profile_no_background(self, minimal_scenario):
+        """Handles missing Background section."""
+        profile = MACSUser._extract_user_profile(minimal_scenario)
+
+        # Should still have full_scenario
+        assert profile["full_scenario"] == minimal_scenario
+
+    def test_extract_profile_includes_full_scenario(self, sample_scenario):
+        """Full scenario is always in profile."""
+        profile = MACSUser._extract_user_profile(sample_scenario)
+
+        assert "full_scenario" in profile
+        assert sample_scenario in profile["full_scenario"]
+
+
+# =============================================================================
+# Unit Tests: Conversation State
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestConversationState:
+    """Tests for conversation state management."""
+
+    def test_is_done_false_initially(self, macs_model, sample_scenario, initial_prompt):
+        """is_done is False at start."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        assert not user.is_done
+
+    def test_is_done_after_max_turns(self, macs_model, sample_scenario, initial_prompt):
+        """is_done is True after max turns."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+            max_turns=2,
+        )
+
+        # Manually increment turn count
+        user._turn_count = 2
+
+        assert user.is_done
+
+    def test_is_done_after_stop_token(self, macs_model, sample_scenario, initial_prompt):
+        """is_done is True after </stop> detected."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        # Manually set stopped flag
+        user._stopped = True
+
+        assert user.is_done
+
+    def test_turn_count_below_max_not_done(self, macs_model, sample_scenario, initial_prompt):
+        """Not done when turn count below max."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+            max_turns=5,
+        )
+
+        user._turn_count = 4  # One below max
+
+        assert not user.is_done
+
+
+# =============================================================================
+# Unit Tests: Reset
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestReset:
+    """Tests for reset method."""
+
+    def test_reset_clears_turn_count(self, macs_model, sample_scenario, initial_prompt):
+        """reset() clears turn count."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+        user._turn_count = 3
+
+        user.reset()
+
+        assert user._turn_count == 0
+
+    def test_reset_clears_stopped(self, macs_model, sample_scenario, initial_prompt):
+        """reset() clears stopped flag."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+        user._stopped = True
+
+        user.reset()
+
+        assert not user._stopped
+
+
+# =============================================================================
+# Unit Tests: Response Simulation
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestResponseSimulation:
+    """Tests for simulate_response method."""
+
+    def test_simulate_response_increments_turn(self, sample_scenario, initial_prompt):
+        """Turn count increments on simulate_response call."""
+        model = MACSModelAdapter(responses=['{"text": "Yes, confirmed.", "details": {}}'])
+        user = MACSUser(
+            model=model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        initial_count = user._turn_count
+
+        # Mock parent's simulate_response to return a simple response
+        with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="Yes, confirmed."):
+            user.simulate_response("When would you like to travel?")
+
+        assert user._turn_count == initial_count + 1
+
+    def test_simulate_response_detects_stop(self, sample_scenario, initial_prompt):
+        """Detects </stop> token."""
+        model = MACSModelAdapter()
+        user = MACSUser(
+            model=model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="Thanks! </stop>"):
+            user.simulate_response("Your flight is booked!")
+
+        assert user._stopped
+        assert user.is_done
+
+    def test_simulate_response_cleans_stop_token(self, sample_scenario, initial_prompt):
+        """Removes </stop> from response."""
+        model = MACSModelAdapter()
+        user = MACSUser(
+            model=model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="Perfect, thanks! </stop>"):
+            response = user.simulate_response("Booking confirmed!")
+
+        assert "</stop>" not in response
+        assert "Perfect, thanks!" in response
+
+    def test_simulate_response_returns_empty_when_done(self, sample_scenario, initial_prompt):
+        """Returns empty string when is_done is True."""
+        model = MACSModelAdapter()
+        user = MACSUser(
+            model=model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+        user._stopped = True  # Already done
+
+        response = user.simulate_response("Any follow-up?")
+
+        assert response == ""
+
+    def test_simulate_response_returns_empty_at_max_turns(self, sample_scenario, initial_prompt):
+        """Returns empty string when max turns reached."""
+        model = MACSModelAdapter()
+        user = MACSUser(
+            model=model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+            max_turns=3,
+        )
+        user._turn_count = 3  # At max
+
+        response = user.simulate_response("One more question?")
+
+        assert response == ""
+
+    def test_simulate_response_fallback_message(self, sample_scenario, initial_prompt):
+        """Provides fallback when response is only stop token."""
+        model = MACSModelAdapter()
+        user = MACSUser(
+            model=model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="</stop>"):
+            response = user.simulate_response("Booking complete!")
+
+        assert response == "Thank you, that's all I needed!"
+        assert user._stopped
+
+
+# =============================================================================
+# Unit Tests: Tool Interface
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestToolInterface:
+    """Tests for get_tool method."""
+
+    def test_get_tool_raises_not_implemented(self, macs_model, sample_scenario, initial_prompt):
+        """Base MACSUser.get_tool() raises NotImplementedError."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        with pytest.raises(NotImplementedError) as exc_info:
+            user.get_tool()
+
+        assert "get_tool" in str(exc_info.value)
+        assert "subclass" in str(exc_info.value).lower()
+
+
+# =============================================================================
+# Unit Tests: Tracing
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestTracing:
+    """Tests for gather_traces method."""
+
+    def test_gather_traces_includes_macs_fields(self, macs_model, sample_scenario, initial_prompt):
+        """Traces include MACS-specific fields."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+            max_turns=7,
+        )
+        user._turn_count = 3
+        user._stopped = True
+
+        traces = user.gather_traces()
+
+        assert traces["max_turns"] == 7
+        assert traces["turns_used"] == 3
+        assert traces["stopped_by_user"] is True
+
+    def test_gather_traces_inherits_base_fields(self, macs_model, sample_scenario, initial_prompt):
+        """Traces include base User fields."""
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        traces = user.gather_traces()
+
+        assert "gathered_at" in traces
+        assert "name" in traces
+        assert traces["name"] == "Simulated User"
+
+
+# =============================================================================
+# Integration Tests
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestMACSUserIntegration:
+    """Integration tests for MACSUser."""
+
+    def test_conversation_lifecycle(self, sample_scenario, initial_prompt):
+        """Test complete conversation lifecycle."""
+        responses = [
+            "Yes, Monday works.",
+            "I prefer Delta.",
+            "Aisle seat please.",
+            "Book it! </stop>",
+        ]
+        model = MACSModelAdapter(responses=responses)
+        user = MACSUser(
+            model=model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+            max_turns=5,
+        )
+
+        # Simulate multi-turn conversation
+        questions = [
+            "When would you like to travel?",
+            "Any airline preference?",
+            "Window or aisle?",
+            "I'll book the flight. Confirmation?",
+        ]
+
+        for i, question in enumerate(questions):
+            if user.is_done:
+                break
+            with patch.object(user.__class__.__bases__[0], "simulate_response", return_value=responses[i]):
+                response = user.simulate_response(question)
+            if i < len(questions) - 1:
+                assert response != ""
+
+        # After stop token, should be done
+        assert user.is_done
+        assert user._turn_count == 4
+
+    def test_max_turns_enforcement(self, sample_scenario, initial_prompt):
+        """Test that max turns is enforced."""
+        model = MACSModelAdapter(responses=["Response"] * 10)
+        user = MACSUser(
+            model=model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+            max_turns=3,
+        )
+
+        # Simulate 3 turns
+        for i in range(3):
+            with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="Response"):
+                user.simulate_response(f"Question {i}")
+
+        # Should be done after 3 turns
+        assert user.is_done
+        assert user._turn_count == 3
+
+        # Additional calls should return empty
+        response = user.simulate_response("One more?")
+        assert response == ""
+
+    def test_reset_allows_new_conversation(self, sample_scenario, initial_prompt):
+        """Test that reset allows starting new conversation."""
+        model = MACSModelAdapter()
+        user = MACSUser(
+            model=model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+            max_turns=2,
+        )
+
+        # Max out turns
+        user._turn_count = 2
+        user._stopped = True
+        assert user.is_done
+
+        # Reset
+        user.reset()
+
+        # Should be able to continue
+        assert not user.is_done
+        assert user._turn_count == 0

From 1f4b41ac301d73ccc47cce8fba69a228573105f8 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Wed, 3 Dec 2025 23:26:21 +0000
Subject: [PATCH 08/34] consolidated tests

---
 tests/test_benchmarks/test_macs/conftest.py   |  89 +---
 .../test_macs/test_macs_benchmark.py          |  74 +--
 .../test_macs/test_macs_environment.py        |  62 +--
 .../test_macs/test_macs_evaluator.py          |  62 +--
 .../test_macs/test_macs_integration.py        | 459 ++----------------
 .../test_macs/test_macs_tool.py               |  51 +-
 .../test_macs/test_macs_user.py               |  81 +---
 7 files changed, 157 insertions(+), 721 deletions(-)

diff --git a/tests/test_benchmarks/test_macs/conftest.py b/tests/test_benchmarks/test_macs/conftest.py
index b8b276c7..6ce27d3b 100644
--- a/tests/test_benchmarks/test_macs/conftest.py
+++ b/tests/test_benchmarks/test_macs/conftest.py
@@ -2,88 +2,36 @@
 
 Fixture Hierarchy
 -----------------
-- tests/conftest.py: Generic fixtures (dummy_model, dummy_agent_adapter, dummy_task, etc.)
+- tests/conftest.py: Generic fixtures (DummyModelAdapter, dummy_model, dummy_agent_adapter, etc.)
   These are automatically available via pytest's conftest inheritance.
 - tests/test_benchmarks/test_macs/conftest.py: MACS-specific fixtures (this file)
 
 MACS tests can use fixtures from both levels - pytest handles this automatically.
 
-Why MACS-Specific Mock Classes Exist
-------------------------------------
-The MACS benchmark uses ToolLLMSimulator and UserLLMSimulator which parse JSON responses
-in a specific format: {"text": "...", "details": {...}}
+MACS-Specific Components
+------------------------
+- MACSAgentAdapter: Returns MessageHistory (not strings) matching the AgentAdapter contract.
+  Used for testing MACSBenchmark.run_agents() without a real agent implementation.
+- ConcreteMACSBenchmark: Concrete implementation of MACSBenchmark for testing.
 
-The generic DummyModelAdapter from tests/conftest.py returns simple strings like
-"test response", which would cause JSON parsing failures in MACS components.
-
-Therefore, we define MACS-specific adapters that:
-1. MACSModelAdapter: Returns valid JSON in the ToolLLMSimulator format by default
-2. MACSAgentAdapter: Returns MessageHistory (not strings) matching the AgentAdapter contract
-
-These are NOT duplicates - they serve a different purpose than the generic test fixtures.
+For model adapters, MACS tests use DummyModelAdapter from tests/conftest.py with JSON
+responses passed explicitly (e.g., DummyModelAdapter(responses=['{"text": "..."}'])).
 """
 
 import pytest
 from typing import Any, Dict, List, Optional, Tuple
 from unittest.mock import MagicMock
 
+from conftest import DummyModelAdapter
 from maseval import AgentAdapter, Task, User, MessageHistory, TaskCollection
 from maseval.benchmark.macs import MACSBenchmark, MACSEnvironment
-from maseval.core.model import ModelAdapter
 
 
 # =============================================================================
 # MACS-Specific Mock Components
-#
-# These exist because MACS components (ToolLLMSimulator, UserLLMSimulator, MACSEvaluator)
-# expect JSON responses in specific formats. The generic DummyModelAdapter returns
-# plain strings which would cause parsing failures.
 # =============================================================================
 
 
-class MACSModelAdapter(ModelAdapter):
-    """Model adapter for testing MACS components.
-
-    Unlike DummyModelAdapter (which returns plain strings), this adapter returns
-    JSON responses in the format expected by MACS simulators:
-
-        {"text": "response text", "details": {...}}
-
-    This format is required by:
-    - ToolLLMSimulator._parse_output() for tool responses
-    - UserLLMSimulator._parse_output() for user responses
-    - MACSEvaluator for assertion evaluation (different format)
-
-    Attributes:
-        prompts: List of all prompts sent to the model (for verification in tests).
-        _call_count: Number of times generate() was called.
-    """
-
-    def __init__(self, responses: Optional[List[str]] = None):
-        """Initialize with optional canned responses.
-
-        Args:
-            responses: List of JSON strings to return. Cycles through if more
-                calls are made than responses provided. Defaults to a valid
-                ToolLLMSimulator response format.
-        """
-        super().__init__()
-        self._model_id = "macs-test-model"
-        self._responses = responses or ['{"text": "Default response", "details": {}}']
-        self._call_count = 0
-        self.prompts: List[str] = []
-
-    @property
-    def model_id(self) -> str:
-        return self._model_id
-
-    def _generate_impl(self, prompt: str, generation_params: Optional[Dict[str, Any]] = None, **kwargs: Any) -> str:
-        self.prompts.append(prompt)
-        response = self._responses[self._call_count % len(self._responses)]
-        self._call_count += 1
-        return response
-
-
 class MACSAgentAdapter(AgentAdapter):
     """Agent adapter for testing MACS benchmark execution.
 
@@ -143,39 +91,38 @@ def setup_agents(
 # =============================================================================
 # Model Fixtures
 #
-# These use MACSModelAdapter because MACS components require JSON responses.
-# For generic model testing, use dummy_model from parent conftest.
+# These use DummyModelAdapter from tests/conftest.py with JSON responses.
 # =============================================================================
 
 
 @pytest.fixture
 def macs_model():
-    """MACS model adapter with default JSON responses.
+    """Model adapter with default JSON responses for MACS tests.
 
     Returns responses in ToolLLMSimulator format: {"text": "...", "details": {...}}
     """
-    return MACSModelAdapter()
+    return DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
 
 
 @pytest.fixture
 def macs_model_evaluator():
-    """MACS model configured for MACSEvaluator tests.
+    """Model configured for MACSEvaluator tests.
 
     Returns JSON array format expected by MACSEvaluator._parse_evaluation_response().
     """
-    return MACSModelAdapter(responses=['[{"assertion": "Test", "answer": "TRUE", "evidence": "OK"}]'])
+    return DummyModelAdapter(responses=['[{"assertion": "Test", "answer": "TRUE", "evidence": "OK"}]'])
 
 
 @pytest.fixture
 def macs_model_tool():
-    """MACS model configured for ToolLLMSimulator tests."""
-    return MACSModelAdapter(responses=['{"text": "Tool executed successfully", "details": {}}'])
+    """Model configured for ToolLLMSimulator tests."""
+    return DummyModelAdapter(responses=['{"text": "Tool executed successfully", "details": {}}'])
 
 
 @pytest.fixture
 def macs_model_user():
-    """MACS model configured for UserLLMSimulator tests."""
-    return MACSModelAdapter(responses=['{"text": "Yes, that works for me.", "details": {}}'])
+    """Model configured for UserLLMSimulator tests."""
+    return DummyModelAdapter(responses=['{"text": "Yes, that works for me.", "details": {}}'])
 
 
 # =============================================================================
diff --git a/tests/test_benchmarks/test_macs/test_macs_benchmark.py b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
index 45863301..325a99fc 100644
--- a/tests/test_benchmarks/test_macs/test_macs_benchmark.py
+++ b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
@@ -13,55 +13,31 @@
     compute_benchmark_metrics,
 )
 
-from .conftest import MACSModelAdapter, MACSAgentAdapter, ConcreteMACSBenchmark
+from .conftest import MACSAgentAdapter, ConcreteMACSBenchmark
+from conftest import DummyModelAdapter
 
 
 # =============================================================================
-# Unit Tests: Initialization
+# Unit Tests: Initialization and Setup
 # =============================================================================
 
 
 @pytest.mark.benchmark
-class TestMACSBenchmarkInit:
-    """Tests for MACSBenchmark initialization."""
+class TestMACSBenchmarkSetup:
+    """Tests for MACSBenchmark initialization and setup methods."""
 
-    def test_init_stores_model(self, macs_model, sample_agent_data):
-        """Model is stored for later use."""
-        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+    def test_init_configures_benchmark(self, macs_model, sample_agent_data):
+        """Benchmark initializes with model, agent_data, and optional params."""
+        callbacks = [MagicMock()]
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model, callbacks=callbacks, n_task_repeats=3)
 
         assert benchmark._model == macs_model
-
-    def test_init_calls_parent(self, macs_model, sample_agent_data):
-        """Parent Benchmark.__init__ is called."""
-        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
-
         assert benchmark.agent_data == sample_agent_data
-
-    def test_init_with_callbacks(self, macs_model, sample_agent_data):
-        """Callbacks are passed to parent."""
-        callbacks = [MagicMock()]
-        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model, callbacks=callbacks)
-
         assert benchmark.callbacks == callbacks
-
-    def test_init_with_n_task_repeats(self, macs_model, sample_agent_data):
-        """n_task_repeats is set correctly."""
-        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model, n_task_repeats=3)
-
         assert benchmark.n_task_repeats == 3
 
-
-# =============================================================================
-# Unit Tests: Setup Methods
-# =============================================================================
-
-
-@pytest.mark.benchmark
-class TestSetupMethods:
-    """Tests for setup methods."""
-
     def test_setup_environment_creates_macs_environment(self, macs_model, sample_agent_data, sample_task):
-        """setup_environment returns MACSEnvironment."""
+        """setup_environment returns MACSEnvironment with tools."""
         benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
 
         env = benchmark.setup_environment(sample_agent_data, sample_task)
@@ -70,21 +46,13 @@ def test_setup_environment_creates_macs_environment(self, macs_model, sample_age
         assert "search_flights" in env.tools
 
     def test_setup_user_creates_macs_user(self, macs_model, sample_agent_data, sample_task):
-        """setup_user returns MACSUser."""
+        """setup_user returns MACSUser with scenario from task."""
         benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
         env = benchmark.setup_environment(sample_agent_data, sample_task)
 
         user = benchmark.setup_user(sample_agent_data, env, sample_task)
 
         assert isinstance(user, MACSUser)
-
-    def test_setup_user_extracts_scenario(self, macs_model, sample_agent_data, sample_task):
-        """Passes scenario from task metadata."""
-        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
-        env = benchmark.setup_environment(sample_agent_data, sample_task)
-
-        user = benchmark.setup_user(sample_agent_data, env, sample_task)
-
         assert user.scenario == "Business trip to NYC"
 
     def test_setup_user_handles_no_scenario(self, macs_model, sample_agent_data, sample_task_no_scenario):
@@ -96,7 +64,7 @@ def test_setup_user_handles_no_scenario(self, macs_model, sample_agent_data, sam
 
         assert user.scenario == ""
 
-    def test_setup_evaluators_creates_dual(self, macs_model, sample_agent_data, sample_task):
+    def test_setup_evaluators_creates_user_and_system(self, macs_model, sample_agent_data, sample_task):
         """Creates both user and system evaluators."""
         benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
         env = benchmark.setup_environment(sample_agent_data, sample_task)
@@ -105,15 +73,11 @@ def test_setup_evaluators_creates_dual(self, macs_model, sample_agent_data, samp
         evaluators = benchmark.setup_evaluators(env, sample_task, agents, None)
 
         assert len(evaluators) == 2
-        assert isinstance(evaluators[0], MACSEvaluator)
-        assert isinstance(evaluators[1], MACSEvaluator)
         assert evaluators[0].gsr_type == "user"
         assert evaluators[1].gsr_type == "system"
 
     def test_setup_agents_is_abstract(self, macs_model, sample_agent_data):
         """setup_agents must be overridden in subclass."""
-        # MACSBenchmark itself can't be instantiated without setup_agents
-        # We verify by checking the abstract method exists
         import inspect
 
         assert inspect.isabstract(MACSBenchmark)
@@ -217,7 +181,7 @@ def test_evaluate_calls_both_evaluators(self, sample_agent_data, sample_task):
             '[{"assertion": "User assertion", "answer": "TRUE", "evidence": "OK"}]',
             '[{"assertion": "System assertion", "answer": "TRUE", "evidence": "OK"}]',
         ]
-        model = MACSModelAdapter(responses=responses)
+        model = DummyModelAdapter(responses=responses)
         benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
         env = benchmark.setup_environment(sample_agent_data, sample_task)
         _, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
@@ -247,7 +211,7 @@ def test_evaluate_returns_aggregated_metrics(self, sample_agent_data, sample_tas
             '[{"assertion": "A", "answer": "TRUE", "evidence": "OK"}]',
             '[{"assertion": "B", "answer": "TRUE", "evidence": "OK"}]',
         ]
-        model = MACSModelAdapter(responses=responses)
+        model = DummyModelAdapter(responses=responses)
         benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
         env = benchmark.setup_environment(sample_agent_data, sample_task)
         _, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
@@ -277,7 +241,7 @@ def test_evaluate_overall_gsr(self, sample_agent_data, sample_task):
             '[{"assertion": "A", "answer": "TRUE", "evidence": "OK"}]',
             '[{"assertion": "B", "answer": "FALSE", "evidence": "Fail"}]',
         ]
-        model = MACSModelAdapter(responses=responses)
+        model = DummyModelAdapter(responses=responses)
         benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
         env = benchmark.setup_environment(sample_agent_data, sample_task)
         _, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
@@ -301,7 +265,7 @@ def test_evaluate_supervisor_gsr(self, sample_agent_data, sample_task):
             '[{"assertion": "A", "answer": "TRUE", "evidence": "OK"}]',
             '[{"assertion": "B", "answer": "FALSE", "evidence": "Fail"}]',
         ]
-        model = MACSModelAdapter(responses=responses)
+        model = DummyModelAdapter(responses=responses)
         benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
         env = benchmark.setup_environment(sample_agent_data, sample_task)
         _, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
@@ -323,7 +287,7 @@ def test_evaluate_combined_report(self, sample_agent_data, sample_task):
             '[{"assertion": "User A", "answer": "TRUE", "evidence": "OK"}]',
             '[{"assertion": "System B", "answer": "TRUE", "evidence": "OK"}]',
         ]
-        model = MACSModelAdapter(responses=responses)
+        model = DummyModelAdapter(responses=responses)
         benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
         env = benchmark.setup_environment(sample_agent_data, sample_task)
         _, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
@@ -472,7 +436,7 @@ def test_full_task_execution(self, sample_agent_data, sample_task):
             '[{"assertion": "Booking confirmed", "answer": "TRUE", "evidence": "Done"}]',
             '[{"assertion": "Database updated", "answer": "TRUE", "evidence": "Updated"}]',
         ]
-        model = MACSModelAdapter(responses=responses)
+        model = DummyModelAdapter(responses=responses)
         benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
 
         # Setup phase
@@ -506,7 +470,7 @@ def test_full_task_execution(self, sample_agent_data, sample_task):
 
     def test_benchmark_with_real_environment(self, sample_agent_data, sample_task):
         """Test with real MACSEnvironment tool creation."""
-        model = MACSModelAdapter()
+        model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
 
         env = benchmark.setup_environment(sample_agent_data, sample_task)
diff --git a/tests/test_benchmarks/test_macs/test_macs_environment.py b/tests/test_benchmarks/test_macs/test_macs_environment.py
index d161940b..19ad2632 100644
--- a/tests/test_benchmarks/test_macs/test_macs_environment.py
+++ b/tests/test_benchmarks/test_macs/test_macs_environment.py
@@ -5,69 +5,37 @@
 
 from maseval.benchmark.macs import MACSEnvironment, MACSGenericTool
 
-from .conftest import MACSModelAdapter
+from conftest import DummyModelAdapter
 
 
 # =============================================================================
-# Unit Tests: Initialization
+# Unit Tests: Initialization and State Setup
 # =============================================================================
 
 
 @pytest.mark.benchmark
-class TestMACSEnvironmentInit:
-    """Tests for MACSEnvironment initialization."""
+class TestMACSEnvironmentSetup:
+    """Tests for MACSEnvironment initialization and state setup."""
 
-    def test_init_with_task_data(self, macs_model, sample_task_data):
-        """Initializes from task data."""
-        env = MACSEnvironment(sample_task_data, macs_model)
-
-        assert env is not None
-        assert "tool_specs" in env.state
-
-    def test_init_stores_model(self, macs_model, sample_task_data):
-        """Model is stored for tool creation."""
+    def test_init_extracts_tool_specs(self, macs_model, sample_task_data):
+        """Initializes from task data and extracts tool_specs."""
         env = MACSEnvironment(sample_task_data, macs_model)
 
         assert env._model == macs_model
-
-    def test_init_calls_parent(self, macs_model, sample_task_data):
-        """Parent Environment.__init__ is called."""
-        env = MACSEnvironment(sample_task_data, macs_model)
-
-        # Parent sets up state and creates tools
         assert hasattr(env, "state")
         assert hasattr(env, "tools")
-
-
-# =============================================================================
-# Unit Tests: State Setup
-# =============================================================================
-
-
-@pytest.mark.benchmark
-class TestSetupState:
-    """Tests for setup_state method."""
-
-    def test_setup_state_extracts_tool_specs(self, macs_model, sample_task_data):
-        """setup_state extracts tool_specs from task_data."""
-        env = MACSEnvironment(sample_task_data, macs_model)
-
         assert "tool_specs" in env.state
         assert len(env.state["tool_specs"]) == 2
 
-    def test_setup_state_empty_tools(self, macs_model):
-        """Handles missing or empty tools."""
-        task_data = {"environment_data": {}}
-        env = MACSEnvironment(task_data, macs_model)
-
-        assert env.state["tool_specs"] == []
-
-    def test_setup_state_no_environment_data(self, macs_model):
-        """Handles missing environment_data."""
-        task_data = {}
-        env = MACSEnvironment(task_data, macs_model)
+    def test_handles_empty_or_missing_tools(self, macs_model):
+        """Handles missing environment_data or empty tools gracefully."""
+        # Missing environment_data
+        env1 = MACSEnvironment({}, macs_model)
+        assert env1.state["tool_specs"] == []
 
-        assert env.state["tool_specs"] == []
+        # Empty tools
+        env2 = MACSEnvironment({"environment_data": {}}, macs_model)
+        assert env2.state["tool_specs"] == []
 
 
 # =============================================================================
@@ -251,7 +219,7 @@ def test_full_workflow(self, macs_model, sample_task_data):
     def test_tools_are_callable(self, sample_task_data):
         """Created tools can be called."""
         # Use a model that returns valid JSON responses (ToolLLMSimulator expects {"text": ..., "details": ...})
-        model = MACSModelAdapter(responses=['{"text": "Found flights: AA123, UA456", "details": {}}'])
+        model = DummyModelAdapter(responses=['{"text": "Found flights: AA123, UA456", "details": {}}'])
         env = MACSEnvironment(sample_task_data, model)
 
         search_flights = env.tools["search_flights"]
diff --git a/tests/test_benchmarks/test_macs/test_macs_evaluator.py b/tests/test_benchmarks/test_macs/test_macs_evaluator.py
index c5812b95..02b661ec 100644
--- a/tests/test_benchmarks/test_macs/test_macs_evaluator.py
+++ b/tests/test_benchmarks/test_macs/test_macs_evaluator.py
@@ -6,7 +6,7 @@
 from maseval import MessageHistory, Task
 from maseval.benchmark.macs import MACSEvaluator
 
-from .conftest import MACSModelAdapter
+from conftest import DummyModelAdapter
 
 
 # =============================================================================
@@ -18,33 +18,22 @@
 class TestMACSEvaluatorInit:
     """Tests for MACSEvaluator initialization."""
 
-    def test_init_user_type(self, macs_model, sample_task):
-        """Initializes with gsr_type='user'."""
+    def test_init_user_type_with_template(self, macs_model, sample_task):
+        """Initializes with gsr_type='user' and loads appropriate template."""
         evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
 
         assert evaluator.gsr_type == "user"
         assert evaluator.model == macs_model
         assert evaluator.task == sample_task
-
-    def test_init_system_type(self, macs_model, sample_task):
-        """Initializes with gsr_type='system'."""
-        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="system")
-
-        assert evaluator.gsr_type == "system"
-
-    def test_init_loads_default_template(self, macs_model, sample_task):
-        """Loads default template from file."""
-        evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
-
-        # Template should contain placeholders
         assert "{{scenario}}" in evaluator.template
         assert "{{history}}" in evaluator.template
         assert "{{assertions}}" in evaluator.template
 
-    def test_init_system_template_has_invocations(self, macs_model, sample_task):
-        """System template includes tool invocations placeholder."""
+    def test_init_system_type_with_template(self, macs_model, sample_task):
+        """Initializes with gsr_type='system' and includes invocations placeholder."""
         evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="system")
 
+        assert evaluator.gsr_type == "system"
         assert "{{invocations}}" in evaluator.template
 
     def test_init_custom_template(self, macs_model, sample_task):
@@ -330,7 +319,7 @@ def test_call_returns_expected_format(self, sample_task, sample_trace):
                 {"assertion": "User received confirmation number", "answer": "TRUE", "evidence": "ABC123 mentioned"},
             ]
         )
-        model = MACSModelAdapter(responses=[response])
+        model = DummyModelAdapter(responses=[response])
         evaluator = MACSEvaluator(model, sample_task, gsr_type="user")
 
         traces = {"messages": sample_trace}
@@ -344,7 +333,7 @@ def test_call_returns_expected_format(self, sample_task, sample_trace):
 
     def test_call_handles_json_error(self, sample_task, sample_trace):
         """Graceful handling of JSON parse error."""
-        model = MACSModelAdapter(responses=["This is not valid JSON"])
+        model = DummyModelAdapter(responses=["This is not valid JSON"])
         evaluator = MACSEvaluator(model, sample_task, gsr_type="user")
 
         traces = {"messages": sample_trace}
@@ -359,7 +348,7 @@ def test_call_handles_json_error(self, sample_task, sample_trace):
     def test_call_handles_wrapped_response(self, sample_task, sample_trace):
         """Handles {'assertions': [...]} wrapper."""
         response = json.dumps({"assertions": [{"assertion": "Test", "answer": "TRUE", "evidence": "Found"}]})
-        model = MACSModelAdapter(responses=[response])
+        model = DummyModelAdapter(responses=[response])
         evaluator = MACSEvaluator(model, sample_task, gsr_type="user")
 
         traces = {"messages": sample_trace}
@@ -371,7 +360,7 @@ def test_call_handles_wrapped_response(self, sample_task, sample_trace):
     def test_call_handles_results_wrapper(self, sample_task, sample_trace):
         """Handles {'results': [...]} wrapper."""
         response = json.dumps({"results": [{"assertion": "Test", "answer": "FALSE", "evidence": "Not found"}]})
-        model = MACSModelAdapter(responses=[response])
+        model = DummyModelAdapter(responses=[response])
         evaluator = MACSEvaluator(model, sample_task, gsr_type="user")
 
         traces = {"messages": sample_trace}
@@ -383,7 +372,7 @@ def test_call_handles_results_wrapper(self, sample_task, sample_trace):
     def test_call_handles_single_dict_response(self, sample_task, sample_trace):
         """Handles single dict instead of list."""
         response = json.dumps({"assertion": "Test", "answer": "TRUE", "evidence": "Found"})
-        model = MACSModelAdapter(responses=[response])
+        model = DummyModelAdapter(responses=[response])
         evaluator = MACSEvaluator(model, sample_task, gsr_type="user")
 
         traces = {"messages": sample_trace}
@@ -399,7 +388,7 @@ def test_call_missing_scenario_raises(self, sample_trace):
             evaluation_data={"assertions": ["user: Test assertion"]},
             metadata={},  # No scenario!
         )
-        model = MACSModelAdapter()
+        model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         evaluator = MACSEvaluator(model, task, gsr_type="user")
 
         traces = {"messages": sample_trace}
@@ -408,7 +397,7 @@ def test_call_missing_scenario_raises(self, sample_trace):
 
     def test_call_no_assertions_returns_perfect(self, sample_task_no_assertions, sample_trace):
         """No assertions → perfect score."""
-        model = MACSModelAdapter()
+        model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         evaluator = MACSEvaluator(model, sample_task_no_assertions, gsr_type="user")
 
         traces = {"messages": sample_trace}
@@ -421,7 +410,7 @@ def test_call_no_assertions_returns_perfect(self, sample_task_no_assertions, sam
     def test_call_adds_assertion_type(self, sample_task, sample_trace):
         """Report items include assertion_type."""
         response = json.dumps([{"assertion": "Test", "answer": "TRUE", "evidence": "Found"}])
-        model = MACSModelAdapter(responses=[response])
+        model = DummyModelAdapter(responses=[response])
         evaluator = MACSEvaluator(model, sample_task, gsr_type="user")
 
         traces = {"messages": sample_trace}
@@ -431,15 +420,28 @@ def test_call_adds_assertion_type(self, sample_task, sample_trace):
 
     def test_call_system_includes_tool_invocations(self, sample_task, sample_trace, sample_tool_traces):
         """System evaluation includes tool invocations in prompt."""
+        from unittest.mock import patch
+
         response = json.dumps([{"assertion": "Test", "answer": "TRUE", "evidence": "Found"}])
-        model = MACSModelAdapter(responses=[response])
+        model = DummyModelAdapter(responses=[response])
         evaluator = MACSEvaluator(model, sample_task, gsr_type="system")
 
         traces = {"messages": sample_trace, "tool_traces": sample_tool_traces}
-        evaluator(traces)
+
+        # Capture the prompt sent to the model
+        captured_prompts = []
+        original_generate = model._generate_impl
+
+        def capture_prompt(prompt, *args, **kwargs):
+            captured_prompts.append(prompt)
+            return original_generate(prompt, *args, **kwargs)
+
+        with patch.object(model, "_generate_impl", side_effect=capture_prompt):
+            evaluator(traces)
 
         # Check that tool invocations were included in the prompt
-        prompt = model.prompts[0]
+        assert len(captured_prompts) > 0
+        prompt = captured_prompts[0]
         assert "search_flights" in prompt or "book_flight" in prompt
 
 
@@ -467,7 +469,7 @@ def test_full_user_evaluation(self):
         )
 
         response = json.dumps([{"assertion": "Booking was successful", "answer": "TRUE", "evidence": "Confirmed"}])
-        model = MACSModelAdapter(responses=[response])
+        model = DummyModelAdapter(responses=[response])
         evaluator = MACSEvaluator(model, task, gsr_type="user")
 
         trace = MessageHistory(
@@ -498,7 +500,7 @@ def test_full_system_evaluation(self):
         )
 
         response = json.dumps([{"assertion": "Database was updated", "answer": "TRUE", "evidence": "DB log shows update"}])
-        model = MACSModelAdapter(responses=[response])
+        model = DummyModelAdapter(responses=[response])
         evaluator = MACSEvaluator(model, task, gsr_type="system")
 
         trace = MessageHistory([{"role": "assistant", "content": "Done"}])
diff --git a/tests/test_benchmarks/test_macs/test_macs_integration.py b/tests/test_benchmarks/test_macs/test_macs_integration.py
index ea97340a..9c8b16c4 100644
--- a/tests/test_benchmarks/test_macs/test_macs_integration.py
+++ b/tests/test_benchmarks/test_macs/test_macs_integration.py
@@ -1,4 +1,8 @@
-"""Integration tests for MACS benchmark components."""
+"""Integration tests for MACS benchmark components.
+
+These tests verify the complete MACS benchmark pipeline end-to-end.
+Component-specific unit tests are in their respective test files.
+"""
 
 import json
 import pytest
@@ -12,218 +16,8 @@
     compute_benchmark_metrics,
 )
 
-from .conftest import MACSModelAdapter, ConcreteMACSBenchmark
-
-
-# =============================================================================
-# Environment and Tool Integration Tests
-# =============================================================================
-
-
-@pytest.mark.benchmark
-class TestEnvironmentToolIntegration:
-    """Integration tests for MACSEnvironment and MACSGenericTool."""
-
-    def test_environment_creates_callable_tools(self, travel_task):
-        """Environment creates tools that can be called."""
-        # Use a model that returns valid JSON responses (ToolLLMSimulator expects {"text": ..., "details": ...})
-        model = MACSModelAdapter(responses=['{"text": "Found flights: AA123, UA456", "details": {}}'])
-        env = MACSEnvironment(
-            task_data={"environment_data": travel_task.environment_data},
-            model=model,
-        )
-
-        assert "search_flights" in env.tools
-        assert "book_flight" in env.tools
-
-        # Tools should be callable
-        search_flights = env.tools["search_flights"]
-        result = search_flights(origin="SFO", destination="JFK", date="2024-12-09")
-
-        # Should return the text from the response
-        assert "Found flights" in result
-
-    def test_tool_tracks_invocations(self, travel_task):
-        """Tool invocations are tracked in history."""
-        model = MACSModelAdapter(responses=['{"text": "success", "details": {}}'])
-        env = MACSEnvironment(
-            task_data={"environment_data": travel_task.environment_data},
-            model=model,
-        )
-
-        search_flights = env.tools["search_flights"]
-
-        # Make multiple calls
-        search_flights(origin="SFO", destination="JFK", date="2024-12-09")
-        search_flights(origin="LAX", destination="ORD", date="2024-12-10")
-
-        history = search_flights.history.to_list()
-        assert len(history) == 2
-        assert history[0]["inputs"]["origin"] == "SFO"
-        assert history[1]["inputs"]["origin"] == "LAX"
-
-    def test_agent_gets_subset_of_tools(self, macs_model):
-        """Agent only gets tools from its assigned tool groups."""
-        task_data = {
-            "environment_data": {
-                "tools": [
-                    {
-                        "tool_name": "group_a",
-                        "actions": [{"name": "tool_a", "description": "Tool A"}],
-                    },
-                    {
-                        "tool_name": "group_b",
-                        "actions": [{"name": "tool_b", "description": "Tool B"}],
-                    },
-                ]
-            }
-        }
-        env = MACSEnvironment(task_data, macs_model)
-
-        agent_spec = {"agent_id": "agent", "tools": ["group_a"]}
-        agent_tools = env.get_tools_for_agent(agent_spec)
-
-        assert "tool_a" in agent_tools
-        assert "tool_b" not in agent_tools
-
-
-# =============================================================================
-# Evaluator Integration Tests
-# =============================================================================
-
-
-@pytest.mark.benchmark
-class TestEvaluatorIntegration:
-    """Integration tests for MACSEvaluator."""
-
-    def test_user_evaluation_with_conversation(self, travel_task, sample_conversation):
-        """User evaluator works with real conversation trace."""
-        response = json.dumps(
-            [
-                {
-                    "assertion": "The user's flight booking request was acknowledged",
-                    "answer": "TRUE",
-                    "evidence": "Agent acknowledged the request",
-                },
-                {"assertion": "The user received flight options or a confirmation", "answer": "TRUE", "evidence": "Confirmation DL123456"},
-            ]
-        )
-        model = MACSModelAdapter(responses=[response])
-        evaluator = MACSEvaluator(model, travel_task, gsr_type="user")
-
-        traces = {"messages": sample_conversation}
-        result = evaluator(traces)
-
-        assert result["gsr"] == 1.0
-        assert len(result["report"]) == 2
-        assert all(item["assertion_type"] == "user" for item in result["report"])
-
-    def test_system_evaluation_with_tool_traces(self, travel_task, sample_conversation):
-        """System evaluator includes tool invocations."""
-        response = json.dumps(
-            [
-                {
-                    "assertion": "The search_flights tool was called with correct parameters",
-                    "answer": "TRUE",
-                    "evidence": "Tool called with SFO, JFK",
-                },
-            ]
-        )
-        model = MACSModelAdapter(responses=[response])
-        evaluator = MACSEvaluator(model, travel_task, gsr_type="system")
-
-        tool_traces = {
-            "search_flights": {
-                "invocations": [
-                    {"inputs": {"origin": "SFO", "destination": "JFK", "date": "2024-12-09"}, "outputs": "Found 3 flights", "status": "success"}
-                ]
-            }
-        }
-
-        traces = {"messages": sample_conversation, "tool_traces": tool_traces}
-        result = evaluator(traces)
-
-        assert result["gsr"] == 1.0
-        assert result["report"][0]["assertion_type"] == "system"
-
-        # Verify tool info was in the prompt
-        prompt = model.prompts[0]
-        assert "search_flights" in prompt
-        assert "SFO" in prompt
-
-    def test_evaluator_handles_partial_success(self, travel_task, sample_conversation):
-        """Evaluator correctly computes partial GSR."""
-        response = json.dumps(
-            [
-                {"assertion": "First assertion", "answer": "TRUE", "evidence": "OK"},
-                {"assertion": "Second assertion", "answer": "FALSE", "evidence": "Failed"},
-            ]
-        )
-        model = MACSModelAdapter(responses=[response])
-        evaluator = MACSEvaluator(model, travel_task, gsr_type="user")
-
-        traces = {"messages": sample_conversation}
-        result = evaluator(traces)
-
-        assert result["gsr"] == 0.0  # Not all passed
-        assert result["partial_gsr"] == 0.5  # 1 of 2 passed
-
-
-# =============================================================================
-# User Simulator Integration Tests
-# =============================================================================
-
-
-@pytest.mark.benchmark
-class TestUserSimulatorIntegration:
-    """Integration tests for MACSUser."""
-
-    def test_user_extracts_profile_from_scenario(self, macs_model, travel_task):
-        """User correctly extracts profile from scenario."""
-        user = MACSUser(
-            model=macs_model,
-            scenario=travel_task.metadata["scenario"],
-            initial_prompt=travel_task.query,
-        )
-
-        # Should have extracted name and other details
-        assert "full_scenario" in user.user_profile
-        assert "Alice Johnson" in user.user_profile.get("full_scenario", "")
-
-    def test_user_respects_max_turns(self, travel_task):
-        """User simulator stops after max_turns."""
-        model = MACSModelAdapter(responses=['{"text": "Yes", "details": {}}'] * 10)
-        user = MACSUser(
-            model=model,
-            scenario=travel_task.metadata["scenario"],
-            initial_prompt=travel_task.query,
-            max_turns=3,
-        )
-
-        # Simulate turns
-        for i in range(3):
-            assert not user.is_done
-            with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="Response"):
-                user.simulate_response(f"Question {i}")
-
-        assert user.is_done
-        assert user._turn_count == 3
-
-    def test_user_detects_stop_token(self, travel_task):
-        """User correctly detects and handles </stop> token."""
-        model = MACSModelAdapter()
-        user = MACSUser(
-            model=model,
-            scenario=travel_task.metadata["scenario"],
-            initial_prompt=travel_task.query,
-        )
-
-        with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="Great, that's all! </stop>"):
-            response = user.simulate_response("Your flight is booked!")
-
-        assert "</stop>" not in response
-        assert user.is_done
-        assert user._stopped
+from .conftest import ConcreteMACSBenchmark
+from conftest import DummyModelAdapter
 
 
 # =============================================================================
@@ -233,7 +27,12 @@ def test_user_detects_stop_token(self, travel_task):
 
 @pytest.mark.benchmark
 class TestFullBenchmarkIntegration:
-    """End-to-end integration tests for MACS benchmark."""
+    """End-to-end integration tests for MACS benchmark.
+
+    These tests verify the complete pipeline works correctly when all
+    components are used together. Component-specific behavior is tested
+    in the individual test files (test_macs_*.py).
+    """
 
     def test_complete_task_lifecycle(self, sample_agent_data, travel_task):
         """Test complete task: setup → run → evaluate."""
@@ -246,7 +45,7 @@ def test_complete_task_lifecycle(self, sample_agent_data, travel_task):
             # System evaluation
             json.dumps([{"assertion": "Tool called", "answer": "TRUE", "evidence": "OK"}]),
         ]
-        model = MACSModelAdapter(responses=responses)
+        model = DummyModelAdapter(responses=responses)
         benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
 
         # Setup phase
@@ -285,31 +84,6 @@ def test_complete_task_lifecycle(self, sample_agent_data, travel_task):
         assert "system_gsr" in results[0]
         assert "overall_gsr" in results[0]
 
-    def test_benchmark_aggregates_metrics_correctly(self):
-        """Test metric aggregation across multiple tasks."""
-        results = [
-            {
-                "task_id": "task-1",
-                "eval": [{"overall_gsr": 1.0, "user_gsr": 1.0, "system_gsr": 1.0, "partial_gsr": 1.0}],
-            },
-            {
-                "task_id": "task-2",
-                "eval": [{"overall_gsr": 0.0, "user_gsr": 1.0, "system_gsr": 0.0, "partial_gsr": 0.5}],
-            },
-            {
-                "task_id": "task-3",
-                "eval": [{"overall_gsr": 1.0, "user_gsr": 1.0, "system_gsr": 1.0, "partial_gsr": 1.0}],
-            },
-        ]
-
-        metrics = compute_benchmark_metrics(results)
-
-        assert metrics["total_tasks"] == 3
-        assert metrics["successful_tasks"] == 2  # overall_gsr == 1.0
-        assert metrics["success_rate"] == pytest.approx(2 / 3)
-        assert metrics["mean_metrics"]["overall_gsr"] == pytest.approx(2 / 3)
-        assert metrics["mean_metrics"]["user_gsr"] == 1.0  # All user tests passed
-
 
 # =============================================================================
 # Data Loading Integration Tests
@@ -383,7 +157,7 @@ class TestErrorHandlingIntegration:
 
     def test_evaluator_handles_malformed_llm_response(self, travel_task, sample_conversation):
         """Evaluator gracefully handles malformed LLM responses."""
-        model = MACSModelAdapter(responses=["This is not valid JSON at all"])
+        model = DummyModelAdapter(responses=["This is not valid JSON at all"])
         evaluator = MACSEvaluator(model, travel_task, gsr_type="user")
 
         traces = {"messages": sample_conversation}
@@ -400,19 +174,6 @@ def test_environment_handles_empty_tool_specs(self, macs_model):
 
         assert env.tools == {}
 
-    def test_user_handles_missing_background(self, macs_model):
-        """User simulator handles scenario without Background section."""
-        scenario = "Simple goal: Book a hotel."
-
-        user = MACSUser(
-            model=macs_model,
-            scenario=scenario,
-            initial_prompt="Book a hotel",
-        )
-
-        # Should not crash, should have minimal profile
-        assert "full_scenario" in user.user_profile
-
 
 # =============================================================================
 # End-to-End Pipeline Tests (benchmark.run())
@@ -423,36 +184,28 @@ def test_user_handles_missing_background(self, macs_model):
 class TestEndToEndPipeline:
     """End-to-end tests that call benchmark.run() with TaskCollection.
 
-    These tests verify the complete MACS benchmark pipeline:
-    1. Task setup (environment, user, agents, evaluators)
-    2. Agent execution
-    3. Trace collection
-    4. Evaluation (user GSR and system GSR)
-    5. Report generation
+    These tests verify the complete MACS benchmark pipeline by actually
+    calling benchmark.run(). More granular integration tests are in
+    TestFullBenchmarkIntegration above.
     """
 
-    def test_run_single_task(self, sample_agent_data, sample_task):
-        """Run benchmark with a single task."""
-        # Create model that returns valid responses for all components
-        # User simulator, tool simulator, and evaluator all need JSON responses
-        model = MACSModelAdapter(
+    def test_run_single_task_complete_pipeline(self, sample_agent_data, travel_task):
+        """Full end-to-end test: single task through benchmark.run()."""
+        model = DummyModelAdapter(
             responses=[
-                # User response
-                '{"text": "Yes, please book that flight.", "details": {}}',
-                # Evaluator responses (user GSR and system GSR)
-                '[{"assertion": "user: Booking confirmed", "answer": "TRUE", "evidence": "User confirmed booking"}]',
-                '[{"assertion": "agent: Database updated", "answer": "TRUE", "evidence": "Agent updated database"}]',
+                '{"text": "Yes, that flight works.", "details": {}}',
+                '[{"assertion": "User request acknowledged", "answer": "TRUE", "evidence": "OK"}]',
+                '[{"assertion": "Tool was called", "answer": "TRUE", "evidence": "OK"}]',
             ]
         )
 
         benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
-        reports = benchmark.run([sample_task])
+        reports = benchmark.run([travel_task])
 
-        # Should have exactly one report
+        # Verify complete report structure
         assert len(reports) == 1
-
         report = reports[0]
-        assert report["task_id"] == str(sample_task.id)
+        assert report["task_id"] == str(travel_task.id)
         assert report["repeat_idx"] == 0
         assert report["status"] == "success"
         assert "traces" in report
@@ -461,9 +214,8 @@ def test_run_single_task(self, sample_agent_data, sample_task):
 
     def test_run_multiple_tasks(self, sample_agent_data, macs_task_collection):
         """Run benchmark with multiple tasks via TaskCollection."""
-        model = MACSModelAdapter(
+        model = DummyModelAdapter(
             responses=[
-                # Responses cycle for each task
                 '{"text": "User response", "details": {}}',
                 '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
                 '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
@@ -473,22 +225,18 @@ def test_run_multiple_tasks(self, sample_agent_data, macs_task_collection):
         benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
         reports = benchmark.run(macs_task_collection)
 
-        # Should have one report per task
         assert len(reports) == len(macs_task_collection)
-
-        # Each report should have correct structure
-        for i, report in enumerate(reports):
-            assert report["repeat_idx"] == 0
+        for report in reports:
             assert report["status"] == "success"
             assert "eval" in report
 
     def test_run_with_task_repeats(self, sample_agent_data, sample_task):
         """Run benchmark with multiple task repetitions."""
-        model = MACSModelAdapter(
+        model = DummyModelAdapter(
             responses=[
                 '{"text": "response", "details": {}}',
                 '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
-                '[{"assertion": "test", "answer": "FALSE", "evidence": "failed"}]',
+                '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
             ]
         )
 
@@ -496,79 +244,11 @@ def test_run_with_task_repeats(self, sample_agent_data, sample_task):
         benchmark = ConcreteMACSBenchmark(sample_agent_data, model, n_task_repeats=n_repeats)
         reports = benchmark.run([sample_task])
 
-        # Should have n_repeats reports for the single task
         assert len(reports) == n_repeats
-
-        # Check repeat indices
         for i, report in enumerate(reports):
             assert report["repeat_idx"] == i
             assert report["task_id"] == str(sample_task.id)
 
-    def test_run_returns_traces(self, sample_agent_data, sample_task):
-        """Benchmark run collects and returns traces from all components."""
-        model = MACSModelAdapter(
-            responses=[
-                '{"text": "User says yes", "details": {}}',
-                '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
-                '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
-            ]
-        )
-
-        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
-        reports = benchmark.run([sample_task])
-
-        report = reports[0]
-        traces = report["traces"]
-
-        # Should have traces from environment and user at minimum
-        # (agents may or may not have traces depending on implementation)
-        assert isinstance(traces, dict)
-
-    def test_run_returns_evaluation_results(self, sample_agent_data, sample_task):
-        """Benchmark run returns evaluation results with GSR scores."""
-        # Configure model to return specific evaluation results
-        model = MACSModelAdapter(
-            responses=[
-                '{"text": "User response", "details": {}}',
-                # User GSR: 1/2 assertions TRUE
-                '[{"assertion": "user: Booking confirmed", "answer": "TRUE", "evidence": "confirmed"}, '
-                '{"assertion": "user: Something else", "answer": "FALSE", "evidence": "not found"}]',
-                # System GSR: 1/1 assertions TRUE
-                '[{"assertion": "agent: Database updated", "answer": "TRUE", "evidence": "updated"}]',
-            ]
-        )
-
-        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
-        reports = benchmark.run([sample_task])
-
-        report = reports[0]
-        eval_result = report["eval"]
-
-        # Should have evaluation results (list of eval dicts)
-        assert eval_result is not None
-        assert isinstance(eval_result, list)
-
-    def test_run_handles_evaluation_failure_gracefully(self, sample_agent_data, sample_task):
-        """Benchmark continues even when evaluation fails."""
-        # Model returns invalid JSON for evaluator
-        model = MACSModelAdapter(
-            responses=[
-                '{"text": "User response", "details": {}}',
-                "not valid json - evaluator will fail",
-                "also not valid json",
-            ]
-        )
-
-        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
-        reports = benchmark.run([sample_task])
-
-        # Should still get a report even though evaluation had issues
-        assert len(reports) == 1
-        report = reports[0]
-
-        # Task should complete (status depends on error handling config)
-        assert "status" in report
-
     def test_run_with_callbacks(self, sample_agent_data, sample_task):
         """Benchmark triggers callbacks during run."""
         from maseval import BenchmarkCallback
@@ -595,7 +275,7 @@ def on_task_end(self, benchmark, task, result):
             def on_run_end(self, benchmark, results):
                 self.events.append("run_end")
 
-        model = MACSModelAdapter(
+        model = DummyModelAdapter(
             responses=[
                 '{"text": "response", "details": {}}',
                 '[{"assertion": "test", "answer": "TRUE", "evidence": "ok"}]',
@@ -608,73 +288,8 @@ def on_run_end(self, benchmark, results):
         benchmark.run([sample_task])
 
         # Verify callback sequence
-        assert "run_start" in callback.events
-        assert "task_start" in callback.events
-        assert "repeat_start_0" in callback.events
-        assert "repeat_end_0" in callback.events
-        assert "task_end" in callback.events
-        assert "run_end" in callback.events
-
-        # Verify order
-        assert callback.events.index("run_start") < callback.events.index("task_start")
-        assert callback.events.index("task_start") < callback.events.index("repeat_start_0")
-        assert callback.events.index("repeat_end_0") < callback.events.index("task_end")
-        assert callback.events.index("task_end") < callback.events.index("run_end")
-
-    def test_run_computes_benchmark_metrics(self, sample_agent_data, sample_task):
-        """Benchmark metrics can be computed from run results."""
-        model = MACSModelAdapter(
-            responses=[
-                '{"text": "response", "details": {}}',
-                '[{"assertion": "user: Booking confirmed", "answer": "TRUE", "evidence": "ok"}]',
-                '[{"assertion": "agent: Database updated", "answer": "TRUE", "evidence": "ok"}]',
-            ]
-        )
-
-        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
-        reports = benchmark.run([sample_task])
-
-        # compute_benchmark_metrics expects full reports (each with "eval" key)
-        metrics = compute_benchmark_metrics(reports)
-        assert isinstance(metrics, dict)
-        assert "total_tasks" in metrics
-        assert "success_rate" in metrics
-        assert "mean_metrics" in metrics
-
-    def test_full_pipeline_with_travel_task(self, sample_agent_data, travel_task):
-        """Full end-to-end test with realistic travel task."""
-        # Comprehensive responses for the full pipeline
-        model = MACSModelAdapter(
-            responses=[
-                # User simulator response (acknowledging agent's response)
-                '{"text": "Yes, that flight works for me. Please book it.", "details": {"satisfied": true}}',
-                # User GSR evaluator response
-                """[
-                    {"assertion": "user: The user's flight booking request was acknowledged", "answer": "TRUE", "evidence": "Agent acknowledged the booking request"},
-                    {"assertion": "user: The user received flight options or a confirmation", "answer": "TRUE", "evidence": "User confirmed the flight"}
-                ]""",
-                # System GSR evaluator response
-                '[{"assertion": "agent: The search_flights tool was called with correct parameters", "answer": "TRUE", "evidence": "Tool was called"}]',
-            ]
-        )
-
-        benchmark = ConcreteMACSBenchmark(sample_agent_data, model)
-        reports = benchmark.run([travel_task])
-
-        assert len(reports) == 1
-        report = reports[0]
-
-        # Verify successful execution
-        assert report["status"] == "success"
-        assert report["task_id"] == str(travel_task.id)
-
-        # Verify traces were collected
-        assert "traces" in report
-        assert isinstance(report["traces"], dict)
-
-        # Verify config was collected
-        assert "config" in report
-        assert isinstance(report["config"], dict)
-
-        # Verify evaluation ran
-        assert "eval" in report
+        expected_order = ["run_start", "task_start", "repeat_start_0", "repeat_end_0", "task_end", "run_end"]
+        for event in expected_order:
+            assert event in callback.events
+        for i in range(len(expected_order) - 1):
+            assert callback.events.index(expected_order[i]) < callback.events.index(expected_order[i + 1])
diff --git a/tests/test_benchmarks/test_macs/test_macs_tool.py b/tests/test_benchmarks/test_macs/test_macs_tool.py
index 37ab332f..733b849e 100644
--- a/tests/test_benchmarks/test_macs/test_macs_tool.py
+++ b/tests/test_benchmarks/test_macs/test_macs_tool.py
@@ -5,7 +5,7 @@
 
 from maseval.benchmark.macs import MACSGenericTool
 
-from .conftest import MACSModelAdapter
+from conftest import DummyModelAdapter
 
 
 # =============================================================================
@@ -17,44 +17,25 @@
 class TestMACSGenericToolInit:
     """Tests for MACSGenericTool initialization."""
 
-    def test_init_from_spec(self, simple_tool_spec, macs_model):
-        """Tool initializes correctly from specification dict."""
+    def test_init_and_defaults(self, simple_tool_spec, minimal_tool_spec, macs_model):
+        """Tool initializes correctly and handles defaults."""
+        # Standard initialization
         tool = MACSGenericTool(simple_tool_spec, macs_model)
-
         assert tool.name == "search_flights"
         assert tool.description == "Search for available flights"
+        assert tool.output_type == "string"
         assert "origin" in tool.inputs
         assert "destination" in tool.inputs
-
-    def test_name_and_description(self, simple_tool_spec, macs_model):
-        """Tool exposes correct name and description."""
-        tool = MACSGenericTool(simple_tool_spec, macs_model)
-
-        assert tool.name == simple_tool_spec["name"]
-        assert tool.description == simple_tool_spec["description"]
-        assert tool.output_type == "string"
-
-    def test_minimal_spec(self, minimal_tool_spec, macs_model):
-        """Tool handles minimal specification with defaults."""
-        tool = MACSGenericTool(minimal_tool_spec, macs_model)
-
-        assert tool.name == "simple_action"
-        assert tool.description == ""
-        assert tool.inputs == {}
-
-    def test_creates_simulator(self, simple_tool_spec, macs_model):
-        """Tool creates a ToolLLMSimulator."""
-        tool = MACSGenericTool(simple_tool_spec, macs_model)
-
         assert tool.simulator is not None
         assert tool.simulator.tool_name == "search_flights"
-
-    def test_empty_history_on_init(self, simple_tool_spec, macs_model):
-        """Tool starts with empty invocation history."""
-        tool = MACSGenericTool(simple_tool_spec, macs_model)
-
         assert len(tool.history.to_list()) == 0
 
+        # Minimal spec with defaults
+        minimal_tool = MACSGenericTool(minimal_tool_spec, macs_model)
+        assert minimal_tool.name == "simple_action"
+        assert minimal_tool.description == ""
+        assert minimal_tool.inputs == {}
+
 
 # =============================================================================
 # Unit Tests: Schema Conversion
@@ -132,7 +113,7 @@ class TestMACSGenericToolInvocation:
     def test_call_invokes_model(self, simple_tool_spec):
         """Calling tool invokes the model via simulator."""
         # Create model that returns valid JSON (ToolLLMSimulator expects {"text": ..., "details": ...})
-        model = MACSModelAdapter(responses=['{"text": "Found flights", "details": {}}'])
+        model = DummyModelAdapter(responses=['{"text": "Found flights", "details": {}}'])
         tool = MACSGenericTool(simple_tool_spec, model)
 
         _ = tool(origin="LAX", destination="JFK")
@@ -143,7 +124,7 @@ def test_call_invokes_model(self, simple_tool_spec):
     def test_call_returns_response(self, simple_tool_spec):
         """Tool call returns the simulated response."""
         # Create model that returns valid JSON
-        model = MACSModelAdapter(responses=['{"text": "Flight found: AA123", "details": {}}'])
+        model = DummyModelAdapter(responses=['{"text": "Flight found: AA123", "details": {}}'])
         tool = MACSGenericTool(simple_tool_spec, model)
 
         result = tool(origin="LAX", destination="JFK")
@@ -153,7 +134,7 @@ def test_call_returns_response(self, simple_tool_spec):
 
     def test_call_records_history(self, simple_tool_spec):
         """Tool invocation recorded in history."""
-        model = MACSModelAdapter(responses=['{"text": "success", "details": {"booking_id": "123"}}'])
+        model = DummyModelAdapter(responses=['{"text": "success", "details": {"booking_id": "123"}}'])
         tool = MACSGenericTool(simple_tool_spec, model)
 
         tool(origin="LAX", destination="JFK")
@@ -167,7 +148,7 @@ def test_call_records_history(self, simple_tool_spec):
 
     def test_multiple_invocations(self, simple_tool_spec):
         """Multiple calls tracked in history."""
-        model = MACSModelAdapter(responses=['{"text": "success", "details": {}}'])
+        model = DummyModelAdapter(responses=['{"text": "success", "details": {}}'])
         tool = MACSGenericTool(simple_tool_spec, model)
 
         tool(origin="LAX", destination="JFK")
@@ -278,7 +259,7 @@ def test_tool_with_complex_spec(self, complex_tool_spec, macs_model):
     def test_end_to_end_flow(self, simple_tool_spec):
         """Complete flow from creation to trace gathering."""
         # Create model with specific response
-        model = MACSModelAdapter(responses=['{"status": "found", "flights": ["AA123", "UA456"]}'])
+        model = DummyModelAdapter(responses=['{"status": "found", "flights": ["AA123", "UA456"]}'])
         tool = MACSGenericTool(simple_tool_spec, model)
 
         # Invoke tool (simulator will use the model)
diff --git a/tests/test_benchmarks/test_macs/test_macs_user.py b/tests/test_benchmarks/test_macs/test_macs_user.py
index 9886e12f..3de4de96 100644
--- a/tests/test_benchmarks/test_macs/test_macs_user.py
+++ b/tests/test_benchmarks/test_macs/test_macs_user.py
@@ -5,7 +5,7 @@
 
 from maseval.benchmark.macs import MACSUser
 
-from .conftest import MACSModelAdapter
+from conftest import DummyModelAdapter
 
 
 # =============================================================================
@@ -17,8 +17,8 @@
 class TestMACSUserInit:
     """Tests for MACSUser initialization."""
 
-    def test_init_basic(self, macs_model, sample_scenario, initial_prompt):
-        """Basic initialization with required args."""
+    def test_init_with_defaults(self, macs_model, sample_scenario, initial_prompt):
+        """Initialization with required args uses proper defaults."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
@@ -27,43 +27,27 @@ def test_init_basic(self, macs_model, sample_scenario, initial_prompt):
 
         assert user.model == macs_model
         assert user.scenario == sample_scenario
-        assert user.name == "Simulated User"  # Default name
-
-    def test_init_custom_name(self, macs_model, sample_scenario, initial_prompt):
-        """Custom name is respected."""
-        user = MACSUser(
-            model=macs_model,
-            scenario=sample_scenario,
-            initial_prompt=initial_prompt,
-            name="Test User",
-        )
-
-        assert user.name == "Test User"
-
-    def test_init_default_max_turns(self, macs_model, sample_scenario, initial_prompt):
-        """Default max_turns is 5."""
-        user = MACSUser(
-            model=macs_model,
-            scenario=sample_scenario,
-            initial_prompt=initial_prompt,
-        )
-
+        assert user.name == "Simulated User"
         assert user.max_turns == 5
+        assert user._turn_count == 0
+        assert not user._stopped
+        assert "full_scenario" in user.user_profile
 
-    def test_init_custom_max_turns(self, macs_model, sample_scenario, initial_prompt):
-        """Custom max_turns is respected."""
+    def test_init_with_custom_params(self, macs_model, sample_scenario, initial_prompt):
+        """Custom name and max_turns are respected."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
             initial_prompt=initial_prompt,
+            name="Test User",
             max_turns=10,
         )
 
+        assert user.name == "Test User"
         assert user.max_turns == 10
 
     def test_init_loads_template(self, macs_model, sample_scenario, initial_prompt):
         """Loads user_simulator.txt template."""
-        # Verify template file exists
         assert MACSUser.TEMPLATE_PATH.exists(), f"Template not found at {MACSUser.TEMPLATE_PATH}"
 
         user = MACSUser(
@@ -71,33 +55,8 @@ def test_init_loads_template(self, macs_model, sample_scenario, initial_prompt):
             scenario=sample_scenario,
             initial_prompt=initial_prompt,
         )
-
-        # MACSUser is created successfully (template is passed to parent User class)
         assert user is not None
 
-    def test_init_extracts_user_profile(self, macs_model, sample_scenario, initial_prompt):
-        """Extracts profile from scenario."""
-        user = MACSUser(
-            model=macs_model,
-            scenario=sample_scenario,
-            initial_prompt=initial_prompt,
-        )
-
-        # Profile should contain extracted info
-        assert "name" in user.user_profile or "full_scenario" in user.user_profile
-        assert user.user_profile.get("full_scenario") == sample_scenario
-
-    def test_init_turn_count_zero(self, macs_model, sample_scenario, initial_prompt):
-        """Turn count starts at zero."""
-        user = MACSUser(
-            model=macs_model,
-            scenario=sample_scenario,
-            initial_prompt=initial_prompt,
-        )
-
-        assert user._turn_count == 0
-        assert not user._stopped
-
 
 # =============================================================================
 # Unit Tests: User Profile Extraction
@@ -261,7 +220,7 @@ class TestResponseSimulation:
 
     def test_simulate_response_increments_turn(self, sample_scenario, initial_prompt):
         """Turn count increments on simulate_response call."""
-        model = MACSModelAdapter(responses=['{"text": "Yes, confirmed.", "details": {}}'])
+        model = DummyModelAdapter(responses=['{"text": "Yes, confirmed.", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
@@ -278,7 +237,7 @@ def test_simulate_response_increments_turn(self, sample_scenario, initial_prompt
 
     def test_simulate_response_detects_stop(self, sample_scenario, initial_prompt):
         """Detects </stop> token."""
-        model = MACSModelAdapter()
+        model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
@@ -293,7 +252,7 @@ def test_simulate_response_detects_stop(self, sample_scenario, initial_prompt):
 
     def test_simulate_response_cleans_stop_token(self, sample_scenario, initial_prompt):
         """Removes </stop> from response."""
-        model = MACSModelAdapter()
+        model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
@@ -308,7 +267,7 @@ def test_simulate_response_cleans_stop_token(self, sample_scenario, initial_prom
 
     def test_simulate_response_returns_empty_when_done(self, sample_scenario, initial_prompt):
         """Returns empty string when is_done is True."""
-        model = MACSModelAdapter()
+        model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
@@ -322,7 +281,7 @@ def test_simulate_response_returns_empty_when_done(self, sample_scenario, initia
 
     def test_simulate_response_returns_empty_at_max_turns(self, sample_scenario, initial_prompt):
         """Returns empty string when max turns reached."""
-        model = MACSModelAdapter()
+        model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
@@ -337,7 +296,7 @@ def test_simulate_response_returns_empty_at_max_turns(self, sample_scenario, ini
 
     def test_simulate_response_fallback_message(self, sample_scenario, initial_prompt):
         """Provides fallback when response is only stop token."""
-        model = MACSModelAdapter()
+        model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
@@ -433,7 +392,7 @@ def test_conversation_lifecycle(self, sample_scenario, initial_prompt):
             "Aisle seat please.",
             "Book it! </stop>",
         ]
-        model = MACSModelAdapter(responses=responses)
+        model = DummyModelAdapter(responses=responses)
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
@@ -463,7 +422,7 @@ def test_conversation_lifecycle(self, sample_scenario, initial_prompt):
 
     def test_max_turns_enforcement(self, sample_scenario, initial_prompt):
         """Test that max turns is enforced."""
-        model = MACSModelAdapter(responses=["Response"] * 10)
+        model = DummyModelAdapter(responses=["Response"] * 10)
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
@@ -486,7 +445,7 @@ def test_max_turns_enforcement(self, sample_scenario, initial_prompt):
 
     def test_reset_allows_new_conversation(self, sample_scenario, initial_prompt):
         """Test that reset allows starting new conversation."""
-        model = MACSModelAdapter()
+        model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,

From 040891027b5fb757979d22e42c8fe1ba1ff4ab31 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 00:16:57 +0000
Subject: [PATCH 09/34] fixed evaluation

---
 examples/macs_benchmark.py                    |  2 +-
 maseval/benchmark/macs/macs.py                | 55 +++++++++++--------
 .../test_macs/test_macs_evaluator.py          | 20 +++++--
 3 files changed, 48 insertions(+), 29 deletions(-)

diff --git a/examples/macs_benchmark.py b/examples/macs_benchmark.py
index 4b06212c..a1ead4ce 100644
--- a/examples/macs_benchmark.py
+++ b/examples/macs_benchmark.py
@@ -221,7 +221,7 @@ def build_agent(agent_id: str, depth: int = 0) -> ToolCallingAgent:
                 name=agent_spec.get("agent_name", agent_id),
                 description=agent_spec.get("agent_instruction", ""),
                 max_steps=25,  # Allow more steps for complex multi-agent tasks
-                verbosity_level=0,
+                verbosity_level=2,
             )
 
             return agent
diff --git a/maseval/benchmark/macs/macs.py b/maseval/benchmark/macs/macs.py
index 480012d1..b65bb962 100644
--- a/maseval/benchmark/macs/macs.py
+++ b/maseval/benchmark/macs/macs.py
@@ -191,15 +191,36 @@ def __init__(
     def filter_traces(self, traces: Dict[str, Any]) -> Dict[str, Any]:
         """Filter traces based on gsr_type.
 
-        For user evaluation: only user-observable messages
-        For system evaluation: full traces including tool invocations
+        For user evaluation: Use user trace which contains the user-observable
+        conversation by construction (what the user sees: queries, agent questions,
+        user answers, and final answers).
+
+        For system evaluation: Full traces including all agent messages and
+        tool invocations (internal behaviors not visible to users).
+
+        Args:
+            traces: Full execution traces dict containing 'agents', 'tools', 'user', etc.
+
+        Returns:
+            Filtered dict with 'messages' and optionally 'tool_traces'
         """
         if self.gsr_type == "user":
+            # User trace contains the user-observable conversation by construction
             user_trace = traces.get("user", {})
-            return {"messages": MessageHistory(user_trace.get("history", []))}
+            return {"messages": user_trace.get("messages", [])}
         else:
-            # System gets everything
-            return traces
+            # System evaluation needs full agent messages and tool traces
+            primary_agent_id = next(iter(traces.get("agents", {}).keys()), None)
+            if primary_agent_id:
+                agent_trace = traces["agents"][primary_agent_id]
+                all_messages = agent_trace.get("messages", [])
+            else:
+                all_messages = []
+
+            return {
+                "messages": all_messages,
+                "tool_traces": traces.get("tools", {}),
+            }
 
     def __call__(
         self,
@@ -721,30 +742,20 @@ def evaluate(
     ) -> List[Dict[str, Any]]:
         """Evaluate using both evaluators and aggregate GSR metrics.
 
+        Uses each evaluator's filter_traces() method to extract relevant data,
+        then calls the evaluator with the filtered traces.
+
         Returns AWS paper format:
         - user_gsr, system_gsr, overall_gsr, supervisor_gsr
         - user_partial_gsr, system_partial_gsr, overall_partial_gsr
         - report: Combined assertion judgments
         """
-        # Get agent traces - primary agent's messages
-        primary_agent_id = list(agents.keys())[0]
-        agent_trace = traces.get("agents", {}).get(primary_agent_id, {})
-        all_messages = MessageHistory(agent_trace.get("messages", []))
-
-        # For user-side evaluation: filter to user-observable messages only
-        # (user queries and assistant responses - not tool calls)
-        user_messages = MessageHistory([msg for msg in all_messages if msg.get("role") in ("user", "assistant")])
-
-        tool_traces = traces.get("tools", {})
-
-        # Run evaluators with properly structured traces dict
+        # Run evaluators - each handles its own trace filtering
         results = []
         for evaluator in evaluators:
-            if isinstance(evaluator, MACSEvaluator) and evaluator.gsr_type == "system":
-                eval_traces = {"messages": all_messages, "tool_traces": tool_traces}
-            else:
-                eval_traces = {"messages": user_messages}
-            result = evaluator(eval_traces, final_answer)
+            # Use the evaluator's filter_traces method to get the right data
+            filtered_traces = evaluator.filter_traces(traces)
+            result = evaluator(filtered_traces, final_answer)
             results.append(result)
 
         # Combine results
diff --git a/tests/test_benchmarks/test_macs/test_macs_evaluator.py b/tests/test_benchmarks/test_macs/test_macs_evaluator.py
index 02b661ec..ab1b6273 100644
--- a/tests/test_benchmarks/test_macs/test_macs_evaluator.py
+++ b/tests/test_benchmarks/test_macs/test_macs_evaluator.py
@@ -153,23 +153,31 @@ def test_filter_traces_user_type(self, macs_model, sample_task, sample_trace):
         """User type gets user messages only."""
         evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="user")
 
-        traces = {"user": {"history": sample_trace.to_list()}, "tools": {"tool1": {}}}
+        # User trace uses 'messages' key (consistent with how User.gather_traces works)
+        traces = {"user": {"messages": sample_trace.to_list()}, "tools": {"tool1": {}}}
         filtered = evaluator.filter_traces(traces)
 
         assert "messages" in filtered
-        assert isinstance(filtered["messages"], MessageHistory)
+        assert isinstance(filtered["messages"], list)
+        assert len(filtered["messages"]) == len(sample_trace)
         # Should not have tools in user evaluation
         assert "tools" not in filtered or filtered.get("tools") is None
 
     def test_filter_traces_system_type(self, macs_model, sample_task, sample_trace, sample_tool_traces):
-        """System type gets full traces."""
+        """System type gets messages and tool_traces."""
         evaluator = MACSEvaluator(macs_model, sample_task, gsr_type="system")
 
-        traces = {"user": {"history": sample_trace.to_list()}, "tools": sample_tool_traces}
+        # Create traces with agent structure as expected by filter_traces
+        traces = {
+            "agents": {"test_agent": {"messages": sample_trace.to_list()}},
+            "tools": sample_tool_traces,
+        }
         filtered = evaluator.filter_traces(traces)
 
-        # System should get everything
-        assert traces == filtered
+        # System should get messages and tool_traces
+        assert "messages" in filtered
+        assert "tool_traces" in filtered
+        assert filtered["tool_traces"] == sample_tool_traces
 
 
 # =============================================================================

From 5ef70147e7f7aebdf0e99e50429459c811ac249f Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 12:40:11 +0000
Subject: [PATCH 10/34] added execution loop to Benchmark and updated user
 accordingly

---
 CHANGELOG.md                                  |   5 +
 maseval/benchmark/macs/macs.py                |  53 ++----
 maseval/core/benchmark.py                     | 128 +++++++++++--
 maseval/core/user.py                          | 170 +++++++++++++++++-
 tests/conftest.py                             |   6 +-
 .../test_macs/test_macs_user.py               |  96 ++++++----
 .../test_benchmark_lifecycle.py               |   4 +-
 7 files changed, 358 insertions(+), 104 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9820c5c5..c2a44a62 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - MACS Benchmark: Multi-Agent Collaboration Scenarios benchmark (PR: #13)
+- Added `execution_loop()` method to `Benchmark` base class enabling iterative agent-user interaction
+- Added `max_invocations` constructor parameter to `Benchmark` (default: 1 for backwards compatibility)
+- Added `max_turns` and `stop_token` parameters to `User` base class for multi-turn support with early stopping
+- Added `is_done()`, `_check_stop_token()`, and `increment_turn()` methods to `User` base class
 - [LlamaIndex](https://github.com/run-llama/llama_index) integration: `LlamaIndexAgentAdapter` and `LlamaIndexUser` for evaluating LlamaIndex workflow-based agents (PR: #7)
   - Supports async workflow execution with proper event loop handling
 - Added a new example: The `5_a_day_benchmark` (PR: #10)
@@ -29,6 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `maseval.interface.agents` structure changed. Tools requiring framework imports (beyond just typing) now in `<framework>_optional.py` and imported dynamically from `<framework>.py`. (PR: #12)
 - Various formatting improvements in the documentation (PR: #12)
 - Added documentation for View Source Code pattern in `CONTRIBUTING.md` and `_optional.py` pattern in interface README (PR: #12)
+- `Benchmark.run()` now uses `execution_loop()` internally to handle agent-user interaction cycles
 
 ### Fixed
 
diff --git a/maseval/benchmark/macs/macs.py b/maseval/benchmark/macs/macs.py
index b65bb962..047a9920 100644
--- a/maseval/benchmark/macs/macs.py
+++ b/maseval/benchmark/macs/macs.py
@@ -383,6 +383,7 @@ class MACSUser(User):
     - Maximum 5 turns of interaction (as per MACS paper)
     - </stop> token detection for natural conversation ending
     - User profile and scenario-aware responses
+    - LLM-based satisfaction evaluation
 
     The simulator maintains a conversation history and uses an LLM to generate
     responses that are consistent with the user's profile and scenario.
@@ -392,7 +393,7 @@ class MACSUser(User):
     """
 
     DEFAULT_MAX_TURNS = 5
-    STOP_TOKEN = "</stop>"
+    DEFAULT_STOP_TOKEN = "</stop>"
     TEMPLATE_PATH = Path(__file__).parent / "prompt_templates" / "user_simulator.txt"
 
     def __init__(
@@ -403,6 +404,7 @@ def __init__(
         name: str = "Simulated User",
         template: Optional[str] = None,
         max_turns: int = DEFAULT_MAX_TURNS,
+        stop_token: str = DEFAULT_STOP_TOKEN,
     ):
         """Initialize MACS user simulator.
 
@@ -413,6 +415,7 @@ def __init__(
             name: User name for identification (default: "Simulated User")
             template: Optional custom prompt template (uses MACS-specific default)
             max_turns: Maximum conversation turns (default: 5, per MACS paper)
+            stop_token: Token indicating user satisfaction (default: "</stop>")
         """
         # Load MACS-specific user simulator template if not provided
         if template is None and self.TEMPLATE_PATH.exists():
@@ -428,10 +431,9 @@ def __init__(
             scenario=scenario,
             initial_prompt=initial_prompt,
             template=template,
+            max_turns=max_turns,
+            stop_token=stop_token,
         )
-        self.max_turns = max_turns
-        self._turn_count = 0
-        self._stopped = False
 
     def get_tool(self) -> Any:
         """Return a tool for agent interaction.
@@ -450,42 +452,6 @@ def get_tool(self) -> Any:
             "Use SmolAgentMACSUser for smolagents or LangGraphMACSUser for langgraph."
         )
 
-    @property
-    def is_done(self) -> bool:
-        """Check if the conversation should end.
-
-        Returns True if:
-        - Maximum turns reached
-        - User responded with </stop> token
-        """
-        return self._stopped or self._turn_count >= self.max_turns
-
-    def simulate_response(self, question: str) -> str:
-        """Simulate a user response, respecting turn limits.
-
-        Args:
-            question: The assistant's question/message
-
-        Returns:
-            The simulated user response, or empty string if done
-        """
-        if self.is_done:
-            return ""
-
-        # Use parent's simulate_response which handles LLM generation
-        response = super().simulate_response(question)
-
-        # Check for stop token
-        if self.STOP_TOKEN in response.lower():
-            self._stopped = True
-            # Clean up the response
-            response = response.replace(self.STOP_TOKEN, "").strip()
-            if not response:
-                response = "Thank you, that's all I needed!"
-
-        self._turn_count += 1
-        return response
-
     def reset(self) -> None:
         """Reset the conversation state for a new interaction."""
         self._turn_count = 0
@@ -635,6 +601,7 @@ def __init__(
         model: ModelAdapter,
         callbacks: Optional[List[Any]] = None,
         n_task_repeats: int = 1,
+        max_invocations: int = 5,
         **kwargs: Any,
     ):
         """Initialize benchmark.
@@ -644,9 +611,10 @@ def __init__(
             model: ModelAdapter for tool simulation and evaluation
             callbacks: Benchmark callbacks
             n_task_repeats: Repetitions per task
+            max_invocations: Maximum agent-user interaction rounds (default: 5 per MACS paper)
         """
         self._model = model
-        super().__init__(agent_data, callbacks, n_task_repeats, **kwargs)
+        super().__init__(agent_data, callbacks, n_task_repeats, max_invocations, **kwargs)
 
     def setup_environment(
         self,
@@ -728,9 +696,10 @@ def run_agents(
         agents: Sequence[AgentAdapter],
         task: Task,
         environment: MACSEnvironment,  # type: ignore[override]
+        query: str = "",
     ) -> Any:
         """Execute agents and return final answer."""
-        answers = [agent.run(task.query) for agent in agents]
+        answers = [agent.run(query) for agent in agents]
         return answers[0] if len(answers) == 1 else answers
 
     def evaluate(
diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py
index 31f7f610..233d9f97 100644
--- a/maseval/core/benchmark.py
+++ b/maseval/core/benchmark.py
@@ -3,6 +3,7 @@
 from datetime import datetime
 import threading
 from enum import Enum
+import warnings
 
 from .evaluator import Evaluator
 from .task import Task, TaskCollection
@@ -67,8 +68,8 @@ def setup_agents(self, agent_data, environment, task, user):
                     agent_adapter = AgentAdapter(agent, "agent")
                     return [agent_adapter], {"agent": agent_adapter}
 
-                def run_agents(self, agents, task, environment):
-                    return agents[0].run(task.query)
+                def run_agents(self, agents, task, environment, query):
+                    return agents[0].run(query)
 
                 # ... implement other abstract methods
 
@@ -100,6 +101,7 @@ def __init__(
         agent_data: Dict[str, Any] | Iterable[Dict[str, Any]],
         callbacks: Optional[List[BenchmarkCallback]] = None,
         n_task_repeats: int = 1,
+        max_invocations: int = 1,
         fail_on_setup_error: bool = False,
         fail_on_task_error: bool = False,
         fail_on_evaluation_error: bool = False,
@@ -115,6 +117,10 @@ def __init__(
                 or collecting custom metrics during the benchmark run.
             n_task_repeats: Number of times to repeat each task. Useful for measuring variance in
                 stochastic agent behaviors. Must be at least 1.
+            max_invocations: Maximum number of agent invocations per task in the execution loop.
+                For simple benchmarks, the default (1) means agents run once per task. For interactive
+                benchmarks with user feedback loops, set higher (e.g., 5 for MACS) to allow multiple
+                agent-user interaction rounds.
             fail_on_setup_error: If True, raise exceptions when setup fails (environment, agents, evaluators).
                 If False (default), catch exceptions during setup and record them in the report with status
                 SETUP_FAILED. This allows the benchmark to continue running remaining tasks even if setup fails.
@@ -207,6 +213,9 @@ def __init__(
         if self.n_task_repeats < 1:
             raise ValueError("n_task_repeats must be at least 1")
 
+        # Execution loop configuration
+        self.max_invocations = max_invocations
+
         # Failure handling configuration
         self.fail_on_task_error = fail_on_task_error
         self.fail_on_evaluation_error = fail_on_evaluation_error
@@ -793,7 +802,7 @@ def evaluate(self, evaluators, agents, final_answer, traces):
         pass
 
     @abstractmethod
-    def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment) -> Any:
+    def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment, query: str) -> Any:
         """Execute the agent system to solve a single task instance.
 
         This method is called once per task repetition by the framework's `run()` loop.
@@ -802,6 +811,9 @@ def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: En
             agents: Sequence of agents to execute (typically just the orchestrator or main agent).
             task: The Task object with the query and any metadata needed for execution.
             environment: The environment instance providing tools and state.
+            query: The query string to pass to agents. For single-turn benchmarks this is
+                typically task.query. For multi-turn with users, this may be an initial
+                prompt or simulated user response.
 
         Returns:
             The final answer or result from the agent system's execution. This could be:
@@ -824,23 +836,106 @@ def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: En
             task iteration, repetitions, and the complete benchmark lifecycle.
 
             ```python
-            def run_agents(self, agents, task, environment):
+            def run_agents(self, agents, task, environment, query):
                 # Simple single-agent execution - returns final answer string
                 orchestrator = agents[0]
-                final_answer = orchestrator.run(task.query)
+                final_answer = orchestrator.run(query)
                 return final_answer
 
             # Or for multiple agents returning a list of answers:
-            def run_agents(self, agents, task, environment):
+            def run_agents(self, agents, task, environment, query):
                 answers = []
                 for agent in agents:
-                    answer = agent.run(task.query)
+                    answer = agent.run(query)
                     answers.append(answer)
                 return answers
             ```
         """
         pass
 
+    def execution_loop(
+        self,
+        agents: Sequence[AgentAdapter],
+        task: Task,
+        environment: Environment,
+        user: Optional[User],
+    ) -> Any:
+        """Execute agents with optional user interaction loop.
+
+        This method orchestrates the agent-user interaction pattern. When a user is
+        present, the user initiates the conversation by providing the first query to
+        agents. If no user is present, ``task.query`` is used as the initial query.
+
+        Query Source Priority:
+            1. **User with initial_prompt**: Uses the user's initial message (fixed string
+               provided at User construction).
+            2. **User without initial_prompt**: Calls ``user.get_initial_query()`` to
+               generate the first message via LLM based on user profile and scenario.
+            3. **No user**: Falls back to ``task.query``.
+
+        Interaction Flow:
+            By default, agents execute once (``max_invocations=1``). For multi-turn
+            interaction, set ``self.max_invocations > 1`` in your benchmark's ``__init__``.
+            The loop continues until ``max_invocations`` is reached or ``user.is_done()``
+            returns True (e.g., max turns reached or stop token detected).
+
+        Note:
+            Override this method in your benchmark subclass to implement custom
+            interaction patterns (e.g., agent-initiated conversations, different
+            termination conditions, or specialized query routing).
+
+        Args:
+            agents: Agents to execute (typically the orchestrator).
+            task: The task being solved.
+            environment: The environment providing tools and state.
+            user: Optional user simulator. If provided, the user initiates and drives
+                the conversation. If None, a single agent execution with ``task.query``.
+
+        Returns:
+            Final answer from the last agent execution.
+
+        Example:
+            For interactive benchmarks, enable multi-turn interaction::
+
+                def __init__(self, ...):
+                    super().__init__(...)
+                    self.max_invocations = 5  # Up to 5 agent-user exchanges
+        """
+
+        final_answer = None
+
+        # Determine initial query text
+        if user is not None:
+            if len(user.messages) > 0:
+                # User has initial_prompt - use it
+                query_text = user.messages[-1].get("content", task.query)
+            else:
+                # No initial_prompt - generate one via LLM
+                query_text = user.get_initial_query()
+        else:
+            # No user - use task query directly
+            query_text = task.query
+
+        for _ in range(self.max_invocations):
+            # Execute agents with query
+            final_answer = self.run_agents(agents, task, environment, query_text)
+
+            # No user means single execution
+            if user is None:
+                break
+
+            # Simulate user response (handles message recording, stop token detection, turn counting)
+            user_response = user.simulate_response(str(final_answer) if final_answer else "")
+
+            # Check if user is done (cheap state check - no LLM call)
+            if user.is_done():
+                break
+
+            # Use user's response as next query
+            query_text = user_response
+
+        return final_answer
+
     def run(self, tasks: Union[Task, TaskCollection, Iterable[Union[Task, dict]]]) -> List[Dict[str, Any]]:
         """Initialize and execute the complete benchmark loop across all tasks.
 
@@ -893,8 +988,8 @@ def run(self, tasks: Union[Task, TaskCollection, Iterable[Union[Task, dict]]]) -
                         agents_to_run, agents_dict = setup_agents(agent_data, environment, task, user)
                         evaluators = setup_evaluators(environment, task, agents_to_run, user)
 
-                        # Run stage
-                        agents_output = run_agents(agents_to_run, task, environment)
+                        # Run stage (execution_loop handles multi-turn if user exists)
+                        agents_output = execution_loop(agents_to_run, task, environment, user)
 
                         # Evaluate stage
                         traces = collect_message_histories(agents_dict)
@@ -977,6 +1072,12 @@ def run(self, tasks: Union[Task, TaskCollection, Iterable[Union[Task, dict]]]) -
                     # 1. Setup
                     environment = self.setup_environment(agent_data, task)
                     user = self.setup_user(agent_data, environment, task)
+                    if user is None and self.max_invocations > 1:
+                        # Warn if multi-turn is enabled but no user to drive interaction
+                        warnings.warn(
+                            f"max_invocations={self.max_invocations} > 1 but no user simulator provided. "
+                            f"Falling back to single-turn execution for task {task.id}."
+                        )
                     agents_to_run, agents_dict = self.setup_agents(agent_data, environment, task, user)
                     evaluators = self.setup_evaluators(environment, task, agents_to_run, user)
 
@@ -1027,9 +1128,9 @@ def run(self, tasks: Union[Task, TaskCollection, Iterable[Union[Task, dict]]]) -
                     # Continue to next task repetition
                     continue
 
-                # 2. Execute agent system
+                # 2. Execute agent system with optional user interaction loop
                 try:
-                    final_answers = self.run_agents(agents_to_run, task, environment)
+                    final_answers = self.execution_loop(agents_to_run, task, environment, user)
                 except Exception as e:
                     execution_status = TaskExecutionStatus.TASK_EXECUTION_FAILED
                     error_info = {
@@ -1046,11 +1147,6 @@ def run(self, tasks: Union[Task, TaskCollection, Iterable[Union[Task, dict]]]) -
                     # Continue with trace collection even if task failed
                     final_answers = None
 
-                # Record final answer in user's conversation history for complete trace
-                # This ensures user traces include the complete user-observable conversation
-                if user is not None and isinstance(final_answers, str):  # TODO change for multimodal model
-                    user.messages.add_message("assistant", final_answers)
-
                 # # Callbacks before evaluation
                 # for cb in self.callbacks:
                 #     cb.on_before_evaluation(self, task, agent_output)
diff --git a/maseval/core/user.py b/maseval/core/user.py
index 6dec001b..5721a79f 100644
--- a/maseval/core/user.py
+++ b/maseval/core/user.py
@@ -22,13 +22,27 @@ class User(ABC, TraceableMixin, ConfigurableMixin):
     The user only has access to the conversation history and does not see the full environment state,
     ensuring partial observability of environment and MAS.
 
+    Multi-Turn Interaction:
+        By default, users support single-turn interaction (max_turns=1). For benchmarks
+        that require multiple agent-user exchanges, set max_turns > 1.
+
+    Early Stopping (User Satisfaction):
+        For benchmarks where the termination criterion is "user satisfaction" rather than
+        a fixed number of turns, configure a stop_token. When the LLM-generated user
+        response contains this token, is_done() returns True, ending the interaction early.
+
+        Example: MACS benchmark uses "</stop>" to signal the user is satisfied with the
+        agent's response, allowing natural conversation endings before max_turns.
+
     Attributes:
         name (str): The name of the user.
         model (ModelAdapter): The language model used for generating responses.
         user_profile (Dict[str, Any]): A dictionary describing the user's persona.
         scenario (str): A description of the task the user is trying to accomplish.
         simulator (UserLLMSimulator): The simulator instance used to generate responses.
-        history (List[Dict[str, str]]): The conversation history between the user and the MAS.
+        messages (MessageHistory): The conversation history between the user and the MAS.
+        max_turns (int): Maximum number of user response turns.
+        stop_token (Optional[str]): Token that triggers early stopping when detected.
     """
 
     def __init__(
@@ -37,9 +51,11 @@ def __init__(
         model: ModelAdapter,
         user_profile: Dict[str, Any],
         scenario: str,
-        initial_prompt: str,
+        initial_prompt: Optional[str] = None,
         template: Optional[str] = None,
         max_try: int = 3,
+        max_turns: int = 1,
+        stop_token: Optional[str] = None,
     ):
         """Initializes the User.
 
@@ -50,11 +66,26 @@ def __init__(
                 preferences, and other relevant information.
             scenario (str): A description of the situation or task the user is trying to
                 accomplish.
-            initial_prompt (str): The initial message or prompt that starts the conversation.
+            initial_prompt (Optional[str], optional): The initial message that starts the
+                conversation. If provided, it's added to the message history as the first
+                user message (not counted as a turn). If None, the conversation starts
+                empty and you can call get_initial_query() to generate one via LLM.
+                Defaults to None.
             template (Optional[str], optional): A custom prompt template for the user
                 simulator. Defaults to None.
             max_try (int, optional): The maximum number of attempts for the simulator to
                 generate a valid response. Defaults to 3.
+            max_turns (int, optional): Maximum number of LLM-generated user responses
+                before is_done() returns True. The initial_prompt (if provided) is NOT
+                counted as a turn since it's not LLM-generated. Use max_turns=1 for
+                single-turn benchmarks, or higher values for multi-turn interaction.
+                Defaults to 1.
+            stop_token (Optional[str], optional): Token that signals user satisfaction,
+                enabling early termination. When the user's LLM-generated response contains
+                this token, is_done() returns True regardless of remaining turns. Use this
+                for benchmarks where termination is based on user satisfaction rather than
+                a fixed turn count. The token is stripped from the response. Defaults to
+                None (early stopping disabled).
         """
         self.name = name
         self.model = model
@@ -67,9 +98,19 @@ def __init__(
             template=template,
             max_try=max_try,
         )
-        self.messages = MessageHistory([{"role": "user", "content": initial_prompt}])
+        # Initialize message history - empty or with initial prompt
+        if initial_prompt is not None:
+            self.messages = MessageHistory([{"role": "user", "content": initial_prompt}])
+        else:
+            self.messages = MessageHistory()
         self.logs: list[Dict[str, Any]] = []
 
+        # Multi-turn configuration
+        self.max_turns = max_turns
+        self.stop_token = stop_token
+        self._turn_count = 0
+        self._stopped = False
+
     def simulate_response(self, question: str) -> str:
         """Simulates a user response to a given question from the MAS.
 
@@ -77,12 +118,20 @@ def simulate_response(self, question: str) -> str:
         generates a response using the UserLLMSimulator, appends the simulated
         response to the history, and returns the response.
 
+        If the user is already done (max_turns reached or stop_token detected),
+        returns an empty string without making an LLM call. If a stop_token is
+        detected in the response, triggers early stopping.
+
         Args:
             question (str): The question or message from the MAS to which the user should respond.
 
         Returns:
-            str: The simulated user's response.
+            str: The simulated user's response, or empty string if done.
         """
+        # Check if already done - saves LLM call
+        if self.is_done():
+            return ""
+
         # Record the assistant prompt and ask simulator. MessageHistory is iterable
         # and can be converted to a list for the simulator.
         self.messages.add_message("assistant", question)
@@ -107,8 +156,65 @@ def simulate_response(self, question: str) -> str:
         log_entry["response_preview"] = self._summarize_response(response)
         self.logs.append(log_entry)
 
-        self.messages.add_message("user", response)
-        return response
+        # Check for stop token and clean response if needed
+        _, clean_response = self._check_stop_token(response)
+
+        self.messages.add_message("user", clean_response)
+        self.increment_turn()
+        return clean_response
+
+    def get_initial_query(self) -> str:
+        """Generate an initial query using the LLM simulator.
+
+        Use this method when you want the user LLM to generate the first message
+        instead of providing a fixed initial_prompt. This is useful for scenarios
+        where the user should initiate the conversation based on their profile
+        and scenario.
+
+        This method:
+        - Calls the LLM simulator with an empty conversation history
+        - Adds the generated query to the message history as a "user" message
+        - Does NOT increment _turn_count (initial query is not counted as a turn)
+        - Checks for stop_token (in case user is immediately satisfied)
+
+        Returns:
+            str: The generated initial query.
+
+        Raises:
+            RuntimeError: If conversation already has messages (use simulate_response instead).
+        """
+        if len(self.messages) > 0:
+            raise RuntimeError(
+                "Cannot generate initial query: conversation already has messages. Use simulate_response() for subsequent turns."
+            )
+
+        start_time = time.time()
+        log_entry: Dict[str, Any] = {
+            "timestamp": datetime.now().isoformat(),
+            "question": "[initial_query]",
+            "status": "success",
+        }
+
+        try:
+            response = self.simulator(conversation_history=[])
+        except Exception as exc:  # pragma: no cover
+            log_entry["duration_seconds"] = time.time() - start_time
+            log_entry["status"] = "error"
+            log_entry["error"] = str(exc)
+            log_entry["error_type"] = type(exc).__name__
+            self.logs.append(log_entry)
+            raise
+
+        log_entry["duration_seconds"] = time.time() - start_time
+        log_entry["response_preview"] = self._summarize_response(response)
+        self.logs.append(log_entry)
+
+        # Check for stop token (user might be immediately satisfied with scenario)
+        _, clean_response = self._check_stop_token(response)
+
+        # Add as initial user message (not counted as a turn)
+        self.messages.add_message("user", clean_response)
+        return clean_response
 
     def gather_traces(self) -> dict[str, Any]:
         """Gather execution traces from this user simulator.
@@ -135,6 +241,52 @@ def gather_traces(self) -> dict[str, Any]:
     def _summarize_response(response: str) -> str:
         return response[:2000]
 
+    def is_done(self) -> bool:
+        """Check if the user interaction should end.
+
+        The base implementation checks:
+        1. If max_turns has been reached
+        2. If the user previously indicated satisfaction (via stop_token)
+
+        Subclasses can override to add custom termination logic (e.g., LLM-based
+        satisfaction checks) by calling super().is_done() first.
+
+        Returns:
+            True if the user is done interacting, False to continue.
+        """
+        # Hard limit on turns
+        if self._turn_count >= self.max_turns:
+            return True
+
+        # User previously indicated they're done
+        if self._stopped:
+            return True
+
+        return False
+
+    def _check_stop_token(self, response: str) -> tuple[bool, str]:
+        """Check if response contains stop token and clean it up.
+
+        Args:
+            response: The user's response to check.
+
+        Returns:
+            Tuple of (should_stop, cleaned_response).
+        """
+        if self.stop_token and self.stop_token.lower() in response.lower():
+            self._stopped = True
+            # Remove the stop token from the response
+            cleaned = response.replace(self.stop_token, "").replace(self.stop_token.lower(), "").strip()
+            return True, cleaned if cleaned else "Thank you, that's all I needed!"
+        return False, response
+
+    def increment_turn(self) -> None:
+        """Increment the turn counter.
+
+        Call this after recording a user response in the message history.
+        """
+        self._turn_count += 1
+
     def gather_config(self) -> dict[str, Any]:
         """Gather configuration from this user simulator.
 
@@ -145,12 +297,16 @@ def gather_config(self) -> dict[str, Any]:
             - name: User name
             - profile: User profile data
             - scenario: Task scenario description
+            - max_turns: Maximum interaction turns
+            - stop_token: Early stopping token (if configured)
         """
         return {
             **super().gather_config(),
             "name": self.name,
             "profile": self.user_profile,
             "scenario": self.scenario,
+            "max_turns": self.max_turns,
+            "stop_token": self.stop_token,
         }
 
     @abstractmethod
diff --git a/tests/conftest.py b/tests/conftest.py
index cfcf2f05..19e9f06e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -226,10 +226,10 @@ def setup_evaluators(
         self.setup_evaluators_calls.append((environment, task, agents, user))
         return [DummyEvaluator(task, environment, user)]
 
-    def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment) -> Any:
-        self.run_agents_calls.append((agents, task, environment))
+    def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment, query: str) -> Any:
+        self.run_agents_calls.append((agents, task, environment, query))
         # Run the first agent and return final answer
-        return agents[0].run(task.query)
+        return agents[0].run(query)
 
     def evaluate(
         self, evaluators: Sequence[Evaluator], agents: Dict[str, AgentAdapter], final_answer: Any, traces: Dict[str, Any]
diff --git a/tests/test_benchmarks/test_macs/test_macs_user.py b/tests/test_benchmarks/test_macs/test_macs_user.py
index 3de4de96..9864da87 100644
--- a/tests/test_benchmarks/test_macs/test_macs_user.py
+++ b/tests/test_benchmarks/test_macs/test_macs_user.py
@@ -1,7 +1,7 @@
 """Unit tests for MACSUser."""
 
 import pytest
-from unittest.mock import patch
+from unittest.mock import patch, MagicMock
 
 from maseval.benchmark.macs import MACSUser
 
@@ -122,18 +122,25 @@ def test_extract_profile_includes_full_scenario(self, sample_scenario):
 class TestConversationState:
     """Tests for conversation state management."""
 
-    def test_is_done_false_initially(self, macs_model, sample_scenario, initial_prompt):
-        """is_done is False at start."""
+    def test_is_done_false_initially_without_assistant_message(self, macs_model, sample_scenario, initial_prompt):
+        """is_done() returns False when no assistant message to evaluate.
+
+        When there's no assistant message yet (only the initial user message),
+        there's nothing to evaluate for satisfaction. The loop should continue
+        to get an agent response first.
+        """
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
             initial_prompt=initial_prompt,
         )
 
-        assert not user.is_done
+        # No assistant message yet, so is_done() returns False
+        # (nothing to evaluate, need to get agent response first)
+        assert not user.is_done()
 
     def test_is_done_after_max_turns(self, macs_model, sample_scenario, initial_prompt):
-        """is_done is True after max turns."""
+        """is_done() returns True after max turns."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
@@ -144,10 +151,10 @@ def test_is_done_after_max_turns(self, macs_model, sample_scenario, initial_prom
         # Manually increment turn count
         user._turn_count = 2
 
-        assert user.is_done
+        assert user.is_done()
 
     def test_is_done_after_stop_token(self, macs_model, sample_scenario, initial_prompt):
-        """is_done is True after </stop> detected."""
+        """is_done() returns True after </stop> detected."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
@@ -157,10 +164,12 @@ def test_is_done_after_stop_token(self, macs_model, sample_scenario, initial_pro
         # Manually set stopped flag
         user._stopped = True
 
-        assert user.is_done
+        assert user.is_done()
+
+    def test_is_done_returns_false_when_not_satisfied(self, macs_model, sample_scenario, initial_prompt):
+        """is_done() returns False when user is not satisfied with response."""
+        from unittest.mock import MagicMock
 
-    def test_turn_count_below_max_not_done(self, macs_model, sample_scenario, initial_prompt):
-        """Not done when turn count below max."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
@@ -168,9 +177,18 @@ def test_turn_count_below_max_not_done(self, macs_model, sample_scenario, initia
             max_turns=5,
         )
 
-        user._turn_count = 4  # One below max
+        # Mock the simulator to return a response without stop token
+        user.simulator = MagicMock(return_value="I need more information.")
+
+        # simulate_response() calls simulator, increments turn count, and checks for stop token
+        response = user.simulate_response("Here is your flight info.")
+
+        # The user's response should be added to messages
+        assert user._turn_count == 1
+        assert "I need more information" in response
 
-        assert not user.is_done
+        # is_done() is a cheap state check - no </stop> token was found
+        assert not user.is_done()
 
 
 # =============================================================================
@@ -229,9 +247,9 @@ def test_simulate_response_increments_turn(self, sample_scenario, initial_prompt
 
         initial_count = user._turn_count
 
-        # Mock parent's simulate_response to return a simple response
-        with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="Yes, confirmed."):
-            user.simulate_response("When would you like to travel?")
+        # Replace the simulator with a mock that returns a controlled response
+        user.simulator = MagicMock(return_value="Yes, confirmed.")
+        user.simulate_response("When would you like to travel?")
 
         assert user._turn_count == initial_count + 1
 
@@ -244,11 +262,12 @@ def test_simulate_response_detects_stop(self, sample_scenario, initial_prompt):
             initial_prompt=initial_prompt,
         )
 
-        with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="Thanks! </stop>"):
-            user.simulate_response("Your flight is booked!")
+        # Replace the simulator with a mock that returns a response with stop token
+        user.simulator = MagicMock(return_value="Thanks! </stop>")
+        user.simulate_response("Your flight is booked!")
 
         assert user._stopped
-        assert user.is_done
+        assert user.is_done()
 
     def test_simulate_response_cleans_stop_token(self, sample_scenario, initial_prompt):
         """Removes </stop> from response."""
@@ -259,8 +278,9 @@ def test_simulate_response_cleans_stop_token(self, sample_scenario, initial_prom
             initial_prompt=initial_prompt,
         )
 
-        with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="Perfect, thanks! </stop>"):
-            response = user.simulate_response("Booking confirmed!")
+        # Replace the simulator with a mock that returns a response with stop token
+        user.simulator = MagicMock(return_value="Perfect, thanks! </stop>")
+        response = user.simulate_response("Booking confirmed!")
 
         assert "</stop>" not in response
         assert "Perfect, thanks!" in response
@@ -303,9 +323,11 @@ def test_simulate_response_fallback_message(self, sample_scenario, initial_promp
             initial_prompt=initial_prompt,
         )
 
-        with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="</stop>"):
-            response = user.simulate_response("Booking complete!")
+        # Replace the simulator with a mock that returns only the stop token
+        user.simulator = MagicMock(return_value="</stop>")
+        response = user.simulate_response("Booking complete!")
 
+        # When response is only stop token, base class provides fallback message
         assert response == "Thank you, that's all I needed!"
         assert user._stopped
 
@@ -385,7 +407,7 @@ class TestMACSUserIntegration:
     """Integration tests for MACSUser."""
 
     def test_conversation_lifecycle(self, sample_scenario, initial_prompt):
-        """Test complete conversation lifecycle."""
+        """Test complete conversation lifecycle with is_done() method."""
         responses = [
             "Yes, Monday works.",
             "I prefer Delta.",
@@ -400,7 +422,10 @@ def test_conversation_lifecycle(self, sample_scenario, initial_prompt):
             max_turns=5,
         )
 
-        # Simulate multi-turn conversation
+        # Replace the simulator with a mock that cycles through responses
+        user.simulator = MagicMock(side_effect=responses)
+
+        # Simulate multi-turn conversation using simulate_response
         questions = [
             "When would you like to travel?",
             "Any airline preference?",
@@ -409,15 +434,14 @@ def test_conversation_lifecycle(self, sample_scenario, initial_prompt):
         ]
 
         for i, question in enumerate(questions):
-            if user.is_done:
+            if user._stopped or user._turn_count >= user.max_turns:
                 break
-            with patch.object(user.__class__.__bases__[0], "simulate_response", return_value=responses[i]):
-                response = user.simulate_response(question)
+            response = user.simulate_response(question)
             if i < len(questions) - 1:
                 assert response != ""
 
         # After stop token, should be done
-        assert user.is_done
+        assert user.is_done()
         assert user._turn_count == 4
 
     def test_max_turns_enforcement(self, sample_scenario, initial_prompt):
@@ -430,13 +454,15 @@ def test_max_turns_enforcement(self, sample_scenario, initial_prompt):
             max_turns=3,
         )
 
+        # Replace the simulator with a mock that returns a controlled response
+        user.simulator = MagicMock(return_value="Response")
+
         # Simulate 3 turns
         for i in range(3):
-            with patch.object(user.__class__.__bases__[0], "simulate_response", return_value="Response"):
-                user.simulate_response(f"Question {i}")
+            user.simulate_response(f"Question {i}")
 
         # Should be done after 3 turns
-        assert user.is_done
+        assert user.is_done()
         assert user._turn_count == 3
 
         # Additional calls should return empty
@@ -456,11 +482,13 @@ def test_reset_allows_new_conversation(self, sample_scenario, initial_prompt):
         # Max out turns
         user._turn_count = 2
         user._stopped = True
-        assert user.is_done
+        assert user.is_done()
 
         # Reset
         user.reset()
 
-        # Should be able to continue
-        assert not user.is_done
+        # After reset, hard limits are cleared but there's no assistant message
+        # to evaluate, so is_done() returns True (nothing to evaluate yet)
+        # This is correct - the execution_loop will call run_agents first
         assert user._turn_count == 0
+        assert not user._stopped
diff --git a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py
index 5b1b08e6..9c226520 100644
--- a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py
+++ b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py
@@ -61,8 +61,8 @@ def test_benchmark_complete_run_multiple_tasks(self):
         task_ids = [r["task_id"] for r in reports]
         assert len(set(task_ids)) == 3
 
-        # Verify queries match - call format is (agents, task, environment)
-        queries = [call[1].query for call in benchmark.run_agents_calls]
+        # Verify queries match - call format is (agents, task, environment, query)
+        queries = [call[3] for call in benchmark.run_agents_calls]
         assert queries == ["Task 1", "Task 2", "Task 3"]
 
     def test_benchmark_task_repetitions(self):

From e878ac111abbd764df1bfbf6375522bddc8d9841 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 16:32:11 +0000
Subject: [PATCH 11/34] updated user benchmark interaction

---
 CHANGELOG.md                                  |   8 +
 tests/conftest.py                             |  68 +--
 .../test_macs/test_macs_benchmark.py          |  45 +-
 .../test_macs/test_macs_user.py               |  30 ++
 .../test_benchmark/test_execution_loop.py     | 428 ++++++++++++++++
 tests/test_core/test_user.py                  | 464 ++++++++++++++++++
 tests/test_core/test_user_simulator.py        | 410 +++++++++++++++-
 7 files changed, 1394 insertions(+), 59 deletions(-)
 create mode 100644 tests/test_core/test_benchmark/test_execution_loop.py
 create mode 100644 tests/test_core/test_user.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c2a44a62..2330e80d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `max_invocations` constructor parameter to `Benchmark` (default: 1 for backwards compatibility)
 - Added `max_turns` and `stop_token` parameters to `User` base class for multi-turn support with early stopping
 - Added `is_done()`, `_check_stop_token()`, and `increment_turn()` methods to `User` base class
+- Added `get_initial_query()` method to `User` base class for LLM-generated initial messages
+- Added `initial_prompt` parameter in `User` base class to trigger the agentic system.
+- Comprehensive testing for multi-turn behavior and `Benchmark.execution_loop()` (PR: #13)
+
 - [LlamaIndex](https://github.com/run-llama/llama_index) integration: `LlamaIndexAgentAdapter` and `LlamaIndexUser` for evaluating LlamaIndex workflow-based agents (PR: #7)
   - Supports async workflow execution with proper event loop handling
 - Added a new example: The `5_a_day_benchmark` (PR: #10)
@@ -25,6 +29,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `get_tools()` returns a dict keyed by tool name
   - Added `get_tool(name)` method for single-tool lookup
   - Removed internal `_tools_dict` attribute (tools dict is now the source of truth)
+- **BREAKING:** `Benchmark.run_agents()` signature changed: added `query: str` parameter
+  - Subclasses must update their implementations to accept and use this parameter
+- Renamed `tests/test_core/test_user_simulator.py` to `tests/test_core/test_user.py` to clarify it tests the `User` class (not `UserLLMSimulator`)
 - Documentation formatting improved. Added darkmode and links to `Github` (PR: #11).
 - `FileResultLogger` now accepts `pathlib.Path` for argument `output_dir` and has an `overwrite` argument to prevent overwriting of existing logs files.
 - `Benchmark` class now has a `fail_on_setup_error` flag that raises errors observed during setup of task (PR: #10)
@@ -37,6 +44,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- Fixed `MACSBenchmark.run_agents()` to use the `query` parameter instead of always using `task.query`
 - `LlamaIndexAgentAdapter` now supports multiple LlamaIndex agent types including `ReActAgent` (workflow-based), `FunctionAgent`, and legacy agents by checking for `.chat()`, `.query()`, and `.run()` methods in priority order (PR: #10)
 - Consistent naming of agent `adapter` over `wrapper` (PR: #3)
 - Fixed an issue that `LiteLLM` interface and `Mixin`s were not shwon in documentation properly (#PR: 12)
diff --git a/tests/conftest.py b/tests/conftest.py
index 19e9f06e..a89bfd96 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -135,45 +135,49 @@ def create_tools(self) -> dict:
 
 
 class DummyUser(User):
-    """Minimal user simulator for testing."""
+    """Minimal user simulator for testing.
+
+    Properly inherits from User base class, allowing tests to verify base class
+    behavior. The simulator is replaced with a mock to avoid LLM calls.
+
+    Supports all base class features:
+    - max_turns / stop_token for multi-turn interaction
+    - is_done() / simulate_response() / get_initial_query()
+    - messages (MessageHistory) for conversation tracking
+    """
 
     def __init__(self, name: str, model: ModelAdapter, **kwargs):
-        # Initialize with minimal requirements
-        self.name = name
-        self.model = model
-        self.user_profile = kwargs.get("user_profile", {})
-        self.scenario = kwargs.get("scenario", "test scenario")
-        self.history = MessageHistory([{"role": "user", "content": kwargs.get("initial_prompt", "Hello")}])
-        # Don't initialize simulator to avoid LLM calls in tests
+        """Initialize DummyUser with proper base class inheritance.
+
+        Args:
+            name: User name
+            model: ModelAdapter instance
+            **kwargs: Forwarded to User base class:
+                - user_profile: Dict of user attributes
+                - scenario: Scenario description
+                - initial_prompt: Optional initial message
+                - max_turns: Max LLM-generated responses (default: 1)
+                - stop_token: Early termination token (default: None)
+        """
+        super().__init__(
+            name=name,
+            model=model,
+            user_profile=kwargs.get("user_profile", {}),
+            scenario=kwargs.get("scenario", "test scenario"),
+            initial_prompt=kwargs.get("initial_prompt"),
+            max_turns=kwargs.get("max_turns", 1),
+            stop_token=kwargs.get("stop_token"),
+        )
+        # Replace simulator with a mock to avoid LLM calls
+        # Tests can set simulator.return_value or side_effect as needed
+        from unittest.mock import MagicMock
+
+        self.simulator = MagicMock(return_value="Mock user response")
 
     def get_tool(self) -> Any:
         """Return a dummy tool for testing."""
         return None
 
-    def gather_traces(self) -> dict:
-        """Return minimal traces for testing."""
-        from datetime import datetime
-
-        return {
-            "type": self.__class__.__name__,
-            "gathered_at": datetime.now().isoformat(),
-            "name": self.name,
-            "message_count": len(self.history),
-            "history": self.history.to_list(),
-        }
-
-    def gather_config(self) -> dict:
-        """Return minimal config for testing."""
-        from datetime import datetime
-
-        return {
-            "type": self.__class__.__name__,
-            "gathered_at": datetime.now().isoformat(),
-            "name": self.name,
-            "user_profile": self.user_profile,
-            "scenario": self.scenario,
-        }
-
 
 class DummyEvaluator(Evaluator):
     """Minimal evaluator for testing."""
diff --git a/tests/test_benchmarks/test_macs/test_macs_benchmark.py b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
index 325a99fc..7c18bfcb 100644
--- a/tests/test_benchmarks/test_macs/test_macs_benchmark.py
+++ b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
@@ -36,6 +36,16 @@ def test_init_configures_benchmark(self, macs_model, sample_agent_data):
         assert benchmark.callbacks == callbacks
         assert benchmark.n_task_repeats == 3
 
+    def test_macs_default_max_invocations_is_five(self, macs_model, sample_agent_data):
+        """MACS benchmark defaults to max_invocations=5 per MACS paper.
+
+        This is a MACS-specific default that differs from the base class default of 1.
+        The MACS paper specifies up to 5 agent-user interaction rounds.
+        """
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+
+        assert benchmark.max_invocations == 5
+
     def test_setup_environment_creates_macs_environment(self, macs_model, sample_agent_data, sample_task):
         """setup_environment returns MACSEnvironment with tools."""
         benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
@@ -101,14 +111,15 @@ class IncompleteMACSBenchmark(MACSBenchmark):
 class TestRunAgents:
     """Tests for run_agents method."""
 
-    def test_run_agents_executes_agents(self, macs_model, sample_agent_data, sample_task):
-        """Agents are executed with query."""
+    def test_run_agents_executes_agents_with_query(self, macs_model, sample_agent_data, sample_task):
+        """Agents are executed with the query parameter."""
         benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
         env = benchmark.setup_environment(sample_agent_data, sample_task)
 
         agents_list, agents_dict = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
 
-        benchmark.run_agents(agents_list, sample_task, env)
+        # Pass explicit query parameter
+        benchmark.run_agents(agents_list, sample_task, env, query=sample_task.query)
 
         # Cast to MACSAgentAdapter to access run_calls
         mock_agent = agents_list[0]
@@ -116,13 +127,33 @@ def test_run_agents_executes_agents(self, macs_model, sample_agent_data, sample_
         assert len(mock_agent.run_calls) == 1
         assert mock_agent.run_calls[0] == sample_task.query
 
+    def test_run_agents_uses_query_parameter_not_task_query(self, macs_model, sample_agent_data, sample_task):
+        """run_agents uses the query parameter, not task.query directly.
+
+        This is critical for multi-turn interaction where the query changes
+        between invocations (e.g., user's response becomes the next query).
+        """
+        benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
+        env = benchmark.setup_environment(sample_agent_data, sample_task)
+        agents_list, _ = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
+
+        # Pass a different query than task.query
+        custom_query = "This is a user response, not the task query"
+        benchmark.run_agents(agents_list, sample_task, env, query=custom_query)
+
+        mock_agent = agents_list[0]
+        assert isinstance(mock_agent, MACSAgentAdapter)
+        # Agent should receive the custom query, not task.query
+        assert mock_agent.run_calls[0] == custom_query
+        assert mock_agent.run_calls[0] != sample_task.query
+
     def test_run_agents_returns_answer(self, macs_model, sample_agent_data, sample_task):
         """Returns final answer(s) as MessageHistory."""
         benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model)
         env = benchmark.setup_environment(sample_agent_data, sample_task)
         agents_list, _ = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
 
-        result = benchmark.run_agents(agents_list, sample_task, env)
+        result = benchmark.run_agents(agents_list, sample_task, env, query=sample_task.query)
 
         # run_agents returns MessageHistory from the agent run
         assert isinstance(result, MessageHistory)
@@ -136,7 +167,7 @@ def test_run_agents_single_agent(self, macs_model, sample_agent_data, sample_tas
         env = benchmark.setup_environment(sample_agent_data, sample_task)
         agents_list, _ = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
 
-        result = benchmark.run_agents(agents_list, sample_task, env)
+        result = benchmark.run_agents(agents_list, sample_task, env, query=sample_task.query)
 
         assert isinstance(result, MessageHistory)
 
@@ -159,7 +190,7 @@ def setup_agents(
         env = benchmark.setup_environment(sample_agent_data, sample_task)
         agents_list, _ = benchmark.setup_agents(sample_agent_data, env, sample_task, None)
 
-        result = benchmark.run_agents(agents_list, sample_task, env)
+        result = benchmark.run_agents(agents_list, sample_task, env, query=sample_task.query)
 
         assert isinstance(result, list)
         assert len(result) == 2
@@ -446,7 +477,7 @@ def test_full_task_execution(self, sample_agent_data, sample_task):
         evaluators = benchmark.setup_evaluators(env, sample_task, agents_list, user)
 
         # Run phase
-        final_answer = benchmark.run_agents(agents_list, sample_task, env)
+        final_answer = benchmark.run_agents(agents_list, sample_task, env, query=sample_task.query)
 
         # Evaluate phase
         traces = {
diff --git a/tests/test_benchmarks/test_macs/test_macs_user.py b/tests/test_benchmarks/test_macs/test_macs_user.py
index 9864da87..4a549523 100644
--- a/tests/test_benchmarks/test_macs/test_macs_user.py
+++ b/tests/test_benchmarks/test_macs/test_macs_user.py
@@ -33,6 +33,36 @@ def test_init_with_defaults(self, macs_model, sample_scenario, initial_prompt):
         assert not user._stopped
         assert "full_scenario" in user.user_profile
 
+    def test_macs_default_max_turns_is_five(self, macs_model, sample_scenario, initial_prompt):
+        """MACS benchmark defaults to max_turns=5 per MACS paper.
+
+        This is a MACS-specific default that differs from the base class default of 1.
+        If the base class default changes, this test ensures MACS maintains its value.
+        """
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        assert user.max_turns == MACSUser.DEFAULT_MAX_TURNS
+        assert user.max_turns == 5
+
+    def test_macs_default_stop_token(self, macs_model, sample_scenario, initial_prompt):
+        """MACS uses '</stop>' as stop token per MACS paper.
+
+        This is a MACS-specific default. If the base class default changes,
+        this test ensures MACS maintains its value.
+        """
+        user = MACSUser(
+            model=macs_model,
+            scenario=sample_scenario,
+            initial_prompt=initial_prompt,
+        )
+
+        assert user.stop_token == MACSUser.DEFAULT_STOP_TOKEN
+        assert user.stop_token == "</stop>"
+
     def test_init_with_custom_params(self, macs_model, sample_scenario, initial_prompt):
         """Custom name and max_turns are respected."""
         user = MACSUser(
diff --git a/tests/test_core/test_benchmark/test_execution_loop.py b/tests/test_core/test_benchmark/test_execution_loop.py
new file mode 100644
index 00000000..b7c198f2
--- /dev/null
+++ b/tests/test_core/test_benchmark/test_execution_loop.py
@@ -0,0 +1,428 @@
+"""Tests for Benchmark.execution_loop() method.
+
+These tests verify the agent-user interaction orchestration, including:
+- Query source priority (user initial_prompt vs get_initial_query vs task.query)
+- Multi-turn interaction with max_invocations
+- Early stopping when user.is_done() returns True
+- Message recording (final_answer attached to user traces)
+"""
+
+import pytest
+from typing import Any, List, Optional, Tuple
+import warnings
+
+from maseval import Benchmark, Task, TaskCollection, User
+
+
+# =============================================================================
+# Test Fixtures and Helpers
+# =============================================================================
+
+
+class ExecutionLoopBenchmark(Benchmark):
+    """Benchmark implementation for testing execution_loop behavior."""
+
+    def __init__(self, *args, return_user: Optional[User] = None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._return_user = return_user
+        self.run_agents_calls: List[Tuple[Any, ...]] = []
+
+    def setup_environment(self, agent_data, task):
+        from conftest import DummyEnvironment
+
+        return DummyEnvironment(task.environment_data)
+
+    def setup_user(self, agent_data, environment, task):
+        return self._return_user
+
+    def setup_agents(self, agent_data, environment, task, user):
+        from conftest import DummyAgent, DummyAgentAdapter
+
+        agent = DummyAgent()
+        adapter = DummyAgentAdapter(agent, "test_agent")
+        return [adapter], {"test_agent": adapter}
+
+    def setup_evaluators(self, environment, task, agents, user):
+        from conftest import DummyEvaluator
+
+        return [DummyEvaluator(task, environment, user)]
+
+    def run_agents(self, agents, task, environment, query):
+        self.run_agents_calls.append((agents, task, environment, query))
+        return agents[0].run(query)
+
+    def evaluate(self, evaluators, agents, final_answer, traces):
+        return [{"score": 1.0}]
+
+
+# =============================================================================
+# Tests: Execution Loop Without User
+# =============================================================================
+
+
+@pytest.mark.core
+class TestExecutionLoopNoUser:
+    """Tests for execution_loop without user simulator."""
+
+    def test_uses_task_query_without_user(self, dummy_model):
+        """Uses task.query when no user present."""
+        task = Task(query="What is the weather?", environment_data={})
+        benchmark = ExecutionLoopBenchmark(agent_data={}, return_user=None)
+
+        env = benchmark.setup_environment({}, task)
+        agents, _ = benchmark.setup_agents({}, env, task, None)
+
+        benchmark.execution_loop(agents, task, env, user=None)
+
+        assert len(benchmark.run_agents_calls) == 1
+        _, _, _, query = benchmark.run_agents_calls[0]
+        assert query == "What is the weather?"
+
+    def test_single_invocation_without_user(self, dummy_model):
+        """Single agent run without user (default max_invocations=1)."""
+        task = Task(query="Query", environment_data={})
+        benchmark = ExecutionLoopBenchmark(agent_data={}, return_user=None)
+
+        env = benchmark.setup_environment({}, task)
+        agents, _ = benchmark.setup_agents({}, env, task, None)
+
+        benchmark.execution_loop(agents, task, env, user=None)
+
+        assert len(benchmark.run_agents_calls) == 1
+
+    def test_returns_final_answer(self, dummy_model):
+        """Returns final answer from agent."""
+        task = Task(query="Test query", environment_data={})
+        benchmark = ExecutionLoopBenchmark(agent_data={}, return_user=None)
+
+        env = benchmark.setup_environment({}, task)
+        agents, _ = benchmark.setup_agents({}, env, task, None)
+
+        result = benchmark.execution_loop(agents, task, env, user=None)
+
+        assert result is not None
+        assert "Response to:" in result
+
+
+# =============================================================================
+# Tests: Execution Loop With User
+# =============================================================================
+
+
+@pytest.mark.core
+class TestExecutionLoopWithUser:
+    """Tests for execution_loop with user simulator."""
+
+    def test_uses_user_initial_prompt(self, dummy_model):
+        """Uses user's initial_prompt as first query."""
+        from conftest import DummyUser
+
+        task = Task(query="Task query (should not be used)", environment_data={})
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="User's initial message",
+            max_turns=5,
+        )
+        benchmark = ExecutionLoopBenchmark(agent_data={}, return_user=user)
+
+        env = benchmark.setup_environment({}, task)
+        agents, _ = benchmark.setup_agents({}, env, task, user)
+
+        benchmark.execution_loop(agents, task, env, user=user)
+
+        # First query should be from user's initial_prompt, not task.query
+        _, _, _, query = benchmark.run_agents_calls[0]
+        assert query == "User's initial message"
+
+    def test_uses_get_initial_query_if_no_prompt(self, dummy_model):
+        """Calls get_initial_query() if no initial_prompt."""
+        from conftest import DummyUser
+
+        task = Task(query="Task query", environment_data={})
+        user = DummyUser(name="test", model=dummy_model, max_turns=5)
+        # No initial_prompt, so messages is empty
+        user.simulator.return_value = "LLM generated initial query"
+
+        benchmark = ExecutionLoopBenchmark(agent_data={}, return_user=user)
+
+        env = benchmark.setup_environment({}, task)
+        agents, _ = benchmark.setup_agents({}, env, task, user)
+
+        benchmark.execution_loop(agents, task, env, user=user)
+
+        # First query should be LLM-generated
+        _, _, _, query = benchmark.run_agents_calls[0]
+        assert query == "LLM generated initial query"
+
+    def test_multi_turn_interaction(self, dummy_model):
+        """Multiple agent-user exchanges up to max_invocations."""
+        from conftest import DummyUser
+
+        task = Task(query="Task query", environment_data={})
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="Start",
+            max_turns=5,
+        )
+        # User responds with different messages each turn
+        user.simulator.side_effect = ["Turn 1 response", "Turn 2 response", "Turn 3 response"]
+
+        benchmark = ExecutionLoopBenchmark(
+            agent_data={},
+            return_user=user,
+            max_invocations=3,
+        )
+
+        env = benchmark.setup_environment({}, task)
+        agents, _ = benchmark.setup_agents({}, env, task, user)
+
+        benchmark.execution_loop(agents, task, env, user=user)
+
+        # Should have 3 agent invocations
+        assert len(benchmark.run_agents_calls) == 3
+
+        # Queries should be: initial, then user responses
+        queries = [call[3] for call in benchmark.run_agents_calls]
+        assert queries[0] == "Start"  # Initial prompt
+        assert queries[1] == "Turn 1 response"
+        assert queries[2] == "Turn 2 response"
+
+    def test_stops_when_user_done_via_max_turns(self, dummy_model):
+        """Stops early when user.is_done() returns True (max_turns reached first)."""
+        from conftest import DummyUser
+
+        task = Task(query="Task query", environment_data={})
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="Start",
+            max_turns=2,  # User done after 2 turns (limiting factor)
+        )
+        user.simulator.side_effect = ["Response 1", "Response 2", "Response 3"]
+
+        benchmark = ExecutionLoopBenchmark(
+            agent_data={},
+            return_user=user,
+            max_invocations=5,  # Would allow 5, but user stops at 2
+        )
+
+        env = benchmark.setup_environment({}, task)
+        agents, _ = benchmark.setup_agents({}, env, task, user)
+
+        benchmark.execution_loop(agents, task, env, user=user)
+
+        # max_turns=2 is the limiting factor, so exactly 2 invocations
+        # Iteration 1: agent runs, simulate_response → turn_count=1, is_done? No
+        # Iteration 2: agent runs, simulate_response → turn_count=2, is_done? Yes → break
+        assert len(benchmark.run_agents_calls) == 2
+
+    def test_stops_when_user_done_via_stop_token(self, dummy_model):
+        """Stops early when user.is_done() returns True (stop_token)."""
+        from conftest import DummyUser
+
+        task = Task(query="Task query", environment_data={})
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="Start",
+            max_turns=10,
+            stop_token="</stop>",
+        )
+        # User stops on second response
+        user.simulator.side_effect = ["Continue please", "Thanks! </stop>"]
+
+        benchmark = ExecutionLoopBenchmark(
+            agent_data={},
+            return_user=user,
+            max_invocations=5,
+        )
+
+        env = benchmark.setup_environment({}, task)
+        agents, _ = benchmark.setup_agents({}, env, task, user)
+
+        benchmark.execution_loop(agents, task, env, user=user)
+
+        # Should stop after user says </stop>
+        assert len(benchmark.run_agents_calls) == 2
+
+    def test_final_answer_in_user_messages(self, dummy_model):
+        """Agent's final_answer is recorded in user messages."""
+        from conftest import DummyUser
+
+        task = Task(query="Task query", environment_data={})
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="Help me",
+            max_turns=1,
+        )
+        user.simulator.return_value = "Thanks"
+
+        benchmark = ExecutionLoopBenchmark(agent_data={}, return_user=user)
+
+        env = benchmark.setup_environment({}, task)
+        agents, _ = benchmark.setup_agents({}, env, task, user)
+
+        benchmark.execution_loop(agents, task, env, user=user)
+
+        # User messages should include the agent's response
+        messages = list(user.messages)
+        assistant_messages = [m for m in messages if m["role"] == "assistant"]
+        assert len(assistant_messages) >= 1
+        # The assistant message should be the agent's response
+        assert "Response to:" in assistant_messages[0]["content"]
+
+    def test_user_response_becomes_next_query(self, dummy_model):
+        """User's response is passed to next agent invocation."""
+        from conftest import DummyUser
+
+        task = Task(query="Task query", environment_data={})
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="Initial",
+            max_turns=3,  # Allow 3 turns
+        )
+        # Need 3 responses: after invocation 1, 2, and 3
+        user.simulator.side_effect = ["User reply 1", "User reply 2", "User reply 3"]
+
+        benchmark = ExecutionLoopBenchmark(
+            agent_data={},
+            return_user=user,
+            max_invocations=3,  # Will stop after 3 due to max_invocations
+        )
+
+        env = benchmark.setup_environment({}, task)
+        agents, _ = benchmark.setup_agents({}, env, task, user)
+
+        benchmark.execution_loop(agents, task, env, user=user)
+
+        # Should have 3 invocations limited by max_invocations
+        assert len(benchmark.run_agents_calls) == 3
+
+        # First query is initial prompt
+        _, _, _, query1 = benchmark.run_agents_calls[0]
+        assert query1 == "Initial"
+
+        # Second invocation should use first user reply
+        _, _, _, query2 = benchmark.run_agents_calls[1]
+        assert query2 == "User reply 1"
+
+        # Third invocation should use second user reply
+        _, _, _, query3 = benchmark.run_agents_calls[2]
+        assert query3 == "User reply 2"
+
+
+# =============================================================================
+# Tests: Max Invocations Configuration
+# =============================================================================
+
+
+@pytest.mark.core
+class TestMaxInvocations:
+    """Tests for max_invocations parameter."""
+
+    def test_default_max_invocations_is_one(self):
+        """Default is single invocation."""
+        benchmark = ExecutionLoopBenchmark(agent_data={})
+        assert benchmark.max_invocations == 1
+
+    def test_custom_max_invocations(self):
+        """Custom max_invocations is stored."""
+        benchmark = ExecutionLoopBenchmark(agent_data={}, max_invocations=5)
+        assert benchmark.max_invocations == 5
+
+    def test_warning_max_invocations_without_user(self):
+        """Warning issued when max_invocations > 1 but no user."""
+        task = Task(query="Test", environment_data={})
+        benchmark = ExecutionLoopBenchmark(
+            agent_data={},
+            return_user=None,
+            max_invocations=5,
+        )
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            benchmark.run(TaskCollection([task]))
+
+            # Check for warning about max_invocations without user
+            warning_messages = [str(warning.message) for warning in w]
+            assert any("max_invocations" in msg and "no user simulator" in msg.lower() for msg in warning_messages)
+
+
+# =============================================================================
+# Tests: Integration with run()
+# =============================================================================
+
+
+@pytest.mark.core
+class TestBenchmarkRunWithUser:
+    """Tests for run() integration with users."""
+
+    def test_run_with_user_uses_execution_loop(self, dummy_model):
+        """run() delegates to execution_loop."""
+        from conftest import DummyUser
+
+        task = Task(query="Task query", environment_data={})
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="User query",
+            max_turns=1,
+        )
+        user.simulator.return_value = "Done"
+
+        benchmark = ExecutionLoopBenchmark(agent_data={}, return_user=user)
+
+        benchmark.run(TaskCollection([task]))
+
+        # Verify run_agents was called with user's initial prompt
+        assert len(benchmark.run_agents_calls) == 1
+        _, _, _, query = benchmark.run_agents_calls[0]
+        assert query == "User query"
+
+    def test_complete_traces_with_user(self, dummy_model):
+        """Traces include complete user conversation."""
+        from conftest import DummyUser
+
+        task = Task(query="Task query", environment_data={})
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="Hello",
+            max_turns=2,
+        )
+        user.simulator.side_effect = ["Reply 1", "Reply 2"]
+
+        benchmark = ExecutionLoopBenchmark(
+            agent_data={},
+            return_user=user,
+            max_invocations=2,
+        )
+
+        reports = benchmark.run(TaskCollection([task]))
+
+        # Check that user traces are in the report
+        assert len(reports) == 1
+        traces = reports[0]["traces"]
+        assert "user" in traces
+
+        # User traces should have the conversation
+        user_traces = traces["user"]
+        assert "messages" in user_traces
+        # Should have exactly: initial + 2 exchanges (initial, agent1, user1, agent2, user2)
+        assert user_traces["message_count"] == 5
+
+        # Verify exact message sequence
+        messages = user_traces["messages"]
+        assert messages[0]["role"] == "user"
+        assert messages[0]["content"] == "Hello"
+        assert messages[1]["role"] == "assistant"
+        assert "Response to:" in messages[1]["content"]
+        assert messages[2]["role"] == "user"
+        assert messages[2]["content"] == "Reply 1"
+        assert messages[3]["role"] == "assistant"
+        assert messages[4]["role"] == "user"
+        assert messages[4]["content"] == "Reply 2"
diff --git a/tests/test_core/test_user.py b/tests/test_core/test_user.py
new file mode 100644
index 00000000..0fb5deb0
--- /dev/null
+++ b/tests/test_core/test_user.py
@@ -0,0 +1,464 @@
+"""Test User class functionality.
+
+These tests verify the User base class (maseval.core.user.User) behavior:
+- Conversation history management (MessageHistory)
+- Multi-turn interaction (max_turns, turn counting)
+- Early stopping via stop tokens
+- Optional initial prompts and LLM-generated initial queries
+
+Note: This tests the User class, NOT the UserLLMSimulator class.
+UserLLMSimulator is tested in test_llm_simulator.py.
+"""
+
+import pytest
+
+
+@pytest.mark.core
+class TestUser:
+    """Tests for User base class basics."""
+
+    def test_user_simulate_response_updates_messages(self, dummy_user):
+        """Test that simulate_response adds to message history."""
+        initial_len = len(dummy_user.messages)
+
+        # simulate_response adds assistant message, then user response
+        dummy_user.simulate_response("How can I help?")
+
+        # Should have added 2 messages: assistant question + user response
+        assert len(dummy_user.messages) == initial_len + 2
+
+    def test_user_messages_includes_both_sides(self, dummy_user):
+        """Test that messages includes both user and assistant messages."""
+        # Simulate a response (adds assistant + user messages)
+        dummy_user.simulate_response("Question for user")
+
+        messages = list(dummy_user.messages)
+        roles = [m["role"] for m in messages]
+        assert "user" in roles
+        assert "assistant" in roles
+
+    def test_user_gather_traces_includes_interactions(self, dummy_user):
+        """Test that gather_traces() includes conversation history."""
+        traces = dummy_user.gather_traces()
+
+        assert "type" in traces
+        assert "gathered_at" in traces
+        assert "name" in traces
+        assert "message_count" in traces
+        assert "messages" in traces
+
+        assert traces["name"] == "test_user"
+        assert isinstance(traces["messages"], list)
+        assert traces["message_count"] == len(traces["messages"])
+
+    def test_user_gather_config_includes_profile(self, dummy_user):
+        """Test that gather_config() includes user profile."""
+        config = dummy_user.gather_config()
+
+        assert "type" in config
+        assert "gathered_at" in config
+        assert "name" in config
+
+        assert config["name"] == "test_user"
+
+    def test_user_initialization(self, dummy_model):
+        """Test that User can be initialized with required parameters."""
+        from conftest import DummyUser
+
+        user = DummyUser(
+            name="test_user",
+            model=dummy_model,
+            user_profile={"role": "customer"},
+            scenario="test scenario",
+            initial_prompt="Hello",
+        )
+
+        assert user.name == "test_user"
+        assert user.user_profile == {"role": "customer"}
+        assert user.scenario == "test scenario"
+        assert len(user.messages) == 1
+
+
+# =============================================================================
+# Multi-Turn Configuration Tests
+# =============================================================================
+
+
+@pytest.mark.core
+class TestUserMultiTurn:
+    """Tests for max_turns behavior."""
+
+    def test_default_max_turns_is_one(self, dummy_model):
+        """Default single-turn mode."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+        assert user.max_turns == 1
+
+    def test_custom_max_turns(self, dummy_model):
+        """Custom max_turns is stored correctly."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=5)
+        assert user.max_turns == 5
+
+    def test_is_done_after_max_turns(self, dummy_model):
+        """is_done() returns True when turn count >= max_turns."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=2)
+        user._turn_count = 2
+
+        assert user.is_done()
+
+    def test_is_done_before_max_turns(self, dummy_model):
+        """is_done() returns False when turn count < max_turns."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=3)
+        user._turn_count = 1
+
+        assert not user.is_done()
+
+    def test_simulate_response_increments_turn_count(self, dummy_model):
+        """Each simulate_response() call increments _turn_count."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=5)
+        initial_count = user._turn_count
+
+        user.simulate_response("Question 1")
+        assert user._turn_count == initial_count + 1
+
+        user.simulate_response("Question 2")
+        assert user._turn_count == initial_count + 2
+
+    def test_simulate_response_returns_empty_when_done(self, dummy_model):
+        """Returns empty string when is_done() is True."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=1)
+        user._turn_count = 1  # Already at max
+
+        response = user.simulate_response("More questions?")
+        assert response == ""
+
+    def test_turn_count_starts_at_zero(self, dummy_model):
+        """Turn count starts at 0."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+        assert user._turn_count == 0
+
+
+# =============================================================================
+# Stop Token Tests
+# =============================================================================
+
+
+@pytest.mark.core
+class TestUserStopToken:
+    """Tests for stop_token early termination."""
+
+    def test_no_stop_token_by_default(self, dummy_model):
+        """stop_token is None by default."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+        assert user.stop_token is None
+
+    def test_custom_stop_token(self, dummy_model):
+        """Custom stop_token is stored."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</done>")
+        assert user.stop_token == "</done>"
+
+    def test_stop_token_detection_sets_stopped(self, dummy_model):
+        """Detecting stop token sets _stopped = True."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user.simulator.return_value = "Thanks! </stop>"
+
+        user.simulate_response("Here's your answer")
+
+        assert user._stopped
+
+    def test_stop_token_removed_from_response(self, dummy_model):
+        """Stop token is stripped from returned response."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user.simulator.return_value = "Perfect, thanks! </stop>"
+
+        response = user.simulate_response("Booking confirmed!")
+
+        assert "</stop>" not in response
+        assert "Perfect, thanks!" in response
+
+    def test_is_done_true_after_stop_token(self, dummy_model):
+        """is_done() returns True after stop token detected."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user.simulator.return_value = "Done </stop>"
+
+        user.simulate_response("Result")
+
+        assert user.is_done()
+
+    def test_stop_token_case_insensitive(self, dummy_model):
+        """Stop token detection is case-insensitive."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</STOP>", max_turns=5)
+        user.simulator.return_value = "Thanks! </stop>"  # lowercase
+
+        user.simulate_response("Answer")
+
+        assert user._stopped
+
+    def test_fallback_message_when_only_stop_token(self, dummy_model):
+        """Provides fallback when response is only stop token."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user.simulator.return_value = "</stop>"
+
+        response = user.simulate_response("Done!")
+
+        assert response == "Thank you, that's all I needed!"
+        assert user._stopped
+
+    def test_stop_token_response_counts_as_turn(self, dummy_model):
+        """The response containing stop token still counts as a turn."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user.simulator.return_value = "Thank you, all is clear </stop>"
+
+        initial_turn_count = user._turn_count
+        user.simulate_response("Here is your result")
+
+        # Turn count should increment even though stop token was detected
+        assert user._turn_count == initial_turn_count + 1
+        assert user._stopped
+        assert user.is_done()
+
+
+# =============================================================================
+# Optional Initial Prompt Tests
+# =============================================================================
+
+
+@pytest.mark.core
+class TestUserInitialPrompt:
+    """Tests for optional initial_prompt behavior."""
+
+    def test_with_initial_prompt_adds_message(self, dummy_model):
+        """Providing initial_prompt adds it to messages."""
+        from conftest import DummyUser
+
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="I need help booking a flight",
+        )
+
+        assert len(user.messages) == 1
+        assert user.messages[0]["role"] == "user"
+        assert user.messages[0]["content"] == "I need help booking a flight"
+
+    def test_without_initial_prompt_empty_messages(self, dummy_model):
+        """No initial_prompt means empty message history."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+        # No initial_prompt provided
+
+        assert len(user.messages) == 0
+
+    def test_get_initial_query_generates_message(self, dummy_model):
+        """get_initial_query() uses LLM to generate first message."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+        user.simulator.return_value = "I want to book a hotel"
+
+        query = user.get_initial_query()
+
+        assert query == "I want to book a hotel"
+        user.simulator.assert_called_once()
+
+    def test_get_initial_query_adds_to_messages(self, dummy_model):
+        """Generated query is added to message history."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+        user.simulator.return_value = "Help me please"
+
+        user.get_initial_query()
+
+        assert len(user.messages) == 1
+        assert user.messages[0]["role"] == "user"
+        assert user.messages[0]["content"] == "Help me please"
+
+    def test_get_initial_query_raises_if_messages_exist(self, dummy_model):
+        """get_initial_query() raises if messages already exist."""
+        from conftest import DummyUser
+
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="Already have a message",
+        )
+
+        with pytest.raises(RuntimeError, match="already has messages"):
+            user.get_initial_query()
+
+    def test_get_initial_query_not_counted_as_turn(self, dummy_model):
+        """Initial query doesn't increment turn count."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=3)
+        user.simulator.return_value = "Initial query"
+
+        user.get_initial_query()
+
+        assert user._turn_count == 0  # Not incremented
+
+
+# =============================================================================
+# Message History Completeness Tests
+# =============================================================================
+
+
+@pytest.mark.core
+class TestUserMessageHistory:
+    """Tests for complete message tracing."""
+
+    def test_initial_message_in_history(self, dummy_model):
+        """Initial prompt is in message history."""
+        from conftest import DummyUser
+
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="Hello agent",
+        )
+
+        assert len(user.messages) == 1
+        assert user.messages[0]["content"] == "Hello agent"
+
+    def test_assistant_message_recorded(self, dummy_model):
+        """simulate_response() records assistant message before responding."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=3)
+        user.simulator.return_value = "User reply"
+
+        user.simulate_response("Agent says hello")
+
+        # Should have: assistant message + user response
+        assert len(user.messages) == 2
+        assert user.messages[0]["role"] == "assistant"
+        assert user.messages[0]["content"] == "Agent says hello"
+
+    def test_user_response_recorded(self, dummy_model):
+        """simulate_response() records user response."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=3)
+        user.simulator.return_value = "Thanks for the help"
+
+        user.simulate_response("Here's your answer")
+
+        assert user.messages[-1]["role"] == "user"
+        assert user.messages[-1]["content"] == "Thanks for the help"
+
+    def test_full_conversation_tracked(self, dummy_model):
+        """Multiple exchanges create complete conversation trace."""
+        from conftest import DummyUser
+
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="I need a flight",
+            max_turns=3,
+        )
+        user.simulator.side_effect = ["Monday works", "Yes, book it"]
+
+        # Two agent-user exchanges
+        user.simulate_response("When do you want to travel?")
+        user.simulate_response("Shall I book it?")
+
+        messages = list(user.messages)
+        assert len(messages) == 5  # initial + 2*(assistant + user)
+
+        # Verify order
+        assert messages[0]["role"] == "user"  # initial
+        assert messages[1]["role"] == "assistant"
+        assert messages[2]["role"] == "user"
+        assert messages[3]["role"] == "assistant"
+        assert messages[4]["role"] == "user"
+
+    def test_gather_traces_includes_all_messages(self, dummy_model):
+        """gather_traces() includes complete conversation."""
+        from conftest import DummyUser
+
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="Hello",
+            max_turns=2,
+        )
+        user.simulator.return_value = "Got it"
+
+        user.simulate_response("Agent response")
+
+        traces = user.gather_traces()
+
+        assert traces["message_count"] == 3
+        assert len(traces["messages"]) == 3
+
+
+# =============================================================================
+# Config Tests
+# =============================================================================
+
+
+@pytest.mark.core
+class TestUserConfig:
+    """Tests for gather_config updates."""
+
+    def test_config_includes_max_turns(self, dummy_model):
+        """gather_config() includes max_turns."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=7)
+
+        config = user.gather_config()
+
+        assert config["max_turns"] == 7
+
+    def test_config_includes_stop_token(self, dummy_model):
+        """gather_config() includes stop_token."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</end>")
+
+        config = user.gather_config()
+
+        assert config["stop_token"] == "</end>"
+
+    def test_config_includes_none_stop_token(self, dummy_model):
+        """gather_config() includes stop_token even when None."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+
+        config = user.gather_config()
+
+        assert "stop_token" in config
+        assert config["stop_token"] is None
diff --git a/tests/test_core/test_user_simulator.py b/tests/test_core/test_user_simulator.py
index 2b3a2473..11c8b1a6 100644
--- a/tests/test_core/test_user_simulator.py
+++ b/tests/test_core/test_user_simulator.py
@@ -1,33 +1,33 @@
 """Test User simulator functionality.
 
-These tests verify that User simulator correctly manages conversation history.
+These tests verify that User simulator correctly manages conversation history,
+multi-turn interaction, and early stopping via stop tokens.
 """
 
 import pytest
+from unittest.mock import MagicMock
 
 
 @pytest.mark.core
 class TestUserSimulator:
-    """Tests for User simulator."""
+    """Tests for User simulator basics."""
 
-    def test_user_simulate_response_updates_history(self, dummy_user):
-        """Test that simulate_response adds to history."""
-        initial_len = len(dummy_user.history)
+    def test_user_simulate_response_updates_messages(self, dummy_user):
+        """Test that simulate_response adds to message history."""
+        initial_len = len(dummy_user.messages)
 
-        # Note: We don't call simulate_response as it would require LLM call
-        # Instead, test manual history manipulation
-        dummy_user.history.add_message("assistant", "How can I help?")
-        dummy_user.history.add_message("user", "I need help")
+        # simulate_response adds assistant message, then user response
+        dummy_user.simulate_response("How can I help?")
 
-        assert len(dummy_user.history) == initial_len + 2
+        # Should have added 2 messages: assistant question + user response
+        assert len(dummy_user.messages) == initial_len + 2
 
-    def test_user_history_includes_both_sides(self, dummy_user):
-        """Test that history includes both user and assistant messages."""
-        # Add a conversation
-        dummy_user.history.add_message("assistant", "Question for user")
-        dummy_user.history.add_message("user", "User response")
+    def test_user_messages_includes_both_sides(self, dummy_user):
+        """Test that messages includes both user and assistant messages."""
+        # Simulate a response (adds assistant + user messages)
+        dummy_user.simulate_response("Question for user")
 
-        messages = list(dummy_user.history)
+        messages = list(dummy_user.messages)
         roles = [m["role"] for m in messages]
         assert "user" in roles
         assert "assistant" in roles
@@ -40,11 +40,11 @@ def test_user_gather_traces_includes_interactions(self, dummy_user):
         assert "gathered_at" in traces
         assert "name" in traces
         assert "message_count" in traces
-        assert "history" in traces
+        assert "messages" in traces
 
         assert traces["name"] == "test_user"
-        assert isinstance(traces["history"], list)
-        assert traces["message_count"] == len(traces["history"])
+        assert isinstance(traces["messages"], list)
+        assert traces["message_count"] == len(traces["messages"])
 
     def test_user_gather_config_includes_profile(self, dummy_user):
         """Test that gather_config() includes user profile."""
@@ -71,4 +71,374 @@ def test_user_initialization(self, dummy_model):
         assert user.name == "test_user"
         assert user.user_profile == {"role": "customer"}
         assert user.scenario == "test scenario"
-        assert len(user.history) == 1
+        assert len(user.messages) == 1
+
+
+# =============================================================================
+# Multi-Turn Configuration Tests
+# =============================================================================
+
+
+@pytest.mark.core
+class TestUserMultiTurn:
+    """Tests for max_turns behavior."""
+
+    def test_default_max_turns_is_one(self, dummy_model):
+        """Default single-turn mode."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+        assert user.max_turns == 1
+
+    def test_custom_max_turns(self, dummy_model):
+        """Custom max_turns is stored correctly."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=5)
+        assert user.max_turns == 5
+
+    def test_is_done_after_max_turns(self, dummy_model):
+        """is_done() returns True when turn count >= max_turns."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=2)
+        user._turn_count = 2
+
+        assert user.is_done()
+
+    def test_is_done_before_max_turns(self, dummy_model):
+        """is_done() returns False when turn count < max_turns."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=3)
+        user._turn_count = 1
+
+        assert not user.is_done()
+
+    def test_simulate_response_increments_turn_count(self, dummy_model):
+        """Each simulate_response() call increments _turn_count."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=5)
+        initial_count = user._turn_count
+
+        user.simulate_response("Question 1")
+        assert user._turn_count == initial_count + 1
+
+        user.simulate_response("Question 2")
+        assert user._turn_count == initial_count + 2
+
+    def test_simulate_response_returns_empty_when_done(self, dummy_model):
+        """Returns empty string when is_done() is True."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=1)
+        user._turn_count = 1  # Already at max
+
+        response = user.simulate_response("More questions?")
+        assert response == ""
+
+    def test_turn_count_starts_at_zero(self, dummy_model):
+        """Turn count starts at 0."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+        assert user._turn_count == 0
+
+
+# =============================================================================
+# Stop Token Tests
+# =============================================================================
+
+
+@pytest.mark.core
+class TestUserStopToken:
+    """Tests for stop_token early termination."""
+
+    def test_no_stop_token_by_default(self, dummy_model):
+        """stop_token is None by default."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+        assert user.stop_token is None
+
+    def test_custom_stop_token(self, dummy_model):
+        """Custom stop_token is stored."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</done>")
+        assert user.stop_token == "</done>"
+
+    def test_stop_token_detection_sets_stopped(self, dummy_model):
+        """Detecting stop token sets _stopped = True."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user.simulator.return_value = "Thanks! </stop>"
+
+        user.simulate_response("Here's your answer")
+
+        assert user._stopped
+
+    def test_stop_token_removed_from_response(self, dummy_model):
+        """Stop token is stripped from returned response."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user.simulator.return_value = "Perfect, thanks! </stop>"
+
+        response = user.simulate_response("Booking confirmed!")
+
+        assert "</stop>" not in response
+        assert "Perfect, thanks!" in response
+
+    def test_is_done_true_after_stop_token(self, dummy_model):
+        """is_done() returns True after stop token detected."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user.simulator.return_value = "Done </stop>"
+
+        user.simulate_response("Result")
+
+        assert user.is_done()
+
+    def test_stop_token_case_insensitive(self, dummy_model):
+        """Stop token detection is case-insensitive."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</STOP>", max_turns=5)
+        user.simulator.return_value = "Thanks! </stop>"  # lowercase
+
+        user.simulate_response("Answer")
+
+        assert user._stopped
+
+    def test_fallback_message_when_only_stop_token(self, dummy_model):
+        """Provides fallback when response is only stop token."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user.simulator.return_value = "</stop>"
+
+        response = user.simulate_response("Done!")
+
+        assert response == "Thank you, that's all I needed!"
+        assert user._stopped
+
+
+# =============================================================================
+# Optional Initial Prompt Tests
+# =============================================================================
+
+
+@pytest.mark.core
+class TestUserInitialPrompt:
+    """Tests for optional initial_prompt behavior."""
+
+    def test_with_initial_prompt_adds_message(self, dummy_model):
+        """Providing initial_prompt adds it to messages."""
+        from conftest import DummyUser
+
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="I need help booking a flight",
+        )
+
+        assert len(user.messages) == 1
+        assert user.messages[0]["role"] == "user"
+        assert user.messages[0]["content"] == "I need help booking a flight"
+
+    def test_without_initial_prompt_empty_messages(self, dummy_model):
+        """No initial_prompt means empty message history."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+        # No initial_prompt provided
+
+        assert len(user.messages) == 0
+
+    def test_get_initial_query_generates_message(self, dummy_model):
+        """get_initial_query() uses LLM to generate first message."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+        user.simulator.return_value = "I want to book a hotel"
+
+        query = user.get_initial_query()
+
+        assert query == "I want to book a hotel"
+        user.simulator.assert_called_once()
+
+    def test_get_initial_query_adds_to_messages(self, dummy_model):
+        """Generated query is added to message history."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+        user.simulator.return_value = "Help me please"
+
+        user.get_initial_query()
+
+        assert len(user.messages) == 1
+        assert user.messages[0]["role"] == "user"
+        assert user.messages[0]["content"] == "Help me please"
+
+    def test_get_initial_query_raises_if_messages_exist(self, dummy_model):
+        """get_initial_query() raises if messages already exist."""
+        from conftest import DummyUser
+
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="Already have a message",
+        )
+
+        with pytest.raises(RuntimeError, match="already has messages"):
+            user.get_initial_query()
+
+    def test_get_initial_query_not_counted_as_turn(self, dummy_model):
+        """Initial query doesn't increment turn count."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=3)
+        user.simulator.return_value = "Initial query"
+
+        user.get_initial_query()
+
+        assert user._turn_count == 0  # Not incremented
+
+
+# =============================================================================
+# Message History Completeness Tests
+# =============================================================================
+
+
+@pytest.mark.core
+class TestUserMessageHistory:
+    """Tests for complete message tracing."""
+
+    def test_initial_message_in_history(self, dummy_model):
+        """Initial prompt is in message history."""
+        from conftest import DummyUser
+
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="Hello agent",
+        )
+
+        assert len(user.messages) == 1
+        assert user.messages[0]["content"] == "Hello agent"
+
+    def test_assistant_message_recorded(self, dummy_model):
+        """simulate_response() records assistant message before responding."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=3)
+        user.simulator.return_value = "User reply"
+
+        user.simulate_response("Agent says hello")
+
+        # Should have: assistant message + user response
+        assert len(user.messages) == 2
+        assert user.messages[0]["role"] == "assistant"
+        assert user.messages[0]["content"] == "Agent says hello"
+
+    def test_user_response_recorded(self, dummy_model):
+        """simulate_response() records user response."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=3)
+        user.simulator.return_value = "Thanks for the help"
+
+        user.simulate_response("Here's your answer")
+
+        assert user.messages[-1]["role"] == "user"
+        assert user.messages[-1]["content"] == "Thanks for the help"
+
+    def test_full_conversation_tracked(self, dummy_model):
+        """Multiple exchanges create complete conversation trace."""
+        from conftest import DummyUser
+
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="I need a flight",
+            max_turns=3,
+        )
+        user.simulator.side_effect = ["Monday works", "Yes, book it"]
+
+        # Two agent-user exchanges
+        user.simulate_response("When do you want to travel?")
+        user.simulate_response("Shall I book it?")
+
+        messages = list(user.messages)
+        assert len(messages) == 5  # initial + 2*(assistant + user)
+
+        # Verify order
+        assert messages[0]["role"] == "user"  # initial
+        assert messages[1]["role"] == "assistant"
+        assert messages[2]["role"] == "user"
+        assert messages[3]["role"] == "assistant"
+        assert messages[4]["role"] == "user"
+
+    def test_gather_traces_includes_all_messages(self, dummy_model):
+        """gather_traces() includes complete conversation."""
+        from conftest import DummyUser
+
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            initial_prompt="Hello",
+            max_turns=2,
+        )
+        user.simulator.return_value = "Got it"
+
+        user.simulate_response("Agent response")
+
+        traces = user.gather_traces()
+
+        assert traces["message_count"] == 3
+        assert len(traces["messages"]) == 3
+
+
+# =============================================================================
+# Config Tests
+# =============================================================================
+
+
+@pytest.mark.core
+class TestUserConfig:
+    """Tests for gather_config updates."""
+
+    def test_config_includes_max_turns(self, dummy_model):
+        """gather_config() includes max_turns."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, max_turns=7)
+
+        config = user.gather_config()
+
+        assert config["max_turns"] == 7
+
+    def test_config_includes_stop_token(self, dummy_model):
+        """gather_config() includes stop_token."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, stop_token="</end>")
+
+        config = user.gather_config()
+
+        assert config["stop_token"] == "</end>"
+
+    def test_config_includes_none_stop_token(self, dummy_model):
+        """gather_config() includes stop_token even when None."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model)
+
+        config = user.gather_config()
+
+        assert "stop_token" in config
+        assert config["stop_token"] is None

From cb918cf9d2b3340e4cd0c9bcef16e0b70c53c1fa Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 16:52:41 +0000
Subject: [PATCH 12/34] moved macs example

---
 examples/macs_benchmark/.gitignore              | 2 ++
 examples/{ => macs_benchmark}/macs_benchmark.py | 0
 2 files changed, 2 insertions(+)
 create mode 100644 examples/macs_benchmark/.gitignore
 rename examples/{ => macs_benchmark}/macs_benchmark.py (100%)

diff --git a/examples/macs_benchmark/.gitignore b/examples/macs_benchmark/.gitignore
new file mode 100644
index 00000000..7d3ba574
--- /dev/null
+++ b/examples/macs_benchmark/.gitignore
@@ -0,0 +1,2 @@
+results/
+data/
diff --git a/examples/macs_benchmark.py b/examples/macs_benchmark/macs_benchmark.py
similarity index 100%
rename from examples/macs_benchmark.py
rename to examples/macs_benchmark/macs_benchmark.py

From f6e2d1e42a98b5f64ee065175a1bac1c93164eba Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 20:20:37 +0000
Subject: [PATCH 13/34] added model factory abstract method to benchmark.

---
 maseval/core/benchmark.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py
index 233d9f97..e4c54803 100644
--- a/maseval/core/benchmark.py
+++ b/maseval/core/benchmark.py
@@ -9,6 +9,7 @@
 from .task import Task, TaskCollection
 from .environment import Environment
 from .agent import AgentAdapter
+from .model import ModelAdapter
 from .callback_handler import CallbackHandler
 from .callback import BenchmarkCallback
 from .user import User
@@ -734,6 +735,44 @@ def setup_evaluators(self, environment, task, agents, user):
         """
         pass
 
+    @abstractmethod
+    def get_model_adapter(self, model_id: str) -> ModelAdapter:
+        """Provide a ModelAdapter for benchmark components that require LLM access.
+
+        Many benchmark components beyond the agents themselves require access to language
+        models. Common examples include:
+
+        - **Tool simulators**: Simulating tool responses when real APIs aren't available
+        - **User simulators**: Generating realistic user responses in multi-turn dialogues
+        - **Judges/Evaluators**: Using LLMs to assess agent performance against criteria
+        - **Reward models**: Computing scores for reinforcement learning
+
+        This method centralizes model provisioning, giving you control over which models
+        are used throughout the benchmark. Implement this to return a configured ModelAdapter
+        for the requested model.
+
+        Args:
+            model_id: The model identifier to use (e.g., "gemini-2.5-flash",
+                "openrouter/google/gemini-2.5-flash", "gpt-4o"). This is passed by the
+                benchmark when setting up components that need model access.
+
+        Returns:
+            A ModelAdapter instance configured for the specified model. For proper tracing,
+            return a fresh adapter for each call rather than reusing instances. You can
+            still share the underlying API client for efficiency.
+
+        How to use:
+            ```python
+            def get_model_adapter(self, model_id: str) -> ModelAdapter:
+                # Return an adapter for the requested model
+                return GoogleGenAIModelAdapter(self.client, model_id=model_id)
+            ```
+
+            The benchmark calls this method when setting up tools, user simulators,
+            and evaluators. Each call creates a fresh adapter with its own trace log.
+        """
+        pass
+
     @abstractmethod
     def evaluate(
         self,

From c1f8b2fc0a1623cbabd4134e91f2baec0d2d626a Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 20:20:42 +0000
Subject: [PATCH 14/34] fixed llama index documentation

---
 docs/interface/agents/llamaindex.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/interface/agents/llamaindex.md b/docs/interface/agents/llamaindex.md
index a95f7aaf..84315ac0 100644
--- a/docs/interface/agents/llamaindex.md
+++ b/docs/interface/agents/llamaindex.md
@@ -2,8 +2,8 @@
 
 Adapter implementing commonly used functions for `LlamaIndex`'s workflow-based agent system.
 
-- [Documentation](https://docs.llamaindex.ai/)
-- [Code Repository](https://developers.llamaindex.ai/python/framework/)
+- [Documentation](https://developers.llamaindex.ai/python/framework/)
+- [Code Repository](https://github.com/run-llama/llama_index/)
 
 ## Installation
 

From aceacdef29ad9d7edd41137130922f6a085e6d46 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 21:48:04 +0000
Subject: [PATCH 15/34] updated benchmark model adapter factory

---
 maseval/core/benchmark.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py
index e4c54803..1bf7953c 100644
--- a/maseval/core/benchmark.py
+++ b/maseval/core/benchmark.py
@@ -736,7 +736,7 @@ def setup_evaluators(self, environment, task, agents, user):
         pass
 
     @abstractmethod
-    def get_model_adapter(self, model_id: str) -> ModelAdapter:
+    def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:
         """Provide a ModelAdapter for benchmark components that require LLM access.
 
         Many benchmark components beyond the agents themselves require access to language
@@ -755,6 +755,9 @@ def get_model_adapter(self, model_id: str) -> ModelAdapter:
             model_id: The model identifier to use (e.g., "gemini-2.5-flash",
                 "openrouter/google/gemini-2.5-flash", "gpt-4o"). This is passed by the
                 benchmark when setting up components that need model access.
+            **kwargs: Additional arguments for adapter creation or registration. Common kwargs:
+                - register_category: Category for trace registration (e.g., "models")
+                - register_name: Name for trace registration (e.g., "evaluator_user_gsr")
 
         Returns:
             A ModelAdapter instance configured for the specified model. For proper tracing,
@@ -762,10 +765,18 @@ def get_model_adapter(self, model_id: str) -> ModelAdapter:
             still share the underlying API client for efficiency.
 
         How to use:
+            For proper tracing, register the adapter after creation using the kwargs:
+
             ```python
-            def get_model_adapter(self, model_id: str) -> ModelAdapter:
-                # Return an adapter for the requested model
-                return GoogleGenAIModelAdapter(self.client, model_id=model_id)
+            def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:
+                adapter = GoogleGenAIModelAdapter(self.client, model_id=model_id)
+
+                # Register for tracing if registration info provided
+                category = kwargs.get("register_category", "models")
+                name = kwargs.get("register_name", model_id)
+                self.register(category, name, adapter)
+
+                return adapter
             ```
 
             The benchmark calls this method when setting up tools, user simulators,

From 055e9b8a6771e0a4431377b4005b9c16f2773446 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 22:22:02 +0000
Subject: [PATCH 16/34] added model factory pattern to macs benchmark

---
 CHANGELOG.md                                  |   1 +
 examples/macs_benchmark/macs_benchmark.py     | 218 +++++++++++++++---
 maseval/benchmark/macs/__init__.py            |   2 +
 maseval/benchmark/macs/data_loader.py         |  63 ++++-
 maseval/benchmark/macs/macs.py                | 177 ++++++++++++--
 tests/conftest.py                             |   4 +
 tests/test_benchmarks/test_macs/conftest.py   |  97 +++++++-
 .../test_macs/test_macs_benchmark.py          |  11 +-
 .../test_macs/test_macs_environment.py        |  85 +++----
 .../test_macs/test_macs_integration.py        |  16 +-
 .../test_macs/test_macs_user.py               |   2 +-
 .../test_benchmark/test_execution_loop.py     |   5 +
 tests/test_core/test_user_simulator.py        |   1 -
 13 files changed, 569 insertions(+), 113 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2330e80d..41de392d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `is_done()`, `_check_stop_token()`, and `increment_turn()` methods to `User` base class
 - Added `get_initial_query()` method to `User` base class for LLM-generated initial messages
 - Added `initial_prompt` parameter in `User` base class to trigger the agentic system.
+- Added abstract `get_model_adapter(model_id, **kwargs)` method to `Benchmark` base class as universal model factory to be used throughout the benchmarks.
 - Comprehensive testing for multi-turn behavior and `Benchmark.execution_loop()` (PR: #13)
 
 - [LlamaIndex](https://github.com/run-llama/llama_index) integration: `LlamaIndexAgentAdapter` and `LlamaIndexUser` for evaluating LlamaIndex workflow-based agents (PR: #7)
diff --git a/examples/macs_benchmark/macs_benchmark.py b/examples/macs_benchmark/macs_benchmark.py
index a1ead4ce..6d152d74 100644
--- a/examples/macs_benchmark/macs_benchmark.py
+++ b/examples/macs_benchmark/macs_benchmark.py
@@ -62,6 +62,7 @@
     MACSGenericTool,
     MACSUser,
     compute_benchmark_metrics,
+    configure_model_ids,
     ensure_data_exists,
     load_agent_config,
     load_tasks,
@@ -72,22 +73,36 @@
 # Model Setup
 # =============================================================================
 
+# Shared client for all model adapters (reuses connection)
+_google_client: Optional[GoogleGenAIClient] = None
 
-def create_model(model_id: str = "gemini-2.5-flash") -> GoogleGenAIModelAdapter:
-    """Create a Google GenAI model adapter.
+
+def get_google_client() -> GoogleGenAIClient:
+    """Get or create the shared Google GenAI client."""
+    global _google_client
+    if _google_client is None:
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            raise ValueError("GOOGLE_API_KEY environment variable is required")
+        _google_client = GoogleGenAIClient(api_key=api_key)
+    return _google_client
+
+
+def create_model(
+    model_id: str = "gemini-2.5-flash",
+) -> GoogleGenAIModelAdapter:
+    """Create a Google GenAI model adapter with separate tracing.
+
+    All adapters share the same underlying client connection for efficiency,
+    but each has its own trace logs for debugging.
 
     Args:
         model_id: Model identifier (default: gemini-2.5-flash)
 
     Returns:
-        Configured GoogleGenAIModelAdapter
+        Configured GoogleGenAIModelAdapter with independent trace logs
     """
-    api_key = os.getenv("GOOGLE_API_KEY")
-    if not api_key:
-        raise ValueError("GOOGLE_API_KEY environment variable is required")
-
-    client = GoogleGenAIClient(api_key=api_key)
-    return GoogleGenAIModelAdapter(client, model_id=model_id)
+    return GoogleGenAIModelAdapter(get_google_client(), model_id=model_id)
 
 
 # =============================================================================
@@ -140,22 +155,53 @@ def forward(self, question: str) -> str:
 class SmolagentsMACSBenchmark(MACSBenchmark):
     """MACS Benchmark implementation for smolagents with multi-agent hierarchy."""
 
+    def get_model_adapter(self, model_id: str, **kwargs):
+        """Create a model adapter for the given model ID.
+
+        Each component (tool, user, evaluator) gets its own adapter for separate tracing.
+        Registration is handled via kwargs passed by the base class.
+
+        Args:
+            model_id: The model identifier (e.g., "gemini-2.5-flash")
+            **kwargs: Optional registration info (register_name)
+
+        Returns:
+            Configured GoogleGenAIModelAdapter
+        """
+        adapter = create_model(model_id=model_id)
+        if "register_name" in kwargs:
+            self.register("models", kwargs["register_name"], adapter)
+        return adapter
+
     def setup_user(
         self,
         agent_data: Dict[str, Any],
         environment: Environment,
         task: Task,
     ) -> SmolagentsMACSUser:
-        """Create smolagents-compatible user simulator."""
+        """Create smolagents-compatible user simulator.
+
+        Extends base MACSUser with smolagents-specific tool integration.
+        Model ID is read from task.user_data["model_id"].
+        """
         scenario = task.metadata.get("scenario", "")
+        user_model_id = self._get_user_model_id(task)
+
+        # Create dedicated model for user via get_model_adapter
+        user_model = self.get_model_adapter(user_model_id, register_name="user_simulator")
 
-        return SmolagentsMACSUser(
+        user = SmolagentsMACSUser(
             name="Simulated User",
-            model=self._model,
+            model=user_model,
             scenario=scenario,
             initial_prompt=task.query,
         )
 
+        # Register the user's simulator for tracing
+        self.register("simulators", "user_simulator", user.simulator)
+
+        return user
+
     def setup_agents(
         self,
         agent_data: Dict[str, Any],
@@ -168,6 +214,8 @@ def setup_agents(
         Implements the exact agent topology from agents.json:
         - Travel/Mortgage: 2-level hierarchy (supervisor -> specialists)
         - Software: 3-level hierarchy (supervisor -> deploy_agent -> infra/app agents)
+
+        Each tool has its own ModelAdapter for separate tracing.
         """
         # Create smolagents model
         smol_model = OpenAIServerModel(
@@ -182,11 +230,15 @@ def setup_agents(
         primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
 
         # Wrap all generic tools for smolagents and register them for tracing
+        # Each tool has its own model from MACSEnvironment.create_tools()
         tool_wrappers: Dict[str, SmolagentsToolWrapper] = {}
         for name, tool in environment.tools.items():
             wrapper = SmolagentsToolWrapper(tool)
             tool_wrappers[name] = wrapper
             self.register("tools", name, wrapper)
+            # Register the tool's model and simulator for tracing
+            self.register("models", f"model_tool_{name}", tool.model)
+            self.register("simulators", f"simulator_tool_{name}", tool.simulator)
 
         # Helper to get tools for an agent
         def get_agent_tools(agent_spec: Dict[str, Any]) -> List[SmolagentsTool]:
@@ -246,19 +298,84 @@ def build_agent(agent_id: str, depth: int = 0) -> ToolCallingAgent:
 # =============================================================================
 
 
+def _create_pydantic_model_from_inputs(name: str, inputs: Dict[str, Any]):
+    """Create a Pydantic model from MACS tool inputs specification.
+
+    Args:
+        name: Name for the generated model class
+        inputs: Dict mapping param names to {type, description}
+
+    Returns:
+        A Pydantic BaseModel class with the specified fields
+    """
+    from pydantic import Field, create_model
+
+    # Map MACS types to Python types
+    type_mapping = {
+        "string": str,
+        "number": float,
+        "integer": int,
+        "boolean": bool,
+        "array": list,
+        "object": dict,
+    }
+
+    fields = {}
+    for param_name, param_spec in inputs.items():
+        param_type = param_spec.get("type", "string")
+        python_type = type_mapping.get(param_type, str)
+        description = param_spec.get("description", "")
+
+        # All fields are optional with empty string default for flexibility
+        fields[param_name] = (python_type, Field(default="", description=description))
+
+    return create_model(f"{name}Input", **fields)
+
+
+def _create_langgraph_tool(generic_tool: MACSGenericTool) -> StructuredTool:
+    """Create a LangGraph StructuredTool from a MACSGenericTool.
+
+    This creates a proper Pydantic schema from the tool's inputs specification,
+    which LangGraph/LangChain requires for tool calling.
+
+    Args:
+        generic_tool: The MACS generic tool to wrap
+
+    Returns:
+        A LangGraph-compatible StructuredTool
+    """
+    # Create the args schema from tool inputs
+    args_schema = _create_pydantic_model_from_inputs(generic_tool.name, generic_tool.inputs)
+
+    # Create a wrapper function that calls the generic tool
+    def tool_func(**kwargs) -> str:
+        return generic_tool(**kwargs)
+
+    # Set function metadata for better tool descriptions
+    tool_func.__name__ = generic_tool.name
+    tool_func.__doc__ = generic_tool.description
+
+    return StructuredTool(
+        name=generic_tool.name,
+        description=generic_tool.description,
+        func=tool_func,
+        args_schema=args_schema,
+    )
+
+
 class LangGraphToolWrapper(ConfigurableMixin, TraceableMixin):
-    """LangGraph wrapper for MACSGenericTool."""
+    """LangGraph wrapper for MACSGenericTool.
+
+    This wrapper creates a LangGraph-compatible StructuredTool from a MACSGenericTool
+    by dynamically generating a Pydantic schema from the tool's input specification.
+    """
 
     def __init__(self, generic_tool: MACSGenericTool):
         self.generic_tool = generic_tool
-        self.tool = StructuredTool.from_function(
-            func=generic_tool,
-            name=generic_tool.name,
-            description=generic_tool.description,
-        )
+        self.tool = _create_langgraph_tool(generic_tool)
 
-    def __call__(self, *args, **kwargs):
-        return self.tool(*args, **kwargs)
+    def __call__(self, *args, **kwargs) -> str:
+        return self.generic_tool(**kwargs)
 
     def gather_traces(self) -> Dict[str, Any]:
         return self.generic_tool.gather_traces()
@@ -292,22 +409,53 @@ class AgentState(TypedDict):
 class LangGraphMACSBenchmark(MACSBenchmark):
     """MACS Benchmark implementation for langgraph with multi-agent hierarchy."""
 
+    def get_model_adapter(self, model_id: str, **kwargs):
+        """Create a model adapter for the given model ID.
+
+        Each component (tool, user, evaluator) gets its own adapter for separate tracing.
+        Registration is handled via kwargs passed by the base class.
+
+        Args:
+            model_id: The model identifier (e.g., "gemini-2.5-flash")
+            **kwargs: Optional registration info (register_name)
+
+        Returns:
+            Configured GoogleGenAIModelAdapter
+        """
+        adapter = create_model(model_id=model_id)
+        if "register_name" in kwargs:
+            self.register("models", kwargs["register_name"], adapter)
+        return adapter
+
     def setup_user(
         self,
         agent_data: Dict[str, Any],
         environment: Environment,
         task: Task,
     ) -> LangGraphMACSUser:
-        """Create langgraph-compatible user simulator."""
+        """Create langgraph-compatible user simulator.
+
+        Extends base MACSUser with LangGraph-specific tool integration.
+        Model ID is read from task.user_data["model_id"].
+        """
         scenario = task.metadata.get("scenario", "")
+        user_model_id = self._get_user_model_id(task)
 
-        return LangGraphMACSUser(
+        # Create dedicated model for user via get_model_adapter
+        user_model = self.get_model_adapter(user_model_id, register_name="user_simulator")
+
+        user = LangGraphMACSUser(
             name="Simulated User",
-            model=self._model,
+            model=user_model,
             scenario=scenario,
             initial_prompt=task.query,
         )
 
+        # Register the user's simulator for tracing
+        self.register("simulators", "user_simulator", user.simulator)
+
+        return user
+
     def setup_agents(
         self,
         agent_data: Dict[str, Any],
@@ -318,6 +466,7 @@ def setup_agents(
         """Create langgraph multi-agent hierarchy.
 
         Uses subgraphs to implement the agent hierarchy from agents.json.
+        Each tool has its own ModelAdapter for separate tracing.
         """
         # Create LangChain model
         llm = ChatGoogleGenerativeAI(
@@ -331,11 +480,15 @@ def setup_agents(
         primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
 
         # Wrap all generic tools and register for tracing
+        # Each tool has its own model from MACSEnvironment.create_tools()
         tool_wrappers: Dict[str, LangGraphToolWrapper] = {}
         for name, tool in environment.tools.items():
             wrapper = LangGraphToolWrapper(tool)
             tool_wrappers[name] = wrapper
             self.register("tools", name, wrapper)
+            # Register the tool's model and simulator for tracing
+            self.register("models", f"model_tool_{name}", tool.model)
+            self.register("simulators", f"simulator_tool_{name}", tool.simulator)
 
         # Helper to get tools for an agent
         def get_agent_tools(agent_spec: Dict[str, Any]) -> List[StructuredTool]:
@@ -545,9 +698,6 @@ def run_benchmark(
     print("Ensuring MACS data is available...")
     ensure_data_exists(verbose=1)
 
-    # Create model for tool simulation and evaluation
-    model = create_model("gemini-2.5-flash")
-
     # Load data
     print(f"Loading {domain} domain tasks...")
     tasks = load_tasks(domain, limit=limit)
@@ -559,6 +709,16 @@ def run_benchmark(
             raise ValueError(f"Task with ID '{task_id}' not found in {domain} domain")
         print(f"Running single task: {task_id}")
 
+    # Configure model IDs for all benchmark components
+    # This sets model_id in environment_data, user_data, and evaluation_data
+    # for each task. The benchmark reads these when setting up components.
+    configure_model_ids(
+        tasks,
+        tool_model_id="gemini-2.5-flash",
+        user_model_id="gemini-2.5-flash",
+        evaluator_model_id="gemini-2.5-flash",
+    )
+
     agent_config = load_agent_config(domain)
 
     # Print agent hierarchy info
@@ -577,9 +737,11 @@ def run_benchmark(
     BenchmarkClass = get_benchmark_class(framework)
     benchmark = BenchmarkClass(
         agent_data=agent_config,
-        model=model,
         callbacks=[logger],
         n_task_repeats=n_task_repeats,
+        fail_on_setup_error=True,
+        fail_on_task_error=True,
+        fail_on_evaluation_error=True,
     )
 
     # Run benchmark
@@ -666,7 +828,7 @@ def main():
         "--output-dir",
         type=Path,
         default=None,
-        help="Output directory for results (default: examples/results/)",
+        help="Output directory for results (default: examples/macs_benchmark/results/)",
     )
 
     args = parser.parse_args()
diff --git a/maseval/benchmark/macs/__init__.py b/maseval/benchmark/macs/__init__.py
index f3dd47c8..abfa588d 100644
--- a/maseval/benchmark/macs/__init__.py
+++ b/maseval/benchmark/macs/__init__.py
@@ -19,6 +19,7 @@
 from .data_loader import (
     load_tasks,
     load_agent_config,
+    configure_model_ids,
     ensure_data_exists,
     process_data,
     download_original_data,
@@ -38,6 +39,7 @@
     # Data loading
     "load_tasks",
     "load_agent_config",
+    "configure_model_ids",
     "ensure_data_exists",
     "process_data",
     "download_original_data",
diff --git a/maseval/benchmark/macs/data_loader.py b/maseval/benchmark/macs/data_loader.py
index 0e3bdabe..df22f9f4 100644
--- a/maseval/benchmark/macs/data_loader.py
+++ b/maseval/benchmark/macs/data_loader.py
@@ -11,7 +11,7 @@
 import json
 from collections import defaultdict
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 from urllib.error import HTTPError, URLError
 from urllib.request import urlopen
 
@@ -500,6 +500,67 @@ def load_agent_config(
         return json.load(f)
 
 
+def configure_model_ids(
+    tasks: Union[TaskCollection, List[Task]],
+    *,
+    tool_model_id: Optional[str] = None,
+    user_model_id: Optional[str] = None,
+    evaluator_model_id: Optional[str] = None,
+) -> Union[TaskCollection, List[Task]]:
+    """Configure model IDs for benchmark components in task data.
+
+    This helper merges runtime model configuration into task data structures,
+    enabling benchmark components (tool simulators, user simulator, evaluators)
+    to access their model IDs through the standard task data flow.
+
+    Only sets values if not already present in the task data, allowing
+    task-specific overrides in the original data to take precedence.
+
+    Args:
+        tasks: TaskCollection or list of Tasks to configure
+        tool_model_id: Model ID for tool simulators (stored in environment_data)
+        user_model_id: Model ID for user simulator (stored in user_data)
+        evaluator_model_id: Model ID for evaluators (stored in evaluation_data)
+
+    Returns:
+        The same collection (mutated in place for convenience)
+
+    Example:
+        ```python
+        tasks = load_tasks("travel", limit=5)
+        configure_model_ids(
+            tasks,
+            tool_model_id="gemini-2.5-flash",
+            user_model_id="gemini-2.5-flash",
+            evaluator_model_id="gpt-4o",  # Use stronger model for evaluation
+        )
+        benchmark = MyMACSBenchmark(agent_data=agent_config)
+        results = benchmark.run(tasks)
+        ```
+
+    Note:
+        This demonstrates a general pattern for enriching tasks with runtime
+        configuration that isn't part of the static task definitions in tasks.json.
+    """
+    for task in tasks:
+        # Environment data: tool model ID
+        if "model_id" in task.environment_data and (not task.environment_data["model_id"] == tool_model_id):
+            raise ValueError(
+                f"Task {task.metadata.get('task_id', '')} already has tool `model_id` set to '{task.environment_data['model_id']}', cannot override with '{tool_model_id}'"
+            )
+        # User data: user model ID
+        if "model_id" in task.user_data and (not task.user_data["model_id"] == user_model_id):
+            raise ValueError(
+                f"Task {task.metadata.get('task_id', '')} already has user `model_id` set to '{task.user_data['model_id']}', cannot override with '{user_model_id}'"
+            )
+        # Evaluation data: evaluator model ID
+        if "model_id" in task.evaluation_data and (not task.evaluation_data["model_id"] == evaluator_model_id):
+            raise ValueError(
+                f"Task {task.metadata.get('task_id', '')} already has evaluator `model_id` set to '{task.evaluation_data['model_id']}', cannot override with '{evaluator_model_id}'"
+            )
+    return tasks
+
+
 # =============================================================================
 # CLI Entry Point
 # =============================================================================
diff --git a/maseval/benchmark/macs/macs.py b/maseval/benchmark/macs/macs.py
index 047a9920..b43e5a6a 100644
--- a/maseval/benchmark/macs/macs.py
+++ b/maseval/benchmark/macs/macs.py
@@ -9,11 +9,17 @@
 Usage:
     from maseval.benchmark.macs import (
         MACSBenchmark, MACSEnvironment, MACSEvaluator, MACSGenericTool,
-        load_tasks, load_agent_config,
+        load_tasks, load_agent_config, configure_model_ids,
     )
 
-    # Load data
+    # Load data and configure model IDs for components
     tasks = load_tasks("travel", limit=5)
+    configure_model_ids(
+        tasks,
+        tool_model_id="gemini-2.5-flash",
+        user_model_id="gemini-2.5-flash",
+        evaluator_model_id="gemini-2.5-flash",
+    )
     agent_config = load_agent_config("travel")
 
     # Create your framework-specific benchmark subclass
@@ -22,15 +28,22 @@ def setup_agents(self, agent_data, environment, task, user):
             # Your framework-specific agent creation
             ...
 
+        def get_model_adapter(self, model_id, **kwargs):
+            # Create and optionally register model adapters
+            adapter = MyModelAdapter(model_id)
+            if "register_name" in kwargs:
+                self.register("models", kwargs["register_name"], adapter)
+            return adapter
+
     # Run
-    benchmark = MyMACSBenchmark(agent_data=agent_config, model=my_model)
+    benchmark = MyMACSBenchmark(agent_data=agent_config)
     results = benchmark.run(tasks)
 """
 
 import json
 from abc import abstractmethod
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple
+from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple
 
 from maseval import (
     AgentAdapter,
@@ -526,17 +539,17 @@ class MACSEnvironment(Environment):
     def __init__(
         self,
         task_data: Dict[str, Any],
-        model: ModelAdapter,
+        model_factory: Callable[[str], ModelAdapter],
         callbacks: Optional[List[Any]] = None,
     ):
         """Initialize environment.
 
         Args:
             task_data: Task data containing environment_data with tool specs
-            model: ModelAdapter for tool simulation
+            model_factory: Factory function that creates a ModelAdapter for a given model_name
             callbacks: Optional callbacks
         """
-        self._model = model
+        self._model_factory = model_factory
         super().__init__(task_data, callbacks)
 
     def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
@@ -548,6 +561,8 @@ def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
     def create_tools(self) -> Dict[str, MACSGenericTool]:  # type: ignore[override]
         """Create tools from task specifications.
 
+        Each tool gets its own ModelAdapter instance for separate tracing.
+
         Returns:
             Dict mapping tool names to MACSGenericTool instances
         """
@@ -556,7 +571,9 @@ def create_tools(self) -> Dict[str, MACSGenericTool]:  # type: ignore[override]
             for action in tool_group.get("actions", []):
                 name = action.get("name")
                 if name and name not in tools:
-                    tools[name] = MACSGenericTool(action, self._model)
+                    # Each tool gets its own model adapter for separate traces
+                    model = self._model_factory(f"tool_{name}")
+                    tools[name] = MACSGenericTool(action, model)
         return tools
 
     def get_tools_for_agent(self, agent_spec: Dict[str, Any]) -> Dict[str, MACSGenericTool]:
@@ -592,13 +609,31 @@ class MACSBenchmark(Benchmark):
     - Dual evaluator setup (user-side + system-side)
     - GSR metric aggregation
 
-    Users must subclass and implement setup_agents() for their framework.
+    Users must subclass and implement:
+    - setup_agents() for their agent framework
+    - get_model_adapter() to provide model adapters
+
+    Model IDs for components (tools, user, evaluators) are read from task data:
+    - task.environment_data["model_id"] for tool simulators
+    - task.user_data["model_id"] for user simulator
+    - task.evaluation_data["model_id"] for evaluators
+
+    Use configure_model_ids() to set these values after loading tasks:
+
+        from maseval.benchmark.macs import load_tasks, configure_model_ids
+
+        tasks = load_tasks("travel")
+        configure_model_ids(
+            tasks,
+            tool_model_id="gemini-2.5-flash",
+            user_model_id="gemini-2.5-flash",
+            evaluator_model_id="gemini-2.5-flash",
+        )
     """
 
     def __init__(
         self,
         agent_data: Dict[str, Any],
-        model: ModelAdapter,
         callbacks: Optional[List[Any]] = None,
         n_task_repeats: int = 1,
         max_invocations: int = 5,
@@ -607,24 +642,104 @@ def __init__(
         """Initialize benchmark.
 
         Args:
-            agent_data: Agent configuration from load_agent_config()
-            model: ModelAdapter for tool simulation and evaluation
+            agent_data: Agent configuration from load_agent_config().
             callbacks: Benchmark callbacks
             n_task_repeats: Repetitions per task
             max_invocations: Maximum agent-user interaction rounds (default: 5 per MACS paper)
         """
-        self._model = model
         super().__init__(agent_data, callbacks, n_task_repeats, max_invocations, **kwargs)
 
+    def _get_tool_model_id(self, task: Task) -> str:
+        """Get tool simulator model ID from task.environment_data.
+
+        Raises:
+            ValueError: If model_id not configured in task.environment_data
+        """
+        model_id = task.environment_data.get("model_id")
+        if model_id is None:
+            raise ValueError(
+                "Tool simulator model_id not configured in task.environment_data.\n"
+                "Use configure_model_ids() after loading tasks:\n\n"
+                "    from maseval.benchmark.macs import load_tasks, configure_model_ids\n\n"
+                "    tasks = load_tasks('travel')\n"
+                "    configure_model_ids(\n"
+                "        tasks,\n"
+                "        tool_model_id='gemini-2.5-flash',\n"
+                "        user_model_id='gemini-2.5-flash',\n"
+                "        evaluator_model_id='gemini-2.5-flash',\n"
+                "    )"
+            )
+        return model_id
+
+    def _get_user_model_id(self, task: Task) -> str:
+        """Get user simulator model ID from task.user_data.
+
+        Raises:
+            ValueError: If model_id not configured in task.user_data
+        """
+        model_id = task.user_data.get("model_id")
+        if model_id is None:
+            raise ValueError(
+                "User simulator model_id not configured in task.user_data.\n"
+                "Use configure_model_ids() after loading tasks:\n\n"
+                "    from maseval.benchmark.macs import load_tasks, configure_model_ids\n\n"
+                "    tasks = load_tasks('travel')\n"
+                "    configure_model_ids(\n"
+                "        tasks,\n"
+                "        tool_model_id='gemini-2.5-flash',\n"
+                "        user_model_id='gemini-2.5-flash',\n"
+                "        evaluator_model_id='gemini-2.5-flash',\n"
+                "    )"
+            )
+        return model_id
+
+    def _get_evaluator_model_id(self, task: Task) -> str:
+        """Get evaluator model ID from task.evaluation_data.
+
+        Raises:
+            ValueError: If model_id not configured in task.evaluation_data
+        """
+        model_id = task.evaluation_data.get("model_id")
+        if model_id is None:
+            raise ValueError(
+                "Evaluator model_id not configured in task.evaluation_data.\n"
+                "Use configure_model_ids() after loading tasks:\n\n"
+                "    from maseval.benchmark.macs import load_tasks, configure_model_ids\n\n"
+                "    tasks = load_tasks('travel')\n"
+                "    configure_model_ids(\n"
+                "        tasks,\n"
+                "        tool_model_id='gemini-2.5-flash',\n"
+                "        user_model_id='gemini-2.5-flash',\n"
+                "        evaluator_model_id='gemini-2.5-flash',\n"
+                "    )"
+            )
+        return model_id
+
     def setup_environment(
         self,
         agent_data: Dict[str, Any],
         task: Task,
     ) -> MACSEnvironment:
-        """Create environment for a task."""
+        """Create environment for a task.
+
+        Uses get_model_adapter() to create separate model adapters for each tool,
+        enabling independent tracing per tool.
+
+        Model ID is read from task.environment_data["model_id"].
+        """
+        tool_model_id = self._get_tool_model_id(task)
+
+        # Create a factory that captures the model_id from task data
+        # tool_name is passed by create_tools() with "tool_" prefix
+        def tool_model_factory(tool_name: str) -> ModelAdapter:
+            return self.get_model_adapter(
+                tool_model_id,
+                register_name=tool_name,
+            )
+
         return MACSEnvironment(
             task_data={"environment_data": task.environment_data},
-            model=self._model,
+            model_factory=tool_model_factory,
         )
 
     def setup_user(
@@ -637,6 +752,7 @@ def setup_user(
 
         Creates a MACSUser with scenario and query from the task.
         The user profile is automatically extracted from the scenario text.
+        Model ID is read from task.user_data["model_id"].
 
         Note: MACSUser.get_tool() raises NotImplementedError.
         Framework-specific subclasses in examples should wrap this user
@@ -651,8 +767,12 @@ def setup_user(
             MACSUser instance
         """
         scenario = task.metadata.get("scenario", "")
+        user_model_id = self._get_user_model_id(task)
         return MACSUser(
-            model=self._model,
+            model=self.get_model_adapter(
+                user_model_id,
+                register_name="user_simulator",
+            ),
             scenario=scenario,
             initial_prompt=task.query,
         )
@@ -685,10 +805,29 @@ def setup_evaluators(
         agents: Sequence[AgentAdapter],
         user: Optional[User],
     ) -> Sequence[Evaluator]:
-        """Create user-side and system-side evaluators."""
+        """Create user-side and system-side evaluators.
+
+        Each evaluator gets its own model adapter for separate tracing.
+        Model ID is read from task.evaluation_data["model_id"].
+        """
+        evaluator_model_id = self._get_evaluator_model_id(task)
         return [
-            MACSEvaluator(self._model, task, gsr_type="user"),
-            MACSEvaluator(self._model, task, gsr_type="system"),
+            MACSEvaluator(
+                self.get_model_adapter(
+                    evaluator_model_id,
+                    register_name="evaluator_user_gsr",
+                ),
+                task,
+                gsr_type="user",
+            ),
+            MACSEvaluator(
+                self.get_model_adapter(
+                    evaluator_model_id,
+                    register_name="evaluator_system_gsr",
+                ),
+                task,
+                gsr_type="system",
+            ),
         ]
 
     def run_agents(
diff --git a/tests/conftest.py b/tests/conftest.py
index a89bfd96..3ffa164b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -208,6 +208,10 @@ def __init__(self, *args, **kwargs):
         self.run_agents_calls = []
         self.evaluate_calls = []
 
+    def get_model_adapter(self, model_id: str, **kwargs):
+        """Create a dummy model adapter for testing."""
+        return DummyModelAdapter(model_id=model_id)
+
     def setup_environment(self, agent_data: Dict[str, Any], task: Task) -> Environment:
         self.setup_environment_calls.append((agent_data, task))
         return DummyEnvironment(task.environment_data)
diff --git a/tests/test_benchmarks/test_macs/conftest.py b/tests/test_benchmarks/test_macs/conftest.py
index 6ce27d3b..7f5f3805 100644
--- a/tests/test_benchmarks/test_macs/conftest.py
+++ b/tests/test_benchmarks/test_macs/conftest.py
@@ -72,10 +72,62 @@ def _run_agent(self, query: str) -> MessageHistory:
 class ConcreteMACSBenchmark(MACSBenchmark):
     """Concrete MACSBenchmark implementation for testing.
 
-    MACSBenchmark is abstract (setup_agents must be implemented by users).
-    This provides a minimal implementation using MACSAgentAdapter.
+    MACSBenchmark is abstract (setup_agents and get_model_adapter must be implemented).
+    This provides a minimal implementation using MACSAgentAdapter and a configurable
+    model factory.
     """
 
+    def __init__(
+        self,
+        agent_data: Dict[str, Any],
+        model_factory: Optional[Any] = None,
+        **kwargs: Any,
+    ):
+        """Initialize with optional model factory.
+
+        Args:
+            agent_data: Agent configuration
+            model_factory: Either a callable that takes a model name and returns a ModelAdapter,
+                          or a single ModelAdapter instance (for convenience in simple tests).
+                          If not provided, creates DummyModelAdapter instances.
+            **kwargs: Additional arguments passed to MACSBenchmark
+        """
+        # Handle both callable factory and single model instance
+        if model_factory is None:
+            self._model_factory = lambda model_name: DummyModelAdapter(
+                model_id=f"test-model-{model_name}",
+                responses=['{"text": "Default response", "details": {}}'],
+            )
+        elif callable(model_factory):
+            self._model_factory = model_factory
+        else:
+            # Single model instance - create a factory that always returns it
+            self._model_factory = lambda model_name: model_factory
+        super().__init__(agent_data, **kwargs)
+
+    def get_model_adapter(self, model_id: str, **kwargs):
+        """Create a model adapter for the given component.
+
+        For testing, the factory is called with register_name (if provided) instead
+        of model_id, allowing test factories to return different responses based on
+        the component type (e.g., "user_simulator", "evaluator_user_gsr").
+        """
+        # Use register_name for factory lookup if provided, else fall back to model_id
+        factory_key = kwargs.get("register_name", model_id)
+        adapter = self._model_factory(factory_key)
+
+        # Register for tracing if registration info provided
+        # Skip registration if no explicit register_name - tests often share model instances
+        register_name = kwargs.get("register_name")
+        if register_name:
+            # Check if this adapter is already registered (same factory can return same instance)
+            try:
+                self.register("models", register_name, adapter)
+            except ValueError:
+                pass  # Already registered - that's fine for tests
+
+        return adapter
+
     def setup_agents(
         self,
         agent_data: Dict[str, Any],
@@ -104,6 +156,15 @@ def macs_model():
     return DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
 
 
+@pytest.fixture
+def macs_model_factory(macs_model):
+    """Model factory that always returns the same macs_model.
+
+    Used for MACSEnvironment which requires a callable factory.
+    """
+    return lambda model_name: macs_model
+
+
 @pytest.fixture
 def macs_model_evaluator():
     """Model configured for MACSEvaluator tests.
@@ -220,10 +281,15 @@ def sample_tool_specs():
 
 @pytest.fixture
 def sample_task():
-    """Sample MACS task with typical structure."""
+    """Sample MACS task with typical structure.
+
+    Includes model_id in environment_data, user_data, and evaluation_data
+    as required by MACSBenchmark.
+    """
     return Task(
         query="Book a flight to NYC",
         environment_data={
+            "model_id": "test-model",
             "tools": [
                 {
                     "tool_name": "flight_tools",
@@ -231,13 +297,17 @@ def sample_task():
                         {"name": "search_flights", "description": "Search flights"},
                     ],
                 }
-            ]
+            ],
+        },
+        user_data={
+            "model_id": "test-model",
         },
         evaluation_data={
+            "model_id": "test-model",
             "assertions": [
                 "user: Booking confirmed",
                 "agent: Database updated",
-            ]
+            ],
         },
         metadata={"scenario": "Business trip to NYC"},
     )
@@ -248,8 +318,9 @@ def sample_task_no_scenario():
     """Task without scenario in metadata."""
     return Task(
         query="Test query",
-        environment_data={"tools": []},
-        evaluation_data={"assertions": []},
+        environment_data={"model_id": "test-model", "tools": []},
+        user_data={"model_id": "test-model"},
+        evaluation_data={"model_id": "test-model", "assertions": []},
         metadata={},
     )
 
@@ -259,8 +330,9 @@ def sample_task_no_assertions():
     """Task with no assertions."""
     return Task(
         query="Simple query",
-        environment_data={},
-        evaluation_data={"assertions": []},
+        environment_data={"model_id": "test-model"},
+        user_data={"model_id": "test-model"},
+        evaluation_data={"model_id": "test-model", "assertions": []},
         metadata={"scenario": "Simple scenario"},
     )
 
@@ -271,6 +343,7 @@ def travel_task():
     return Task(
         query="I need to book a flight from San Francisco to New York for next Monday.",
         environment_data={
+            "model_id": "test-model",
             "tools": [
                 {
                     "tool_name": "travel_tools",
@@ -298,14 +371,16 @@ def travel_task():
                         },
                     ],
                 }
-            ]
+            ],
         },
+        user_data={"model_id": "test-model"},
         evaluation_data={
+            "model_id": "test-model",
             "assertions": [
                 "user: The user's flight booking request was acknowledged",
                 "user: The user received flight options or a confirmation",
                 "agent: The search_flights tool was called with correct parameters",
-            ]
+            ],
         },
         metadata={
             "scenario": """Goal: The user wants to book a flight from San Francisco to New York.
diff --git a/tests/test_benchmarks/test_macs/test_macs_benchmark.py b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
index 7c18bfcb..91800729 100644
--- a/tests/test_benchmarks/test_macs/test_macs_benchmark.py
+++ b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
@@ -8,7 +8,6 @@
 from maseval.benchmark.macs import (
     MACSBenchmark,
     MACSEnvironment,
-    MACSEvaluator,
     MACSUser,
     compute_benchmark_metrics,
 )
@@ -27,11 +26,10 @@ class TestMACSBenchmarkSetup:
     """Tests for MACSBenchmark initialization and setup methods."""
 
     def test_init_configures_benchmark(self, macs_model, sample_agent_data):
-        """Benchmark initializes with model, agent_data, and optional params."""
+        """Benchmark initializes with agent_data and optional params."""
         callbacks = [MagicMock()]
         benchmark = ConcreteMACSBenchmark(sample_agent_data, macs_model, callbacks=callbacks, n_task_repeats=3)
 
-        assert benchmark._model == macs_model
         assert benchmark.agent_data == sample_agent_data
         assert benchmark.callbacks == callbacks
         assert benchmark.n_task_repeats == 3
@@ -175,6 +173,13 @@ def test_run_agents_multiple_agents(self, macs_model, sample_agent_data, sample_
         """Multiple agents return list of answers."""
 
         class MultiAgentBenchmark(MACSBenchmark):
+            def __init__(self, agent_data, model_factory, **kwargs):
+                self._model_factory = model_factory if callable(model_factory) else lambda _: model_factory
+                super().__init__(agent_data, **kwargs)
+
+            def get_model_adapter(self, model_id: str, **kwargs):
+                return self._model_factory(model_id)
+
             def setup_agents(
                 self,
                 agent_data: Dict[str, Any],
diff --git a/tests/test_benchmarks/test_macs/test_macs_environment.py b/tests/test_benchmarks/test_macs/test_macs_environment.py
index 19ad2632..9cfe4fcb 100644
--- a/tests/test_benchmarks/test_macs/test_macs_environment.py
+++ b/tests/test_benchmarks/test_macs/test_macs_environment.py
@@ -17,24 +17,23 @@
 class TestMACSEnvironmentSetup:
     """Tests for MACSEnvironment initialization and state setup."""
 
-    def test_init_extracts_tool_specs(self, macs_model, sample_task_data):
+    def test_init_extracts_tool_specs(self, macs_model_factory, sample_task_data):
         """Initializes from task data and extracts tool_specs."""
-        env = MACSEnvironment(sample_task_data, macs_model)
+        env = MACSEnvironment(sample_task_data, macs_model_factory)
 
-        assert env._model == macs_model
         assert hasattr(env, "state")
         assert hasattr(env, "tools")
         assert "tool_specs" in env.state
         assert len(env.state["tool_specs"]) == 2
 
-    def test_handles_empty_or_missing_tools(self, macs_model):
+    def test_handles_empty_or_missing_tools(self, macs_model_factory):
         """Handles missing environment_data or empty tools gracefully."""
         # Missing environment_data
-        env1 = MACSEnvironment({}, macs_model)
+        env1 = MACSEnvironment({}, macs_model_factory)
         assert env1.state["tool_specs"] == []
 
         # Empty tools
-        env2 = MACSEnvironment({"environment_data": {}}, macs_model)
+        env2 = MACSEnvironment({"environment_data": {}}, macs_model_factory)
         assert env2.state["tool_specs"] == []
 
 
@@ -47,24 +46,24 @@ def test_handles_empty_or_missing_tools(self, macs_model):
 class TestCreateTools:
     """Tests for create_tools method."""
 
-    def test_create_tools_from_specs(self, macs_model, sample_task_data):
+    def test_create_tools_from_specs(self, macs_model_factory, sample_task_data):
         """Creates MACSGenericTool instances from specs."""
-        env = MACSEnvironment(sample_task_data, macs_model)
+        env = MACSEnvironment(sample_task_data, macs_model_factory)
 
         assert len(env.tools) == 3  # search_flights, book_flight, search_hotels
         assert all(isinstance(tool, MACSGenericTool) for tool in env.tools.values())
 
-    def test_create_tools_keyed_by_name(self, macs_model, sample_task_data):
+    def test_create_tools_keyed_by_name(self, macs_model_factory, sample_task_data):
         """Tools dict is keyed by tool name."""
-        env = MACSEnvironment(sample_task_data, macs_model)
+        env = MACSEnvironment(sample_task_data, macs_model_factory)
 
         assert "search_flights" in env.tools
         assert "book_flight" in env.tools
         assert "search_hotels" in env.tools
 
-    def test_create_tools_correct_properties(self, macs_model, sample_task_data):
+    def test_create_tools_correct_properties(self, macs_model_factory, sample_task_data):
         """Created tools have correct properties."""
-        env = MACSEnvironment(sample_task_data, macs_model)
+        env = MACSEnvironment(sample_task_data, macs_model_factory)
 
         search_flights = env.tools["search_flights"]
         assert search_flights.name == "search_flights"
@@ -72,7 +71,7 @@ def test_create_tools_correct_properties(self, macs_model, sample_task_data):
         assert "origin" in search_flights.inputs
         assert "destination" in search_flights.inputs
 
-    def test_create_tools_deduplicates(self, macs_model):
+    def test_create_tools_deduplicates(self, macs_model_factory):
         """Duplicate tool names are deduplicated."""
         task_data = {
             "environment_data": {
@@ -88,20 +87,20 @@ def test_create_tools_deduplicates(self, macs_model):
                 ]
             }
         }
-        env = MACSEnvironment(task_data, macs_model)
+        env = MACSEnvironment(task_data, macs_model_factory)
 
         # Should only have one instance
         assert len(env.tools) == 1
         assert "duplicate_tool" in env.tools
 
-    def test_create_tools_empty_specs(self, macs_model):
+    def test_create_tools_empty_specs(self, macs_model_factory):
         """Empty specs returns empty dict."""
         task_data = {"environment_data": {"tools": []}}
-        env = MACSEnvironment(task_data, macs_model)
+        env = MACSEnvironment(task_data, macs_model_factory)
 
         assert env.tools == {}
 
-    def test_create_tools_empty_actions(self, macs_model):
+    def test_create_tools_empty_actions(self, macs_model_factory):
         """Handles tool groups with no actions."""
         task_data = {
             "environment_data": {
@@ -110,7 +109,7 @@ def test_create_tools_empty_actions(self, macs_model):
                 ]
             }
         }
-        env = MACSEnvironment(task_data, macs_model)
+        env = MACSEnvironment(task_data, macs_model_factory)
 
         assert env.tools == {}
 
@@ -124,9 +123,9 @@ def test_create_tools_empty_actions(self, macs_model):
 class TestGetToolsForAgent:
     """Tests for get_tools_for_agent method."""
 
-    def test_get_tools_for_agent(self, macs_model, sample_task_data, sample_agent_spec_flight):
+    def test_get_tools_for_agent(self, macs_model_factory, sample_task_data, sample_agent_spec_flight):
         """Returns tools matching agent spec."""
-        env = MACSEnvironment(sample_task_data, macs_model)
+        env = MACSEnvironment(sample_task_data, macs_model_factory)
 
         agent_tools = env.get_tools_for_agent(sample_agent_spec_flight)
 
@@ -135,9 +134,9 @@ def test_get_tools_for_agent(self, macs_model, sample_task_data, sample_agent_sp
         assert "book_flight" in agent_tools
         assert "search_hotels" not in agent_tools
 
-    def test_get_tools_for_agent_all(self, macs_model, sample_task_data, sample_agent_spec_all):
+    def test_get_tools_for_agent_all(self, macs_model_factory, sample_task_data, sample_agent_spec_all):
         """Returns all tools when agent has access to all groups."""
-        env = MACSEnvironment(sample_task_data, macs_model)
+        env = MACSEnvironment(sample_task_data, macs_model_factory)
 
         agent_tools = env.get_tools_for_agent(sample_agent_spec_all)
 
@@ -146,17 +145,17 @@ def test_get_tools_for_agent_all(self, macs_model, sample_task_data, sample_agen
         assert "book_flight" in agent_tools
         assert "search_hotels" in agent_tools
 
-    def test_get_tools_for_agent_no_match(self, macs_model, sample_task_data, sample_agent_spec_none):
+    def test_get_tools_for_agent_no_match(self, macs_model_factory, sample_task_data, sample_agent_spec_none):
         """Returns empty dict if no matching tool groups."""
-        env = MACSEnvironment(sample_task_data, macs_model)
+        env = MACSEnvironment(sample_task_data, macs_model_factory)
 
         agent_tools = env.get_tools_for_agent(sample_agent_spec_none)
 
         assert agent_tools == {}
 
-    def test_get_tools_for_agent_partial(self, macs_model, sample_task_data):
+    def test_get_tools_for_agent_partial(self, macs_model_factory, sample_task_data):
         """Returns subset matching agent's tool groups."""
-        env = MACSEnvironment(sample_task_data, macs_model)
+        env = MACSEnvironment(sample_task_data, macs_model_factory)
 
         agent_spec = {
             "agent_id": "hotel_agent",
@@ -167,18 +166,18 @@ def test_get_tools_for_agent_partial(self, macs_model, sample_task_data):
         assert len(agent_tools) == 1
         assert "search_hotels" in agent_tools
 
-    def test_get_tools_for_agent_returns_same_instances(self, macs_model, sample_task_data, sample_agent_spec_flight):
+    def test_get_tools_for_agent_returns_same_instances(self, macs_model_factory, sample_task_data, sample_agent_spec_flight):
         """Returns same tool instances as in env.tools."""
-        env = MACSEnvironment(sample_task_data, macs_model)
+        env = MACSEnvironment(sample_task_data, macs_model_factory)
 
         agent_tools = env.get_tools_for_agent(sample_agent_spec_flight)
 
         # Same instance, not copies
         assert agent_tools["search_flights"] is env.tools["search_flights"]
 
-    def test_get_tools_for_agent_empty_tools_list(self, macs_model, sample_task_data):
+    def test_get_tools_for_agent_empty_tools_list(self, macs_model_factory, sample_task_data):
         """Handles agent with empty tools list."""
-        env = MACSEnvironment(sample_task_data, macs_model)
+        env = MACSEnvironment(sample_task_data, macs_model_factory)
 
         agent_spec = {"agent_id": "no_tools", "tools": []}
         agent_tools = env.get_tools_for_agent(agent_spec)
@@ -195,10 +194,10 @@ def test_get_tools_for_agent_empty_tools_list(self, macs_model, sample_task_data
 class TestMACSEnvironmentIntegration:
     """Integration tests for MACSEnvironment."""
 
-    def test_full_workflow(self, macs_model, sample_task_data):
+    def test_full_workflow(self, macs_model_factory, sample_task_data):
         """Test complete environment workflow."""
         # Create environment
-        env = MACSEnvironment(sample_task_data, macs_model)
+        env = MACSEnvironment(sample_task_data, macs_model_factory)
 
         # Verify tools created
         assert len(env.tools) == 3
@@ -220,7 +219,11 @@ def test_tools_are_callable(self, sample_task_data):
         """Created tools can be called."""
         # Use a model that returns valid JSON responses (ToolLLMSimulator expects {"text": ..., "details": ...})
         model = DummyModelAdapter(responses=['{"text": "Found flights: AA123, UA456", "details": {}}'])
-        env = MACSEnvironment(sample_task_data, model)
+
+        def model_factory(model_name):
+            return model
+
+        env = MACSEnvironment(sample_task_data, model_factory)
 
         search_flights = env.tools["search_flights"]
         result = search_flights(origin="LAX", destination="JFK")
@@ -228,9 +231,9 @@ def test_tools_are_callable(self, sample_task_data):
         # Should return the text from the response
         assert "Found flights" in result
 
-    def test_multiple_agents_share_tools(self, macs_model, sample_task_data):
+    def test_multiple_agents_share_tools(self, macs_model_factory, sample_task_data):
         """Multiple agents can share the same tool instances."""
-        env = MACSEnvironment(sample_task_data, macs_model)
+        env = MACSEnvironment(sample_task_data, macs_model_factory)
 
         agent1_spec = {"agent_id": "agent1", "tools": ["flight_tools"]}
         agent2_spec = {"agent_id": "agent2", "tools": ["flight_tools"]}
@@ -258,7 +261,7 @@ def test_multiple_agents_share_tools(self, macs_model, sample_task_data):
 class TestEdgeCases:
     """Edge case tests for MACSEnvironment."""
 
-    def test_tool_with_no_name(self, macs_model):
+    def test_tool_with_no_name(self, macs_model_factory):
         """Handles actions without name field."""
         task_data = {
             "environment_data": {
@@ -273,13 +276,13 @@ def test_tool_with_no_name(self, macs_model):
                 ]
             }
         }
-        env = MACSEnvironment(task_data, macs_model)
+        env = MACSEnvironment(task_data, macs_model_factory)
 
         # Should only create the valid tool
         assert len(env.tools) == 1
         assert "valid_tool" in env.tools
 
-    def test_callbacks_passed_to_parent(self, macs_model, sample_task_data):
+    def test_callbacks_passed_to_parent(self, macs_model_factory, sample_task_data):
         """Callbacks are passed to parent Environment."""
         from maseval.core.callback import EnvironmentCallback
 
@@ -288,12 +291,12 @@ class MockCallback(EnvironmentCallback):
             pass
 
         callbacks = [MockCallback(), MockCallback()]
-        env = MACSEnvironment(sample_task_data, macs_model, callbacks=callbacks)
+        env = MACSEnvironment(sample_task_data, macs_model_factory, callbacks=callbacks)
 
         assert len(env.callbacks) == 2
         assert all(isinstance(cb, EnvironmentCallback) for cb in env.callbacks)
 
-    def test_nested_tool_groups(self, macs_model):
+    def test_nested_tool_groups(self, macs_model_factory):
         """Handles deeply nested tool structures."""
         task_data = {
             "environment_data": {
@@ -318,7 +321,7 @@ def test_nested_tool_groups(self, macs_model):
                 ]
             }
         }
-        env = MACSEnvironment(task_data, macs_model)
+        env = MACSEnvironment(task_data, macs_model_factory)
 
         assert "tool1" in env.tools
         assert "nested" in env.tools["tool1"].inputs
diff --git a/tests/test_benchmarks/test_macs/test_macs_integration.py b/tests/test_benchmarks/test_macs/test_macs_integration.py
index 9c8b16c4..961b3761 100644
--- a/tests/test_benchmarks/test_macs/test_macs_integration.py
+++ b/tests/test_benchmarks/test_macs/test_macs_integration.py
@@ -6,14 +6,12 @@
 
 import json
 import pytest
-from unittest.mock import patch
 
 from maseval import Task
 from maseval.benchmark.macs import (
     MACSEnvironment,
     MACSEvaluator,
     MACSUser,
-    compute_benchmark_metrics,
 )
 
 from .conftest import ConcreteMACSBenchmark
@@ -100,14 +98,16 @@ def test_loaded_task_works_with_environment(self, macs_model, sample_agent_data)
         task = Task(
             query="Book a flight",
             environment_data={
+                "model_id": "test-model",
                 "tools": [
                     {
                         "tool_name": "flight_search",
                         "actions": [{"name": "search", "description": "Search flights"}],
                     }
-                ]
+                ],
             },
-            evaluation_data={"assertions": ["user: Booking done"]},
+            user_data={"model_id": "test-model"},
+            evaluation_data={"model_id": "test-model", "assertions": ["user: Booking done"]},
             metadata={"scenario": "Travel booking scenario", "task_id": "task-000001"},
         )
 
@@ -116,7 +116,7 @@ def test_loaded_task_works_with_environment(self, macs_model, sample_agent_data)
 
         assert "search" in env.tools
 
-    def test_loaded_agent_config_works_with_environment(self, macs_model):
+    def test_loaded_agent_config_works_with_environment(self, macs_model_factory):
         """Agent config works with tool assignment."""
         # Simulate loaded agent config
         agent_config = {
@@ -137,7 +137,7 @@ def test_loaded_agent_config_works_with_environment(self, macs_model):
             }
         }
 
-        env = MACSEnvironment(task_data, macs_model)
+        env = MACSEnvironment(task_data, macs_model_factory)
 
         # Get tools for agent from config
         agent_spec = agent_config["agents"][0]
@@ -167,10 +167,10 @@ def test_evaluator_handles_malformed_llm_response(self, travel_task, sample_conv
         assert result["gsr"] == 0.0
         assert "error" in result
 
-    def test_environment_handles_empty_tool_specs(self, macs_model):
+    def test_environment_handles_empty_tool_specs(self, macs_model_factory):
         """Environment handles tasks with no tools."""
         task_data = {"environment_data": {"tools": []}}
-        env = MACSEnvironment(task_data, macs_model)
+        env = MACSEnvironment(task_data, macs_model_factory)
 
         assert env.tools == {}
 
diff --git a/tests/test_benchmarks/test_macs/test_macs_user.py b/tests/test_benchmarks/test_macs/test_macs_user.py
index 4a549523..f31e2e26 100644
--- a/tests/test_benchmarks/test_macs/test_macs_user.py
+++ b/tests/test_benchmarks/test_macs/test_macs_user.py
@@ -1,7 +1,7 @@
 """Unit tests for MACSUser."""
 
 import pytest
-from unittest.mock import patch, MagicMock
+from unittest.mock import MagicMock
 
 from maseval.benchmark.macs import MACSUser
 
diff --git a/tests/test_core/test_benchmark/test_execution_loop.py b/tests/test_core/test_benchmark/test_execution_loop.py
index b7c198f2..c9fb9426 100644
--- a/tests/test_core/test_benchmark/test_execution_loop.py
+++ b/tests/test_core/test_benchmark/test_execution_loop.py
@@ -27,6 +27,11 @@ def __init__(self, *args, return_user: Optional[User] = None, **kwargs):
         self._return_user = return_user
         self.run_agents_calls: List[Tuple[Any, ...]] = []
 
+    def get_model_adapter(self, model_id: str, **kwargs):
+        from conftest import DummyModelAdapter
+
+        return DummyModelAdapter(model_id=model_id)
+
     def setup_environment(self, agent_data, task):
         from conftest import DummyEnvironment
 
diff --git a/tests/test_core/test_user_simulator.py b/tests/test_core/test_user_simulator.py
index 11c8b1a6..f6bc606a 100644
--- a/tests/test_core/test_user_simulator.py
+++ b/tests/test_core/test_user_simulator.py
@@ -5,7 +5,6 @@
 """
 
 import pytest
-from unittest.mock import MagicMock
 
 
 @pytest.mark.core

From b08bf550d65fc9e0c8c7b9ddedc60c2af48de57d Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 22:31:45 +0000
Subject: [PATCH 17/34] fixed typing issues

---
 examples/macs_benchmark/macs_benchmark.py      | 10 ++++++----
 maseval/benchmark/macs/data_loader.py          | 18 ++++++++++--------
 pyproject.toml                                 |  5 +++++
 .../test_macs/test_data_loader.py              |  7 ++++---
 .../test_macs/test_macs_benchmark.py           |  3 +++
 5 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/examples/macs_benchmark/macs_benchmark.py b/examples/macs_benchmark/macs_benchmark.py
index 6d152d74..5210bc51 100644
--- a/examples/macs_benchmark/macs_benchmark.py
+++ b/examples/macs_benchmark/macs_benchmark.py
@@ -30,13 +30,15 @@
 import argparse
 import os
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Tuple
+from typing import Any, Dict, List, Literal, Optional, Tuple, TYPE_CHECKING
 
 # Third-party imports (both frameworks will be installed)
 from google.genai import Client as GoogleGenAIClient
 
 # smolagents imports
-from smolagents import Tool as SmolagentsTool, ToolCallingAgent, OpenAIServerModel, FinalAnswerTool
+from smolagents import ToolCallingAgent, OpenAIServerModel, FinalAnswerTool
+if TYPE_CHECKING:
+    from smolagents import Tool as SmolagentsTool
 
 # langgraph imports
 from langchain_core.tools import StructuredTool
@@ -241,7 +243,7 @@ def setup_agents(
             self.register("simulators", f"simulator_tool_{name}", tool.simulator)
 
         # Helper to get tools for an agent
-        def get_agent_tools(agent_spec: Dict[str, Any]) -> List[SmolagentsTool]:
+        def get_agent_tools(agent_spec: Dict[str, Any]) -> List["SmolagentsTool"]:
             """Get wrapped tools for an agent based on its tool groups."""
             agent_tools = environment.get_tools_for_agent(agent_spec)
             return [tool_wrappers[name] for name in agent_tools if name in tool_wrappers]
@@ -252,7 +254,7 @@ def build_agent(agent_id: str, depth: int = 0) -> ToolCallingAgent:
             agent_spec = agent_lookup.get(agent_id, {})
 
             # Get this agent's tools
-            agent_tools: List[SmolagentsTool] = get_agent_tools(agent_spec)
+            agent_tools: List["SmolagentsTool"] = get_agent_tools(agent_spec)
             agent_tools.append(FinalAnswerTool())
 
             # Build managed agents from reachable_agents
diff --git a/maseval/benchmark/macs/data_loader.py b/maseval/benchmark/macs/data_loader.py
index df22f9f4..02106814 100644
--- a/maseval/benchmark/macs/data_loader.py
+++ b/maseval/benchmark/macs/data_loader.py
@@ -452,16 +452,18 @@ def load_tasks(
 
     tasks = []
     for t in tasks_list:
-        task_kwargs: Dict[str, Any] = {
-            "query": t["query"],
-            "environment_data": t.get("environment_data", {}),
-            "evaluation_data": t.get("evaluation_data", {}),
-            "metadata": t.get("metadata", {}),
-        }
+        metadata = t.get("metadata", {})
         # Store task ID in metadata (format: task-NNNNNN)
         if t.get("id"):
-            task_kwargs["metadata"]["task_id"] = t["id"]
-        tasks.append(Task(**task_kwargs))
+            metadata["task_id"] = t["id"]
+        tasks.append(
+            Task(
+                query=t["query"],
+                environment_data=t.get("environment_data", {}),
+                evaluation_data=t.get("evaluation_data", {}),
+                metadata=metadata,
+            )
+        )
 
     return TaskCollection(tasks)
 
diff --git a/pyproject.toml b/pyproject.toml
index b19b8ecc..96980860 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -203,3 +203,8 @@ output = "coverage.xml"
 
 [tool.coverage.html]
 directory = "htmlcov"
+
+[tool.ty.src]
+# Exclude notebooks and docs/examples from type checking
+# These contain local imports that are only valid when run from their directory
+exclude = ["docs/examples/", "examples/"]
diff --git a/tests/test_benchmarks/test_macs/test_data_loader.py b/tests/test_benchmarks/test_macs/test_data_loader.py
index 67e0a1b4..fdbfbbed 100644
--- a/tests/test_benchmarks/test_macs/test_data_loader.py
+++ b/tests/test_benchmarks/test_macs/test_data_loader.py
@@ -3,10 +3,11 @@
 import json
 import pytest
 from pathlib import Path
-from typing import Any, Dict
+from typing import Any, Dict, Generator
 from unittest.mock import patch, MagicMock
 from tempfile import TemporaryDirectory
 from urllib.error import URLError, HTTPError
+from email.message import Message
 
 from maseval.benchmark.macs.data_loader import (
     DEFAULT_DATA_DIR,
@@ -92,7 +93,7 @@ def sample_scenarios_data() -> Dict[str, Any]:
 
 
 @pytest.fixture
-def temp_data_dir() -> Path:
+def temp_data_dir() -> Generator[Path, None, None]:
     """Create a temporary directory for test data."""
     with TemporaryDirectory() as tmpdir:
         yield Path(tmpdir)
@@ -596,7 +597,7 @@ def test_download_file_http_error(self):
                 url="http://example.com/test.json",
                 code=404,
                 msg="Not Found",
-                hdrs=None,
+                hdrs=Message(),
                 fp=None,
             )
             with pytest.raises(RuntimeError, match="Failed to download"):
diff --git a/tests/test_benchmarks/test_macs/test_macs_benchmark.py b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
index 91800729..c0b33d01 100644
--- a/tests/test_benchmarks/test_macs/test_macs_benchmark.py
+++ b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
@@ -8,6 +8,7 @@
 from maseval.benchmark.macs import (
     MACSBenchmark,
     MACSEnvironment,
+    MACSEvaluator,
     MACSUser,
     compute_benchmark_metrics,
 )
@@ -81,6 +82,8 @@ def test_setup_evaluators_creates_user_and_system(self, macs_model, sample_agent
         evaluators = benchmark.setup_evaluators(env, sample_task, agents, None)
 
         assert len(evaluators) == 2
+        assert isinstance(evaluators[0], MACSEvaluator)
+        assert isinstance(evaluators[1], MACSEvaluator)
         assert evaluators[0].gsr_type == "user"
         assert evaluators[1].gsr_type == "system"
 

From 75925b2a2f5623185c881206d01c126535986444 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 22:37:09 +0000
Subject: [PATCH 18/34] fixed tests for GHA

---
 examples/macs_benchmark/macs_benchmark.py   |  1 +
 tests/test_benchmarks/test_macs/conftest.py | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/examples/macs_benchmark/macs_benchmark.py b/examples/macs_benchmark/macs_benchmark.py
index 5210bc51..af29ab4c 100644
--- a/examples/macs_benchmark/macs_benchmark.py
+++ b/examples/macs_benchmark/macs_benchmark.py
@@ -37,6 +37,7 @@
 
 # smolagents imports
 from smolagents import ToolCallingAgent, OpenAIServerModel, FinalAnswerTool
+
 if TYPE_CHECKING:
     from smolagents import Tool as SmolagentsTool
 
diff --git a/tests/test_benchmarks/test_macs/conftest.py b/tests/test_benchmarks/test_macs/conftest.py
index 7f5f3805..1bb3db28 100644
--- a/tests/test_benchmarks/test_macs/conftest.py
+++ b/tests/test_benchmarks/test_macs/conftest.py
@@ -27,6 +27,24 @@
 from maseval.benchmark.macs import MACSBenchmark, MACSEnvironment
 
 
+# =============================================================================
+# Session-Scoped Setup
+# =============================================================================
+
+
+@pytest.fixture(scope="session", autouse=True)
+def ensure_macs_templates():
+    """Download MACS prompt templates before running any benchmark tests.
+
+    This fixture runs once per test session and ensures that the prompt template
+    files (user.txt, system.txt, user_simulator.txt) exist locally. These templates
+    are required by MACSEvaluator and MACSUser.
+    """
+    from maseval.benchmark.macs.data_loader import download_prompt_templates
+
+    download_prompt_templates()
+
+
 # =============================================================================
 # MACS-Specific Mock Components
 # =============================================================================

From b192afba6d473d36635b7755dda0b4dc2668f3b0 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 22:42:32 +0000
Subject: [PATCH 19/34] fixed gitignore bug for macs

---
 maseval/benchmark/macs/.gitignore             |  9 +++-
 .../macs/prompt_templates/user_simulator.txt  | 49 +++++++++++++++++++
 2 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 maseval/benchmark/macs/prompt_templates/user_simulator.txt

diff --git a/maseval/benchmark/macs/.gitignore b/maseval/benchmark/macs/.gitignore
index 0bfef8c7..fb90a54a 100644
--- a/maseval/benchmark/macs/.gitignore
+++ b/maseval/benchmark/macs/.gitignore
@@ -1,3 +1,8 @@
-(# Ignore generated data files created by the aws_collab data processor)
+# Ignore generated data files created by the aws_collab data processor
 **/*.json
-**/*.txt
+
+# Ignore downloaded prompt templates (user.txt, system.txt, issues.txt)
+# But NOT user_simulator.txt which is a custom template committed to the repo
+prompt_templates/user.txt
+prompt_templates/system.txt
+prompt_templates/issues.txt
diff --git a/maseval/benchmark/macs/prompt_templates/user_simulator.txt b/maseval/benchmark/macs/prompt_templates/user_simulator.txt
new file mode 100644
index 00000000..38cc7866
--- /dev/null
+++ b/maseval/benchmark/macs/prompt_templates/user_simulator.txt
@@ -0,0 +1,49 @@
+### ROLE
+You are simulating a human user interacting with an AI travel/service assistant. Generate the user's next response based on their profile and goals.
+
+### USER PROFILE
+{{user_profile}}
+
+### SCENARIO & GOALS
+The user's objective in this conversation:
+{{scenario}}
+
+### CONVERSATION HISTORY
+{{conversation_history}}
+
+### YOUR TASK
+Generate the user's next response. Follow these guidelines:
+
+1. **Check Goal Completion**: Review the Goals section carefully. If ALL goals have been satisfactorily addressed by the assistant in the conversation, end your response with the token `</stop>`.
+
+2. **Stay In Character**: Respond naturally as this specific user would, based on their profile.
+
+3. **Advance Goals**: If goals remain unmet, provide information or ask questions to help accomplish them.
+
+4. **Be Helpful**: Answer the assistant's questions directly. Provide relevant details from your profile when asked.
+
+### OUTPUT INSTRUCTIONS
+Respond with ONLY a valid JSON object containing:
+- "text": The user's response (include `</stop>` at the end if ALL goals are met)
+- "details": Key information extracted from your response
+
+Example when goals are still pending:
+```json
+{
+    "text": "Yes, I'd like to book the flight for December 20th. My preferred seats are 31A and 31B.",
+    "details": {
+        "date": "December 20th",
+        "seats": ["31A", "31B"]
+    }
+}
+```
+
+Example when ALL goals are satisfied:
+```json
+{
+    "text": "Perfect, thank you for booking the flight and providing the weather forecast. That's everything I needed! </stop>",
+    "details": {
+        "confirmation": "all_goals_met"
+    }
+}
+```

From 7b1131f0338e13a9cb17c0cc328616f7c21764d0 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 22:57:24 +0000
Subject: [PATCH 20/34] [skip ci] fixed formatting

---
 BENCHMARKS.md |  2 +-
 CHANGELOG.md  | 72 +++++++++++++++++++++++++++++++++++----------------
 2 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/BENCHMARKS.md b/BENCHMARKS.md
index 10f2ba38..38647cc6 100644
--- a/BENCHMARKS.md
+++ b/BENCHMARKS.md
@@ -4,7 +4,7 @@ This document provides detailed information, sources, and licensing for all benc
 
 ---
 
-## 1. Multi-Agent Collaboriation Scenario Benchmark (MACS Benchmark)
+## 1. Multi-Agent Collaboration Scenario Benchmark (MACS Benchmark)
 
 This benchmark is designed to test and evaluate the collaborative problem-solving capabilities of multi-agent systems. The implementation in this library provides the necessary code to set up and run these scenarios.
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 41de392d..2410a085 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,46 +9,74 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+**Benchmarks**
+
 - MACS Benchmark: Multi-Agent Collaboration Scenarios benchmark (PR: #13)
-- Added `execution_loop()` method to `Benchmark` base class enabling iterative agent-user interaction
-- Added `max_invocations` constructor parameter to `Benchmark` (default: 1 for backwards compatibility)
-- Added `max_turns` and `stop_token` parameters to `User` base class for multi-turn support with early stopping
-- Added `is_done()`, `_check_stop_token()`, and `increment_turn()` methods to `User` base class
-- Added `get_initial_query()` method to `User` base class for LLM-generated initial messages
-- Added `initial_prompt` parameter in `User` base class to trigger the agentic system.
-- Added abstract `get_model_adapter(model_id, **kwargs)` method to `Benchmark` base class as universal model factory to be used throughout the benchmarks.
-- Comprehensive testing for multi-turn behavior and `Benchmark.execution_loop()` (PR: #13)
+
+**Benchmark**
+
+- Added `execution_loop()` method to `Benchmark` base class enabling iterative agent-user interaction (PR: #13)
+- Added `max_invocations` constructor parameter to `Benchmark` (default: 1 for backwards compatibility) (PR: #13)
+- Added abstract `get_model_adapter(model_id, **kwargs)` method to `Benchmark` base class as universal model factory to be used throughout the benchmarks. (PR: #13)
+
+**User**
+
+- Added `max_turns` and `stop_token` parameters to `User` base class for multi-turn support with early stopping (PR: #13)
+- Added `is_done()`, `_check_stop_token()`, and `increment_turn()` methods to `User` base class (PR: #13)
+- Added `get_initial_query()` method to `User` base class for LLM-generated initial messages (PR: #13)
+- Added `initial_prompt` parameter in `User` base class to trigger the agentic system. (PR: #13)
+
+**Environment**
+
+- Added `Environment.get_tool(name)` method for single-tool lookup (PR: #13)
+
+**Interface**
 
 - [LlamaIndex](https://github.com/run-llama/llama_index) integration: `LlamaIndexAgentAdapter` and `LlamaIndexUser` for evaluating LlamaIndex workflow-based agents (PR: #7)
-  - Supports async workflow execution with proper event loop handling
-- Added a new example: The `5_a_day_benchmark` (PR: #10)
 - The `logs` property inside `SmolAgentAdapter` and `LanggraphAgentAdapter` are now properly filled. (PR: #3)
 
+**Examples**
+
+- Added a new example: The `5_a_day_benchmark` (PR: #10)
+
 ### Changed
 
-- **BREAKING:** `Environment.create_tools()` now returns `Dict[str, Any]` instead of `list`
-  - `get_tools()` returns a dict keyed by tool name
-  - Added `get_tool(name)` method for single-tool lookup
-  - Removed internal `_tools_dict` attribute (tools dict is now the source of truth)
-- **BREAKING:** `Benchmark.run_agents()` signature changed: added `query: str` parameter
-  - Subclasses must update their implementations to accept and use this parameter
-- Renamed `tests/test_core/test_user_simulator.py` to `tests/test_core/test_user.py` to clarify it tests the `User` class (not `UserLLMSimulator`)
-- Documentation formatting improved. Added darkmode and links to `Github` (PR: #11).
-- `FileResultLogger` now accepts `pathlib.Path` for argument `output_dir` and has an `overwrite` argument to prevent overwriting of existing logs files.
+**Environment**
+
+- `Environment.create_tools()` now returns `Dict[str, Any]` instead of `list` (PR: #13)
+
+**Benchmark**
+
+- `Benchmark.run_agents()` signature changed: added `query: str` parameter (PR: #13)
+- `Benchmark.run()` now uses `execution_loop()` internally to handle agent-user interaction cycles (PR: #13)
 - `Benchmark` class now has a `fail_on_setup_error` flag that raises errors observed during setup of task (PR: #10)
+
+**Callback**
+
+- `FileResultLogger` now accepts `pathlib.Path` for argument `output_dir` and has an `overwrite` argument to prevent overwriting of existing logs files.
+
+**Evaluator**
+
 - The `Evaluator` class now has a `filter_traces` base method to conveniently adapt the same evaluator to different entities in the traces (PR: #10).
+
+**Other**
+
+- Documentation formatting improved. Added darkmode and links to `Github` (PR: #11).
 - Improved Quick Start Guide in `docs/getting-started/quickstart.md`. (PR: #10)
 - `maseval.interface.agents` structure changed. Tools requiring framework imports (beyond just typing) now in `<framework>_optional.py` and imported dynamically from `<framework>.py`. (PR: #12)
 - Various formatting improvements in the documentation (PR: #12)
 - Added documentation for View Source Code pattern in `CONTRIBUTING.md` and `_optional.py` pattern in interface README (PR: #12)
-- `Benchmark.run()` now uses `execution_loop()` internally to handle agent-user interaction cycles
 
 ### Fixed
 
-- Fixed `MACSBenchmark.run_agents()` to use the `query` parameter instead of always using `task.query`
+**Interface**
+
 - `LlamaIndexAgentAdapter` now supports multiple LlamaIndex agent types including `ReActAgent` (workflow-based), `FunctionAgent`, and legacy agents by checking for `.chat()`, `.query()`, and `.run()` methods in priority order (PR: #10)
+
+**Other**
+
 - Consistent naming of agent `adapter` over `wrapper` (PR: #3)
-- Fixed an issue that `LiteLLM` interface and `Mixin`s were not shwon in documentation properly (#PR: 12)
+- Fixed an issue that `LiteLLM` interface and `Mixin`s were not shown in documentation properly (#PR: 12)
 
 ### Removed
 

From 1efd5c058c338e6bed0814edddf3dd23cd49338e Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 23:10:18 +0000
Subject: [PATCH 21/34] [skip ci] formatting fixes

---
 docs/examples/index.md            | 10 +++++-----
 maseval/benchmark/macs/.gitignore |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/examples/index.md b/docs/examples/index.md
index a1c58768..cc5b22be 100644
--- a/docs/examples/index.md
+++ b/docs/examples/index.md
@@ -2,8 +2,8 @@
 
 Learn MASEval through hands-on examples covering common use cases and benchmarks.
 
-| Example                                                                                                                             | Description                                             |
-| ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
-| [Tutorial](tutorial.ipynb)                                                                                                          | Introduction to MASEval's core concepts and basic usage |
-| [Five-a-Day Benchmark](five_a_day_benchmark.ipynb)                                                                                  | Building a custom benchmark from scratch                |
-| [Multi-Agent Collaboration Scenario Benchmark (MACS)](https://github.com/parameterlab/MASEval/blob/main/examples/macs_benchmark.py) | An adaptation of the `maseval.benchmark.MACSBenchmark`. |
+| Example                                                                                                                                            | Description                                             |
+| -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
+| [Tutorial](tutorial.ipynb)                                                                                                                         | Introduction to MASEval's core concepts and basic usage |
+| [Five-a-Day Benchmark](five_a_day_benchmark.ipynb)                                                                                                 | Building a custom benchmark from scratch                |
+| [Multi-Agent Collaboration Scenario Benchmark (MACS)](https://github.com/parameterlab/MASEval/blob/main/examples/macs_benchmark/macs_benchmark.py) | An adaptation of the `maseval.benchmark.MACSBenchmark`. |
diff --git a/maseval/benchmark/macs/.gitignore b/maseval/benchmark/macs/.gitignore
index fb90a54a..5104a8b5 100644
--- a/maseval/benchmark/macs/.gitignore
+++ b/maseval/benchmark/macs/.gitignore
@@ -1,5 +1,5 @@
-# Ignore generated data files created by the aws_collab data processor
-**/*.json
+# Ignore generated data files created by the MACS benchmark
+data/*
 
 # Ignore downloaded prompt templates (user.txt, system.txt, issues.txt)
 # But NOT user_simulator.txt which is a custom template committed to the repo

From 5663ca965b44ae2c39c4dd27303e9a1281210b6b Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Thu, 4 Dec 2025 23:47:06 +0000
Subject: [PATCH 22/34] small clean up to user turn counting

---
 maseval/benchmark/macs/macs.py                |  13 +-
 maseval/core/benchmark.py                     |  19 +--
 maseval/core/user.py                          |  64 +++++-----
 tests/conftest.py                             |  11 +-
 tests/test_benchmarks/test_macs/conftest.py   |   2 +-
 .../test_macs/test_macs_user.py               | 112 +++++++++---------
 .../test_benchmark/test_execution_loop.py     |  51 ++++----
 tests/test_core/test_user.py                  |  53 +++++----
 tests/test_core/test_user_simulator.py        |  53 +++++----
 .../test_llamaindex_integration.py            |   4 +-
 .../test_smolagents_integration.py            |   2 +-
 11 files changed, 197 insertions(+), 187 deletions(-)

diff --git a/maseval/benchmark/macs/macs.py b/maseval/benchmark/macs/macs.py
index b43e5a6a..46016461 100644
--- a/maseval/benchmark/macs/macs.py
+++ b/maseval/benchmark/macs/macs.py
@@ -413,7 +413,7 @@ def __init__(
         self,
         model: ModelAdapter,
         scenario: str,
-        initial_prompt: str,
+        initial_query: str,
         name: str = "Simulated User",
         template: Optional[str] = None,
         max_turns: int = DEFAULT_MAX_TURNS,
@@ -424,7 +424,7 @@ def __init__(
         Args:
             model: ModelAdapter for LLM-based response generation
             scenario: Full scenario text (contains goals and user background)
-            initial_prompt: The initial query to the agent
+            initial_query: The initial query to the agent
             name: User name for identification (default: "Simulated User")
             template: Optional custom prompt template (uses MACS-specific default)
             max_turns: Maximum conversation turns (default: 5, per MACS paper)
@@ -442,7 +442,7 @@ def __init__(
             model=model,
             user_profile=user_profile,
             scenario=scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
             template=template,
             max_turns=max_turns,
             stop_token=stop_token,
@@ -467,12 +467,15 @@ def get_tool(self) -> Any:
 
     def reset(self) -> None:
         """Reset the conversation state for a new interaction."""
-        self._turn_count = 0
         self._stopped = False
         # Keep only the initial user message
         if len(self.messages) > 0:
             initial = self.messages[0]
             self.messages = MessageHistory([initial])
+            self._turn_count = 1  # Initial message counts as first turn
+        else:
+            self.messages = MessageHistory()
+            self._turn_count = 0
 
     @staticmethod
     def _extract_user_profile(scenario: str) -> Dict[str, Any]:
@@ -774,7 +777,7 @@ def setup_user(
                 register_name="user_simulator",
             ),
             scenario=scenario,
-            initial_prompt=task.query,
+            initial_query=task.query,
         )
 
     @abstractmethod
diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py
index 1bf7953c..37f7359d 100644
--- a/maseval/core/benchmark.py
+++ b/maseval/core/benchmark.py
@@ -913,15 +913,8 @@ def execution_loop(
         """Execute agents with optional user interaction loop.
 
         This method orchestrates the agent-user interaction pattern. When a user is
-        present, the user initiates the conversation by providing the first query to
-        agents. If no user is present, ``task.query`` is used as the initial query.
-
-        Query Source Priority:
-            1. **User with initial_prompt**: Uses the user's initial message (fixed string
-               provided at User construction).
-            2. **User without initial_prompt**: Calls ``user.get_initial_query()`` to
-               generate the first message via LLM based on user profile and scenario.
-            3. **No user**: Falls back to ``task.query``.
+        present, the user initiates the conversation using `User.get_intial_query`.
+        If no user is present, ``task.query`` is used as the initial query.
 
         Interaction Flow:
             By default, agents execute once (``max_invocations=1``). For multi-turn
@@ -956,14 +949,8 @@ def __init__(self, ...):
 
         # Determine initial query text
         if user is not None:
-            if len(user.messages) > 0:
-                # User has initial_prompt - use it
-                query_text = user.messages[-1].get("content", task.query)
-            else:
-                # No initial_prompt - generate one via LLM
-                query_text = user.get_initial_query()
+            query_text = user.get_initial_query()
         else:
-            # No user - use task query directly
             query_text = task.query
 
         for _ in range(self.max_invocations):
diff --git a/maseval/core/user.py b/maseval/core/user.py
index 5721a79f..5fa396a4 100644
--- a/maseval/core/user.py
+++ b/maseval/core/user.py
@@ -51,7 +51,7 @@ def __init__(
         model: ModelAdapter,
         user_profile: Dict[str, Any],
         scenario: str,
-        initial_prompt: Optional[str] = None,
+        initial_query: Optional[str] = None,
         template: Optional[str] = None,
         max_try: int = 3,
         max_turns: int = 1,
@@ -66,20 +66,18 @@ def __init__(
                 preferences, and other relevant information.
             scenario (str): A description of the situation or task the user is trying to
                 accomplish.
-            initial_prompt (Optional[str], optional): The initial message that starts the
-                conversation. If provided, it's added to the message history as the first
-                user message (not counted as a turn). If None, the conversation starts
-                empty and you can call get_initial_query() to generate one via LLM.
-                Defaults to None.
+            initial_query (Optional[str], optional): A pre-set query to start the
+                conversation. If provided, it becomes the first user message. If None,
+                call get_initial_query() to generate one from the model based on the
+                user profile and scenario. Defaults to None.
             template (Optional[str], optional): A custom prompt template for the user
                 simulator. Defaults to None.
             max_try (int, optional): The maximum number of attempts for the simulator to
                 generate a valid response. Defaults to 3.
-            max_turns (int, optional): Maximum number of LLM-generated user responses
-                before is_done() returns True. The initial_prompt (if provided) is NOT
-                counted as a turn since it's not LLM-generated. Use max_turns=1 for
-                single-turn benchmarks, or higher values for multi-turn interaction.
-                Defaults to 1.
+            max_turns (int, optional): Maximum number of user messages in the
+                conversation. Each user message counts as one turn, including the
+                initial_query. Use max_turns=1 for single-turn benchmarks, or higher
+                values for multi-turn interaction. Defaults to 1.
             stop_token (Optional[str], optional): Token that signals user satisfaction,
                 enabling early termination. When the user's LLM-generated response contains
                 this token, is_done() returns True regardless of remaining turns. Use this
@@ -98,17 +96,19 @@ def __init__(
             template=template,
             max_try=max_try,
         )
-        # Initialize message history - empty or with initial prompt
-        if initial_prompt is not None:
-            self.messages = MessageHistory([{"role": "user", "content": initial_prompt}])
+        # Initialize message history - empty or with initial query
+        if initial_query is not None:
+            self.messages = MessageHistory([{"role": "user", "content": initial_query}])
+            self._initial_turn_count = 1  # Initial query counts as first turn
         else:
             self.messages = MessageHistory()
+            self._initial_turn_count = 0
         self.logs: list[Dict[str, Any]] = []
 
         # Multi-turn configuration
         self.max_turns = max_turns
         self.stop_token = stop_token
-        self._turn_count = 0
+        self._turn_count = self._initial_turn_count
         self._stopped = False
 
     def simulate_response(self, question: str) -> str:
@@ -164,30 +164,33 @@ def simulate_response(self, question: str) -> str:
         return clean_response
 
     def get_initial_query(self) -> str:
-        """Generate an initial query using the LLM simulator.
+        """Get the initial query for the conversation.
 
-        Use this method when you want the user LLM to generate the first message
-        instead of providing a fixed initial_prompt. This is useful for scenarios
-        where the user should initiate the conversation based on their profile
-        and scenario.
+        If an initial_query was provided at construction, returns it.
+        Otherwise, generates one using the LLM simulator based on the user's
+        profile and scenario.
 
         This method:
-        - Calls the LLM simulator with an empty conversation history
-        - Adds the generated query to the message history as a "user" message
-        - Does NOT increment _turn_count (initial query is not counted as a turn)
-        - Checks for stop_token (in case user is immediately satisfied)
+        - Returns the existing initial query if one was provided
+        - Or calls the LLM simulator to generate one
+        - Ensures the query is in the message history
+        - Counts the initial query as the first turn
 
         Returns:
-            str: The generated initial query.
+            str: The initial query (either pre-set or LLM-generated).
 
         Raises:
-            RuntimeError: If conversation already has messages (use simulate_response instead).
+            RuntimeError: If called after conversation has progressed beyond
+                the initial message.
         """
+        # If we already have an initial query in messages, return it
         if len(self.messages) > 0:
-            raise RuntimeError(
-                "Cannot generate initial query: conversation already has messages. Use simulate_response() for subsequent turns."
-            )
+            first_message = self.messages[0]
+            if first_message.get("role") == "user":
+                return first_message.get("content", "")
+            raise RuntimeError("Cannot get initial query: conversation has progressed. Use simulate_response() for subsequent turns.")
 
+        # Generate initial query via LLM
         start_time = time.time()
         log_entry: Dict[str, Any] = {
             "timestamp": datetime.now().isoformat(),
@@ -212,8 +215,9 @@ def get_initial_query(self) -> str:
         # Check for stop token (user might be immediately satisfied with scenario)
         _, clean_response = self._check_stop_token(response)
 
-        # Add as initial user message (not counted as a turn)
+        # Add as initial user message and count as first turn
         self.messages.add_message("user", clean_response)
+        self.increment_turn()
         return clean_response
 
     def gather_traces(self) -> dict[str, Any]:
diff --git a/tests/conftest.py b/tests/conftest.py
index 3ffa164b..214c5188 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -155,8 +155,8 @@ def __init__(self, name: str, model: ModelAdapter, **kwargs):
             **kwargs: Forwarded to User base class:
                 - user_profile: Dict of user attributes
                 - scenario: Scenario description
-                - initial_prompt: Optional initial message
-                - max_turns: Max LLM-generated responses (default: 1)
+                - initial_query: Optional initial message
+                - max_turns: Max interaction turns (default: 1)
                 - stop_token: Early termination token (default: None)
         """
         super().__init__(
@@ -164,7 +164,7 @@ def __init__(self, name: str, model: ModelAdapter, **kwargs):
             model=model,
             user_profile=kwargs.get("user_profile", {}),
             scenario=kwargs.get("scenario", "test scenario"),
-            initial_prompt=kwargs.get("initial_prompt"),
+            initial_query=kwargs.get("initial_query"),
             max_turns=kwargs.get("max_turns", 1),
             stop_token=kwargs.get("stop_token"),
         )
@@ -281,13 +281,14 @@ def dummy_environment():
 
 @pytest.fixture
 def dummy_user(dummy_model):
-    """Create a dummy user."""
+    """Create a dummy user with an initial query."""
     return DummyUser(
         name="test_user",
         model=dummy_model,
         user_profile={"role": "tester"},
         scenario="test scenario",
-        initial_prompt="Hello",
+        initial_query="Hello",
+        max_turns=2,  # Allow at least one simulate_response after initial query
     )
 
 
diff --git a/tests/test_benchmarks/test_macs/conftest.py b/tests/test_benchmarks/test_macs/conftest.py
index 1bb3db28..1b0ff359 100644
--- a/tests/test_benchmarks/test_macs/conftest.py
+++ b/tests/test_benchmarks/test_macs/conftest.py
@@ -574,7 +574,7 @@ def minimal_scenario():
 
 
 @pytest.fixture
-def initial_prompt():
+def initial_query():
     """Sample initial user query."""
     return "I need to book a flight to New York for Monday."
 
diff --git a/tests/test_benchmarks/test_macs/test_macs_user.py b/tests/test_benchmarks/test_macs/test_macs_user.py
index f31e2e26..584f88dc 100644
--- a/tests/test_benchmarks/test_macs/test_macs_user.py
+++ b/tests/test_benchmarks/test_macs/test_macs_user.py
@@ -17,23 +17,23 @@
 class TestMACSUserInit:
     """Tests for MACSUser initialization."""
 
-    def test_init_with_defaults(self, macs_model, sample_scenario, initial_prompt):
+    def test_init_with_defaults(self, macs_model, sample_scenario, initial_query):
         """Initialization with required args uses proper defaults."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
 
         assert user.model == macs_model
         assert user.scenario == sample_scenario
         assert user.name == "Simulated User"
         assert user.max_turns == 5
-        assert user._turn_count == 0
+        assert user._turn_count == 1  # Initial query counts as first turn
         assert not user._stopped
         assert "full_scenario" in user.user_profile
 
-    def test_macs_default_max_turns_is_five(self, macs_model, sample_scenario, initial_prompt):
+    def test_macs_default_max_turns_is_five(self, macs_model, sample_scenario, initial_query):
         """MACS benchmark defaults to max_turns=5 per MACS paper.
 
         This is a MACS-specific default that differs from the base class default of 1.
@@ -42,13 +42,13 @@ def test_macs_default_max_turns_is_five(self, macs_model, sample_scenario, initi
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
 
         assert user.max_turns == MACSUser.DEFAULT_MAX_TURNS
         assert user.max_turns == 5
 
-    def test_macs_default_stop_token(self, macs_model, sample_scenario, initial_prompt):
+    def test_macs_default_stop_token(self, macs_model, sample_scenario, initial_query):
         """MACS uses '</stop>' as stop token per MACS paper.
 
         This is a MACS-specific default. If the base class default changes,
@@ -57,18 +57,18 @@ def test_macs_default_stop_token(self, macs_model, sample_scenario, initial_prom
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
 
         assert user.stop_token == MACSUser.DEFAULT_STOP_TOKEN
         assert user.stop_token == "</stop>"
 
-    def test_init_with_custom_params(self, macs_model, sample_scenario, initial_prompt):
+    def test_init_with_custom_params(self, macs_model, sample_scenario, initial_query):
         """Custom name and max_turns are respected."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
             name="Test User",
             max_turns=10,
         )
@@ -76,14 +76,14 @@ def test_init_with_custom_params(self, macs_model, sample_scenario, initial_prom
         assert user.name == "Test User"
         assert user.max_turns == 10
 
-    def test_init_loads_template(self, macs_model, sample_scenario, initial_prompt):
+    def test_init_loads_template(self, macs_model, sample_scenario, initial_query):
         """Loads user_simulator.txt template."""
         assert MACSUser.TEMPLATE_PATH.exists(), f"Template not found at {MACSUser.TEMPLATE_PATH}"
 
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
         assert user is not None
 
@@ -152,7 +152,7 @@ def test_extract_profile_includes_full_scenario(self, sample_scenario):
 class TestConversationState:
     """Tests for conversation state management."""
 
-    def test_is_done_false_initially_without_assistant_message(self, macs_model, sample_scenario, initial_prompt):
+    def test_is_done_false_initially_without_assistant_message(self, macs_model, sample_scenario, initial_query):
         """is_done() returns False when no assistant message to evaluate.
 
         When there's no assistant message yet (only the initial user message),
@@ -162,19 +162,19 @@ def test_is_done_false_initially_without_assistant_message(self, macs_model, sam
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
 
         # No assistant message yet, so is_done() returns False
         # (nothing to evaluate, need to get agent response first)
         assert not user.is_done()
 
-    def test_is_done_after_max_turns(self, macs_model, sample_scenario, initial_prompt):
+    def test_is_done_after_max_turns(self, macs_model, sample_scenario, initial_query):
         """is_done() returns True after max turns."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
             max_turns=2,
         )
 
@@ -183,12 +183,12 @@ def test_is_done_after_max_turns(self, macs_model, sample_scenario, initial_prom
 
         assert user.is_done()
 
-    def test_is_done_after_stop_token(self, macs_model, sample_scenario, initial_prompt):
+    def test_is_done_after_stop_token(self, macs_model, sample_scenario, initial_query):
         """is_done() returns True after </stop> detected."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
 
         # Manually set stopped flag
@@ -196,14 +196,14 @@ def test_is_done_after_stop_token(self, macs_model, sample_scenario, initial_pro
 
         assert user.is_done()
 
-    def test_is_done_returns_false_when_not_satisfied(self, macs_model, sample_scenario, initial_prompt):
+    def test_is_done_returns_false_when_not_satisfied(self, macs_model, sample_scenario, initial_query):
         """is_done() returns False when user is not satisfied with response."""
         from unittest.mock import MagicMock
 
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
             max_turns=5,
         )
 
@@ -214,7 +214,8 @@ def test_is_done_returns_false_when_not_satisfied(self, macs_model, sample_scena
         response = user.simulate_response("Here is your flight info.")
 
         # The user's response should be added to messages
-        assert user._turn_count == 1
+        # initial_query is turn 1, this simulate_response is turn 2
+        assert user._turn_count == 2
         assert "I need more information" in response
 
         # is_done() is a cheap state check - no </stop> token was found
@@ -230,25 +231,25 @@ def test_is_done_returns_false_when_not_satisfied(self, macs_model, sample_scena
 class TestReset:
     """Tests for reset method."""
 
-    def test_reset_clears_turn_count(self, macs_model, sample_scenario, initial_prompt):
-        """reset() clears turn count."""
+    def test_reset_clears_turn_count(self, macs_model, sample_scenario, initial_query):
+        """reset() resets turn count to initial value."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
         user._turn_count = 3
 
         user.reset()
 
-        assert user._turn_count == 0
+        assert user._turn_count == 1  # Back to initial (counts initial query)
 
-    def test_reset_clears_stopped(self, macs_model, sample_scenario, initial_prompt):
+    def test_reset_clears_stopped(self, macs_model, sample_scenario, initial_query):
         """reset() clears stopped flag."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
         user._stopped = True
 
@@ -266,13 +267,13 @@ def test_reset_clears_stopped(self, macs_model, sample_scenario, initial_prompt)
 class TestResponseSimulation:
     """Tests for simulate_response method."""
 
-    def test_simulate_response_increments_turn(self, sample_scenario, initial_prompt):
+    def test_simulate_response_increments_turn(self, sample_scenario, initial_query):
         """Turn count increments on simulate_response call."""
         model = DummyModelAdapter(responses=['{"text": "Yes, confirmed.", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
 
         initial_count = user._turn_count
@@ -283,13 +284,13 @@ def test_simulate_response_increments_turn(self, sample_scenario, initial_prompt
 
         assert user._turn_count == initial_count + 1
 
-    def test_simulate_response_detects_stop(self, sample_scenario, initial_prompt):
+    def test_simulate_response_detects_stop(self, sample_scenario, initial_query):
         """Detects </stop> token."""
         model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
 
         # Replace the simulator with a mock that returns a response with stop token
@@ -299,13 +300,13 @@ def test_simulate_response_detects_stop(self, sample_scenario, initial_prompt):
         assert user._stopped
         assert user.is_done()
 
-    def test_simulate_response_cleans_stop_token(self, sample_scenario, initial_prompt):
+    def test_simulate_response_cleans_stop_token(self, sample_scenario, initial_query):
         """Removes </stop> from response."""
         model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
 
         # Replace the simulator with a mock that returns a response with stop token
@@ -315,13 +316,13 @@ def test_simulate_response_cleans_stop_token(self, sample_scenario, initial_prom
         assert "</stop>" not in response
         assert "Perfect, thanks!" in response
 
-    def test_simulate_response_returns_empty_when_done(self, sample_scenario, initial_prompt):
+    def test_simulate_response_returns_empty_when_done(self, sample_scenario, initial_query):
         """Returns empty string when is_done is True."""
         model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
         user._stopped = True  # Already done
 
@@ -329,13 +330,13 @@ def test_simulate_response_returns_empty_when_done(self, sample_scenario, initia
 
         assert response == ""
 
-    def test_simulate_response_returns_empty_at_max_turns(self, sample_scenario, initial_prompt):
+    def test_simulate_response_returns_empty_at_max_turns(self, sample_scenario, initial_query):
         """Returns empty string when max turns reached."""
         model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
             max_turns=3,
         )
         user._turn_count = 3  # At max
@@ -344,13 +345,13 @@ def test_simulate_response_returns_empty_at_max_turns(self, sample_scenario, ini
 
         assert response == ""
 
-    def test_simulate_response_fallback_message(self, sample_scenario, initial_prompt):
+    def test_simulate_response_fallback_message(self, sample_scenario, initial_query):
         """Provides fallback when response is only stop token."""
         model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
 
         # Replace the simulator with a mock that returns only the stop token
@@ -371,12 +372,12 @@ def test_simulate_response_fallback_message(self, sample_scenario, initial_promp
 class TestToolInterface:
     """Tests for get_tool method."""
 
-    def test_get_tool_raises_not_implemented(self, macs_model, sample_scenario, initial_prompt):
+    def test_get_tool_raises_not_implemented(self, macs_model, sample_scenario, initial_query):
         """Base MACSUser.get_tool() raises NotImplementedError."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
 
         with pytest.raises(NotImplementedError) as exc_info:
@@ -395,12 +396,12 @@ def test_get_tool_raises_not_implemented(self, macs_model, sample_scenario, init
 class TestTracing:
     """Tests for gather_traces method."""
 
-    def test_gather_traces_includes_macs_fields(self, macs_model, sample_scenario, initial_prompt):
+    def test_gather_traces_includes_macs_fields(self, macs_model, sample_scenario, initial_query):
         """Traces include MACS-specific fields."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
             max_turns=7,
         )
         user._turn_count = 3
@@ -412,12 +413,12 @@ def test_gather_traces_includes_macs_fields(self, macs_model, sample_scenario, i
         assert traces["turns_used"] == 3
         assert traces["stopped_by_user"] is True
 
-    def test_gather_traces_inherits_base_fields(self, macs_model, sample_scenario, initial_prompt):
+    def test_gather_traces_inherits_base_fields(self, macs_model, sample_scenario, initial_query):
         """Traces include base User fields."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
         )
 
         traces = user.gather_traces()
@@ -436,7 +437,7 @@ def test_gather_traces_inherits_base_fields(self, macs_model, sample_scenario, i
 class TestMACSUserIntegration:
     """Integration tests for MACSUser."""
 
-    def test_conversation_lifecycle(self, sample_scenario, initial_prompt):
+    def test_conversation_lifecycle(self, sample_scenario, initial_query):
         """Test complete conversation lifecycle with is_done() method."""
         responses = [
             "Yes, Monday works.",
@@ -448,7 +449,7 @@ def test_conversation_lifecycle(self, sample_scenario, initial_prompt):
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
             max_turns=5,
         )
 
@@ -471,16 +472,18 @@ def test_conversation_lifecycle(self, sample_scenario, initial_prompt):
                 assert response != ""
 
         # After stop token, should be done
+        # initial_query counts as turn 1, so with 4 simulate_responses we'd have 5 turns
+        # But stop token was hit on 4th response, so turn_count is 5
         assert user.is_done()
-        assert user._turn_count == 4
+        assert user._turn_count == 5
 
-    def test_max_turns_enforcement(self, sample_scenario, initial_prompt):
+    def test_max_turns_enforcement(self, sample_scenario, initial_query):
         """Test that max turns is enforced."""
         model = DummyModelAdapter(responses=["Response"] * 10)
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
             max_turns=3,
         )
 
@@ -499,13 +502,13 @@ def test_max_turns_enforcement(self, sample_scenario, initial_prompt):
         response = user.simulate_response("One more?")
         assert response == ""
 
-    def test_reset_allows_new_conversation(self, sample_scenario, initial_prompt):
+    def test_reset_allows_new_conversation(self, sample_scenario, initial_query):
         """Test that reset allows starting new conversation."""
         model = DummyModelAdapter(responses=['{"text": "Default response", "details": {}}'])
         user = MACSUser(
             model=model,
             scenario=sample_scenario,
-            initial_prompt=initial_prompt,
+            initial_query=initial_query,
             max_turns=2,
         )
 
@@ -517,8 +520,7 @@ def test_reset_allows_new_conversation(self, sample_scenario, initial_prompt):
         # Reset
         user.reset()
 
-        # After reset, hard limits are cleared but there's no assistant message
-        # to evaluate, so is_done() returns True (nothing to evaluate yet)
-        # This is correct - the execution_loop will call run_agents first
-        assert user._turn_count == 0
+        # After reset, turn count goes back to 1 (initial query counts)
+        # and stopped flag is cleared
+        assert user._turn_count == 1
         assert not user._stopped
diff --git a/tests/test_core/test_benchmark/test_execution_loop.py b/tests/test_core/test_benchmark/test_execution_loop.py
index c9fb9426..307aee9d 100644
--- a/tests/test_core/test_benchmark/test_execution_loop.py
+++ b/tests/test_core/test_benchmark/test_execution_loop.py
@@ -1,7 +1,7 @@
 """Tests for Benchmark.execution_loop() method.
 
 These tests verify the agent-user interaction orchestration, including:
-- Query source priority (user initial_prompt vs get_initial_query vs task.query)
+- Query source priority (user initial_query vs get_initial_query vs task.query)
 - Multi-turn interaction with max_invocations
 - Early stopping when user.is_done() returns True
 - Message recording (final_answer attached to user traces)
@@ -118,15 +118,15 @@ def test_returns_final_answer(self, dummy_model):
 class TestExecutionLoopWithUser:
     """Tests for execution_loop with user simulator."""
 
-    def test_uses_user_initial_prompt(self, dummy_model):
-        """Uses user's initial_prompt as first query."""
+    def test_uses_user_initial_query(self, dummy_model):
+        """Uses user's initial_query as first query."""
         from conftest import DummyUser
 
         task = Task(query="Task query (should not be used)", environment_data={})
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="User's initial message",
+            initial_query="User's initial message",
             max_turns=5,
         )
         benchmark = ExecutionLoopBenchmark(agent_data={}, return_user=user)
@@ -136,17 +136,17 @@ def test_uses_user_initial_prompt(self, dummy_model):
 
         benchmark.execution_loop(agents, task, env, user=user)
 
-        # First query should be from user's initial_prompt, not task.query
+        # First query should be from user's initial_query, not task.query
         _, _, _, query = benchmark.run_agents_calls[0]
         assert query == "User's initial message"
 
-    def test_uses_get_initial_query_if_no_prompt(self, dummy_model):
-        """Calls get_initial_query() if no initial_prompt."""
+    def test_uses_get_initial_query_if_no_initial_query(self, dummy_model):
+        """Calls get_initial_query() if no initial_query."""
         from conftest import DummyUser
 
         task = Task(query="Task query", environment_data={})
         user = DummyUser(name="test", model=dummy_model, max_turns=5)
-        # No initial_prompt, so messages is empty
+        # No initial_query, so messages is empty
         user.simulator.return_value = "LLM generated initial query"
 
         benchmark = ExecutionLoopBenchmark(agent_data={}, return_user=user)
@@ -168,7 +168,7 @@ def test_multi_turn_interaction(self, dummy_model):
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="Start",
+            initial_query="Start",
             max_turns=5,
         )
         # User responds with different messages each turn
@@ -202,15 +202,15 @@ def test_stops_when_user_done_via_max_turns(self, dummy_model):
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="Start",
-            max_turns=2,  # User done after 2 turns (limiting factor)
+            initial_query="Start",  # Counts as turn 1
+            max_turns=3,  # User done after 3 user messages
         )
         user.simulator.side_effect = ["Response 1", "Response 2", "Response 3"]
 
         benchmark = ExecutionLoopBenchmark(
             agent_data={},
             return_user=user,
-            max_invocations=5,  # Would allow 5, but user stops at 2
+            max_invocations=5,  # Would allow 5, but user stops at 3 turns
         )
 
         env = benchmark.setup_environment({}, task)
@@ -218,9 +218,8 @@ def test_stops_when_user_done_via_max_turns(self, dummy_model):
 
         benchmark.execution_loop(agents, task, env, user=user)
 
-        # max_turns=2 is the limiting factor, so exactly 2 invocations
-        # Iteration 1: agent runs, simulate_response → turn_count=1, is_done? No
-        # Iteration 2: agent runs, simulate_response → turn_count=2, is_done? Yes → break
+        # max_turns=3 with initial_query counting as turn 1
+        # After 2 simulate_response calls, turn_count=3, is_done=True
         assert len(benchmark.run_agents_calls) == 2
 
     def test_stops_when_user_done_via_stop_token(self, dummy_model):
@@ -231,7 +230,7 @@ def test_stops_when_user_done_via_stop_token(self, dummy_model):
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="Start",
+            initial_query="Start",
             max_turns=10,
             stop_token="</stop>",
         )
@@ -260,8 +259,8 @@ def test_final_answer_in_user_messages(self, dummy_model):
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="Help me",
-            max_turns=1,
+            initial_query="Help me",
+            max_turns=2,  # Allow initial + one response
         )
         user.simulator.return_value = "Thanks"
 
@@ -287,10 +286,10 @@ def test_user_response_becomes_next_query(self, dummy_model):
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="Initial",
-            max_turns=3,  # Allow 3 turns
+            initial_query="Initial",  # Counts as turn 1
+            max_turns=4,  # Allow 4 turns total
         )
-        # Need 3 responses: after invocation 1, 2, and 3
+        # User responses for turn 2, 3, 4
         user.simulator.side_effect = ["User reply 1", "User reply 2", "User reply 3"]
 
         benchmark = ExecutionLoopBenchmark(
@@ -307,7 +306,7 @@ def test_user_response_becomes_next_query(self, dummy_model):
         # Should have 3 invocations limited by max_invocations
         assert len(benchmark.run_agents_calls) == 3
 
-        # First query is initial prompt
+        # First query is initial query
         _, _, _, query1 = benchmark.run_agents_calls[0]
         assert query1 == "Initial"
 
@@ -374,7 +373,7 @@ def test_run_with_user_uses_execution_loop(self, dummy_model):
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="User query",
+            initial_query="User query",
             max_turns=1,
         )
         user.simulator.return_value = "Done"
@@ -396,8 +395,8 @@ def test_complete_traces_with_user(self, dummy_model):
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="Hello",
-            max_turns=2,
+            initial_query="Hello",  # Turn 1
+            max_turns=3,  # Allow 3 user messages total
         )
         user.simulator.side_effect = ["Reply 1", "Reply 2"]
 
@@ -417,7 +416,7 @@ def test_complete_traces_with_user(self, dummy_model):
         # User traces should have the conversation
         user_traces = traces["user"]
         assert "messages" in user_traces
-        # Should have exactly: initial + 2 exchanges (initial, agent1, user1, agent2, user2)
+        # Should have: initial + 2 exchanges (initial, agent1, user1, agent2, user2)
         assert user_traces["message_count"] == 5
 
         # Verify exact message sequence
diff --git a/tests/test_core/test_user.py b/tests/test_core/test_user.py
index 0fb5deb0..0d006ce4 100644
--- a/tests/test_core/test_user.py
+++ b/tests/test_core/test_user.py
@@ -70,7 +70,7 @@ def test_user_initialization(self, dummy_model):
             model=dummy_model,
             user_profile={"role": "customer"},
             scenario="test scenario",
-            initial_prompt="Hello",
+            initial_query="Hello",
         )
 
         assert user.name == "test_user"
@@ -143,13 +143,20 @@ def test_simulate_response_returns_empty_when_done(self, dummy_model):
         response = user.simulate_response("More questions?")
         assert response == ""
 
-    def test_turn_count_starts_at_zero(self, dummy_model):
-        """Turn count starts at 0."""
+    def test_turn_count_starts_at_zero_without_initial_query(self, dummy_model):
+        """Turn count starts at 0 when no initial_query provided."""
         from conftest import DummyUser
 
         user = DummyUser(name="test", model=dummy_model)
         assert user._turn_count == 0
 
+    def test_turn_count_starts_at_one_with_initial_query(self, dummy_model):
+        """Turn count starts at 1 when initial_query is provided."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, initial_query="Hello")
+        assert user._turn_count == 1
+
 
 # =============================================================================
 # Stop Token Tests
@@ -253,29 +260,29 @@ def test_stop_token_response_counts_as_turn(self, dummy_model):
 
 
 @pytest.mark.core
-class TestUserInitialPrompt:
-    """Tests for optional initial_prompt behavior."""
+class TestUserInitialQuery:
+    """Tests for optional initial_query behavior."""
 
-    def test_with_initial_prompt_adds_message(self, dummy_model):
-        """Providing initial_prompt adds it to messages."""
+    def test_with_initial_query_adds_message(self, dummy_model):
+        """Providing initial_query adds it to messages."""
         from conftest import DummyUser
 
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="I need help booking a flight",
+            initial_query="I need help booking a flight",
         )
 
         assert len(user.messages) == 1
         assert user.messages[0]["role"] == "user"
         assert user.messages[0]["content"] == "I need help booking a flight"
 
-    def test_without_initial_prompt_empty_messages(self, dummy_model):
-        """No initial_prompt means empty message history."""
+    def test_without_initial_query_empty_messages(self, dummy_model):
+        """No initial_query means empty message history."""
         from conftest import DummyUser
 
         user = DummyUser(name="test", model=dummy_model)
-        # No initial_prompt provided
+        # No initial_query provided
 
         assert len(user.messages) == 0
 
@@ -304,21 +311,21 @@ def test_get_initial_query_adds_to_messages(self, dummy_model):
         assert user.messages[0]["role"] == "user"
         assert user.messages[0]["content"] == "Help me please"
 
-    def test_get_initial_query_raises_if_messages_exist(self, dummy_model):
-        """get_initial_query() raises if messages already exist."""
+    def test_get_initial_query_returns_existing_query(self, dummy_model):
+        """get_initial_query() returns existing initial query if present."""
         from conftest import DummyUser
 
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="Already have a message",
+            initial_query="Already have a message",
         )
 
-        with pytest.raises(RuntimeError, match="already has messages"):
-            user.get_initial_query()
+        query = user.get_initial_query()
+        assert query == "Already have a message"
 
-    def test_get_initial_query_not_counted_as_turn(self, dummy_model):
-        """Initial query doesn't increment turn count."""
+    def test_get_initial_query_counts_as_turn(self, dummy_model):
+        """Initial query increments turn count."""
         from conftest import DummyUser
 
         user = DummyUser(name="test", model=dummy_model, max_turns=3)
@@ -326,7 +333,7 @@ def test_get_initial_query_not_counted_as_turn(self, dummy_model):
 
         user.get_initial_query()
 
-        assert user._turn_count == 0  # Not incremented
+        assert user._turn_count == 1  # Counts as first turn
 
 
 # =============================================================================
@@ -339,13 +346,13 @@ class TestUserMessageHistory:
     """Tests for complete message tracing."""
 
     def test_initial_message_in_history(self, dummy_model):
-        """Initial prompt is in message history."""
+        """Initial query is in message history."""
         from conftest import DummyUser
 
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="Hello agent",
+            initial_query="Hello agent",
         )
 
         assert len(user.messages) == 1
@@ -384,7 +391,7 @@ def test_full_conversation_tracked(self, dummy_model):
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="I need a flight",
+            initial_query="I need a flight",
             max_turns=3,
         )
         user.simulator.side_effect = ["Monday works", "Yes, book it"]
@@ -410,7 +417,7 @@ def test_gather_traces_includes_all_messages(self, dummy_model):
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="Hello",
+            initial_query="Hello",
             max_turns=2,
         )
         user.simulator.return_value = "Got it"
diff --git a/tests/test_core/test_user_simulator.py b/tests/test_core/test_user_simulator.py
index f6bc606a..dda61aac 100644
--- a/tests/test_core/test_user_simulator.py
+++ b/tests/test_core/test_user_simulator.py
@@ -64,7 +64,7 @@ def test_user_initialization(self, dummy_model):
             model=dummy_model,
             user_profile={"role": "customer"},
             scenario="test scenario",
-            initial_prompt="Hello",
+            initial_query="Hello",
         )
 
         assert user.name == "test_user"
@@ -137,13 +137,20 @@ def test_simulate_response_returns_empty_when_done(self, dummy_model):
         response = user.simulate_response("More questions?")
         assert response == ""
 
-    def test_turn_count_starts_at_zero(self, dummy_model):
-        """Turn count starts at 0."""
+    def test_turn_count_starts_at_zero_without_initial_query(self, dummy_model):
+        """Turn count starts at 0 when no initial_query provided."""
         from conftest import DummyUser
 
         user = DummyUser(name="test", model=dummy_model)
         assert user._turn_count == 0
 
+    def test_turn_count_starts_at_one_with_initial_query(self, dummy_model):
+        """Turn count starts at 1 when initial_query is provided."""
+        from conftest import DummyUser
+
+        user = DummyUser(name="test", model=dummy_model, initial_query="Hello")
+        assert user._turn_count == 1
+
 
 # =============================================================================
 # Stop Token Tests
@@ -232,29 +239,29 @@ def test_fallback_message_when_only_stop_token(self, dummy_model):
 
 
 @pytest.mark.core
-class TestUserInitialPrompt:
-    """Tests for optional initial_prompt behavior."""
+class TestUserInitialQuery:
+    """Tests for optional initial_query behavior."""
 
-    def test_with_initial_prompt_adds_message(self, dummy_model):
-        """Providing initial_prompt adds it to messages."""
+    def test_with_initial_query_adds_message(self, dummy_model):
+        """Providing initial_query adds it to messages."""
         from conftest import DummyUser
 
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="I need help booking a flight",
+            initial_query="I need help booking a flight",
         )
 
         assert len(user.messages) == 1
         assert user.messages[0]["role"] == "user"
         assert user.messages[0]["content"] == "I need help booking a flight"
 
-    def test_without_initial_prompt_empty_messages(self, dummy_model):
-        """No initial_prompt means empty message history."""
+    def test_without_initial_query_empty_messages(self, dummy_model):
+        """No initial_query means empty message history."""
         from conftest import DummyUser
 
         user = DummyUser(name="test", model=dummy_model)
-        # No initial_prompt provided
+        # No initial_query provided
 
         assert len(user.messages) == 0
 
@@ -283,21 +290,21 @@ def test_get_initial_query_adds_to_messages(self, dummy_model):
         assert user.messages[0]["role"] == "user"
         assert user.messages[0]["content"] == "Help me please"
 
-    def test_get_initial_query_raises_if_messages_exist(self, dummy_model):
-        """get_initial_query() raises if messages already exist."""
+    def test_get_initial_query_returns_existing_query(self, dummy_model):
+        """get_initial_query() returns existing initial query if present."""
         from conftest import DummyUser
 
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="Already have a message",
+            initial_query="Already have a message",
         )
 
-        with pytest.raises(RuntimeError, match="already has messages"):
-            user.get_initial_query()
+        query = user.get_initial_query()
+        assert query == "Already have a message"
 
-    def test_get_initial_query_not_counted_as_turn(self, dummy_model):
-        """Initial query doesn't increment turn count."""
+    def test_get_initial_query_counts_as_turn(self, dummy_model):
+        """Initial query increments turn count."""
         from conftest import DummyUser
 
         user = DummyUser(name="test", model=dummy_model, max_turns=3)
@@ -305,7 +312,7 @@ def test_get_initial_query_not_counted_as_turn(self, dummy_model):
 
         user.get_initial_query()
 
-        assert user._turn_count == 0  # Not incremented
+        assert user._turn_count == 1  # Counts as first turn
 
 
 # =============================================================================
@@ -318,13 +325,13 @@ class TestUserMessageHistory:
     """Tests for complete message tracing."""
 
     def test_initial_message_in_history(self, dummy_model):
-        """Initial prompt is in message history."""
+        """Initial query is in message history."""
         from conftest import DummyUser
 
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="Hello agent",
+            initial_query="Hello agent",
         )
 
         assert len(user.messages) == 1
@@ -363,7 +370,7 @@ def test_full_conversation_tracked(self, dummy_model):
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="I need a flight",
+            initial_query="I need a flight",
             max_turns=3,
         )
         user.simulator.side_effect = ["Monday works", "Yes, book it"]
@@ -389,7 +396,7 @@ def test_gather_traces_includes_all_messages(self, dummy_model):
         user = DummyUser(
             name="test",
             model=dummy_model,
-            initial_prompt="Hello",
+            initial_query="Hello",
             max_turns=2,
         )
         user.simulator.return_value = "Got it"
diff --git a/tests/test_interface/test_agent_integration/test_llamaindex_integration.py b/tests/test_interface/test_agent_integration/test_llamaindex_integration.py
index 498daa55..96c16cc4 100644
--- a/tests/test_interface/test_agent_integration/test_llamaindex_integration.py
+++ b/tests/test_interface/test_agent_integration/test_llamaindex_integration.py
@@ -60,7 +60,7 @@ def test_llamaindex_user_creation():
         model=mock_model,
         user_profile={"role": "tester"},
         scenario="test scenario",
-        initial_prompt="test prompt",
+        initial_query="test prompt",
     )
 
     assert user is not None
@@ -79,7 +79,7 @@ def test_llamaindex_user_get_tool():
         model=mock_model,
         user_profile={"role": "tester"},
         scenario="test scenario",
-        initial_prompt="test prompt",
+        initial_query="test prompt",
     )
 
     tool = user.get_tool()
diff --git a/tests/test_interface/test_agent_integration/test_smolagents_integration.py b/tests/test_interface/test_agent_integration/test_smolagents_integration.py
index 3fdb8412..be687899 100644
--- a/tests/test_interface/test_agent_integration/test_smolagents_integration.py
+++ b/tests/test_interface/test_agent_integration/test_smolagents_integration.py
@@ -60,7 +60,7 @@ def test_smolagents_user_creation():
         model=mock_model,
         user_profile={"role": "tester"},
         scenario="test scenario",
-        initial_prompt="test prompt",
+        initial_query="test prompt",
     )
 
     assert user is not None

From 2ff59e86139966c00ff8f6c174ab0c7868f76105 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 5 Dec 2025 00:07:12 +0000
Subject: [PATCH 23/34] [skip ci] user termination reason recorded

---
 maseval/__init__.py  |  3 ++-
 maseval/core/user.py | 43 +++++++++++++++++++++++++++++++++----------
 2 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/maseval/__init__.py b/maseval/__init__.py
index e81693a2..250a370d 100644
--- a/maseval/__init__.py
+++ b/maseval/__init__.py
@@ -17,7 +17,7 @@
 from .core.callbacks import MessageTracingAgentCallback
 from .core.simulator import ToolLLMSimulator, UserLLMSimulator
 from .core.model import ModelAdapter
-from .core.user import User
+from .core.user import User, TerminationReason
 from .core.evaluator import Evaluator
 from .core.history import MessageHistory, ToolInvocationHistory
 from .core.tracing import TraceableMixin
@@ -37,6 +37,7 @@
     "ToolLLMSimulator",
     "UserLLMSimulator",
     "User",
+    "TerminationReason",
     "MessageHistory",
     "Evaluator",
     "ToolInvocationHistory",
diff --git a/maseval/core/user.py b/maseval/core/user.py
index 5fa396a4..6e672001 100644
--- a/maseval/core/user.py
+++ b/maseval/core/user.py
@@ -5,10 +5,20 @@
 from typing import Dict, Any, Optional
 from abc import ABC, abstractmethod
 from datetime import datetime
+from enum import Enum
 import time
 from .history import MessageHistory
 
 
+class TerminationReason(Enum):
+    """Reason why user interaction terminated."""
+
+    NOT_TERMINATED = "not_terminated"
+    MAX_TURNS = "max_turns"
+    USER_TERMINATED = "user_terminated"  # stop token detected
+    MAX_TURNS_AND_USER_TERMINATED = "max_turns_and_user_terminated"  # both conditions met
+
+
 class User(ABC, TraceableMixin, ConfigurableMixin):
     """A class representing a simulated user that can interact with a multi-agent system (MAS).
 
@@ -239,18 +249,39 @@ def gather_traces(self) -> dict[str, Any]:
             "message_count": len(self.messages),
             "messages": self.messages.to_list(),
             "logs": self.logs,
+            "termination_reason": self.termination_reason.value,
         }
 
     @staticmethod
     def _summarize_response(response: str) -> str:
         return response[:2000]
 
+    @property
+    def termination_reason(self) -> TerminationReason:
+        """Get the reason why the user interaction terminated.
+
+        Returns:
+            TerminationReason indicating why is_done() returns True,
+            or NOT_TERMINATED if the interaction is still ongoing.
+        """
+        max_turns_reached = self._turn_count >= self.max_turns
+        user_terminated = self._stopped
+
+        if max_turns_reached and user_terminated:
+            return TerminationReason.MAX_TURNS_AND_USER_TERMINATED
+        elif max_turns_reached:
+            return TerminationReason.MAX_TURNS
+        elif user_terminated:
+            return TerminationReason.USER_TERMINATED
+        else:
+            return TerminationReason.NOT_TERMINATED
+
     def is_done(self) -> bool:
         """Check if the user interaction should end.
 
         The base implementation checks:
         1. If max_turns has been reached
-        2. If the user previously indicated satisfaction (via stop_token)
+        2. If the user previously indicated termination (via stop_token)
 
         Subclasses can override to add custom termination logic (e.g., LLM-based
         satisfaction checks) by calling super().is_done() first.
@@ -258,15 +289,7 @@ def is_done(self) -> bool:
         Returns:
             True if the user is done interacting, False to continue.
         """
-        # Hard limit on turns
-        if self._turn_count >= self.max_turns:
-            return True
-
-        # User previously indicated they're done
-        if self._stopped:
-            return True
-
-        return False
+        return self.termination_reason != TerminationReason.NOT_TERMINATED
 
     def _check_stop_token(self, response: str) -> tuple[bool, str]:
         """Check if response contains stop token and clean it up.

From d1b896a567215aaf774a732a263d9c370d6091ba Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 5 Dec 2025 00:27:01 +0000
Subject: [PATCH 24/34] added better early stopping support to base User class

---
 CHANGELOG.md                                  |   4 +-
 maseval/benchmark/macs/.gitignore             |   1 -
 maseval/benchmark/macs/macs.py                |  12 +-
 .../macs/prompt_templates/user_simulator.txt  |  49 --
 maseval/core/simulator.py                     |  36 +-
 maseval/core/user.py                          |  20 +
 .../templates/user_llm_simulator_template.txt |   2 +-
 tests/conftest.py                             |   2 +
 .../test_macs/test_macs_user.py               |  13 +-
 .../test_benchmark/test_execution_loop.py     |   1 +
 tests/test_core/test_llm_simulator.py         |  55 +++
 tests/test_core/test_user.py                  |  82 +++-
 tests/test_core/test_user_simulator.py        | 450 ------------------
 13 files changed, 203 insertions(+), 524 deletions(-)
 delete mode 100644 maseval/benchmark/macs/prompt_templates/user_simulator.txt
 delete mode 100644 tests/test_core/test_user_simulator.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2410a085..9f008ed4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,10 +21,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 **User**
 
-- Added `max_turns` and `stop_token` parameters to `User` base class for multi-turn support with early stopping (PR: #13)
+- Added `max_turns` and `stop_token` parameters to `User` base class for multi-turn support with early stopping. Same applied to `UserLLMSimulator`. (PR: #13)
 - Added `is_done()`, `_check_stop_token()`, and `increment_turn()` methods to `User` base class (PR: #13)
 - Added `get_initial_query()` method to `User` base class for LLM-generated initial messages (PR: #13)
-- Added `initial_prompt` parameter in `User` base class to trigger the agentic system. (PR: #13)
+- Added `initial_query` parameter in `User` base class to trigger the agentic system. (PR: #13)
 
 **Environment**
 
diff --git a/maseval/benchmark/macs/.gitignore b/maseval/benchmark/macs/.gitignore
index 5104a8b5..7426499f 100644
--- a/maseval/benchmark/macs/.gitignore
+++ b/maseval/benchmark/macs/.gitignore
@@ -2,7 +2,6 @@
 data/*
 
 # Ignore downloaded prompt templates (user.txt, system.txt, issues.txt)
-# But NOT user_simulator.txt which is a custom template committed to the repo
 prompt_templates/user.txt
 prompt_templates/system.txt
 prompt_templates/issues.txt
diff --git a/maseval/benchmark/macs/macs.py b/maseval/benchmark/macs/macs.py
index 46016461..de100694 100644
--- a/maseval/benchmark/macs/macs.py
+++ b/maseval/benchmark/macs/macs.py
@@ -407,7 +407,7 @@ class MACSUser(User):
 
     DEFAULT_MAX_TURNS = 5
     DEFAULT_STOP_TOKEN = "</stop>"
-    TEMPLATE_PATH = Path(__file__).parent / "prompt_templates" / "user_simulator.txt"
+    DEFAULT_EARLY_STOPPING_CONDITION = "ALL goals have been satisfactorily addressed by the assistant"
 
     def __init__(
         self,
@@ -418,6 +418,7 @@ def __init__(
         template: Optional[str] = None,
         max_turns: int = DEFAULT_MAX_TURNS,
         stop_token: str = DEFAULT_STOP_TOKEN,
+        early_stopping_condition: str = DEFAULT_EARLY_STOPPING_CONDITION,
     ):
         """Initialize MACS user simulator.
 
@@ -426,14 +427,12 @@ def __init__(
             scenario: Full scenario text (contains goals and user background)
             initial_query: The initial query to the agent
             name: User name for identification (default: "Simulated User")
-            template: Optional custom prompt template (uses MACS-specific default)
+            template: Optional custom prompt template (uses base UserLLMSimulator template)
             max_turns: Maximum conversation turns (default: 5, per MACS paper)
             stop_token: Token indicating user satisfaction (default: "</stop>")
+            early_stopping_condition: Description of when to emit stop token
+                (default: "ALL goals have been satisfactorily addressed by the assistant")
         """
-        # Load MACS-specific user simulator template if not provided
-        if template is None and self.TEMPLATE_PATH.exists():
-            template = self.TEMPLATE_PATH.read_text()
-
         # Extract user profile from scenario text
         user_profile = self._extract_user_profile(scenario)
 
@@ -446,6 +445,7 @@ def __init__(
             template=template,
             max_turns=max_turns,
             stop_token=stop_token,
+            early_stopping_condition=early_stopping_condition,
         )
 
     def get_tool(self) -> Any:
diff --git a/maseval/benchmark/macs/prompt_templates/user_simulator.txt b/maseval/benchmark/macs/prompt_templates/user_simulator.txt
deleted file mode 100644
index 38cc7866..00000000
--- a/maseval/benchmark/macs/prompt_templates/user_simulator.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-### ROLE
-You are simulating a human user interacting with an AI travel/service assistant. Generate the user's next response based on their profile and goals.
-
-### USER PROFILE
-{{user_profile}}
-
-### SCENARIO & GOALS
-The user's objective in this conversation:
-{{scenario}}
-
-### CONVERSATION HISTORY
-{{conversation_history}}
-
-### YOUR TASK
-Generate the user's next response. Follow these guidelines:
-
-1. **Check Goal Completion**: Review the Goals section carefully. If ALL goals have been satisfactorily addressed by the assistant in the conversation, end your response with the token `</stop>`.
-
-2. **Stay In Character**: Respond naturally as this specific user would, based on their profile.
-
-3. **Advance Goals**: If goals remain unmet, provide information or ask questions to help accomplish them.
-
-4. **Be Helpful**: Answer the assistant's questions directly. Provide relevant details from your profile when asked.
-
-### OUTPUT INSTRUCTIONS
-Respond with ONLY a valid JSON object containing:
-- "text": The user's response (include `</stop>` at the end if ALL goals are met)
-- "details": Key information extracted from your response
-
-Example when goals are still pending:
-```json
-{
-    "text": "Yes, I'd like to book the flight for December 20th. My preferred seats are 31A and 31B.",
-    "details": {
-        "date": "December 20th",
-        "seats": ["31A", "31B"]
-    }
-}
-```
-
-Example when ALL goals are satisfied:
-```json
-{
-    "text": "Perfect, thank you for booking the flight and providing the weather forecast. That's everything I needed! </stop>",
-    "details": {
-        "confirmation": "all_goals_met"
-    }
-}
-```
diff --git a/maseval/core/simulator.py b/maseval/core/simulator.py
index 65fb5565..5525ed8c 100644
--- a/maseval/core/simulator.py
+++ b/maseval/core/simulator.py
@@ -253,6 +253,8 @@ def __init__(
         template: Optional[str] = None,
         max_try: int = 3,
         generation_params: Optional[Dict[str, Any]] = None,
+        stop_token: Optional[str] = None,
+        early_stopping_condition: Optional[str] = None,
     ):
         """
         Initializes the UserLLMSimulator.
@@ -261,11 +263,29 @@ def __init__(
             model (ModelAdapter): The language model to use for generation.
             user_profile (Dict[str, str]): A dictionary containing the user's profile.
             scenario (str): The scenario for the user.
-            template (str, optional): A prompt template. Defaults to the one in the library. See `maseval.utils.templates.user_llm_simulator_template.txt`.
+            template (str, optional): A prompt template. Defaults to the one in the library.
+                See `maseval.utils.templates.user_llm_simulator_template.txt`.
             max_try (int, optional): Maximum number of model calls to attempt. Defaults to 3.
-            generation_params (Dict[str, Any], optional): Default generation parameters for the model. This overwrites the ModelAdapter's defaults if provided.
+            generation_params (Dict[str, Any], optional): Default generation parameters for the model.
+                This overwrites the ModelAdapter's defaults if provided.
                 Both can be overridden at call time. Defaults to None.
+            stop_token (Optional[str], optional): Token to include in responses when early
+                stopping condition is met. Must be provided together with early_stopping_condition.
+                Defaults to None.
+            early_stopping_condition (Optional[str], optional): A description of when the
+                user should stop the conversation (e.g., "all goals have been accomplished").
+                Must be provided together with stop_token. Defaults to None.
+
+        Raises:
+            ValueError: If only one of stop_token or early_stopping_condition is provided.
         """
+        # Validate early stopping configuration
+        if (stop_token is None) != (early_stopping_condition is None):
+            raise ValueError(
+                "stop_token and early_stopping_condition must both be set or both be None. "
+                f"Got stop_token={stop_token!r}, early_stopping_condition={early_stopping_condition!r}"
+            )
+
         if template is None:
             template_path = os.path.join(os.path.dirname(__file__), "utils", "templates", "user_llm_simulator_template.txt")
             with open(template_path, "r") as f:
@@ -274,6 +294,8 @@ def __init__(
         self.user_profile = user_profile
         self.scenario = scenario
         self.generation_params = generation_params or {}
+        self.stop_token = stop_token
+        self.early_stopping_condition = early_stopping_condition
 
     def __call__(
         self,
@@ -318,10 +340,20 @@ def _fill_prompt_template(self, **kwargs) -> str:
         for message in conversation_history:
             formatted_history += f"{message['role']}: {message['content']}\n"
 
+        # Build early stopping instructions if configured
+        early_stopping_instructions = ""
+        if self.stop_token and self.early_stopping_condition:
+            early_stopping_instructions = (
+                f"\n### EARLY STOPPING\n"
+                f"If the following condition is satisfied: {self.early_stopping_condition}\n"
+                f"Then end your response with the token `{self.stop_token}` to signal that the conversation should end.\n"
+            )
+
         replacements = {
             "user_profile": json.dumps(self.user_profile, indent=2),
             "scenario": self.scenario,
             "conversation_history": formatted_history,
+            "early_stopping_instructions": early_stopping_instructions,
         }
         for k, v in replacements.items():
             prompt = prompt.replace("{{" + k + "}}", str(v))
diff --git a/maseval/core/user.py b/maseval/core/user.py
index 6e672001..e815a170 100644
--- a/maseval/core/user.py
+++ b/maseval/core/user.py
@@ -53,6 +53,7 @@ class User(ABC, TraceableMixin, ConfigurableMixin):
         messages (MessageHistory): The conversation history between the user and the MAS.
         max_turns (int): Maximum number of user response turns.
         stop_token (Optional[str]): Token that triggers early stopping when detected.
+        early_stopping_condition (Optional[str]): Description of when to emit the stop token.
     """
 
     def __init__(
@@ -66,6 +67,7 @@ def __init__(
         max_try: int = 3,
         max_turns: int = 1,
         stop_token: Optional[str] = None,
+        early_stopping_condition: Optional[str] = None,
     ):
         """Initializes the User.
 
@@ -94,17 +96,34 @@ def __init__(
                 for benchmarks where termination is based on user satisfaction rather than
                 a fixed turn count. The token is stripped from the response. Defaults to
                 None (early stopping disabled).
+            early_stopping_condition (Optional[str], optional): A description of when the
+                user should stop the conversation (e.g., "all goals have been accomplished").
+                Used with stop_token to instruct the LLM when to emit the stop token.
+                Must be provided if stop_token is set. Defaults to None.
+
+        Raises:
+            ValueError: If only one of stop_token or early_stopping_condition is provided.
         """
+        # Validate early stopping configuration
+        if (stop_token is None) != (early_stopping_condition is None):
+            raise ValueError(
+                "stop_token and early_stopping_condition must both be set or both be None. "
+                f"Got stop_token={stop_token!r}, early_stopping_condition={early_stopping_condition!r}"
+            )
+
         self.name = name
         self.model = model
         self.user_profile = user_profile
         self.scenario = scenario
+        
         self.simulator = UserLLMSimulator(
             model=self.model,
             user_profile=self.user_profile,
             scenario=self.scenario,
             template=template,
             max_try=max_try,
+            stop_token=stop_token,
+            early_stopping_condition=early_stopping_condition,
         )
         # Initialize message history - empty or with initial query
         if initial_query is not None:
@@ -118,6 +137,7 @@ def __init__(
         # Multi-turn configuration
         self.max_turns = max_turns
         self.stop_token = stop_token
+        self.early_stopping_condition = early_stopping_condition
         self._turn_count = self._initial_turn_count
         self._stopped = False
 
diff --git a/maseval/core/utils/templates/user_llm_simulator_template.txt b/maseval/core/utils/templates/user_llm_simulator_template.txt
index 996a179e..b30abe7b 100644
--- a/maseval/core/utils/templates/user_llm_simulator_template.txt
+++ b/maseval/core/utils/templates/user_llm_simulator_template.txt
@@ -12,7 +12,7 @@ The user's objective in this conversation is to accomplish the following tasks:
 This is the conversation so far. You will provide the user's next turn.
 {{conversation_history}}
 
-
+{{early_stopping_instructions}}
 ### YOUR TASK
 Based on the user's profile, their goals, and the agent's last message, generate the next response from the user's perspective. Follow these steps:
 1.  Decide what the user would say next to advance their goals. This could be providing new information, answering a question, or asking for clarification.
diff --git a/tests/conftest.py b/tests/conftest.py
index 214c5188..75b147cd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -158,6 +158,7 @@ def __init__(self, name: str, model: ModelAdapter, **kwargs):
                 - initial_query: Optional initial message
                 - max_turns: Max interaction turns (default: 1)
                 - stop_token: Early termination token (default: None)
+                - early_stopping_condition: Description of when to stop (default: None)
         """
         super().__init__(
             name=name,
@@ -167,6 +168,7 @@ def __init__(self, name: str, model: ModelAdapter, **kwargs):
             initial_query=kwargs.get("initial_query"),
             max_turns=kwargs.get("max_turns", 1),
             stop_token=kwargs.get("stop_token"),
+            early_stopping_condition=kwargs.get("early_stopping_condition"),
         )
         # Replace simulator with a mock to avoid LLM calls
         # Tests can set simulator.return_value or side_effect as needed
diff --git a/tests/test_benchmarks/test_macs/test_macs_user.py b/tests/test_benchmarks/test_macs/test_macs_user.py
index 584f88dc..e69083a7 100644
--- a/tests/test_benchmarks/test_macs/test_macs_user.py
+++ b/tests/test_benchmarks/test_macs/test_macs_user.py
@@ -76,16 +76,19 @@ def test_init_with_custom_params(self, macs_model, sample_scenario, initial_quer
         assert user.name == "Test User"
         assert user.max_turns == 10
 
-    def test_init_loads_template(self, macs_model, sample_scenario, initial_query):
-        """Loads user_simulator.txt template."""
-        assert MACSUser.TEMPLATE_PATH.exists(), f"Template not found at {MACSUser.TEMPLATE_PATH}"
-
+    def test_init_uses_base_template_with_early_stopping(self, macs_model, sample_scenario, initial_query):
+        """Uses base UserLLMSimulator template with early stopping configured."""
         user = MACSUser(
             model=macs_model,
             scenario=sample_scenario,
             initial_query=initial_query,
         )
-        assert user is not None
+        # Verify early stopping is configured
+        assert user.stop_token == MACSUser.DEFAULT_STOP_TOKEN
+        assert user.early_stopping_condition == MACSUser.DEFAULT_EARLY_STOPPING_CONDITION
+        # Simulator should have the same config
+        assert user.simulator.stop_token == "</stop>"
+        assert "goals have been satisfactorily addressed" in user.simulator.early_stopping_condition
 
 
 # =============================================================================
diff --git a/tests/test_core/test_benchmark/test_execution_loop.py b/tests/test_core/test_benchmark/test_execution_loop.py
index 307aee9d..d6629fc1 100644
--- a/tests/test_core/test_benchmark/test_execution_loop.py
+++ b/tests/test_core/test_benchmark/test_execution_loop.py
@@ -233,6 +233,7 @@ def test_stops_when_user_done_via_stop_token(self, dummy_model):
             initial_query="Start",
             max_turns=10,
             stop_token="</stop>",
+            early_stopping_condition="goals are met",
         )
         # User stops on second response
         user.simulator.side_effect = ["Continue please", "Thanks! </stop>"]
diff --git a/tests/test_core/test_llm_simulator.py b/tests/test_core/test_llm_simulator.py
index bfe6194f..449bc5d6 100644
--- a/tests/test_core/test_llm_simulator.py
+++ b/tests/test_core/test_llm_simulator.py
@@ -152,3 +152,58 @@ def test_llm_simulator_gather_traces(self, dummy_model):
         assert "failed_calls" in traces
         assert "logs" in traces
         assert traces["successful_calls"] == 1
+
+
+@pytest.mark.core
+class TestUserLLMSimulatorValidation:
+    """Tests for UserLLMSimulator early stopping validation."""
+
+    def test_stop_token_without_condition_raises(self, dummy_model):
+        """ValueError raised when stop_token set but early_stopping_condition is None."""
+        from maseval.core.simulator import UserLLMSimulator
+
+        with pytest.raises(ValueError, match="must both be set or both be None"):
+            UserLLMSimulator(
+                model=dummy_model,
+                user_profile={"name": "test"},
+                scenario="test scenario",
+                stop_token="</stop>",
+            )
+
+    def test_condition_without_stop_token_raises(self, dummy_model):
+        """ValueError raised when early_stopping_condition set but stop_token is None."""
+        from maseval.core.simulator import UserLLMSimulator
+
+        with pytest.raises(ValueError, match="must both be set or both be None"):
+            UserLLMSimulator(
+                model=dummy_model,
+                user_profile={"name": "test"},
+                scenario="test scenario",
+                early_stopping_condition="goals are met",
+            )
+
+    def test_both_none_is_valid(self, dummy_model):
+        """No error when both stop_token and early_stopping_condition are None."""
+        from maseval.core.simulator import UserLLMSimulator
+
+        simulator = UserLLMSimulator(
+            model=dummy_model,
+            user_profile={"name": "test"},
+            scenario="test scenario",
+        )
+        assert simulator.stop_token is None
+        assert simulator.early_stopping_condition is None
+
+    def test_both_set_is_valid(self, dummy_model):
+        """No error when both stop_token and early_stopping_condition are set."""
+        from maseval.core.simulator import UserLLMSimulator
+
+        simulator = UserLLMSimulator(
+            model=dummy_model,
+            user_profile={"name": "test"},
+            scenario="test scenario",
+            stop_token="</stop>",
+            early_stopping_condition="all goals accomplished",
+        )
+        assert simulator.stop_token == "</stop>"
+        assert simulator.early_stopping_condition == "all goals accomplished"
diff --git a/tests/test_core/test_user.py b/tests/test_core/test_user.py
index 0d006ce4..1db0ab19 100644
--- a/tests/test_core/test_user.py
+++ b/tests/test_core/test_user.py
@@ -173,19 +173,50 @@ def test_no_stop_token_by_default(self, dummy_model):
 
         user = DummyUser(name="test", model=dummy_model)
         assert user.stop_token is None
+        assert user.early_stopping_condition is None
+
+    def test_stop_token_without_condition_raises(self, dummy_model):
+        """ValueError raised when stop_token set but early_stopping_condition is None."""
+        from conftest import DummyUser
+
+        with pytest.raises(ValueError, match="must both be set or both be None"):
+            DummyUser(name="test", model=dummy_model, stop_token="</stop>")
+
+    def test_condition_without_stop_token_raises(self, dummy_model):
+        """ValueError raised when early_stopping_condition set but stop_token is None."""
+        from conftest import DummyUser
+
+        with pytest.raises(ValueError, match="must both be set or both be None"):
+            DummyUser(
+                name="test",
+                model=dummy_model,
+                early_stopping_condition="goals are met",
+            )
 
     def test_custom_stop_token(self, dummy_model):
         """Custom stop_token is stored."""
         from conftest import DummyUser
 
-        user = DummyUser(name="test", model=dummy_model, stop_token="</done>")
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            stop_token="</done>",
+            early_stopping_condition="goals are met",
+        )
         assert user.stop_token == "</done>"
+        assert user.early_stopping_condition == "goals are met"
 
     def test_stop_token_detection_sets_stopped(self, dummy_model):
         """Detecting stop token sets _stopped = True."""
         from conftest import DummyUser
 
-        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            stop_token="</stop>",
+            early_stopping_condition="goals are met",
+            max_turns=5,
+        )
         user.simulator.return_value = "Thanks! </stop>"
 
         user.simulate_response("Here's your answer")
@@ -196,7 +227,13 @@ def test_stop_token_removed_from_response(self, dummy_model):
         """Stop token is stripped from returned response."""
         from conftest import DummyUser
 
-        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            stop_token="</stop>",
+            early_stopping_condition="goals are met",
+            max_turns=5,
+        )
         user.simulator.return_value = "Perfect, thanks! </stop>"
 
         response = user.simulate_response("Booking confirmed!")
@@ -208,7 +245,13 @@ def test_is_done_true_after_stop_token(self, dummy_model):
         """is_done() returns True after stop token detected."""
         from conftest import DummyUser
 
-        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            stop_token="</stop>",
+            early_stopping_condition="goals are met",
+            max_turns=5,
+        )
         user.simulator.return_value = "Done </stop>"
 
         user.simulate_response("Result")
@@ -219,7 +262,13 @@ def test_stop_token_case_insensitive(self, dummy_model):
         """Stop token detection is case-insensitive."""
         from conftest import DummyUser
 
-        user = DummyUser(name="test", model=dummy_model, stop_token="</STOP>", max_turns=5)
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            stop_token="</STOP>",
+            early_stopping_condition="goals are met",
+            max_turns=5,
+        )
         user.simulator.return_value = "Thanks! </stop>"  # lowercase
 
         user.simulate_response("Answer")
@@ -230,7 +279,13 @@ def test_fallback_message_when_only_stop_token(self, dummy_model):
         """Provides fallback when response is only stop token."""
         from conftest import DummyUser
 
-        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            stop_token="</stop>",
+            early_stopping_condition="goals are met",
+            max_turns=5,
+        )
         user.simulator.return_value = "</stop>"
 
         response = user.simulate_response("Done!")
@@ -242,7 +297,13 @@ def test_stop_token_response_counts_as_turn(self, dummy_model):
         """The response containing stop token still counts as a turn."""
         from conftest import DummyUser
 
-        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            stop_token="</stop>",
+            early_stopping_condition="goals are met",
+            max_turns=5,
+        )
         user.simulator.return_value = "Thank you, all is clear </stop>"
 
         initial_turn_count = user._turn_count
@@ -453,7 +514,12 @@ def test_config_includes_stop_token(self, dummy_model):
         """gather_config() includes stop_token."""
         from conftest import DummyUser
 
-        user = DummyUser(name="test", model=dummy_model, stop_token="</end>")
+        user = DummyUser(
+            name="test",
+            model=dummy_model,
+            stop_token="</end>",
+            early_stopping_condition="goals are met",
+        )
 
         config = user.gather_config()
 
diff --git a/tests/test_core/test_user_simulator.py b/tests/test_core/test_user_simulator.py
deleted file mode 100644
index dda61aac..00000000
--- a/tests/test_core/test_user_simulator.py
+++ /dev/null
@@ -1,450 +0,0 @@
-"""Test User simulator functionality.
-
-These tests verify that User simulator correctly manages conversation history,
-multi-turn interaction, and early stopping via stop tokens.
-"""
-
-import pytest
-
-
-@pytest.mark.core
-class TestUserSimulator:
-    """Tests for User simulator basics."""
-
-    def test_user_simulate_response_updates_messages(self, dummy_user):
-        """Test that simulate_response adds to message history."""
-        initial_len = len(dummy_user.messages)
-
-        # simulate_response adds assistant message, then user response
-        dummy_user.simulate_response("How can I help?")
-
-        # Should have added 2 messages: assistant question + user response
-        assert len(dummy_user.messages) == initial_len + 2
-
-    def test_user_messages_includes_both_sides(self, dummy_user):
-        """Test that messages includes both user and assistant messages."""
-        # Simulate a response (adds assistant + user messages)
-        dummy_user.simulate_response("Question for user")
-
-        messages = list(dummy_user.messages)
-        roles = [m["role"] for m in messages]
-        assert "user" in roles
-        assert "assistant" in roles
-
-    def test_user_gather_traces_includes_interactions(self, dummy_user):
-        """Test that gather_traces() includes conversation history."""
-        traces = dummy_user.gather_traces()
-
-        assert "type" in traces
-        assert "gathered_at" in traces
-        assert "name" in traces
-        assert "message_count" in traces
-        assert "messages" in traces
-
-        assert traces["name"] == "test_user"
-        assert isinstance(traces["messages"], list)
-        assert traces["message_count"] == len(traces["messages"])
-
-    def test_user_gather_config_includes_profile(self, dummy_user):
-        """Test that gather_config() includes user profile."""
-        config = dummy_user.gather_config()
-
-        assert "type" in config
-        assert "gathered_at" in config
-        assert "name" in config
-
-        assert config["name"] == "test_user"
-
-    def test_user_initialization(self, dummy_model):
-        """Test that User can be initialized with required parameters."""
-        from conftest import DummyUser
-
-        user = DummyUser(
-            name="test_user",
-            model=dummy_model,
-            user_profile={"role": "customer"},
-            scenario="test scenario",
-            initial_query="Hello",
-        )
-
-        assert user.name == "test_user"
-        assert user.user_profile == {"role": "customer"}
-        assert user.scenario == "test scenario"
-        assert len(user.messages) == 1
-
-
-# =============================================================================
-# Multi-Turn Configuration Tests
-# =============================================================================
-
-
-@pytest.mark.core
-class TestUserMultiTurn:
-    """Tests for max_turns behavior."""
-
-    def test_default_max_turns_is_one(self, dummy_model):
-        """Default single-turn mode."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model)
-        assert user.max_turns == 1
-
-    def test_custom_max_turns(self, dummy_model):
-        """Custom max_turns is stored correctly."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, max_turns=5)
-        assert user.max_turns == 5
-
-    def test_is_done_after_max_turns(self, dummy_model):
-        """is_done() returns True when turn count >= max_turns."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, max_turns=2)
-        user._turn_count = 2
-
-        assert user.is_done()
-
-    def test_is_done_before_max_turns(self, dummy_model):
-        """is_done() returns False when turn count < max_turns."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, max_turns=3)
-        user._turn_count = 1
-
-        assert not user.is_done()
-
-    def test_simulate_response_increments_turn_count(self, dummy_model):
-        """Each simulate_response() call increments _turn_count."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, max_turns=5)
-        initial_count = user._turn_count
-
-        user.simulate_response("Question 1")
-        assert user._turn_count == initial_count + 1
-
-        user.simulate_response("Question 2")
-        assert user._turn_count == initial_count + 2
-
-    def test_simulate_response_returns_empty_when_done(self, dummy_model):
-        """Returns empty string when is_done() is True."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, max_turns=1)
-        user._turn_count = 1  # Already at max
-
-        response = user.simulate_response("More questions?")
-        assert response == ""
-
-    def test_turn_count_starts_at_zero_without_initial_query(self, dummy_model):
-        """Turn count starts at 0 when no initial_query provided."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model)
-        assert user._turn_count == 0
-
-    def test_turn_count_starts_at_one_with_initial_query(self, dummy_model):
-        """Turn count starts at 1 when initial_query is provided."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, initial_query="Hello")
-        assert user._turn_count == 1
-
-
-# =============================================================================
-# Stop Token Tests
-# =============================================================================
-
-
-@pytest.mark.core
-class TestUserStopToken:
-    """Tests for stop_token early termination."""
-
-    def test_no_stop_token_by_default(self, dummy_model):
-        """stop_token is None by default."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model)
-        assert user.stop_token is None
-
-    def test_custom_stop_token(self, dummy_model):
-        """Custom stop_token is stored."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, stop_token="</done>")
-        assert user.stop_token == "</done>"
-
-    def test_stop_token_detection_sets_stopped(self, dummy_model):
-        """Detecting stop token sets _stopped = True."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
-        user.simulator.return_value = "Thanks! </stop>"
-
-        user.simulate_response("Here's your answer")
-
-        assert user._stopped
-
-    def test_stop_token_removed_from_response(self, dummy_model):
-        """Stop token is stripped from returned response."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
-        user.simulator.return_value = "Perfect, thanks! </stop>"
-
-        response = user.simulate_response("Booking confirmed!")
-
-        assert "</stop>" not in response
-        assert "Perfect, thanks!" in response
-
-    def test_is_done_true_after_stop_token(self, dummy_model):
-        """is_done() returns True after stop token detected."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
-        user.simulator.return_value = "Done </stop>"
-
-        user.simulate_response("Result")
-
-        assert user.is_done()
-
-    def test_stop_token_case_insensitive(self, dummy_model):
-        """Stop token detection is case-insensitive."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, stop_token="</STOP>", max_turns=5)
-        user.simulator.return_value = "Thanks! </stop>"  # lowercase
-
-        user.simulate_response("Answer")
-
-        assert user._stopped
-
-    def test_fallback_message_when_only_stop_token(self, dummy_model):
-        """Provides fallback when response is only stop token."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, stop_token="</stop>", max_turns=5)
-        user.simulator.return_value = "</stop>"
-
-        response = user.simulate_response("Done!")
-
-        assert response == "Thank you, that's all I needed!"
-        assert user._stopped
-
-
-# =============================================================================
-# Optional Initial Prompt Tests
-# =============================================================================
-
-
-@pytest.mark.core
-class TestUserInitialQuery:
-    """Tests for optional initial_query behavior."""
-
-    def test_with_initial_query_adds_message(self, dummy_model):
-        """Providing initial_query adds it to messages."""
-        from conftest import DummyUser
-
-        user = DummyUser(
-            name="test",
-            model=dummy_model,
-            initial_query="I need help booking a flight",
-        )
-
-        assert len(user.messages) == 1
-        assert user.messages[0]["role"] == "user"
-        assert user.messages[0]["content"] == "I need help booking a flight"
-
-    def test_without_initial_query_empty_messages(self, dummy_model):
-        """No initial_query means empty message history."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model)
-        # No initial_query provided
-
-        assert len(user.messages) == 0
-
-    def test_get_initial_query_generates_message(self, dummy_model):
-        """get_initial_query() uses LLM to generate first message."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model)
-        user.simulator.return_value = "I want to book a hotel"
-
-        query = user.get_initial_query()
-
-        assert query == "I want to book a hotel"
-        user.simulator.assert_called_once()
-
-    def test_get_initial_query_adds_to_messages(self, dummy_model):
-        """Generated query is added to message history."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model)
-        user.simulator.return_value = "Help me please"
-
-        user.get_initial_query()
-
-        assert len(user.messages) == 1
-        assert user.messages[0]["role"] == "user"
-        assert user.messages[0]["content"] == "Help me please"
-
-    def test_get_initial_query_returns_existing_query(self, dummy_model):
-        """get_initial_query() returns existing initial query if present."""
-        from conftest import DummyUser
-
-        user = DummyUser(
-            name="test",
-            model=dummy_model,
-            initial_query="Already have a message",
-        )
-
-        query = user.get_initial_query()
-        assert query == "Already have a message"
-
-    def test_get_initial_query_counts_as_turn(self, dummy_model):
-        """Initial query increments turn count."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, max_turns=3)
-        user.simulator.return_value = "Initial query"
-
-        user.get_initial_query()
-
-        assert user._turn_count == 1  # Counts as first turn
-
-
-# =============================================================================
-# Message History Completeness Tests
-# =============================================================================
-
-
-@pytest.mark.core
-class TestUserMessageHistory:
-    """Tests for complete message tracing."""
-
-    def test_initial_message_in_history(self, dummy_model):
-        """Initial query is in message history."""
-        from conftest import DummyUser
-
-        user = DummyUser(
-            name="test",
-            model=dummy_model,
-            initial_query="Hello agent",
-        )
-
-        assert len(user.messages) == 1
-        assert user.messages[0]["content"] == "Hello agent"
-
-    def test_assistant_message_recorded(self, dummy_model):
-        """simulate_response() records assistant message before responding."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, max_turns=3)
-        user.simulator.return_value = "User reply"
-
-        user.simulate_response("Agent says hello")
-
-        # Should have: assistant message + user response
-        assert len(user.messages) == 2
-        assert user.messages[0]["role"] == "assistant"
-        assert user.messages[0]["content"] == "Agent says hello"
-
-    def test_user_response_recorded(self, dummy_model):
-        """simulate_response() records user response."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, max_turns=3)
-        user.simulator.return_value = "Thanks for the help"
-
-        user.simulate_response("Here's your answer")
-
-        assert user.messages[-1]["role"] == "user"
-        assert user.messages[-1]["content"] == "Thanks for the help"
-
-    def test_full_conversation_tracked(self, dummy_model):
-        """Multiple exchanges create complete conversation trace."""
-        from conftest import DummyUser
-
-        user = DummyUser(
-            name="test",
-            model=dummy_model,
-            initial_query="I need a flight",
-            max_turns=3,
-        )
-        user.simulator.side_effect = ["Monday works", "Yes, book it"]
-
-        # Two agent-user exchanges
-        user.simulate_response("When do you want to travel?")
-        user.simulate_response("Shall I book it?")
-
-        messages = list(user.messages)
-        assert len(messages) == 5  # initial + 2*(assistant + user)
-
-        # Verify order
-        assert messages[0]["role"] == "user"  # initial
-        assert messages[1]["role"] == "assistant"
-        assert messages[2]["role"] == "user"
-        assert messages[3]["role"] == "assistant"
-        assert messages[4]["role"] == "user"
-
-    def test_gather_traces_includes_all_messages(self, dummy_model):
-        """gather_traces() includes complete conversation."""
-        from conftest import DummyUser
-
-        user = DummyUser(
-            name="test",
-            model=dummy_model,
-            initial_query="Hello",
-            max_turns=2,
-        )
-        user.simulator.return_value = "Got it"
-
-        user.simulate_response("Agent response")
-
-        traces = user.gather_traces()
-
-        assert traces["message_count"] == 3
-        assert len(traces["messages"]) == 3
-
-
-# =============================================================================
-# Config Tests
-# =============================================================================
-
-
-@pytest.mark.core
-class TestUserConfig:
-    """Tests for gather_config updates."""
-
-    def test_config_includes_max_turns(self, dummy_model):
-        """gather_config() includes max_turns."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, max_turns=7)
-
-        config = user.gather_config()
-
-        assert config["max_turns"] == 7
-
-    def test_config_includes_stop_token(self, dummy_model):
-        """gather_config() includes stop_token."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model, stop_token="</end>")
-
-        config = user.gather_config()
-
-        assert config["stop_token"] == "</end>"
-
-    def test_config_includes_none_stop_token(self, dummy_model):
-        """gather_config() includes stop_token even when None."""
-        from conftest import DummyUser
-
-        user = DummyUser(name="test", model=dummy_model)
-
-        config = user.gather_config()
-
-        assert "stop_token" in config
-        assert config["stop_token"] is None

From 7ee0b624279d8645b426cdebd86e8dd593a9c3ce Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 5 Dec 2025 00:28:37 +0000
Subject: [PATCH 25/34] fixed test typing error

---
 maseval/core/user.py                              | 2 +-
 tests/test_benchmarks/test_macs/test_macs_user.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/maseval/core/user.py b/maseval/core/user.py
index e815a170..9b7a87a9 100644
--- a/maseval/core/user.py
+++ b/maseval/core/user.py
@@ -115,7 +115,7 @@ def __init__(
         self.model = model
         self.user_profile = user_profile
         self.scenario = scenario
-        
+
         self.simulator = UserLLMSimulator(
             model=self.model,
             user_profile=self.user_profile,
diff --git a/tests/test_benchmarks/test_macs/test_macs_user.py b/tests/test_benchmarks/test_macs/test_macs_user.py
index e69083a7..d48b60bd 100644
--- a/tests/test_benchmarks/test_macs/test_macs_user.py
+++ b/tests/test_benchmarks/test_macs/test_macs_user.py
@@ -88,6 +88,7 @@ def test_init_uses_base_template_with_early_stopping(self, macs_model, sample_sc
         assert user.early_stopping_condition == MACSUser.DEFAULT_EARLY_STOPPING_CONDITION
         # Simulator should have the same config
         assert user.simulator.stop_token == "</stop>"
+        assert user.simulator.early_stopping_condition is not None
         assert "goals have been satisfactorily addressed" in user.simulator.early_stopping_condition
 
 

From 9ea018226469b3b11feda752fe10eb88e81231e6 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 5 Dec 2025 11:18:05 +0000
Subject: [PATCH 26/34] updated examples

---
 .../five_a_day_benchmark.ipynb                | 14 ++++++++---
 .../five_a_day_benchmark.py                   | 24 +++++++++++--------
 examples/introduction/tutorial.ipynb          | 14 ++++++++---
 3 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb b/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb
index 8f8c81bc..217e4840 100644
--- a/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb
+++ b/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb
@@ -124,7 +124,7 @@
     "from smolagents import ToolCallingAgent, LiteLLMModel, FinalAnswerTool\n",
     "\n",
     "# MASEval core components\n",
-    "from maseval import Benchmark, Environment, Task, TaskCollection, AgentAdapter, Evaluator\n",
+    "from maseval import Benchmark, Environment, Task, TaskCollection, AgentAdapter, Evaluator, ModelAdapter\n",
     "from maseval.interface.agents.smolagents import SmolAgentAdapter\n",
     "\n",
     "# Import evaluators module (dynamically loaded later)\n",
@@ -663,11 +663,19 @@
     "\n",
     "        return evaluator_instances\n",
     "\n",
-    "    def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment) -> Sequence[Any]:\n",
+    "    def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment, query: str) -> Sequence[Any]:\n",
     "        \"\"\"Execute agents and return their final answers.\"\"\"\n",
-    "        answers = [agent.run(task.query) for agent in agents]\n",
+    "        answers = [agent.run(query) for agent in agents]\n",
     "        return answers\n",
     "\n",
+    "    def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:\n",
+    "        \"\"\"Return a model adapter for benchmark components that need LLM access.\n",
+    "        \n",
+    "        This benchmark doesn't use simulated tools, user simulators, or LLM judges,\n",
+    "        so this method is not called during execution.\n",
+    "        \"\"\"\n",
+    "        raise NotImplementedError(\"This benchmark doesn't use model adapters for tools/users/evaluators.\")\n",
+    "\n",
     "    def evaluate(\n",
     "        self,\n",
     "        evaluators: Sequence[Evaluator],\n",
diff --git a/examples/five_a_day_benchmark/five_a_day_benchmark.py b/examples/five_a_day_benchmark/five_a_day_benchmark.py
index 1c7c5115..40b45069 100644
--- a/examples/five_a_day_benchmark/five_a_day_benchmark.py
+++ b/examples/five_a_day_benchmark/five_a_day_benchmark.py
@@ -26,7 +26,7 @@
 
 from utils import derive_seed, sanitize_name  # type: ignore[unresolved-import]
 
-from maseval import Benchmark, Environment, Evaluator, Task, TaskCollection, AgentAdapter
+from maseval import Benchmark, Environment, Evaluator, Task, TaskCollection, AgentAdapter, ModelAdapter
 from maseval.core.callbacks.result_logger import FileResultLogger
 
 # Import tool implementations
@@ -263,7 +263,7 @@ def build_smolagents_single_agent(
         tools=tools,
         name=sanitized_name,
         instructions=primary_spec["agent_instruction"],
-        verbosity_level=2,
+        verbosity_level=0,
     )
 
     return SmolAgentAdapter(agent, primary_spec["agent_id"])
@@ -401,7 +401,7 @@ def build_smolagents_multi_agent(
             name=sanitized_name,
             description=agent_spec["agent_instruction"],
             instructions=agent_spec["agent_instruction"],
-            verbosity_level=2,
+            verbosity_level=0,
         )
         specialist_agents.append(specialist)
 
@@ -418,7 +418,7 @@ def build_smolagents_multi_agent(
         managed_agents=specialist_agents if specialist_agents else None,
         name=sanitized_primary_name,
         instructions=primary_spec["agent_instruction"],
-        verbosity_level=2,
+        verbosity_level=0,
     )
 
     return SmolAgentAdapter(agent, primary_spec["agent_id"])
@@ -740,10 +740,6 @@ def setup_agents(
         builder = get_agent_builder(framework, agent_type)
         agent_adapter = builder(model_id, temperature, all_tool_adapters, primary_spec, specialist_specs)
 
-        # Use .visualize() when smolagents # TODO remove
-        if framework == "smolagents":
-            agent_adapter.agent.visualize()
-
         return [agent_adapter], {primary_agent_id: agent_adapter}
 
     def setup_evaluators(self, environment, task, agents, user) -> Sequence[Evaluator]:
@@ -762,11 +758,19 @@ def setup_evaluators(self, environment, task, agents, user) -> Sequence[Evaluato
 
         return evaluator_instances
 
-    def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment) -> Sequence[Any]:
+    def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment, query: str) -> Sequence[Any]:
         """Execute agents and return their final answers."""
-        answers = [agent.run(task.query) for agent in agents]
+        answers = [agent.run(query) for agent in agents]
         return answers
 
+    def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:
+        """Return a model adapter for benchmark components that need LLM access.
+
+        This benchmark doesn't use simulated tools, user simulators, or LLM judges,
+        so this method is not called during execution.
+        """
+        raise NotImplementedError("This benchmark doesn't use model adapters for tools/users/evaluators.")
+
     def evaluate(
         self,
         evaluators: Sequence[Evaluator],
diff --git a/examples/introduction/tutorial.ipynb b/examples/introduction/tutorial.ipynb
index 3b4b0846..058f9f47 100644
--- a/examples/introduction/tutorial.ipynb
+++ b/examples/introduction/tutorial.ipynb
@@ -509,7 +509,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from maseval import AgentAdapter\n",
+    "from maseval import AgentAdapter, ModelAdapter\n",
     "from typing import Sequence, Tuple\n",
     "\n",
     "\n",
@@ -546,13 +546,21 @@
     "        \"\"\"Create evaluators for the task.\"\"\"\n",
     "        return [FinancialAccuracyEvaluator(task, environment, user), EmailSentEvaluator(task, environment, user)]\n",
     "\n",
-    "    def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment) -> Any:\n",
+    "    def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment, query: str) -> Any:\n",
     "        \"\"\"Execute the agent and return the final answer.\"\"\"\n",
     "        # Run the main agent with the task query\n",
     "        agent = agents[0]\n",
-    "        result = agent.run(task.query)\n",
+    "        result = agent.run(query)\n",
     "        return result\n",
     "\n",
+    "    def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:\n",
+    "        \"\"\"Return a model adapter for benchmark components that need LLM access.\n",
+    "        \n",
+    "        This tutorial doesn't use simulated tools, user simulators, or LLM judges,\n",
+    "        so this method is not called during execution.\n",
+    "        \"\"\"\n",
+    "        raise NotImplementedError(\"This tutorial doesn't use model adapters for tools/users/evaluators.\")\n",
+    "\n",
     "    def evaluate(\n",
     "        self, evaluators: Sequence[Evaluator], agents: Dict[str, AgentAdapter], final_answer: Any, traces: Dict[str, Any]\n",
     "    ) -> List[Dict[str, Any]]:\n",

From aa29258bdcb008bc80ec966bba194a4cacd0b54c Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 5 Dec 2025 11:18:13 +0000
Subject: [PATCH 27/34] updated agent instructions

---
 AGENTS.md | 101 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 92 insertions(+), 9 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 7036a867..4f319423 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -30,10 +30,10 @@ uv run pytest tests/
 
 ```bash
 # Format code
-ruff format .
+uv run ruff format .
 
 # Lint and auto-fix issues
-ruff check . --fix
+uv run ruff check . --fix
 ```
 
 ## Testing Instructions
@@ -45,14 +45,14 @@ ruff check . --fix
 
 ```bash
 # Run all tests
-pytest -v
+uv run pytest -v
 
 # Core tests only (minimal dependencies)
-pytest -m core -v
+uv run pytest -m core -v
 
 # Specific integration tests
-pytest -m smolagents -v
-pytest -m interface -v
+uv run pytest -m smolagents -v
+uv run pytest -m interface -v
 ```
 
 ## Dependency Management
@@ -209,7 +209,7 @@ Example workflow:
 uv sync --all-extras --all-groups
 
 # Before committing
-ruff format . && ruff check . --fix && pytest -v
+uv run ruff format . && uv run ruff check . --fix && uv run pytest -v
 
 # Run example
 uv run python examples/amazon_collab.py
@@ -221,7 +221,7 @@ uv sync --all-extras --all-groups
 uv add --optional <extra-name> <package-name>
 
 # Check specific test file
-pytest tests/test_core/test_agent.py -v
+uv run pytest tests/test_core/test_agent.py -v
 ```
 
 ## Type Hinting
@@ -239,4 +239,87 @@ For lists and dictionaries, use `Dict[...,...]`, `List[...]`, `Sequence[...]` et
 
 ## Changelog
 
-When the task is completed, add your changes to the Changelog.
+When you complete a task, document your changes in the Changelog. Multiple tasks contribute to a single PR, and PRs are compiled into release changelogs.
+
+### User-Facing Documentation
+
+Write changelog entries from the **user's perspective** - describe what the change means for someone using the library, not what you did internally. Focus on features, fixes, and improvements they'll notice or benefit from.
+
+### Task-Level Documentation
+
+Add an entry for your completed task under the `## Unreleased` section.
+
+### Important Rules
+
+- If you modified something already listed under "Added" in `Unreleased`, **update that existing entry** instead of adding a new one under "Changed"
+- Keep entries focused on user impact, not implementation details
+- Multiple task entries will be grouped together under the same PR
+- PR changelogs are then compiled into release notes between versions
+
+### Format
+
+Brief description of the user-facing change (PR: #PR_NUMBER_PLACEHOLDER)
+
+### Example (User-Facing)
+
+**Good:**
+
+- Added support for custom retry strategies in API client with argument `retry` for `Client.__init__`. (PR: #13)
+- Fixed timeout errors when processing large datasets in `func` (PR: #4)
+
+**Bad (not user-focused):**
+
+- Refactored retry logic into separate module
+- Updated error handling in data_processor.py
+
+## Docstrings
+
+Write docstrings for **users**, not about your implementation process.
+
+### Rules
+
+- Describe what the code does and how to use it
+- Explain parameters, return values, and behavior
+- Never write narratives: "I did...", "First we...", "Then I..."
+- Never include quality claims: "rigorously tested", "well-optimized"
+- Omit implementation details users don't need
+
+### Bad (narrative, claims, implementation details)
+
+```
+def calculate_average(numbers: list) -> float:
+    """
+    I implemented this to calculate averages. First I sum the numbers,
+    then divide by count. Rigorously tested and optimized.
+    """
+```
+
+### Good (clear, user-focused)
+
+```
+def calculate_average(numbers: list) -> float:
+    """
+    Calculate the arithmetic mean of numbers.
+
+    Args:
+        numbers: List of numeric values
+
+    Returns:
+        Average as float
+
+    Raises:
+        ValueError: If list is empty
+    """
+```
+
+## Early-Release Status
+
+**This project is early-release. Clean, maintainable code is the priority - not backwards compatibility.**
+
+- Break APIs if it improves design
+- Refactor poor implementations
+- Remove technical debt as soon as you identify it
+- Don't preserve bad patterns for compatibility reasons
+- Focus on getting it right, not keeping it the same
+
+We have zero obligation to maintain backwards compatibility. If you find code messy, propose a fix.

From bf6c100ef32a7e9ba173e743a37b5d5b54df1385 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 5 Dec 2025 12:52:01 +0000
Subject: [PATCH 28/34] fixed examples

---
 examples/five_a_day_benchmark/data/tasks.json | 13 ++--
 .../evaluators/code_generation.py             | 28 +++++---
 .../five_a_day_benchmark.ipynb                |  2 +-
 .../five_a_day_benchmark.py                   | 67 +++++++++++++------
 .../tools/code_execution.py                   | 21 +++---
 examples/introduction/tutorial.ipynb          | 49 ++++++++++++--
 6 files changed, 122 insertions(+), 58 deletions(-)

diff --git a/examples/five_a_day_benchmark/data/tasks.json b/examples/five_a_day_benchmark/data/tasks.json
index 8c6bb176..4a4dcfa9 100644
--- a/examples/five_a_day_benchmark/data/tasks.json
+++ b/examples/five_a_day_benchmark/data/tasks.json
@@ -70,7 +70,7 @@
   {
     "query": "I want to split my Apple stock equally among my children. Can you look up the current stock price and tell me how much each child would get if I give them equal shares?",
     "environment_data": {
-      "tools": ["websearch", "family_info", "calculator", "banking"],
+      "tools": ["stock_price", "family_info", "calculator", "banking"],
       "family_info": {
         "children": [
           { "name": "Emma", "age": 16 },
@@ -102,15 +102,10 @@
       ]
     },
     "metadata": {
-      "description": "Tests multi-step reasoning with web search, data retrieval, and mathematical computation. Agent must look up stock price, retrieve family information, and calculate per-child inheritance value.",
-      "tools_required": ["websearch", "family_info", "calculator", "banking"],
+      "description": "Tests multi-step reasoning with stock price lookup, data retrieval, and mathematical computation. Agent must look up stock price, retrieve family information, and calculate per-child inheritance value.",
+      "tools_required": ["stock_price", "family_info", "calculator", "banking"],
       "complexity": "medium",
-      "skills_tested": [
-        "web_search",
-        "data_retrieval",
-        "arithmetic",
-        "multi_step_reasoning"
-      ],
+      "skills_tested": ["data_retrieval", "arithmetic", "multi_step_reasoning"],
       "task_id": "finance_calculation"
     }
   },
diff --git a/examples/five_a_day_benchmark/evaluators/code_generation.py b/examples/five_a_day_benchmark/evaluators/code_generation.py
index 79e00087..7f860b7e 100644
--- a/examples/five_a_day_benchmark/evaluators/code_generation.py
+++ b/examples/five_a_day_benchmark/evaluators/code_generation.py
@@ -12,7 +12,7 @@
 
 from maseval import Evaluator, Environment, Task, User
 from .utils import normalize_final_answer, call_llm_judge
-from examples.five_a_day_benchmark.tools import get_safe_python_exec_environment
+from tools import get_safe_python_exec_environment
 
 
 class UnitTestEvaluator(Evaluator):
@@ -51,12 +51,15 @@ def __call__(self, traces: Dict[str, Any], final_answer: Optional[str] = None) -
             expected_output = test_case["expected_output"]
 
             try:
-                result = self._execute_code(code, self.function_name, test_input)
+                result, printed_output = self._execute_code(code, self.function_name, test_input)
                 passed = result == expected_output
                 test_results.append(passed)
 
                 if not passed:
-                    errors.append(f"Test {i}: expected {expected_output}, got {result}")
+                    error_msg = f"Test {i}: expected {expected_output}, got {result}"
+                    if printed_output:
+                        error_msg += f" [stdout: {printed_output.strip()}]"
+                    errors.append(error_msg)
 
             except Exception as e:
                 test_results.append(False)
@@ -73,8 +76,12 @@ def __call__(self, traces: Dict[str, Any], final_answer: Optional[str] = None) -
             "errors": errors if errors else None,
         }
 
-    def _execute_code(self, code: str, function_name: str, test_input: Any) -> Any:
-        """Execute code safely using RestrictedPython and return result."""
+    def _execute_code(self, code: str, function_name: str, test_input: Any) -> tuple[Any, str]:
+        """Execute code safely using RestrictedPython and return result with captured output.
+
+        Returns:
+            Tuple of (result, printed_output) where printed_output contains any print() calls.
+        """
         # Compile with RestrictedPython
         compile_result = compile_restricted(code, "<evaluator>", "exec")
 
@@ -90,15 +97,20 @@ def _execute_code(self, code: str, function_name: str, test_input: Any) -> Any:
 
         code_obj = compile_result.code if hasattr(compile_result, "code") else compile_result
 
-        # Get shared safe execution environment
-        safe_env = get_safe_python_exec_environment(include_print_collector=False)
+        # Get safe execution environment (includes PrintCollector)
+        safe_env = get_safe_python_exec_environment()
 
         exec(code_obj, safe_env)
 
         if function_name not in safe_env:
             raise ValueError(f"Function '{function_name}' not found in code")
 
-        return safe_env[function_name](test_input)
+        result = safe_env[function_name](test_input)
+
+        # Collect any print output
+        printed_output = safe_env.get("_print", lambda: "")()
+
+        return result, printed_output
 
     def _extract_code_from_answer(self, answer: str) -> Optional[str]:
         """Extract Python code from final answer string."""
diff --git a/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb b/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb
index 217e4840..903aab92 100644
--- a/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb
+++ b/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb
@@ -670,7 +670,7 @@
     "\n",
     "    def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:\n",
     "        \"\"\"Return a model adapter for benchmark components that need LLM access.\n",
-    "        \n",
+    "\n",
     "        This benchmark doesn't use simulated tools, user simulators, or LLM judges,\n",
     "        so this method is not called during execution.\n",
     "        \"\"\"\n",
diff --git a/examples/five_a_day_benchmark/five_a_day_benchmark.py b/examples/five_a_day_benchmark/five_a_day_benchmark.py
index 40b45069..68a154b0 100644
--- a/examples/five_a_day_benchmark/five_a_day_benchmark.py
+++ b/examples/five_a_day_benchmark/five_a_day_benchmark.py
@@ -236,7 +236,7 @@ def build_smolagents_single_agent(
     all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
-) -> Any:
+) -> tuple[Any, Dict[str, Any]]:
     """Build a single smolagents agent.
 
     Args:
@@ -247,7 +247,7 @@ def build_smolagents_single_agent(
         specialist_specs: Empty list for single-agent (ignored)
 
     Returns:
-        SmolAgentAdapter wrapping the created agent
+        Tuple of (primary_adapter, all_adapters_dict) for consistent interface
     """
     from smolagents import ToolCallingAgent
     from maseval.interface.agents.smolagents import SmolAgentAdapter
@@ -266,7 +266,8 @@ def build_smolagents_single_agent(
         verbosity_level=0,
     )
 
-    return SmolAgentAdapter(agent, primary_spec["agent_id"])
+    adapter = SmolAgentAdapter(agent, primary_spec["agent_id"])
+    return adapter, {primary_spec["agent_id"]: adapter}
 
 
 def build_langgraph_single_agent(
@@ -275,7 +276,7 @@ def build_langgraph_single_agent(
     all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
-) -> Any:
+) -> tuple[Any, Dict[str, Any]]:
     """Build a single langgraph agent.
 
     Args:
@@ -286,7 +287,7 @@ def build_langgraph_single_agent(
         specialist_specs: Empty list for single-agent (ignored)
 
     Returns:
-        LangGraphAgentAdapter wrapping the created graph
+        Tuple of (primary_adapter, all_adapters_dict) for consistent interface
     """
     from langchain_core.messages import SystemMessage
     from langgraph.graph import StateGraph, END
@@ -323,7 +324,8 @@ def call_model(state: AgentState):
     workflow.add_edge("tools", "agent")
 
     graph = workflow.compile()
-    return LangGraphAgentAdapter(graph, primary_spec["agent_id"])
+    adapter = LangGraphAgentAdapter(graph, primary_spec["agent_id"])
+    return adapter, {primary_spec["agent_id"]: adapter}
 
 
 def build_llamaindex_single_agent(
@@ -332,7 +334,7 @@ def build_llamaindex_single_agent(
     all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
-) -> Any:
+) -> tuple[Any, Dict[str, Any]]:
     """Build a single llamaindex agent.
 
     Args:
@@ -343,7 +345,7 @@ def build_llamaindex_single_agent(
         specialist_specs: Empty list for single-agent (ignored)
 
     Returns:
-        LlamaIndexAgentAdapter wrapping the created agent
+        Tuple of (primary_adapter, all_adapters_dict) for consistent interface
     """
     from llama_index.core.agent.workflow.react_agent import ReActAgent
     from maseval.interface.agents.llamaindex import LlamaIndexAgentAdapter
@@ -361,7 +363,8 @@ def build_llamaindex_single_agent(
         system_prompt=primary_spec.get("agent_instruction"),
     )
 
-    return LlamaIndexAgentAdapter(agent, primary_spec["agent_id"])
+    adapter = LlamaIndexAgentAdapter(agent, primary_spec["agent_id"])
+    return adapter, {primary_spec["agent_id"]: adapter}
 
 
 def build_smolagents_multi_agent(
@@ -370,7 +373,7 @@ def build_smolagents_multi_agent(
     all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
-) -> Any:
+) -> tuple[Any, Dict[str, Any]]:
     """Build smolagents multi-agent setup with orchestrator and specialists.
 
     Args:
@@ -381,12 +384,15 @@ def build_smolagents_multi_agent(
         specialist_specs: List of specialist agent specifications
 
     Returns:
-        SmolAgentAdapter wrapping the orchestrator agent
+        Tuple of (primary_adapter, all_adapters_dict) where all_adapters_dict
+        includes the orchestrator and all specialists for trace registration.
     """
     from smolagents import ToolCallingAgent, FinalAnswerTool
     from maseval.interface.agents.smolagents import SmolAgentAdapter
 
     specialist_agents = []
+    specialist_adapters_dict: Dict[str, Any] = {}
+
     for agent_spec in specialist_specs:
         specialist_seed = agent_spec.get("seed")
         specialist_model = get_model(model_id, "smolagents", temperature, specialist_seed)
@@ -404,6 +410,8 @@ def build_smolagents_multi_agent(
             verbosity_level=0,
         )
         specialist_agents.append(specialist)
+        # Create adapter for each specialist for trace registration
+        specialist_adapters_dict[agent_spec["agent_id"]] = SmolAgentAdapter(specialist, agent_spec["agent_id"])
 
     primary_adapters = filter_tool_adapters_by_prefix(all_tool_adapters, primary_spec["tools"])
     primary_tools = [adapter.tool for adapter in primary_adapters.values()]
@@ -421,7 +429,11 @@ def build_smolagents_multi_agent(
         verbosity_level=0,
     )
 
-    return SmolAgentAdapter(agent, primary_spec["agent_id"])
+    primary_adapter = SmolAgentAdapter(agent, primary_spec["agent_id"])
+
+    # Return primary adapter and dict of all adapters (including primary) for trace registration
+    all_adapters = {primary_spec["agent_id"]: primary_adapter, **specialist_adapters_dict}
+    return primary_adapter, all_adapters
 
 
 def build_langgraph_multi_agent(
@@ -430,7 +442,7 @@ def build_langgraph_multi_agent(
     all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
-) -> Any:
+) -> tuple[Any, Dict[str, Any]]:
     """Build langgraph multi-agent setup with orchestrator and specialists.
 
     Args:
@@ -441,7 +453,8 @@ def build_langgraph_multi_agent(
         specialist_specs: List of specialist agent specifications
 
     Returns:
-        LangGraphAgentAdapter wrapping the multi-agent graph
+        Tuple of (primary_adapter, all_adapters_dict). Note: LangGraph multi-agent
+        compiles specialists into graph nodes, so only the graph is traceable.
     """
     from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
     from langchain_core.tools import tool as create_tool
@@ -578,7 +591,8 @@ def route_after_orchestrator(state: MultiAgentState):
         workflow.add_edge(agent_id, "orchestrator")
 
     graph = workflow.compile()
-    return LangGraphAgentAdapter(graph, primary_spec["agent_id"])
+    adapter = LangGraphAgentAdapter(graph, primary_spec["agent_id"])
+    return adapter, {primary_spec["agent_id"]: adapter}
 
 
 def build_llamaindex_multi_agent(
@@ -587,7 +601,7 @@ def build_llamaindex_multi_agent(
     all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
-) -> Any:
+) -> tuple[Any, Dict[str, Any]]:
     """Build llamaindex multi-agent setup with orchestrator and specialists.
 
     Args:
@@ -598,7 +612,8 @@ def build_llamaindex_multi_agent(
         specialist_specs: List of specialist agent specifications
 
     Returns:
-        LlamaIndexAgentAdapter wrapping the orchestrator agent
+        Tuple of (primary_adapter, all_adapters_dict). Note: LlamaIndex multi-agent
+        uses handoff tools, so only the orchestrator is directly traceable.
     """
     from llama_index.core.agent.workflow.react_agent import ReActAgent
     from llama_index.core.tools import FunctionTool
@@ -666,7 +681,8 @@ async def run_specialist():
         system_prompt=primary_spec.get("agent_instruction"),
     )
 
-    return LlamaIndexAgentAdapter(orchestrator, primary_spec["agent_id"])
+    adapter = LlamaIndexAgentAdapter(orchestrator, primary_spec["agent_id"])
+    return adapter, {primary_spec["agent_id"]: adapter}
 
 
 def get_agent_builder(framework: str, agent_type: str):
@@ -723,7 +739,13 @@ def setup_environment(self, agent_data: Dict[str, Any], task: Task) -> Environme
     def setup_agents(
         self, agent_data: Dict[str, Any], environment: Environment, task: Task, user=None
     ) -> tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
-        """Create framework-specific agent with tools from environment."""
+        """Create framework-specific agent with tools from environment.
+
+        Returns:
+            Tuple of (agents_to_run, agents_dict):
+            - agents_to_run: List of adapters for agents that should be executed
+            - agents_dict: Dict of all adapters for trace registration (includes specialists)
+        """
         framework = agent_data["framework"]
         agent_type = agent_data["agent_type"]
         model_id = agent_data["model_config"]["model_id"]
@@ -736,11 +758,12 @@ def setup_agents(
         primary_spec = next(a for a in agents_specs if a["agent_id"] == primary_agent_id)
         specialist_specs = [a for a in agents_specs if a["agent_id"] != primary_agent_id]
 
-        # Build agent using unified interface
+        # Build agent using unified interface - now returns (primary_adapter, all_adapters_dict)
         builder = get_agent_builder(framework, agent_type)
-        agent_adapter = builder(model_id, temperature, all_tool_adapters, primary_spec, specialist_specs)
+        primary_adapter, all_adapters_dict = builder(model_id, temperature, all_tool_adapters, primary_spec, specialist_specs)
 
-        return [agent_adapter], {primary_agent_id: agent_adapter}
+        # Return primary adapter to run, and all adapters for trace registration
+        return [primary_adapter], all_adapters_dict
 
     def setup_evaluators(self, environment, task, agents, user) -> Sequence[Evaluator]:
         """Create evaluators based on task's evaluation_data.evaluators list."""
diff --git a/examples/five_a_day_benchmark/tools/code_execution.py b/examples/five_a_day_benchmark/tools/code_execution.py
index fc3b0b5e..bf4dd910 100644
--- a/examples/five_a_day_benchmark/tools/code_execution.py
+++ b/examples/five_a_day_benchmark/tools/code_execution.py
@@ -73,29 +73,24 @@ def get_safe_guards() -> dict:
     }
 
 
-def get_safe_python_exec_environment(include_print_collector: bool = False) -> dict:
+def get_safe_python_exec_environment() -> dict:
     """Get a complete safe execution environment for RestrictedPython.
 
-    Args:
-        include_print_collector: If True, includes PrintCollector for capturing print output.
-                                If False, print goes to stdout (useful for evaluators).
+    Always includes PrintCollector for capturing print output. After exec(),
+    retrieve captured output via: env.get('_print', lambda: '')().
 
     Returns:
         A dictionary suitable for use as globals in exec() with RestrictedPython.
     """
-    env = {
+    from RestrictedPython.PrintCollector import PrintCollector
+
+    return {
         **safe_globals,
         "__builtins__": get_safe_builtins(),
         **get_safe_guards(),
+        "_print_": PrintCollector,
     }
 
-    if include_print_collector:
-        from RestrictedPython.PrintCollector import PrintCollector
-
-        env["_print_"] = PrintCollector
-
-    return env
-
 
 class CodeExecutionState:
     """Shared state for code execution tools.
@@ -106,7 +101,7 @@ class CodeExecutionState:
     def __init__(self, test_cases: list[dict[str, Any]] | None = None):
         self.test_cases = test_cases or []
         # Get shared safe execution environment with print collector for capturing output
-        self.safe_env = get_safe_python_exec_environment(include_print_collector=True)
+        self.safe_env = get_safe_python_exec_environment()
 
 
 class PythonExecutorExecuteTool(BaseTool):
diff --git a/examples/introduction/tutorial.ipynb b/examples/introduction/tutorial.ipynb
index 058f9f47..367291f7 100644
--- a/examples/introduction/tutorial.ipynb
+++ b/examples/introduction/tutorial.ipynb
@@ -104,6 +104,32 @@
     "        return result\n",
     "\n",
     "\n",
+    "class SimpleInboxTool(Tool):\n",
+    "    \"\"\"A simple tool to read the email inbox.\"\"\"\n",
+    "\n",
+    "    name = \"get_inbox\"\n",
+    "    description = \"Retrieve all emails in the inbox. Returns sender, subject, and body for each email.\"\n",
+    "    inputs = {}\n",
+    "    output_type = \"string\"\n",
+    "\n",
+    "    def __init__(self, inbox: List[Dict], **kwargs):\n",
+    "        super().__init__(**kwargs)\n",
+    "        self.inbox = inbox\n",
+    "\n",
+    "    def forward(self) -> str:\n",
+    "        \"\"\"Return all emails in inbox as formatted string.\"\"\"\n",
+    "        if not self.inbox:\n",
+    "            return \"Inbox is empty.\"\n",
+    "\n",
+    "        result = \"Email Inbox:\\n\"\n",
+    "        for i, email in enumerate(self.inbox, 1):\n",
+    "            result += f\"\\n--- Email {i} ---\\n\"\n",
+    "            result += f\"From: {email['from']}\\n\"\n",
+    "            result += f\"Subject: {email['subject']}\\n\"\n",
+    "            result += f\"Body: {email['body']}\\n\"\n",
+    "        return result\n",
+    "\n",
+    "\n",
     "class SimpleEmailTool(Tool):\n",
     "    \"\"\"A simple tool to send emails.\"\"\"\n",
     "\n",
@@ -154,14 +180,25 @@
     "    {\"date\": \"2025-11-16\", \"description\": \"Property Maintenance\", \"amount\": -450, \"type\": \"expense\"},\n",
     "]\n",
     "\n",
+    "# Sample email inbox\n",
+    "email_inbox = [\n",
+    "    {\n",
+    "        \"from\": \"sarah.johnson@email.com\",\n",
+    "        \"to\": \"sean.crane85@mymail-online.biz\",\n",
+    "        \"subject\": \"Rental Payment Confirmation\",\n",
+    "        \"body\": \"Hi Sean, I just transferred the deposit ($2,000) and first month's rent ($1,500) to your account. Can you please confirm you received it? Thanks, Sarah\",\n",
+    "    }\n",
+    "]\n",
+    "\n",
     "# List to track sent emails\n",
     "sent_emails = []\n",
     "\n",
     "# Create tool instances\n",
     "banking_tool = SimpleBankingTool(transactions=banking_transactions)\n",
+    "inbox_tool = SimpleInboxTool(inbox=email_inbox)\n",
     "email_tool = SimpleEmailTool(sent_emails=sent_emails)\n",
     "\n",
-    "print(f\"Created {len([banking_tool, email_tool])} tools\")"
+    "print(f\"Created {len([banking_tool, inbox_tool, email_tool])} tools\")"
    ]
   },
   {
@@ -188,7 +225,7 @@
     "\n",
     "# Create the agent with tools and instructions\n",
     "agent = ToolCallingAgent(\n",
-    "    tools=[banking_tool, email_tool],\n",
+    "    tools=[banking_tool, inbox_tool, email_tool],\n",
     "    model=model,\n",
     "    instructions=\"\"\"You are a helpful assistant that helps users with email and banking tasks.\n",
     "Use the available tools to retrieve information and take appropriate actions.\n",
@@ -389,15 +426,17 @@
     "\n",
     "    def create_tools(self) -> Dict[str, Any]:\n",
     "        \"\"\"Create tool instances from environment data, keyed by name.\"\"\"\n",
-    "        # Get banking transactions from environment data\n",
+    "        # Get banking transactions and inbox from environment data\n",
     "        transactions = self.state.get(\"banking\", {}).get(\"bank_transactions\", [])\n",
+    "        inbox = self.state.get(\"email_inbox\", [])\n",
     "\n",
     "        # Create tool instances - track sent emails for evaluation\n",
     "        self.sent_emails: List[Dict] = []\n",
     "        banking_tool = SimpleBankingTool(transactions=transactions)\n",
+    "        inbox_tool = SimpleInboxTool(inbox=inbox)\n",
     "        email_tool = SimpleEmailTool(sent_emails=self.sent_emails)\n",
     "\n",
-    "        return {\"get_bank_transactions\": banking_tool, \"send_email\": email_tool}\n",
+    "        return {\"get_transactions\": banking_tool, \"get_inbox\": inbox_tool, \"send_email\": email_tool}\n",
     "\n",
     "\n",
     "print(\"Environment class defined!\")"
@@ -555,7 +594,7 @@
     "\n",
     "    def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:\n",
     "        \"\"\"Return a model adapter for benchmark components that need LLM access.\n",
-    "        \n",
+    "\n",
     "        This tutorial doesn't use simulated tools, user simulators, or LLM judges,\n",
     "        so this method is not called during execution.\n",
     "        \"\"\"\n",

From 1d32b31dba826d196c8067c60257610cb729249e Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 5 Dec 2025 13:06:26 +0000
Subject: [PATCH 29/34] fixed small issues in macs example

---
 examples/macs_benchmark/macs_benchmark.py | 12 ++++-----
 maseval/benchmark/macs/data_loader.py     | 33 ++++++++++++++---------
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/examples/macs_benchmark/macs_benchmark.py b/examples/macs_benchmark/macs_benchmark.py
index af29ab4c..2157ed72 100644
--- a/examples/macs_benchmark/macs_benchmark.py
+++ b/examples/macs_benchmark/macs_benchmark.py
@@ -30,16 +30,14 @@
 import argparse
 import os
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Tuple, TYPE_CHECKING
+from typing import Any, Dict, List, Literal, Optional, Tuple
 
 # Third-party imports (both frameworks will be installed)
 from google.genai import Client as GoogleGenAIClient
 
 # smolagents imports
 from smolagents import ToolCallingAgent, OpenAIServerModel, FinalAnswerTool
-
-if TYPE_CHECKING:
-    from smolagents import Tool as SmolagentsTool
+from smolagents import Tool as SmolagentsTool
 
 # langgraph imports
 from langchain_core.tools import StructuredTool
@@ -197,7 +195,7 @@ def setup_user(
             name="Simulated User",
             model=user_model,
             scenario=scenario,
-            initial_prompt=task.query,
+            initial_query=task.query,
         )
 
         # Register the user's simulator for tracing
@@ -234,13 +232,13 @@ def setup_agents(
 
         # Wrap all generic tools for smolagents and register them for tracing
         # Each tool has its own model from MACSEnvironment.create_tools()
+        # Models are already registered by the environment via get_model_adapter()
         tool_wrappers: Dict[str, SmolagentsToolWrapper] = {}
         for name, tool in environment.tools.items():
             wrapper = SmolagentsToolWrapper(tool)
             tool_wrappers[name] = wrapper
             self.register("tools", name, wrapper)
-            # Register the tool's model and simulator for tracing
-            self.register("models", f"model_tool_{name}", tool.model)
+            # Register the tool's simulator for tracing
             self.register("simulators", f"simulator_tool_{name}", tool.simulator)
 
         # Helper to get tools for an agent
diff --git a/maseval/benchmark/macs/data_loader.py b/maseval/benchmark/macs/data_loader.py
index 02106814..fa1d4181 100644
--- a/maseval/benchmark/macs/data_loader.py
+++ b/maseval/benchmark/macs/data_loader.py
@@ -546,20 +546,29 @@ def configure_model_ids(
     """
     for task in tasks:
         # Environment data: tool model ID
-        if "model_id" in task.environment_data and (not task.environment_data["model_id"] == tool_model_id):
-            raise ValueError(
-                f"Task {task.metadata.get('task_id', '')} already has tool `model_id` set to '{task.environment_data['model_id']}', cannot override with '{tool_model_id}'"
-            )
+        if tool_model_id is not None:
+            if "model_id" in task.environment_data and task.environment_data["model_id"] != tool_model_id:
+                raise ValueError(
+                    f"Task {task.metadata.get('task_id', '')} already has tool `model_id` set to '{task.environment_data['model_id']}', cannot override with '{tool_model_id}'"
+                )
+            task.environment_data["model_id"] = tool_model_id
+
         # User data: user model ID
-        if "model_id" in task.user_data and (not task.user_data["model_id"] == user_model_id):
-            raise ValueError(
-                f"Task {task.metadata.get('task_id', '')} already has user `model_id` set to '{task.user_data['model_id']}', cannot override with '{user_model_id}'"
-            )
+        if user_model_id is not None:
+            if "model_id" in task.user_data and task.user_data["model_id"] != user_model_id:
+                raise ValueError(
+                    f"Task {task.metadata.get('task_id', '')} already has user `model_id` set to '{task.user_data['model_id']}', cannot override with '{user_model_id}'"
+                )
+            task.user_data["model_id"] = user_model_id
+
         # Evaluation data: evaluator model ID
-        if "model_id" in task.evaluation_data and (not task.evaluation_data["model_id"] == evaluator_model_id):
-            raise ValueError(
-                f"Task {task.metadata.get('task_id', '')} already has evaluator `model_id` set to '{task.evaluation_data['model_id']}', cannot override with '{evaluator_model_id}'"
-            )
+        if evaluator_model_id is not None:
+            if "model_id" in task.evaluation_data and task.evaluation_data["model_id"] != evaluator_model_id:
+                raise ValueError(
+                    f"Task {task.metadata.get('task_id', '')} already has evaluator `model_id` set to '{task.evaluation_data['model_id']}', cannot override with '{evaluator_model_id}'"
+                )
+            task.evaluation_data["model_id"] = evaluator_model_id
+
     return tasks
 
 

From 3353a3282f999ee2baf4f6a7e56f1b62b01ee2a9 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 5 Dec 2025 13:14:13 +0000
Subject: [PATCH 30/34] changed llm simulator to raise error

---
 CHANGELOG.md                          |  4 ++
 maseval/__init__.py                   |  3 +-
 maseval/core/simulator.py             | 66 +++++++++++++++++++++------
 tests/test_core/test_llm_simulator.py | 21 ++++++---
 4 files changed, 72 insertions(+), 22 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9f008ed4..a42c4908 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -59,6 +59,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - The `Evaluator` class now has a `filter_traces` base method to conveniently adapt the same evaluator to different entities in the traces (PR: #10).
 
+**Simulator**
+
+- The `LLMSimulator` now throws an exception when json cannot be decoded instead of returning the error message as text to the agent (PR: #13).
+
 **Other**
 
 - Documentation formatting improved. Added darkmode and links to `Github` (PR: #11).
diff --git a/maseval/__init__.py b/maseval/__init__.py
index 250a370d..85a22268 100644
--- a/maseval/__init__.py
+++ b/maseval/__init__.py
@@ -15,7 +15,7 @@
 from .core.callback_handler import CallbackHandler
 from .core.callback import BenchmarkCallback, EnvironmentCallback, AgentCallback
 from .core.callbacks import MessageTracingAgentCallback
-from .core.simulator import ToolLLMSimulator, UserLLMSimulator
+from .core.simulator import ToolLLMSimulator, UserLLMSimulator, SimulatorError
 from .core.model import ModelAdapter
 from .core.user import User, TerminationReason
 from .core.evaluator import Evaluator
@@ -36,6 +36,7 @@
     "MessageTracingAgentCallback",
     "ToolLLMSimulator",
     "UserLLMSimulator",
+    "SimulatorError",
     "User",
     "TerminationReason",
     "MessageHistory",
diff --git a/maseval/core/simulator.py b/maseval/core/simulator.py
index 5525ed8c..c891b937 100644
--- a/maseval/core/simulator.py
+++ b/maseval/core/simulator.py
@@ -9,6 +9,42 @@
 from enum import Enum
 
 
+class SimulatorError(Exception):
+    """Raised when a simulator fails to produce a valid result after all retries.
+
+    This exception is raised when the LLM simulator exhausts all retry attempts
+    without successfully parsing the model output. The benchmark catches this
+    exception and records it as a task execution failure.
+
+    Attributes:
+        message: Description of the failure.
+        attempts: Number of attempts made before failing.
+        last_error: The last error encountered during parsing.
+        logs: The complete log of all attempts for debugging.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        attempts: int = 0,
+        last_error: Optional[str] = None,
+        logs: Optional[List[Dict[str, Any]]] = None,
+    ):
+        self.message = message
+        self.attempts = attempts
+        self.last_error = last_error
+        self.logs = logs or []
+        super().__init__(self.message)
+
+    def __str__(self) -> str:
+        parts = [self.message]
+        if self.attempts > 0:
+            parts.append(f"(attempts: {self.attempts})")
+        if self.last_error:
+            parts.append(f"Last error: {self.last_error}")
+        return " ".join(parts)
+
+
 class LLMSimulator(ABC, TraceableMixin):
     """
     A base class for simulators that use an LLM.
@@ -103,7 +139,22 @@ def __call__(self, generation_params: Optional[Dict[str, Any]] = None, **kwargs)
                 # )
             self.logs.append(entry)
 
-        return parsed_result if parsed_result is not None else self._get_error_result()
+        if parsed_result is not None:
+            return parsed_result
+
+        # All attempts failed - raise exception with details
+        last_error = None
+        for log in reversed(self.logs):
+            if log.get("id") == request_id and log.get("error"):
+                last_error = log["error"]
+                break
+
+        raise SimulatorError(
+            message=f"{self.__class__.__name__} failed to parse model output after {self.max_try} attempts",
+            attempts=self.max_try,
+            last_error=last_error,
+            logs=[log for log in self.logs if log.get("id") == request_id],
+        )
 
     def _call_model_and_parse(self, prompt: str) -> Any:
         """
@@ -126,13 +177,6 @@ def _parse_output(self, output: str) -> Any:
         """
         pass
 
-    @abstractmethod
-    def _get_error_result(self) -> Any:
-        """
-        Returns the error result when parsing fails.
-        """
-        pass
-
     def gather_traces(self) -> dict[str, Any]:
         """Gather execution traces from this simulator.
 
@@ -236,9 +280,6 @@ def _fill_prompt_template(self, **kwargs) -> str:
             prompt = prompt.replace("{{" + k + "}}", v)
         return prompt
 
-    def _get_error_result(self) -> tuple[str, Dict[str, Any]]:
-        return "Error: Failed to decode LLM output after multiple attempts.", {"raw_output": None}
-
 
 class UserLLMSimulator(LLMSimulator):
     """
@@ -358,6 +399,3 @@ def _fill_prompt_template(self, **kwargs) -> str:
         for k, v in replacements.items():
             prompt = prompt.replace("{{" + k + "}}", str(v))
         return prompt
-
-    def _get_error_result(self) -> str:
-        return "Error: Failed to get a response from the user simulator."
diff --git a/tests/test_core/test_llm_simulator.py b/tests/test_core/test_llm_simulator.py
index 449bc5d6..504d9da3 100644
--- a/tests/test_core/test_llm_simulator.py
+++ b/tests/test_core/test_llm_simulator.py
@@ -4,7 +4,7 @@
 """
 
 import pytest
-from maseval.core.simulator import ToolLLMSimulator, SimulatorCallStatus
+from maseval.core.simulator import ToolLLMSimulator, SimulatorCallStatus, SimulatorError
 
 
 @pytest.mark.core
@@ -44,7 +44,7 @@ def test_llm_simulator_retry_logic(self, dummy_model):
         assert len(simulator.logs) == 2
 
     def test_llm_simulator_parsing_error_retry(self, dummy_model):
-        """Test that parsing errors trigger retries."""
+        """Test that parsing errors trigger retries and raise SimulatorError on exhaustion."""
         from conftest import DummyModelAdapter
 
         # All responses are invalid JSON
@@ -58,11 +58,15 @@ def test_llm_simulator_parsing_error_retry(self, dummy_model):
             max_try=3,
         )
 
-        result = simulator(actual_inputs={"param": "test"})
+        # Should raise SimulatorError after max_try attempts
+        with pytest.raises(SimulatorError) as exc_info:
+            simulator(actual_inputs={"param": "test"})
 
-        # Should fail after max_try attempts
-        assert result is not None  # Returns error result
-        assert len(simulator.logs) == 3  # All 3 attempts logged
+        # Verify exception details
+        assert exc_info.value.attempts == 3
+        assert exc_info.value.last_error is not None
+        assert len(exc_info.value.logs) == 3  # All 3 attempts in exception logs
+        assert len(simulator.logs) == 3  # All 3 attempts logged in simulator
 
     def test_llm_simulator_max_attempts_respected(self, dummy_model):
         """Test that max_try limit is respected."""
@@ -78,10 +82,13 @@ def test_llm_simulator_max_attempts_respected(self, dummy_model):
             max_try=2,  # Only allow 2 attempts
         )
 
-        _ = simulator(actual_inputs={"param": "test"})
+        # Should raise after 2 attempts
+        with pytest.raises(SimulatorError) as exc_info:
+            simulator(actual_inputs={"param": "test"})
 
         # Should stop after 2 attempts, not continue to 10
         assert len(simulator.logs) == 2
+        assert exc_info.value.attempts == 2
 
     def test_llm_simulator_history_structure(self, dummy_model):
         """Test that history entries have correct structure."""

From fce3d1f82142f9cf80e64cea64737e79027aed69 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 5 Dec 2025 15:10:38 +0000
Subject: [PATCH 31/34] refined exception handling

---
 CHANGELOG.md                                  |  13 +
 examples/macs_benchmark/macs_benchmark.py     |  28 +-
 maseval/__init__.py                           |  38 +-
 maseval/benchmark/macs/macs.py                | 109 ++++-
 maseval/core/benchmark.py                     |  95 +++-
 maseval/core/exceptions.py                    | 414 ++++++++++++++++++
 maseval/core/simulator.py                     | 165 ++++++-
 scripts/run_macs.sh                           |   5 +
 .../test_macs/test_macs_benchmark.py          | 168 ++++++-
 .../test_benchmark_lifecycle.py               |  13 +-
 tests/test_core/test_exceptions.py            | 404 +++++++++++++++++
 11 files changed, 1407 insertions(+), 45 deletions(-)
 create mode 100644 maseval/core/exceptions.py
 create mode 100755 scripts/run_macs.sh
 create mode 100644 tests/test_core/test_exceptions.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a42c4908..300a8509 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+**Exceptions and Error Classification**
+
+- Added `AgentError`, `EnvironmentError`, `UserError` exception hierarchy in `maseval.core.exceptions` for classifying execution failures by responsibility (PR: #13)
+- Added `TaskExecutionStatus.AGENT_ERROR`, `ENVIRONMENT_ERROR`, `USER_ERROR`, `UNKNOWN_EXECUTION_ERROR` for fine-grained error classification enabling fair scoring (PR: #13)
+- Added validation helpers: `validate_argument_type()`, `validate_required_arguments()`, `validate_no_extra_arguments()`, `validate_arguments_from_schema()` for tool implementers (PR: #13)
+- Added `ToolSimulatorError` and `UserSimulatorError` exception subclasses that inherit from both `SimulatorError` and the appropriate MASEval error type (PR: #13)
+
 **Benchmarks**
 
 - MACS Benchmark: Multi-Agent Collaboration Scenarios benchmark (PR: #13)
@@ -41,6 +48,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+**Exception Handling**
+
+- Benchmark now classifies execution errors into `AGENT_ERROR` (agent's fault), `ENVIRONMENT_ERROR` (tool/infra failure), `USER_ERROR` (user simulator failure), or `UNKNOWN_EXECUTION_ERROR` (unclassified) instead of generic `TASK_EXECUTION_FAILED` (PR: #13)
+- `ToolLLMSimulator` now raises `ToolSimulatorError` (classified as `ENVIRONMENT_ERROR`) on failure (PR: #13)
+- `UserLLMSimulator` now raises `UserSimulatorError` (classified as `USER_ERROR`) on failure (PR: #13)
+
 **Environment**
 
 - `Environment.create_tools()` now returns `Dict[str, Any]` instead of `list` (PR: #13)
diff --git a/examples/macs_benchmark/macs_benchmark.py b/examples/macs_benchmark/macs_benchmark.py
index 2157ed72..092c97bc 100644
--- a/examples/macs_benchmark/macs_benchmark.py
+++ b/examples/macs_benchmark/macs_benchmark.py
@@ -753,16 +753,36 @@ def run_benchmark(
     summary = compute_benchmark_metrics(results)
 
     # Print summary
-    print("\n" + "=" * 50)
+    print("\n" + "=" * 60)
     print("BENCHMARK SUMMARY")
-    print("=" * 50)
+    print("=" * 60)
     print(f"Framework: {framework}")
     print(f"Domain: {domain}")
-    print(f"Total Tasks: {summary['total_tasks']}")
+    total = summary["total_tasks"]
+    print(f"Total Tasks: {total}")
+    print(f"Scored Tasks: {summary['scored_tasks']}")
     print(f"Successful Tasks (Overall GSR=1.0): {summary['successful_tasks']}")
     print(f"Success Rate: {summary['success_rate']:.2%}")
 
-    print("\nMean Metrics:")
+    # Show status breakdown with counts and proportions
+    print("\nStatus Breakdown:")
+    status_counts = summary.get("status_counts", {})
+    for status, count in sorted(status_counts.items()):
+        pct = (count / total * 100) if total > 0 else 0
+        print(f"  {status:<25} {count:>4} ({pct:5.1f}%)")
+
+    # Show excluded tasks if any
+    excluded = summary.get("excluded", {})
+    total_excluded = sum(excluded.values())
+    if total_excluded > 0:
+        pct_excluded = (total_excluded / total * 100) if total > 0 else 0
+        print(f"\nExcluded from scoring: {total_excluded} ({pct_excluded:.1f}%)")
+        for status, count in sorted(excluded.items()):
+            if count > 0:
+                pct = (count / total * 100) if total > 0 else 0
+                print(f"  {status:<25} {count:>4} ({pct:5.1f}%)")
+
+    print("\nMean Metrics (scored tasks only):")
     for metric, value in summary["mean_metrics"].items():
         print(f"  {metric:<25} {value:.4f}")
 
diff --git a/maseval/__init__.py b/maseval/__init__.py
index 85a22268..11ea20d3 100644
--- a/maseval/__init__.py
+++ b/maseval/__init__.py
@@ -15,33 +15,67 @@
 from .core.callback_handler import CallbackHandler
 from .core.callback import BenchmarkCallback, EnvironmentCallback, AgentCallback
 from .core.callbacks import MessageTracingAgentCallback
-from .core.simulator import ToolLLMSimulator, UserLLMSimulator, SimulatorError
+from .core.simulator import (
+    ToolLLMSimulator,
+    UserLLMSimulator,
+    SimulatorError,
+    ToolSimulatorError,
+    UserSimulatorError,
+)
 from .core.model import ModelAdapter
 from .core.user import User, TerminationReason
 from .core.evaluator import Evaluator
 from .core.history import MessageHistory, ToolInvocationHistory
 from .core.tracing import TraceableMixin
+from .core.exceptions import (
+    MASEvalError,
+    AgentError,
+    EnvironmentError,
+    UserError,
+    validate_argument_type,
+    validate_required_arguments,
+    validate_no_extra_arguments,
+    validate_arguments_from_schema,
+)
 
 __all__ = [
+    # Tasks
     "Task",
     "TaskCollection",
+    # Core abstractions
     "Environment",
     "AgentAdapter",
     "Benchmark",
     "TaskExecutionStatus",
+    # Callbacks
     "CallbackHandler",
     "BenchmarkCallback",
     "EnvironmentCallback",
     "AgentCallback",
     "MessageTracingAgentCallback",
+    # Simulators
     "ToolLLMSimulator",
     "UserLLMSimulator",
     "SimulatorError",
+    "ToolSimulatorError",
+    "UserSimulatorError",
+    # User simulation
     "User",
     "TerminationReason",
-    "MessageHistory",
+    # Evaluation
     "Evaluator",
+    # History and tracing
+    "MessageHistory",
     "ToolInvocationHistory",
     "ModelAdapter",
     "TraceableMixin",
+    # Exceptions and validation
+    "MASEvalError",
+    "AgentError",
+    "EnvironmentError",
+    "UserError",
+    "validate_argument_type",
+    "validate_required_arguments",
+    "validate_no_extra_arguments",
+    "validate_arguments_from_schema",
 ]
diff --git a/maseval/benchmark/macs/macs.py b/maseval/benchmark/macs/macs.py
index de100694..abda8182 100644
--- a/maseval/benchmark/macs/macs.py
+++ b/maseval/benchmark/macs/macs.py
@@ -56,6 +56,9 @@ def get_model_adapter(self, model_id, **kwargs):
     ToolInvocationHistory,
     ToolLLMSimulator,
     User,
+    AgentError,
+    EnvironmentError,
+    validate_arguments_from_schema,
 )
 from maseval.core.config import ConfigurableMixin
 from maseval.core.tracing import TraceableMixin
@@ -85,6 +88,12 @@ def __init__(self, generic_tool: MACSGenericTool):
 
             def forward(self, **kwargs) -> str:
                 return self.generic_tool(**kwargs)
+
+    Error Classification:
+        - AgentError: Raised when agent provides invalid arguments (wrong types,
+          missing required args, constraint violations). Agent's fault.
+        - EnvironmentError: Raised when tool infrastructure fails after input
+          validation (LLM simulator fails, internal error). Not agent's fault.
     """
 
     def __init__(self, spec: Dict[str, Any], model: ModelAdapter):
@@ -125,8 +134,54 @@ def _schema_to_inputs(schema: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
 
     def __call__(self, **kwargs) -> str:
-        """Execute the tool with simulated response."""
-        response, details = self.simulator(actual_inputs=kwargs)
+        """Execute the tool with simulated response.
+
+        Args:
+            **kwargs: Tool arguments provided by the agent.
+
+        Returns:
+            Simulated tool response string.
+
+        Raises:
+            AgentError: If agent provides invalid arguments (wrong types, missing
+                required args). This is the agent's fault.
+            EnvironmentError: If tool infrastructure fails after validation (LLM
+                simulator fails, internal error). Not the agent's fault.
+        """
+        # 1. VALIDATE INPUTS (agent's responsibility to get this right)
+        try:
+            validate_arguments_from_schema(
+                kwargs,
+                self.input_schema,
+                component=self.name,
+                strict=False,  # Allow extra args (some agents add metadata)
+            )
+        except AgentError:
+            # Re-raise AgentError as-is
+            raise
+        except (TypeError, ValueError, KeyError) as e:
+            # Convert other validation errors to AgentError
+            raise AgentError(
+                f"Invalid arguments for tool '{self.name}': {e}",
+                component=self.name,
+            ) from e
+
+        # 2. EXECUTE (our responsibility - if this fails after validation, it's on us)
+        try:
+            # ToolLLMSimulator raises ToolSimulatorError (subclass of EnvironmentError)
+            # on failure, so it's automatically classified correctly
+            response, details = self.simulator(actual_inputs=kwargs)
+        except EnvironmentError:
+            # Re-raise EnvironmentError as-is (includes ToolSimulatorError)
+            raise
+        except Exception as e:
+            # Any other error in our tool code is our fault
+            raise EnvironmentError(
+                f"Tool '{self.name}' internal error: {e}",
+                component=self.name,
+            ) from e
+
+        # 3. RECORD INVOCATION
         self.history.add_invocation(
             inputs=kwargs,
             outputs=response,
@@ -910,26 +965,69 @@ def evaluate(
 def compute_benchmark_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
     """Compute summary metrics across all benchmark results.
 
+    Infrastructure errors (environment errors, user simulator errors, evaluation errors,
+    unknown errors) are excluded from scoring metrics to ensure fair evaluation. Only
+    tasks that completed execution (successfully or with agent errors) are included in
+    the success rate and mean metric calculations.
+
     Args:
         results: List of result dicts from benchmark.run()
 
     Returns:
-        Dict with total_tasks, successful_tasks, success_rate, mean_metrics
+        Dict with:
+            - total_tasks: Total number of tasks attempted
+            - scored_tasks: Tasks included in scoring (excludes infrastructure errors)
+            - successful_tasks: Tasks with overall_gsr=1.0
+            - success_rate: successful_tasks / scored_tasks
+            - mean_metrics: Mean of each metric across scored tasks
+            - excluded: Dict with counts of excluded tasks by category
+            - status_counts: Dict with counts of each status type
     """
+    # Status values that indicate infrastructure failures (not agent's fault)
+    INFRASTRUCTURE_STATUSES = {
+        "environment_error",
+        "user_error",
+        "unknown_execution_error",
+        "evaluation_failed",
+        "setup_failed",
+    }
+
     if not results:
         return {
             "total_tasks": 0,
+            "scored_tasks": 0,
             "successful_tasks": 0,
             "success_rate": 0.0,
             "mean_metrics": {},
+            "excluded": {
+                "environment_error": 0,
+                "user_error": 0,
+                "unknown_execution_error": 0,
+                "evaluation_failed": 0,
+                "setup_failed": 0,
+            },
+            "status_counts": {},
         }
 
     total_tasks = len(results)
     metric_sums: Dict[str, float] = {}
     metric_counts: Dict[str, int] = {}
     successful_tasks = 0
+    scored_tasks = 0
+    status_counts: Dict[str, int] = {}
+    excluded_counts: Dict[str, int] = {s: 0 for s in INFRASTRUCTURE_STATUSES}
 
     for res in results:
+        status = res.get("status", "unknown")
+        status_counts[status] = status_counts.get(status, 0) + 1
+
+        # Skip infrastructure failures from scoring
+        if status in INFRASTRUCTURE_STATUSES:
+            excluded_counts[status] = excluded_counts.get(status, 0) + 1
+            continue
+
+        # Task is included in scoring
+        scored_tasks += 1
         evals = res.get("eval") or []
         found_success = False
 
@@ -945,12 +1043,15 @@ def compute_benchmark_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
         if found_success:
             successful_tasks += 1
 
-    success_rate = successful_tasks / total_tasks if total_tasks > 0 else 0.0
+    success_rate = successful_tasks / scored_tasks if scored_tasks > 0 else 0.0
     mean_metrics = {k: metric_sums[k] / metric_counts[k] if metric_counts[k] else 0.0 for k in metric_sums}
 
     return {
         "total_tasks": total_tasks,
+        "scored_tasks": scored_tasks,
         "successful_tasks": successful_tasks,
         "success_rate": success_rate,
         "mean_metrics": mean_metrics,
+        "excluded": excluded_counts,
+        "status_counts": status_counts,
     }
diff --git a/maseval/core/benchmark.py b/maseval/core/benchmark.py
index 37f7359d..14f6432f 100644
--- a/maseval/core/benchmark.py
+++ b/maseval/core/benchmark.py
@@ -21,6 +21,7 @@
     TqdmProgressBarCallback,
     RichProgressBarCallback,
 )
+from .exceptions import AgentError, EnvironmentError, UserError
 
 
 class TaskExecutionStatus(Enum):
@@ -29,18 +30,36 @@ class TaskExecutionStatus(Enum):
     This enum tracks the execution state of a task through the benchmark lifecycle,
     enabling graceful failure handling and comprehensive result reporting.
 
+    The status distinguishes between errors caused by the agent (agent's fault) and
+    errors caused by the evaluation infrastructure (environment, user simulator).
+    This enables fair scoring by excluding infrastructure failures.
+
     Attributes:
-        SUCCESS: Task executed and evaluated successfully
-        TASK_EXECUTION_FAILED: Agent execution raised an exception
-        EVALUATION_FAILED: Task executed but evaluation raised an exception
-        SETUP_FAILED: Setup phase (environment, agents, evaluators) raised an exception
+        SUCCESS: Task executed and evaluated successfully.
+        AGENT_ERROR: Agent violated contract at a boundary (agent's fault, counts against score).
+        ENVIRONMENT_ERROR: Environment/tool infrastructure failed (not agent's fault, exclude from scoring).
+        USER_ERROR: User simulator failed (not agent's fault, exclude from scoring).
+        UNKNOWN_EXECUTION_ERROR: Unclassified execution error (e.g., agent framework internal failure).
+        EVALUATION_FAILED: Task executed but evaluation raised an exception.
+        SETUP_FAILED: Setup phase (environment, agents, evaluators) raised an exception.
+
+    Scoring Guidance:
+        - Include in agent score: SUCCESS, AGENT_ERROR
+        - Exclude from agent score: ENVIRONMENT_ERROR, USER_ERROR, UNKNOWN_EXECUTION_ERROR
+        - Handle separately: EVALUATION_FAILED, SETUP_FAILED
     """
 
     SUCCESS = "success"
-    TASK_EXECUTION_FAILED = "task_execution_failed"
+    AGENT_ERROR = "agent_error"
+    ENVIRONMENT_ERROR = "environment_error"
+    USER_ERROR = "user_error"
+    UNKNOWN_EXECUTION_ERROR = "unknown_execution_error"
     EVALUATION_FAILED = "evaluation_failed"
     SETUP_FAILED = "setup_failed"
 
+    # Deprecated: kept for backward compatibility, use specific error types instead
+    TASK_EXECUTION_FAILED = "task_execution_failed"
+
 
 class Benchmark(ABC):
     """Abstract base class for orchestrating multi-agent system execution and evaluation.
@@ -1168,8 +1187,63 @@ def run(self, tasks: Union[Task, TaskCollection, Iterable[Union[Task, dict]]]) -
                 # 2. Execute agent system with optional user interaction loop
                 try:
                     final_answers = self.execution_loop(agents_to_run, task, environment, user)
+                except AgentError as e:
+                    # Agent violated contract at boundary (agent's fault)
+                    execution_status = TaskExecutionStatus.AGENT_ERROR
+                    error_info = {
+                        "error_type": type(e).__name__,
+                        "error_message": str(e),
+                        "component": e.component,
+                        "details": e.details,
+                        "traceback": "".join(__import__("traceback").format_exception(type(e), e, e.__traceback__)),
+                    }
+
+                    if self.fail_on_task_error:
+                        # Clear registry before re-raising
+                        self.clear_registry()
+                        raise
+
+                    # Continue with trace collection even if task failed
+                    final_answers = None
+                except EnvironmentError as e:
+                    # Environment/tool infrastructure failed (not agent's fault)
+                    execution_status = TaskExecutionStatus.ENVIRONMENT_ERROR
+                    error_info = {
+                        "error_type": type(e).__name__,
+                        "error_message": str(e),
+                        "component": e.component,
+                        "details": e.details,
+                        "traceback": "".join(__import__("traceback").format_exception(type(e), e, e.__traceback__)),
+                    }
+
+                    if self.fail_on_task_error:
+                        # Clear registry before re-raising
+                        self.clear_registry()
+                        raise
+
+                    # Continue with trace collection even if task failed
+                    final_answers = None
+                except UserError as e:
+                    # User simulator failed (not agent's fault)
+                    execution_status = TaskExecutionStatus.USER_ERROR
+                    error_info = {
+                        "error_type": type(e).__name__,
+                        "error_message": str(e),
+                        "component": e.component,
+                        "details": e.details,
+                        "traceback": "".join(__import__("traceback").format_exception(type(e), e, e.__traceback__)),
+                    }
+
+                    if self.fail_on_task_error:
+                        # Clear registry before re-raising
+                        self.clear_registry()
+                        raise
+
+                    # Continue with trace collection even if task failed
+                    final_answers = None
                 except Exception as e:
-                    execution_status = TaskExecutionStatus.TASK_EXECUTION_FAILED
+                    # Unclassified error (e.g., agent framework internal failure)
+                    execution_status = TaskExecutionStatus.UNKNOWN_EXECUTION_ERROR
                     error_info = {
                         "error_type": type(e).__name__,
                         "error_message": str(e),
@@ -1332,11 +1406,16 @@ def get_failed_tasks(
 
         # Normalize status_filter to a set of status values (strings)
         if status_filter is None:
-            # All non-success statuses
+            # All non-success statuses (includes all classified and unclassified failures)
             filter_values = {
-                TaskExecutionStatus.TASK_EXECUTION_FAILED.value,
+                TaskExecutionStatus.AGENT_ERROR.value,
+                TaskExecutionStatus.ENVIRONMENT_ERROR.value,
+                TaskExecutionStatus.USER_ERROR.value,
+                TaskExecutionStatus.UNKNOWN_EXECUTION_ERROR.value,
                 TaskExecutionStatus.EVALUATION_FAILED.value,
                 TaskExecutionStatus.SETUP_FAILED.value,
+                # Include deprecated status for backward compatibility
+                TaskExecutionStatus.TASK_EXECUTION_FAILED.value,
             }
         elif isinstance(status_filter, TaskExecutionStatus):
             filter_values = {status_filter.value}
diff --git a/maseval/core/exceptions.py b/maseval/core/exceptions.py
new file mode 100644
index 00000000..77fc8c08
--- /dev/null
+++ b/maseval/core/exceptions.py
@@ -0,0 +1,414 @@
+"""Exception hierarchy for MASEval error classification.
+
+MASEval distinguishes between errors caused by the agent (agent's fault) and errors
+caused by the evaluation infrastructure (environment, user simulator, etc.). This
+distinction enables fair scoring by excluding infrastructure failures from agent
+performance metrics.
+
+Exception Hierarchy:
+    MASEvalError (base)
+    ├── AgentError           - Agent violated contract at boundary (agent's fault)
+    ├── EnvironmentError     - Environment/tool infrastructure failed (not agent's fault)
+    └── UserError            - User simulator failed (not agent's fault)
+
+Usage in Tools:
+    The boundary between agent responsibility and environment responsibility is
+    INPUT VALIDATION. If validation passes but execution fails, it's an environment error.
+
+    ```python
+    def my_tool(a: int, b: int) -> int:
+        # 1. Validate inputs - agent's responsibility
+        if not isinstance(a, int):
+            raise AgentError(f"Expected int for 'a', got {type(a).__name__}")
+
+        # 2. Execute - our responsibility (if this fails, it's on us)
+        try:
+            return some_external_api_call(a, b)
+        except ExternalAPIError as e:
+            raise EnvironmentError(f"API call failed: {e}") from e
+    ```
+
+Usage in Benchmark Results:
+    After running a benchmark, filter results by error type for fair scoring:
+
+    ```python
+    results = benchmark.run(tasks)
+
+    # Tasks where agent is accountable
+    scoreable = [r for r in results if r["status"] in ("success", "agent_error")]
+
+    # Infrastructure failures to investigate separately
+    infra_failures = [r for r in results if r["status"] in ("environment_error", "user_error")]
+    ```
+"""
+
+from typing import Any, Dict, List, Optional
+
+
+class MASEvalError(Exception):
+    """Base exception for all MASEval-controlled component failures.
+
+    This is the base class for exceptions that occur at boundaries we control
+    (tools, environment, user simulator). Errors from agent framework internals
+    should NOT use this hierarchy - they remain as generic exceptions and are
+    classified as UNKNOWN_EXECUTION_ERROR.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        component: Optional[str] = None,
+        details: Optional[Dict[str, Any]] = None,
+    ):
+        """Initialize MASEvalError.
+
+        Args:
+            message: Human-readable error description.
+            component: Name of the component that raised the error (e.g., tool name).
+            details: Additional structured information about the error.
+        """
+        self.message = message
+        self.component = component
+        self.details = details or {}
+        super().__init__(message)
+
+    def __str__(self) -> str:
+        if self.component:
+            return f"[{self.component}] {self.message}"
+        return self.message
+
+
+class AgentError(MASEvalError):
+    """Agent violated the contract at a boundary we control.
+
+    Raised when the agent provides invalid inputs to components we control.
+    This is the agent's fault - these tasks count against their score.
+
+    The `suggestion` field provides agent-friendly hints for self-correction
+    that some agent frameworks may use for automatic recovery.
+
+    When to raise:
+        - Agent passed wrong argument types to a tool
+        - Agent passed arguments that violate documented constraints
+        - Agent is missing required arguments
+        - Agent called a tool with semantically invalid input
+        - Agent exceeded documented limits (max retries, rate limits, etc.)
+
+    Examples:
+        ```python
+        # Wrong type with suggestion
+        raise AgentError(
+            "Expected int for 'count', got str",
+            component="search_tool",
+            suggestion="Provide count as a number, e.g., count=10"
+        )
+
+        # Missing required argument
+        raise AgentError(
+            "Missing required argument 'query'",
+            component="search_tool",
+            suggestion="Include query='your search terms'"
+        )
+
+        # Constraint violation
+        raise AgentError(
+            "Argument 'limit' must be positive, got -5",
+            component="fetch_tool",
+            suggestion="Use a positive value, e.g., limit=10"
+        )
+        ```
+    """
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        component: Optional[str] = None,
+        details: Optional[Dict[str, Any]] = None,
+        suggestion: Optional[str] = None,
+    ):
+        """Initialize AgentError.
+
+        Args:
+            message: Human-readable error description explaining what went wrong.
+            component: Name of the component that raised the error (e.g., tool name).
+            details: Additional structured information about the error.
+            suggestion: Agent-friendly hint for correcting the error. Some agent
+                frameworks use this for automatic retry with corrected inputs.
+        """
+        super().__init__(message, component=component, details=details)
+        self.suggestion = suggestion
+
+    def __str__(self) -> str:
+        base = super().__str__()
+        if self.suggestion:
+            return f"{base}. Suggestion: {self.suggestion}"
+        return base
+
+
+class EnvironmentError(MASEvalError):
+    """Environment or tool infrastructure failed.
+
+    Raised when our code fails AFTER validating agent inputs. This indicates
+    a problem with the evaluation infrastructure, not the agent's behavior.
+    These tasks should be excluded from agent scoring.
+
+    When to raise:
+        - Tool implementation has a bug
+        - External API/database our tool depends on failed
+        - ToolLLMSimulator failed to parse model output
+        - Model adapter for tool simulation failed
+        - Resource exhaustion in environment components
+        - File I/O errors in environment setup
+
+    Examples:
+        ```python
+        # Tool bug
+        raise EnvironmentError("Internal error in calculation", component="calc_tool")
+
+        # External dependency failed
+        raise EnvironmentError("Database connection failed", component="db_tool")
+
+        # Simulator failed
+        raise EnvironmentError(
+            "Failed to parse LLM response after 3 attempts",
+            component="flight_search",
+            details={"attempts": 3, "last_error": "Invalid JSON"}
+        )
+        ```
+
+    Note:
+        Python has a built-in `EnvironmentError` (alias for `OSError`), but it's
+        rarely used directly. This class shadows it intentionally for clean semantics.
+        If you need the built-in, use `OSError` explicitly.
+    """
+
+    pass
+
+
+class UserError(MASEvalError):
+    """User simulator failed.
+
+    Raised when the user simulation infrastructure fails. This is NOT the
+    agent's fault - these tasks should be excluded from agent scoring.
+
+    When to raise:
+        - UserLLMSimulator couldn't reach the LLM API
+        - User model returned unparseable response after retries
+        - User simulator configuration error
+        - User profile data is malformed
+
+    Examples:
+        ```python
+        # API failure
+        raise UserError("OpenAI API unreachable", component="user_simulator")
+
+        # Parse failure
+        raise UserError(
+            "Failed to parse user response after 3 attempts",
+            component="user_simulator",
+            details={"attempts": 3, "last_error": "Missing 'text' field"}
+        )
+        ```
+    """
+
+    pass
+
+
+# =============================================================================
+# Convenience functions for tool implementers
+# =============================================================================
+
+
+def validate_argument_type(
+    value: Any,
+    expected_type: str,
+    arg_name: str,
+    component: Optional[str] = None,
+) -> None:
+    """Validate that a value matches an expected JSON schema type.
+
+    Raises AgentError if validation fails.
+
+    Args:
+        value: The value to validate.
+        expected_type: JSON schema type ("string", "integer", "number", "boolean", "array", "object").
+        arg_name: Name of the argument (for error message).
+        component: Optional component name for error context.
+
+    Raises:
+        AgentError: If value doesn't match expected type.
+
+    Example:
+        ```python
+        def my_tool(count: int, name: str):
+            validate_argument_type(count, "integer", "count", "my_tool")
+            validate_argument_type(name, "string", "name", "my_tool")
+            # ... tool logic
+        ```
+    """
+    type_map = {
+        "string": (str,),
+        "integer": (int,),
+        "number": (int, float),
+        "boolean": (bool,),
+        "array": (list,),
+        "object": (dict,),
+    }
+
+    # Special case: integer should not accept bool (bool is subclass of int in Python)
+    if expected_type == "integer" and isinstance(value, bool):
+        raise AgentError(
+            f"Argument '{arg_name}' expected integer, got boolean",
+            component=component,
+            details={"argument": arg_name, "expected": expected_type, "actual": type(value).__name__},
+            suggestion=f"Provide {arg_name} as an integer, e.g., 10 (not true/false)",
+        )
+
+    expected_types = type_map.get(expected_type)
+    if expected_types is None:
+        # Unknown type - accept anything
+        return
+
+    if not isinstance(value, expected_types):
+        # Build a suggestion based on expected type
+        type_hints = {
+            "string": 'a string, e.g., "example"',
+            "integer": "an integer, e.g., 10",
+            "number": "a number, e.g., 3.14",
+            "boolean": "a boolean, e.g., true or false",
+            "array": "a list, e.g., [1, 2, 3]",
+            "object": "an object, e.g., {}",
+        }
+        hint = type_hints.get(expected_type, f"a {expected_type}")
+        raise AgentError(
+            f"Argument '{arg_name}' expected {expected_type}, got {type(value).__name__}",
+            component=component,
+            details={"argument": arg_name, "expected": expected_type, "actual": type(value).__name__},
+            suggestion=f"Provide {arg_name} as {hint}",
+        )
+
+
+def validate_required_arguments(
+    kwargs: Dict[str, Any],
+    required: List[str],
+    component: Optional[str] = None,
+) -> None:
+    """Validate that all required arguments are present.
+
+    Raises AgentError if any required argument is missing.
+
+    Args:
+        kwargs: The keyword arguments dict to validate.
+        required: List of required argument names.
+        component: Optional component name for error context.
+
+    Raises:
+        AgentError: If any required argument is missing.
+
+    Example:
+        ```python
+        def my_tool(**kwargs):
+            validate_required_arguments(kwargs, ["query", "limit"], "my_tool")
+            # ... tool logic
+        ```
+    """
+    missing = [arg for arg in required if arg not in kwargs]
+    if missing:
+        raise AgentError(
+            f"Missing required argument(s): {', '.join(missing)}",
+            component=component,
+            details={"missing": missing, "required": required},
+            suggestion=f"Include the following argument(s): {', '.join(missing)}",
+        )
+
+
+def validate_no_extra_arguments(
+    kwargs: Dict[str, Any],
+    allowed: List[str],
+    component: Optional[str] = None,
+) -> None:
+    """Validate that no unexpected arguments are present.
+
+    Raises AgentError if any argument is not in the allowed list.
+
+    Args:
+        kwargs: The keyword arguments dict to validate.
+        allowed: List of allowed argument names.
+        component: Optional component name for error context.
+
+    Raises:
+        AgentError: If any unexpected argument is present.
+
+    Example:
+        ```python
+        def my_tool(**kwargs):
+            validate_no_extra_arguments(kwargs, ["query", "limit"], "my_tool")
+            # ... tool logic
+        ```
+    """
+    extra = [arg for arg in kwargs if arg not in allowed]
+    if extra:
+        raise AgentError(
+            f"Unexpected argument(s): {', '.join(extra)}",
+            component=component,
+            details={"unexpected": extra, "allowed": allowed},
+            suggestion=f"Remove the unexpected argument(s): {', '.join(extra)}. Valid arguments: {', '.join(allowed)}",
+        )
+
+
+def validate_arguments_from_schema(
+    kwargs: Dict[str, Any],
+    schema: Dict[str, Any],
+    component: Optional[str] = None,
+    *,
+    strict: bool = False,
+) -> None:
+    """Validate arguments against a JSON schema.
+
+    This is the main validation function for tool implementers. It validates:
+    - Required arguments are present
+    - Argument types match the schema
+    - No extra arguments (if strict=True)
+
+    Args:
+        kwargs: The keyword arguments dict to validate.
+        schema: JSON schema with 'properties' and optionally 'required'.
+        component: Optional component name for error context.
+        strict: If True, reject arguments not in schema. Default False.
+
+    Raises:
+        AgentError: If validation fails.
+
+    Example:
+        ```python
+        SCHEMA = {
+            "properties": {
+                "query": {"type": "string"},
+                "limit": {"type": "integer"},
+            },
+            "required": ["query"],
+        }
+
+        def my_tool(**kwargs):
+            validate_arguments_from_schema(kwargs, SCHEMA, "my_tool")
+            # ... tool logic
+        ```
+    """
+    properties = schema.get("properties", {})
+    required = schema.get("required", [])
+
+    # Check required arguments
+    validate_required_arguments(kwargs, required, component)
+
+    # Check for extra arguments if strict
+    if strict:
+        validate_no_extra_arguments(kwargs, list(properties.keys()), component)
+
+    # Validate types for provided arguments
+    for arg_name, value in kwargs.items():
+        if arg_name in properties:
+            expected_type = properties[arg_name].get("type")
+            if expected_type:
+                validate_argument_type(value, expected_type, arg_name, component)
diff --git a/maseval/core/simulator.py b/maseval/core/simulator.py
index c891b937..edeed8b7 100644
--- a/maseval/core/simulator.py
+++ b/maseval/core/simulator.py
@@ -5,16 +5,21 @@
 from datetime import datetime
 from .model import ModelAdapter
 from .tracing import TraceableMixin
+from .exceptions import EnvironmentError, UserError
 import uuid
 from enum import Enum
 
 
 class SimulatorError(Exception):
-    """Raised when a simulator fails to produce a valid result after all retries.
+    """Base exception for simulator failures.
 
-    This exception is raised when the LLM simulator exhausts all retry attempts
-    without successfully parsing the model output. The benchmark catches this
-    exception and records it as a task execution failure.
+    This exception is raised when an LLM simulator exhausts all retry attempts
+    without successfully parsing the model output.
+
+    Note:
+        Subclasses (ToolSimulatorError, UserSimulatorError) inherit from the
+        appropriate MASEval exception type for proper error classification.
+        Use those specific subclasses in concrete simulators.
 
     Attributes:
         message: Description of the failure.
@@ -29,15 +34,20 @@ def __init__(
         attempts: int = 0,
         last_error: Optional[str] = None,
         logs: Optional[List[Dict[str, Any]]] = None,
+        component: Optional[str] = None,
     ):
         self.message = message
         self.attempts = attempts
         self.last_error = last_error
         self.logs = logs or []
+        self.component = component
         super().__init__(self.message)
 
     def __str__(self) -> str:
-        parts = [self.message]
+        parts = []
+        if self.component:
+            parts.append(f"[{self.component}]")
+        parts.append(self.message)
         if self.attempts > 0:
             parts.append(f"(attempts: {self.attempts})")
         if self.last_error:
@@ -45,11 +55,85 @@ def __str__(self) -> str:
         return " ".join(parts)
 
 
+class ToolSimulatorError(SimulatorError, EnvironmentError):
+    """Tool simulator failed - not the agent's fault.
+
+    Raised when ToolLLMSimulator fails after exhausting retries.
+    This inherits from EnvironmentError, so it's classified as
+    ENVIRONMENT_ERROR in benchmark results.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        attempts: int = 0,
+        last_error: Optional[str] = None,
+        logs: Optional[List[Dict[str, Any]]] = None,
+        component: Optional[str] = None,
+    ):
+        # Initialize SimulatorError (sets message, attempts, last_error, logs, component)
+        SimulatorError.__init__(
+            self,
+            message=message,
+            attempts=attempts,
+            last_error=last_error,
+            logs=logs,
+            component=component,
+        )
+        # Initialize EnvironmentError for MASEval classification
+        EnvironmentError.__init__(
+            self,
+            message=message,
+            component=component,
+            details={"attempts": attempts, "last_error": last_error},
+        )
+
+
+class UserSimulatorError(SimulatorError, UserError):
+    """User simulator failed - not the agent's fault.
+
+    Raised when UserLLMSimulator fails after exhausting retries.
+    This inherits from UserError, so it's classified as
+    USER_ERROR in benchmark results.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        attempts: int = 0,
+        last_error: Optional[str] = None,
+        logs: Optional[List[Dict[str, Any]]] = None,
+        component: Optional[str] = None,
+    ):
+        # Initialize SimulatorError (sets message, attempts, last_error, logs, component)
+        SimulatorError.__init__(
+            self,
+            message=message,
+            attempts=attempts,
+            last_error=last_error,
+            logs=logs,
+            component=component,
+        )
+        # Initialize UserError for MASEval classification
+        UserError.__init__(
+            self,
+            message=message,
+            component=component,
+            details={"attempts": attempts, "last_error": last_error},
+        )
+
+
 class LLMSimulator(ABC, TraceableMixin):
     """
     A base class for simulators that use an LLM.
+
+    Subclasses should override `_create_error` to return the appropriate
+    exception type (ToolSimulatorError, UserSimulatorError, etc.).
     """
 
+    # Override in subclasses to specify component name for error messages
+    _component_name: Optional[str] = None
+
     def __init__(
         self,
         model: ModelAdapter,
@@ -77,6 +161,34 @@ def __init__(
         # Entry schema: {id, timestamp, input, raw_output, parsed_output, status}
         self.logs: list[dict[str, Any]] = []
 
+    def _create_error(
+        self,
+        message: str,
+        attempts: int,
+        last_error: Optional[str],
+        logs: List[Dict[str, Any]],
+    ) -> SimulatorError:
+        """Create the appropriate error type for this simulator.
+
+        Override in subclasses to return ToolSimulatorError or UserSimulatorError.
+
+        Args:
+            message: Error description.
+            attempts: Number of attempts made.
+            last_error: The last error encountered.
+            logs: Complete log of attempts.
+
+        Returns:
+            SimulatorError (or subclass) instance.
+        """
+        return SimulatorError(
+            message=message,
+            attempts=attempts,
+            last_error=last_error,
+            logs=logs,
+            component=self._component_name,
+        )
+
     def __call__(self, generation_params: Optional[Dict[str, Any]] = None, **kwargs) -> Any:
         """
         Generates a simulated output.
@@ -149,7 +261,7 @@ def __call__(self, generation_params: Optional[Dict[str, Any]] = None, **kwargs)
                 last_error = log["error"]
                 break
 
-        raise SimulatorError(
+        raise self._create_error(
             message=f"{self.__class__.__name__} failed to parse model output after {self.max_try} attempts",
             attempts=self.max_try,
             last_error=last_error,
@@ -214,6 +326,9 @@ class SimulatorCallStatus(Enum):
 class ToolLLMSimulator(LLMSimulator):
     """
     A simulator that uses an LLM to generate plausible tool outputs.
+
+    Raises ToolSimulatorError on failure, which is classified as
+    ENVIRONMENT_ERROR (not the agent's fault).
     """
 
     def __init__(
@@ -246,10 +361,27 @@ def __init__(
                 template = f.read()
         super().__init__(model, template, max_try)
         self.tool_name = tool_name
+        self._component_name = tool_name  # For error messages
         self.tool_description = tool_description
         self.tool_inputs = tool_inputs
         self.generation_params = generation_params or {}
 
+    def _create_error(
+        self,
+        message: str,
+        attempts: int,
+        last_error: Optional[str],
+        logs: List[Dict[str, Any]],
+    ) -> ToolSimulatorError:
+        """Create ToolSimulatorError for tool simulation failures."""
+        return ToolSimulatorError(
+            message=message,
+            attempts=attempts,
+            last_error=last_error,
+            logs=logs,
+            component=self.tool_name,
+        )
+
     def __call__(self, generation_params: Optional[Dict[str, Any]] = None, **actual_inputs: Dict[str, Any]) -> Tuple[str, Dict[str, Any]]:
         return super().__call__(generation_params=generation_params, **actual_inputs)
 
@@ -284,8 +416,13 @@ def _fill_prompt_template(self, **kwargs) -> str:
 class UserLLMSimulator(LLMSimulator):
     """
     A simulator that uses an LLM to act as the user.
+
+    Raises UserSimulatorError on failure, which is classified as
+    USER_ERROR (not the agent's fault).
     """
 
+    _component_name = "user_simulator"
+
     def __init__(
         self,
         model: ModelAdapter,
@@ -338,6 +475,22 @@ def __init__(
         self.stop_token = stop_token
         self.early_stopping_condition = early_stopping_condition
 
+    def _create_error(
+        self,
+        message: str,
+        attempts: int,
+        last_error: Optional[str],
+        logs: List[Dict[str, Any]],
+    ) -> UserSimulatorError:
+        """Create UserSimulatorError for user simulation failures."""
+        return UserSimulatorError(
+            message=message,
+            attempts=attempts,
+            last_error=last_error,
+            logs=logs,
+            component="user_simulator",
+        )
+
     def __call__(
         self,
         conversation_history: List[Dict[str, str]],
diff --git a/scripts/run_macs.sh b/scripts/run_macs.sh
new file mode 100755
index 00000000..8c56ff3e
--- /dev/null
+++ b/scripts/run_macs.sh
@@ -0,0 +1,5 @@
+# !/bin/bash
+
+pwd
+
+uv run python examples/macs_benchmark/macs_benchmark.py --framework smolagents --domain travel --limit 2
\ No newline at end of file
diff --git a/tests/test_benchmarks/test_macs/test_macs_benchmark.py b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
index c0b33d01..eeceddcd 100644
--- a/tests/test_benchmarks/test_macs/test_macs_benchmark.py
+++ b/tests/test_benchmarks/test_macs/test_macs_benchmark.py
@@ -361,51 +361,63 @@ def test_empty_results(self):
         result = compute_benchmark_metrics([])
 
         assert result["total_tasks"] == 0
+        assert result["scored_tasks"] == 0
         assert result["successful_tasks"] == 0
         assert result["success_rate"] == 0.0
         assert result["mean_metrics"] == {}
+        assert result["excluded"] == {
+            "environment_error": 0,
+            "user_error": 0,
+            "unknown_execution_error": 0,
+            "evaluation_failed": 0,
+            "setup_failed": 0,
+        }
+        assert result["status_counts"] == {}
 
     def test_single_successful_result(self):
         """Single successful result counted."""
-        results = [{"eval": [{"overall_gsr": 1.0, "user_gsr": 1.0, "system_gsr": 1.0}]}]
+        results = [{"status": "completed", "eval": [{"overall_gsr": 1.0, "user_gsr": 1.0, "system_gsr": 1.0}]}]
 
         metrics = compute_benchmark_metrics(results)
 
         assert metrics["total_tasks"] == 1
+        assert metrics["scored_tasks"] == 1
         assert metrics["successful_tasks"] == 1
         assert metrics["success_rate"] == 1.0
 
     def test_single_failed_result(self):
         """Single failed result counted."""
-        results = [{"eval": [{"overall_gsr": 0.0, "user_gsr": 0.0, "system_gsr": 0.0}]}]
+        results = [{"status": "completed", "eval": [{"overall_gsr": 0.0, "user_gsr": 0.0, "system_gsr": 0.0}]}]
 
         metrics = compute_benchmark_metrics(results)
 
         assert metrics["total_tasks"] == 1
+        assert metrics["scored_tasks"] == 1
         assert metrics["successful_tasks"] == 0
         assert metrics["success_rate"] == 0.0
 
     def test_multiple_results(self):
         """Multiple results aggregated correctly."""
         results = [
-            {"eval": [{"overall_gsr": 1.0}]},  # Success
-            {"eval": [{"overall_gsr": 0.0}]},  # Fail
-            {"eval": [{"overall_gsr": 1.0}]},  # Success
+            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},  # Success
+            {"status": "completed", "eval": [{"overall_gsr": 0.0}]},  # Fail
+            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},  # Success
         ]
 
         metrics = compute_benchmark_metrics(results)
 
         assert metrics["total_tasks"] == 3
+        assert metrics["scored_tasks"] == 3
         assert metrics["successful_tasks"] == 2
         assert metrics["success_rate"] == pytest.approx(2 / 3)
 
     def test_success_rate_calculation(self):
-        """success_rate = successful/total."""
+        """success_rate = successful/scored (not total)."""
         results = [
-            {"eval": [{"overall_gsr": 1.0}]},
-            {"eval": [{"overall_gsr": 1.0}]},
-            {"eval": [{"overall_gsr": 0.0}]},
-            {"eval": [{"overall_gsr": 0.0}]},
+            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "completed", "eval": [{"overall_gsr": 0.0}]},
+            {"status": "completed", "eval": [{"overall_gsr": 0.0}]},
         ]
 
         metrics = compute_benchmark_metrics(results)
@@ -415,8 +427,8 @@ def test_success_rate_calculation(self):
     def test_mean_metrics_calculation(self):
         """Mean of numeric metrics computed."""
         results = [
-            {"eval": [{"overall_gsr": 1.0, "partial_gsr": 0.8}]},
-            {"eval": [{"overall_gsr": 0.0, "partial_gsr": 0.4}]},
+            {"status": "completed", "eval": [{"overall_gsr": 1.0, "partial_gsr": 0.8}]},
+            {"status": "completed", "eval": [{"overall_gsr": 0.0, "partial_gsr": 0.4}]},
         ]
 
         metrics = compute_benchmark_metrics(results)
@@ -427,27 +439,29 @@ def test_mean_metrics_calculation(self):
     def test_handles_missing_eval(self):
         """Handles results with no eval key."""
         results = [
-            {"eval": [{"overall_gsr": 1.0}]},
-            {"no_eval_key": True},  # Missing eval
-            {"eval": None},  # None eval
+            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "completed", "no_eval_key": True},  # Missing eval
+            {"status": "completed", "eval": None},  # None eval
         ]
 
         metrics = compute_benchmark_metrics(results)
 
         assert metrics["total_tasks"] == 3
+        assert metrics["scored_tasks"] == 3
         assert metrics["successful_tasks"] == 1
 
     def test_handles_non_numeric_values(self):
         """Non-numeric values in eval are ignored for mean."""
         results = [
             {
+                "status": "completed",
                 "eval": [
                     {
                         "overall_gsr": 1.0,
                         "report": [{"assertion": "A"}],  # Non-numeric
                         "status": "success",  # String
                     }
-                ]
+                ],
             }
         ]
 
@@ -458,6 +472,128 @@ def test_handles_non_numeric_values(self):
         assert "report" not in metrics["mean_metrics"]
         assert "status" not in metrics["mean_metrics"]
 
+    def test_excludes_environment_errors_from_scoring(self):
+        """Environment errors are excluded from scoring."""
+        results = [
+            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "environment_error", "eval": None},  # Should be excluded
+            {"status": "completed", "eval": [{"overall_gsr": 0.0}]},
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["total_tasks"] == 3
+        assert metrics["scored_tasks"] == 2  # Only completed tasks
+        assert metrics["successful_tasks"] == 1
+        assert metrics["success_rate"] == 0.5  # 1/2, not 1/3
+        assert metrics["excluded"]["environment_error"] == 1
+
+    def test_excludes_user_errors_from_scoring(self):
+        """User simulator errors are excluded from scoring."""
+        results = [
+            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "user_error", "eval": None},
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["total_tasks"] == 2
+        assert metrics["scored_tasks"] == 1
+        assert metrics["successful_tasks"] == 1
+        assert metrics["success_rate"] == 1.0  # Only the completed one
+        assert metrics["excluded"]["user_error"] == 1
+
+    def test_excludes_unknown_errors_from_scoring(self):
+        """Unknown execution errors are excluded from scoring."""
+        results = [
+            {"status": "unknown_execution_error", "eval": None},
+            {"status": "completed", "eval": [{"overall_gsr": 0.0}]},
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["total_tasks"] == 2
+        assert metrics["scored_tasks"] == 1
+        assert metrics["success_rate"] == 0.0
+        assert metrics["excluded"]["unknown_execution_error"] == 1
+
+    def test_excludes_setup_failed_from_scoring(self):
+        """Setup failures are excluded from scoring."""
+        results = [
+            {"status": "setup_failed", "eval": None},
+            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["total_tasks"] == 2
+        assert metrics["scored_tasks"] == 1
+        assert metrics["excluded"]["setup_failed"] == 1
+
+    def test_excludes_evaluation_failed_from_scoring(self):
+        """Evaluation failures are excluded from scoring."""
+        results = [
+            {"status": "evaluation_failed", "eval": None},
+            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["total_tasks"] == 2
+        assert metrics["scored_tasks"] == 1
+        assert metrics["success_rate"] == 1.0  # Only the completed one
+        assert metrics["excluded"]["evaluation_failed"] == 1
+
+    def test_includes_agent_errors_in_scoring(self):
+        """Agent errors ARE included in scoring (agent's fault)."""
+        results = [
+            {"status": "agent_error", "eval": [{"overall_gsr": 0.0}]},
+            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["total_tasks"] == 2
+        assert metrics["scored_tasks"] == 2  # Agent errors count!
+        assert metrics["successful_tasks"] == 1
+        assert metrics["success_rate"] == 0.5
+
+    def test_status_counts_tracked(self):
+        """Status counts are tracked for all tasks."""
+        results = [
+            {"status": "completed", "eval": [{"overall_gsr": 1.0}]},
+            {"status": "completed", "eval": [{"overall_gsr": 0.0}]},
+            {"status": "agent_error", "eval": None},
+            {"status": "environment_error", "eval": None},
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["status_counts"]["completed"] == 2
+        assert metrics["status_counts"]["agent_error"] == 1
+        assert metrics["status_counts"]["environment_error"] == 1
+
+    def test_mixed_errors_comprehensive(self):
+        """Comprehensive test with various error types."""
+        results = [
+            {"status": "completed", "eval": [{"overall_gsr": 1.0, "accuracy": 0.9}]},
+            {"status": "completed", "eval": [{"overall_gsr": 0.0, "accuracy": 0.3}]},
+            {"status": "agent_error", "eval": [{"overall_gsr": 0.0, "accuracy": 0.0}]},
+            {"status": "environment_error", "eval": None},  # Excluded
+            {"status": "user_error", "eval": None},  # Excluded
+            {"status": "evaluation_failed", "eval": None},  # Excluded
+            {"status": "setup_failed", "eval": None},  # Excluded
+        ]
+
+        metrics = compute_benchmark_metrics(results)
+
+        assert metrics["total_tasks"] == 7
+        assert metrics["scored_tasks"] == 3  # completed(2) + agent_error(1)
+        assert metrics["successful_tasks"] == 1
+        assert metrics["success_rate"] == pytest.approx(1 / 3)
+        assert metrics["mean_metrics"]["accuracy"] == pytest.approx((0.9 + 0.3 + 0.0) / 3)
+        assert sum(metrics["excluded"].values()) == 4
+
 
 # =============================================================================
 # Integration Tests
diff --git a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py
index 9c226520..8a15fca0 100644
--- a/tests/test_core/test_benchmark/test_benchmark_lifecycle.py
+++ b/tests/test_core/test_benchmark/test_benchmark_lifecycle.py
@@ -331,7 +331,9 @@ def setup_agents(self, agent_data, environment, task, user):
 
         assert len(reports) == 1
         report = reports[0]
-        assert report["status"] == TaskExecutionStatus.TASK_EXECUTION_FAILED.value
+        # RuntimeError from agent framework is classified as UNKNOWN_EXECUTION_ERROR
+        # (we can't determine if it's agent's fault or infrastructure failure)
+        assert report["status"] == TaskExecutionStatus.UNKNOWN_EXECUTION_ERROR.value
         assert "error" in report
         assert report["error"]["error_type"] == "RuntimeError"
         assert "Agent execution failed!" in report["error"]["error_message"]
@@ -515,12 +517,13 @@ def setup_agents(self, agent_data, environment, task, user):
         assert len(failed_external) == 1
         assert [t.id for t in failed] == [t.id for t in failed_external]
 
-        # Get only task execution failures
-        exec_failed = benchmark.get_failed_tasks(TaskExecutionStatus.TASK_EXECUTION_FAILED)
+        # RuntimeError from agent framework is classified as UNKNOWN_EXECUTION_ERROR
+        # (we can't determine if it's agent's fault or infrastructure failure)
+        exec_failed = benchmark.get_failed_tasks(TaskExecutionStatus.UNKNOWN_EXECUTION_ERROR)
         assert len(exec_failed) == 1
 
         # External reports version
-        exec_failed_external = benchmark.get_failed_tasks(TaskExecutionStatus.TASK_EXECUTION_FAILED, reports=reports)
+        exec_failed_external = benchmark.get_failed_tasks(TaskExecutionStatus.UNKNOWN_EXECUTION_ERROR, reports=reports)
         assert len(exec_failed_external) == 1
 
         # No evaluation failures in this test
@@ -532,7 +535,7 @@ def setup_agents(self, agent_data, environment, task, user):
         reports_copy.append(
             {
                 "task_id": "fake-task",
-                "status": TaskExecutionStatus.TASK_EXECUTION_FAILED.value,
+                "status": TaskExecutionStatus.UNKNOWN_EXECUTION_ERROR.value,
                 "error": "Fake error",
             }
         )
diff --git a/tests/test_core/test_exceptions.py b/tests/test_core/test_exceptions.py
new file mode 100644
index 00000000..0f66eeab
--- /dev/null
+++ b/tests/test_core/test_exceptions.py
@@ -0,0 +1,404 @@
+"""Tests for exception classification in benchmark execution.
+
+These tests verify that different exception types (AgentError, EnvironmentError,
+UserError) are correctly classified into their respective TaskExecutionStatus
+values, enabling fair scoring by distinguishing agent faults from infrastructure
+failures.
+"""
+
+import pytest
+from maseval import (
+    TaskCollection,
+    TaskExecutionStatus,
+    AgentError,
+    EnvironmentError,
+    UserError,
+    validate_argument_type,
+    validate_required_arguments,
+    validate_no_extra_arguments,
+    validate_arguments_from_schema,
+)
+
+
+class TestExceptionClassification:
+    """Tests for exception classification in benchmark execution."""
+
+    def test_agent_error_classified_correctly(self):
+        """Test that AgentError is classified as AGENT_ERROR."""
+        from conftest import DummyBenchmark, DummyAgentAdapter
+
+        class AgentErrorRaisingAgent:
+            def run(self, query: str) -> str:
+                raise AgentError("Invalid tool argument", component="test_tool")
+
+        class AgentErrorAdapter(DummyAgentAdapter):
+            def _run_agent(self, query: str) -> str:
+                return self.agent.run(query)
+
+        class AgentErrorBenchmark(DummyBenchmark):
+            def setup_agents(self, agent_data, environment, task, user):
+                agent = AgentErrorRaisingAgent()
+                adapter = AgentErrorAdapter(agent, "agent")
+                return [adapter], {"agent": adapter}
+
+        tasks = TaskCollection.from_list([{"query": "Test", "environment_data": {}}])
+        benchmark = AgentErrorBenchmark(agent_data={})
+        reports = benchmark.run(tasks)
+
+        assert len(reports) == 1
+        assert reports[0]["status"] == TaskExecutionStatus.AGENT_ERROR.value
+        assert reports[0]["error"]["error_type"] == "AgentError"
+        assert reports[0]["error"]["component"] == "test_tool"
+
+    def test_environment_error_classified_correctly(self):
+        """Test that EnvironmentError is classified as ENVIRONMENT_ERROR."""
+        from conftest import DummyBenchmark, DummyAgentAdapter
+
+        class EnvironmentErrorRaisingAgent:
+            def run(self, query: str) -> str:
+                raise EnvironmentError("Database connection failed", component="db_tool")
+
+        class EnvironmentErrorAdapter(DummyAgentAdapter):
+            def _run_agent(self, query: str) -> str:
+                return self.agent.run(query)
+
+        class EnvironmentErrorBenchmark(DummyBenchmark):
+            def setup_agents(self, agent_data, environment, task, user):
+                agent = EnvironmentErrorRaisingAgent()
+                adapter = EnvironmentErrorAdapter(agent, "agent")
+                return [adapter], {"agent": adapter}
+
+        tasks = TaskCollection.from_list([{"query": "Test", "environment_data": {}}])
+        benchmark = EnvironmentErrorBenchmark(agent_data={})
+        reports = benchmark.run(tasks)
+
+        assert len(reports) == 1
+        assert reports[0]["status"] == TaskExecutionStatus.ENVIRONMENT_ERROR.value
+        assert reports[0]["error"]["error_type"] == "EnvironmentError"
+        assert reports[0]["error"]["component"] == "db_tool"
+
+    def test_user_error_classified_correctly(self):
+        """Test that UserError is classified as USER_ERROR."""
+        from conftest import DummyBenchmark, DummyAgentAdapter
+
+        class UserErrorRaisingAgent:
+            def run(self, query: str) -> str:
+                raise UserError("User simulator API failed", component="user_simulator")
+
+        class UserErrorAdapter(DummyAgentAdapter):
+            def _run_agent(self, query: str) -> str:
+                return self.agent.run(query)
+
+        class UserErrorBenchmark(DummyBenchmark):
+            def setup_agents(self, agent_data, environment, task, user):
+                agent = UserErrorRaisingAgent()
+                adapter = UserErrorAdapter(agent, "agent")
+                return [adapter], {"agent": adapter}
+
+        tasks = TaskCollection.from_list([{"query": "Test", "environment_data": {}}])
+        benchmark = UserErrorBenchmark(agent_data={})
+        reports = benchmark.run(tasks)
+
+        assert len(reports) == 1
+        assert reports[0]["status"] == TaskExecutionStatus.USER_ERROR.value
+        assert reports[0]["error"]["error_type"] == "UserError"
+        assert reports[0]["error"]["component"] == "user_simulator"
+
+    def test_generic_exception_classified_as_unknown(self):
+        """Test that generic exceptions are classified as UNKNOWN_EXECUTION_ERROR."""
+        from conftest import DummyBenchmark, DummyAgentAdapter
+
+        class GenericErrorRaisingAgent:
+            def run(self, query: str) -> str:
+                raise ValueError("Some internal error")
+
+        class GenericErrorAdapter(DummyAgentAdapter):
+            def _run_agent(self, query: str) -> str:
+                return self.agent.run(query)
+
+        class GenericErrorBenchmark(DummyBenchmark):
+            def setup_agents(self, agent_data, environment, task, user):
+                agent = GenericErrorRaisingAgent()
+                adapter = GenericErrorAdapter(agent, "agent")
+                return [adapter], {"agent": adapter}
+
+        tasks = TaskCollection.from_list([{"query": "Test", "environment_data": {}}])
+        benchmark = GenericErrorBenchmark(agent_data={})
+        reports = benchmark.run(tasks)
+
+        assert len(reports) == 1
+        assert reports[0]["status"] == TaskExecutionStatus.UNKNOWN_EXECUTION_ERROR.value
+        assert reports[0]["error"]["error_type"] == "ValueError"
+
+    def test_error_details_included_in_report(self):
+        """Test that error details are included in the report."""
+        from conftest import DummyBenchmark, DummyAgentAdapter
+
+        class DetailedAgentErrorRaisingAgent:
+            def run(self, query: str) -> str:
+                raise AgentError(
+                    "Invalid argument type",
+                    component="my_tool",
+                    details={"expected": "int", "actual": "str", "argument": "count"},
+                )
+
+        class DetailedAgentErrorAdapter(DummyAgentAdapter):
+            def _run_agent(self, query: str) -> str:
+                return self.agent.run(query)
+
+        class DetailedAgentErrorBenchmark(DummyBenchmark):
+            def setup_agents(self, agent_data, environment, task, user):
+                agent = DetailedAgentErrorRaisingAgent()
+                adapter = DetailedAgentErrorAdapter(agent, "agent")
+                return [adapter], {"agent": adapter}
+
+        tasks = TaskCollection.from_list([{"query": "Test", "environment_data": {}}])
+        benchmark = DetailedAgentErrorBenchmark(agent_data={})
+        reports = benchmark.run(tasks)
+
+        assert len(reports) == 1
+        error = reports[0]["error"]
+        assert error["component"] == "my_tool"
+        assert error["details"]["expected"] == "int"
+        assert error["details"]["actual"] == "str"
+
+
+class TestAgentErrorSuggestion:
+    """Tests for AgentError suggestion field."""
+
+    def test_agent_error_with_suggestion(self):
+        """AgentError can include a suggestion for self-correction."""
+        error = AgentError(
+            "Expected int for 'count', got str",
+            component="search_tool",
+            suggestion="Provide count as an integer, e.g., count=10",
+        )
+        assert error.suggestion == "Provide count as an integer, e.g., count=10"
+        assert "Suggestion:" in str(error)
+        assert "count=10" in str(error)
+
+    def test_agent_error_without_suggestion(self):
+        """AgentError works without suggestion."""
+        error = AgentError("Simple error")
+        assert error.suggestion is None
+        assert "Suggestion:" not in str(error)
+
+    def test_validation_helpers_include_suggestions(self):
+        """Validation helpers include helpful suggestions in errors."""
+        # Type validation
+        try:
+            validate_argument_type("hello", "integer", "count")
+        except AgentError as e:
+            assert e.suggestion is not None
+            assert "count" in e.suggestion
+            assert "integer" in e.suggestion
+
+        # Required arguments
+        try:
+            validate_required_arguments({}, ["query"])
+        except AgentError as e:
+            assert e.suggestion is not None
+            assert "query" in e.suggestion
+
+        # Extra arguments
+        try:
+            validate_no_extra_arguments({"extra": 1}, ["allowed"])
+        except AgentError as e:
+            assert e.suggestion is not None
+            assert "extra" in e.suggestion
+
+
+class TestValidationHelpers:
+    """Tests for validation helper functions."""
+
+    def test_validate_argument_type_valid(self):
+        """Test that valid arguments pass validation."""
+        # These should not raise
+        validate_argument_type("hello", "string", "name")
+        validate_argument_type(42, "integer", "count")
+        validate_argument_type(3.14, "number", "value")
+        validate_argument_type(True, "boolean", "flag")
+        validate_argument_type([1, 2, 3], "array", "items")
+        validate_argument_type({"key": "value"}, "object", "config")
+
+    def test_validate_argument_type_invalid(self):
+        """Test that invalid arguments raise AgentError."""
+        with pytest.raises(AgentError, match="expected string"):
+            validate_argument_type(42, "string", "name")
+
+        with pytest.raises(AgentError, match="expected integer"):
+            validate_argument_type("not an int", "integer", "count")
+
+        with pytest.raises(AgentError, match="expected boolean"):
+            validate_argument_type(1, "boolean", "flag")  # int != bool
+
+    def test_validate_argument_type_bool_not_int(self):
+        """Test that bool is not accepted as integer."""
+        with pytest.raises(AgentError, match="expected integer, got boolean"):
+            validate_argument_type(True, "integer", "count")
+
+    def test_validate_required_arguments_present(self):
+        """Test that present required arguments pass validation."""
+        # Should not raise
+        validate_required_arguments({"a": 1, "b": 2}, ["a", "b"])
+        validate_required_arguments({"a": 1, "b": 2, "c": 3}, ["a"])
+
+    def test_validate_required_arguments_missing(self):
+        """Test that missing required arguments raise AgentError."""
+        with pytest.raises(AgentError, match="Missing required argument"):
+            validate_required_arguments({"a": 1}, ["a", "b"])
+
+    def test_validate_no_extra_arguments_valid(self):
+        """Test that no extra arguments pass validation."""
+        # Should not raise
+        validate_no_extra_arguments({"a": 1, "b": 2}, ["a", "b", "c"])
+        validate_no_extra_arguments({}, ["a", "b"])
+
+    def test_validate_no_extra_arguments_invalid(self):
+        """Test that extra arguments raise AgentError."""
+        with pytest.raises(AgentError, match="Unexpected argument"):
+            validate_no_extra_arguments({"a": 1, "extra": 2}, ["a"])
+
+    def test_validate_arguments_from_schema_valid(self):
+        """Test that valid arguments pass schema validation."""
+        schema = {
+            "properties": {
+                "name": {"type": "string"},
+                "count": {"type": "integer"},
+            },
+            "required": ["name"],
+        }
+
+        # Should not raise
+        validate_arguments_from_schema({"name": "test", "count": 5}, schema)
+        validate_arguments_from_schema({"name": "test"}, schema)  # count optional
+
+    def test_validate_arguments_from_schema_missing_required(self):
+        """Test that missing required arguments raise AgentError."""
+        schema = {
+            "properties": {
+                "name": {"type": "string"},
+            },
+            "required": ["name"],
+        }
+
+        with pytest.raises(AgentError, match="Missing required"):
+            validate_arguments_from_schema({}, schema)
+
+    def test_validate_arguments_from_schema_wrong_type(self):
+        """Test that wrong types raise AgentError."""
+        schema = {
+            "properties": {
+                "count": {"type": "integer"},
+            },
+        }
+
+        with pytest.raises(AgentError, match="expected integer"):
+            validate_arguments_from_schema({"count": "not an int"}, schema)
+
+    def test_validate_arguments_from_schema_strict_mode(self):
+        """Test that strict mode rejects extra arguments."""
+        schema = {
+            "properties": {
+                "name": {"type": "string"},
+            },
+        }
+
+        # Non-strict (default) allows extra args
+        validate_arguments_from_schema({"name": "test", "extra": 1}, schema, strict=False)
+
+        # Strict mode rejects extra args
+        with pytest.raises(AgentError, match="Unexpected argument"):
+            validate_arguments_from_schema({"name": "test", "extra": 1}, schema, strict=True)
+
+
+class TestFilteringByErrorType:
+    """Tests for filtering failed tasks by error type."""
+
+    def test_filter_agent_errors_only(self):
+        """Test filtering to get only agent errors."""
+        from conftest import DummyBenchmark, DummyAgentAdapter, DummyAgent
+
+        class MixedErrorBenchmark(DummyBenchmark):
+            task_counter = 0
+
+            def setup_agents(self, agent_data, environment, task, user):
+                self.task_counter += 1
+
+                class DynamicAgent:
+                    def __init__(self, error_type):
+                        self.error_type = error_type
+
+                    def run(self, query: str) -> str:
+                        if self.error_type == "agent":
+                            raise AgentError("Agent fault")
+                        elif self.error_type == "env":
+                            raise EnvironmentError("Env fault")
+                        return "success"
+
+                if self.task_counter == 1:
+                    agent = DynamicAgent("agent")
+                elif self.task_counter == 2:
+                    agent = DynamicAgent("env")
+                else:
+                    agent = DummyAgent()
+
+                adapter = DummyAgentAdapter(agent, "agent")
+                return [adapter], {"agent": adapter}
+
+        tasks = TaskCollection.from_list(
+            [
+                {"query": "Task 1", "environment_data": {}},
+                {"query": "Task 2", "environment_data": {}},
+                {"query": "Task 3", "environment_data": {}},
+            ]
+        )
+
+        benchmark = MixedErrorBenchmark(agent_data={})
+        reports = benchmark.run(tasks)
+
+        # Should have 1 success, 1 agent error, 1 env error
+        statuses = [r["status"] for r in reports]
+        assert TaskExecutionStatus.AGENT_ERROR.value in statuses
+        assert TaskExecutionStatus.ENVIRONMENT_ERROR.value in statuses
+        assert TaskExecutionStatus.SUCCESS.value in statuses
+
+        # Filter only agent errors
+        agent_errors = benchmark.get_failed_tasks(TaskExecutionStatus.AGENT_ERROR)
+        assert len(agent_errors) == 1
+
+        # Filter only environment errors
+        env_errors = benchmark.get_failed_tasks(TaskExecutionStatus.ENVIRONMENT_ERROR)
+        assert len(env_errors) == 1
+
+        # Filter multiple types
+        all_failures = benchmark.get_failed_tasks(
+            [
+                TaskExecutionStatus.AGENT_ERROR,
+                TaskExecutionStatus.ENVIRONMENT_ERROR,
+            ]
+        )
+        assert len(all_failures) == 2
+
+    def test_scoring_guidance(self):
+        """Test the scoring guidance from TaskExecutionStatus docstring."""
+        # Include in agent score: SUCCESS, AGENT_ERROR
+        scoreable_statuses = {
+            TaskExecutionStatus.SUCCESS.value,
+            TaskExecutionStatus.AGENT_ERROR.value,
+        }
+
+        # Exclude from agent score: ENVIRONMENT_ERROR, USER_ERROR, UNKNOWN_EXECUTION_ERROR
+        exclude_statuses = {
+            TaskExecutionStatus.ENVIRONMENT_ERROR.value,
+            TaskExecutionStatus.USER_ERROR.value,
+            TaskExecutionStatus.UNKNOWN_EXECUTION_ERROR.value,
+        }
+
+        # Verify no overlap
+        assert scoreable_statuses.isdisjoint(exclude_statuses)
+
+        # Verify all execution statuses are accounted for (excluding setup/eval)
+        all_exec_statuses = scoreable_statuses | exclude_statuses
+        assert TaskExecutionStatus.SETUP_FAILED.value not in all_exec_statuses
+        assert TaskExecutionStatus.EVALUATION_FAILED.value not in all_exec_statuses

From 45fcad28a6a297113d581bd358d6e56b6f4c8f87 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 5 Dec 2025 15:27:06 +0000
Subject: [PATCH 32/34] added better documentation of exceptions

---
 CHANGELOG.md                          |   6 +-
 docs/guides/exception-handling.md     | 257 ++++++++++++++++++++++++++
 docs/guides/index.md                  |   1 +
 docs/reference/exceptions.md          |  48 +++++
 mkdocs.yml                            |   2 +
 tests/test_core/test_llm_simulator.py |  24 ++-
 6 files changed, 329 insertions(+), 9 deletions(-)
 create mode 100644 docs/guides/exception-handling.md
 create mode 100644 docs/reference/exceptions.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 300a8509..b856f86e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,7 +14,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `AgentError`, `EnvironmentError`, `UserError` exception hierarchy in `maseval.core.exceptions` for classifying execution failures by responsibility (PR: #13)
 - Added `TaskExecutionStatus.AGENT_ERROR`, `ENVIRONMENT_ERROR`, `USER_ERROR`, `UNKNOWN_EXECUTION_ERROR` for fine-grained error classification enabling fair scoring (PR: #13)
 - Added validation helpers: `validate_argument_type()`, `validate_required_arguments()`, `validate_no_extra_arguments()`, `validate_arguments_from_schema()` for tool implementers (PR: #13)
-- Added `ToolSimulatorError` and `UserSimulatorError` exception subclasses that inherit from both `SimulatorError` and the appropriate MASEval error type (PR: #13)
+- Added `ToolSimulatorError` and `UserSimulatorError` exception subclasses for simulator-specific context while inheriting proper classification (PR: #13)
+
+**Documentation**
+
+- Added Exception Handling guide explaining error classification, fair scoring, and rerunning failed tasks (PR: #13)
 
 **Benchmarks**
 
diff --git a/docs/guides/exception-handling.md b/docs/guides/exception-handling.md
new file mode 100644
index 00000000..819db1e4
--- /dev/null
+++ b/docs/guides/exception-handling.md
@@ -0,0 +1,257 @@
+# Exception Handling
+
+## Overview
+
+When running benchmarks, tasks can fail for different reasons. MASEval provides an exception hierarchy that distinguishes between **agent failures** and **infrastructure failures**. This distinction gives you the option to analyze different failure modes separately, which can be useful for fair scoring or debugging.
+
+!!! info "Why Distinguish Failure Types?"
+
+    Consider a scenario where the agent provides correct inputs but the database connection times out. Without distinguishing failure types, this would appear as an agent failure. The exception hierarchy allows separating these cases when analysis requires it.
+
+## Error Types
+
+MASEval defines three error categories:
+
+| Exception          | Source         | Default Scoring | Example                         |
+| ------------------ | -------------- | --------------- | ------------------------------- |
+| `AgentError`       | Agent input    | Included        | Agent passed wrong type to tool |
+| `EnvironmentError` | Infrastructure | Excluded        | Database connection failed      |
+| `UserError`        | User simulator | Excluded        | LLM API unreachable             |
+
+### AgentError
+
+Indicates the agent violated a contract at a controlled boundary:
+
+```python
+from maseval import AgentError
+
+def calculate(a: int, b: int, operation: str) -> int:
+    # Validate inputs
+    if not isinstance(a, int):
+        raise AgentError(
+            f"Expected int for 'a', got {type(a).__name__}",
+            component="calculate",
+            suggestion="Provide a as an integer, e.g., a=10"
+        )
+
+    if operation not in ("add", "subtract", "multiply"):
+        raise AgentError(
+            f"Unknown operation: {operation}",
+            component="calculate",
+            suggestion="Use one of: add, subtract, multiply"
+        )
+
+    # ... execution logic
+```
+
+The optional `suggestion` field provides agent-friendly hints. Some agent frameworks use error messages for automatic retry attempts.
+
+### EnvironmentError
+
+Indicates infrastructure failure after input validation passed:
+
+```python
+from maseval import EnvironmentError
+
+def fetch_data(query: str) -> dict:
+    # Input validation passed, now execute
+    try:
+        return database.query(query)
+    except DatabaseTimeoutError as e:
+        raise EnvironmentError(
+            "Database query timed out",
+            component="fetch_data",
+            details={"timeout": 30, "query_length": len(query)}
+        ) from e
+```
+
+The `details` dict can include debugging information for developers.
+
+### UserError
+
+Indicates user simulation infrastructure failure:
+
+```python
+from maseval import UserError
+
+class SimulatedUser:
+    def respond(self, agent_message: str) -> str:
+        try:
+            return self.llm.generate(agent_message)
+        except APIError as e:
+            raise UserError(
+                "User simulator LLM failed",
+                component="user_simulator",
+                details={"error": str(e)}
+            ) from e
+```
+
+## The Boundary Pattern
+
+One approach to exception handling places the boundary between agent responsibility and infrastructure responsibility at input validation:
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                     TOOL EXECUTION                          │
+├─────────────────────────────────────────────────────────────┤
+│                                                             │
+│   ┌─────────────────┐                                       │
+│   │  INPUT          │  Agent passes arguments               │
+│   │  VALIDATION     │                                       │
+│   │                 │  ❌ Fails → AgentError                │
+│   │                 │  ✓ Passes ↓                           │
+│   └─────────────────┘                                       │
+│           │                                                 │
+│           ▼                                                 │
+│   ┌─────────────────┐                                       │
+│   │  EXECUTION      │  Tool runs its logic                  │
+│   │                 │                                       │
+│   │                 │  ❌ Fails → EnvironmentError          │
+│   │                 │  ✓ Passes → Result                    │
+│   └─────────────────┘                                       │
+│                                                             │
+└─────────────────────────────────────────────────────────────┘
+```
+
+With this pattern:
+
+- Validation failures indicate agent-provided bad input (`AgentError`)
+- Execution failures after validation indicate infrastructure issues (`EnvironmentError`)
+
+## Validation Helpers
+
+MASEval provides optional utilities for input validation:
+
+```python
+from maseval import (
+    validate_argument_type,
+    validate_required_arguments,
+    validate_arguments_from_schema,
+)
+
+SCHEMA = {
+    "properties": {
+        "query": {"type": "string"},
+        "limit": {"type": "integer"},
+    },
+    "required": ["query"],
+}
+
+def search(**kwargs):
+    validate_arguments_from_schema(kwargs, SCHEMA, component="search")
+    # Execution logic...
+```
+
+These helpers raise `AgentError` with automatic suggestions:
+
+```
+AgentError: [search] Argument 'limit' expected integer, got string.
+Suggestion: Provide limit as an integer, e.g., 10
+```
+
+## Task Execution Status
+
+Each completed task has a status indicating what happened:
+
+| Status                    | Description                    |
+| ------------------------- | ------------------------------ |
+| `success`                 | Task completed normally        |
+| `agent_error`             | AgentError was raised          |
+| `environment_error`       | EnvironmentError was raised    |
+| `user_error`              | UserError was raised           |
+| `evaluation_failed`       | Evaluator raised an exception  |
+| `setup_failed`            | Task setup raised an exception |
+| `unknown_execution_error` | Unclassified exception         |
+
+## Scoring Considerations
+
+When computing benchmark metrics, distinguishing between failure modes, provides the option to exclude those infrastructure failures from the scoring.
+
+The recommended use is to count `agent_error` as agentic failure and others as benchmarking failure, i.e include the former but exclude the letter from scoring.
+
+```python
+results = benchmark.run(tasks)
+summary = compute_benchmark_metrics(results)
+```
+
+Example output:
+
+```
+Total Tasks: 100
+Scored Tasks: 92
+Success Rate: 65.22%
+
+Status Breakdown:
+  success                    60
+  agent_error                 8
+  environment_error           5
+  user_error                  2
+  ...
+```
+
+The success rate (65.22%) reflects `60 / 92` rather than `60 / 100`.
+
+## Rerunning Failed Tasks
+
+Infrastructure errors are often transient. Tasks with infrastructure failures can be rerun:
+
+```python
+results = benchmark.run(tasks)
+
+# Identify infrastructure failures
+infra_failed_ids = [
+    r["task_id"] for r in results
+    if r["status"] in ("environment_error", "user_error", "unknown_execution_error")
+]
+
+if infra_failed_ids:
+    # Filter and rerun
+    retry_tasks = tasks.filter(lambda t: t.id in infra_failed_ids)
+    retry_results = benchmark.run(retry_tasks)
+
+    # Merge results
+    final_results = [
+        r for r in results if r["task_id"] not in infra_failed_ids
+    ] + retry_results
+```
+
+## Error Message Audiences
+
+Different exception types serve different audiences:
+
+| Exception          | Primary Audience | Message Characteristics           |
+| ------------------ | ---------------- | --------------------------------- |
+| `AgentError`       | Agent/Framework  | Actionable, with suggestion field |
+| `EnvironmentError` | Developer        | Technical, debugging-oriented     |
+| `UserError`        | Developer        | Identifies simulator issue        |
+
+Examples:
+
+```python
+# AgentError - agent-facing
+AgentError(
+    "Expected string for 'query', got int",
+    suggestion="Provide query as a string"
+)
+
+# EnvironmentError - developer-facing
+EnvironmentError(
+    "Connection failed after 3 retries",
+    details={"host": "api.example.com", "timeout": 30}
+)
+```
+
+## Summary
+
+MASEval's exception hierarchy provides:
+
+- **`AgentError`**: Signals agent input violations
+- **`EnvironmentError`**: Signals infrastructure failures
+- **`UserError`**: Signals user simulator failures
+
+This distinction enables:
+
+- Separating failure analysis by source
+- Optional exclusion of infrastructure failures from scoring
+- Targeted rerunning of transient failures
+- Different error message styles for different audiences
diff --git a/docs/guides/index.md b/docs/guides/index.md
index 531e81dc..630a06d1 100644
--- a/docs/guides/index.md
+++ b/docs/guides/index.md
@@ -6,3 +6,4 @@ Guides provide an in-depth exploration of MASEval's features and best practices.
 | ---------------------------------------------- | ------------------------------------------------------------- |
 | [Message Tracing](message-tracing.md)          | Capture and inspect agent conversations during benchmark runs |
 | [Configuration Gathering](config-gathering.md) | Collect and export configuration for reproducibility          |
+| [Exception Handling](exception-handling.md)    | Distinguish agent errors from infrastructure failures         |
diff --git a/docs/reference/exceptions.md b/docs/reference/exceptions.md
new file mode 100644
index 00000000..ce938230
--- /dev/null
+++ b/docs/reference/exceptions.md
@@ -0,0 +1,48 @@
+# Exceptions
+
+Exception classes for error classification in benchmark execution.
+
+[:material-github: View source](https://github.com/parameterlab/maseval/blob/main/maseval/core/exceptions.py){ .md-source-file }
+
+## Exception Hierarchy
+
+```
+MASEvalError (base)
+├── AgentError           - Agent violated contract (agent's fault)
+├── EnvironmentError     - Environment/tool failed (not agent's fault)
+└── UserError            - User simulator failed (not agent's fault)
+
+SimulatorError (base for simulators)
+├── ToolSimulatorError   - Also inherits EnvironmentError
+└── UserSimulatorError   - Also inherits UserError
+```
+
+## Core Exceptions
+
+::: maseval.core.exceptions.MASEvalError
+
+::: maseval.core.exceptions.AgentError
+
+::: maseval.core.exceptions.EnvironmentError
+
+::: maseval.core.exceptions.UserError
+
+## Simulator Exceptions
+
+::: maseval.core.simulator.SimulatorError
+
+::: maseval.core.simulator.ToolSimulatorError
+
+::: maseval.core.simulator.UserSimulatorError
+
+## Validation Helpers
+
+These functions simplify input validation and raise `AgentError` with helpful suggestions:
+
+::: maseval.core.exceptions.validate_argument_type
+
+::: maseval.core.exceptions.validate_required_arguments
+
+::: maseval.core.exceptions.validate_no_extra_arguments
+
+::: maseval.core.exceptions.validate_arguments_from_schema
diff --git a/mkdocs.yml b/mkdocs.yml
index 6864f49d..8161f8a2 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -88,6 +88,7 @@ nav:
   - Guides:
       - guides/index.md
       - Message Tracing: guides/message-tracing.md
+      - Exception Handling: guides/exception-handling.md
   - Examples:
       - examples/index.md
       - Tiny Tutorial: examples/tutorial.ipynb
@@ -100,6 +101,7 @@ nav:
           - Callback: reference/callback.md
           - Environment: reference/environment.md
           - Evaluator: reference/evaluator.md
+          - Exceptions: reference/exceptions.md
           - History: reference/history.md
           - Model: reference/model.md
           - Simulator: reference/simulator.md
diff --git a/tests/test_core/test_llm_simulator.py b/tests/test_core/test_llm_simulator.py
index 504d9da3..6def7239 100644
--- a/tests/test_core/test_llm_simulator.py
+++ b/tests/test_core/test_llm_simulator.py
@@ -3,8 +3,14 @@
 These tests verify that LLMSimulator retry logic and tracing work correctly.
 """
 
+from typing import cast
+
 import pytest
-from maseval.core.simulator import ToolLLMSimulator, SimulatorCallStatus, SimulatorError
+from maseval.core.simulator import (
+    ToolLLMSimulator,
+    SimulatorCallStatus,
+    ToolSimulatorError,
+)
 
 
 @pytest.mark.core
@@ -58,14 +64,15 @@ def test_llm_simulator_parsing_error_retry(self, dummy_model):
             max_try=3,
         )
 
-        # Should raise SimulatorError after max_try attempts
-        with pytest.raises(SimulatorError) as exc_info:
+        # Should raise ToolSimulatorError after max_try attempts
+        with pytest.raises(ToolSimulatorError) as exc_info:
             simulator(actual_inputs={"param": "test"})
 
         # Verify exception details
-        assert exc_info.value.attempts == 3
-        assert exc_info.value.last_error is not None
-        assert len(exc_info.value.logs) == 3  # All 3 attempts in exception logs
+        err = cast(ToolSimulatorError, exc_info.value)
+        assert err.attempts == 3
+        assert err.last_error is not None
+        assert len(err.logs) == 3  # All 3 attempts in exception logs
         assert len(simulator.logs) == 3  # All 3 attempts logged in simulator
 
     def test_llm_simulator_max_attempts_respected(self, dummy_model):
@@ -83,12 +90,13 @@ def test_llm_simulator_max_attempts_respected(self, dummy_model):
         )
 
         # Should raise after 2 attempts
-        with pytest.raises(SimulatorError) as exc_info:
+        with pytest.raises(ToolSimulatorError) as exc_info:
             simulator(actual_inputs={"param": "test"})
 
         # Should stop after 2 attempts, not continue to 10
+        err = cast(ToolSimulatorError, exc_info.value)
         assert len(simulator.logs) == 2
-        assert exc_info.value.attempts == 2
+        assert err.attempts == 2
 
     def test_llm_simulator_history_structure(self, dummy_model):
         """Test that history entries have correct structure."""

From 07229c9e4c4bf806d90c0b8ba2ca8c56ef2b800e Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 5 Dec 2025 15:58:07 +0000
Subject: [PATCH 33/34] fixed small bug for macs with langgraph

---
 examples/macs_benchmark/macs_benchmark.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/macs_benchmark/macs_benchmark.py b/examples/macs_benchmark/macs_benchmark.py
index 092c97bc..d06ce503 100644
--- a/examples/macs_benchmark/macs_benchmark.py
+++ b/examples/macs_benchmark/macs_benchmark.py
@@ -274,7 +274,7 @@ def build_agent(agent_id: str, depth: int = 0) -> ToolCallingAgent:
                 name=agent_spec.get("agent_name", agent_id),
                 description=agent_spec.get("agent_instruction", ""),
                 max_steps=25,  # Allow more steps for complex multi-agent tasks
-                verbosity_level=2,
+                verbosity_level=0,
             )
 
             return agent
@@ -449,7 +449,7 @@ def setup_user(
             name="Simulated User",
             model=user_model,
             scenario=scenario,
-            initial_prompt=task.query,
+            initial_query=task.query,
         )
 
         # Register the user's simulator for tracing
@@ -480,15 +480,15 @@ def setup_agents(
         agent_lookup = {a["agent_id"]: a for a in agents_config}
         primary_agent_id = agent_data.get("primary_agent_id", "supervisor")
 
-        # Wrap all generic tools and register for tracing
+        # Wrap all generic tools for LangGraph and register them for tracing
         # Each tool has its own model from MACSEnvironment.create_tools()
+        # Models are already registered by the environment via get_model_adapter()
         tool_wrappers: Dict[str, LangGraphToolWrapper] = {}
         for name, tool in environment.tools.items():
             wrapper = LangGraphToolWrapper(tool)
             tool_wrappers[name] = wrapper
             self.register("tools", name, wrapper)
-            # Register the tool's model and simulator for tracing
-            self.register("models", f"model_tool_{name}", tool.model)
+            # Register the tool's simulator for tracing
             self.register("simulators", f"simulator_tool_{name}", tool.simulator)
 
         # Helper to get tools for an agent

From 7bb1faa3805439a3923cbaadd0182d887185b862 Mon Sep 17 00:00:00 2001
From: cemde <c.emde@me.com>
Date: Fri, 5 Dec 2025 16:03:47 +0000
Subject: [PATCH 34/34] removed debugging file

---
 scripts/run_macs.sh | 5 -----
 1 file changed, 5 deletions(-)
 delete mode 100755 scripts/run_macs.sh

diff --git a/scripts/run_macs.sh b/scripts/run_macs.sh
deleted file mode 100755
index 8c56ff3e..00000000
--- a/scripts/run_macs.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-# !/bin/bash
-
-pwd
-
-uv run python examples/macs_benchmark/macs_benchmark.py --framework smolagents --domain travel --limit 2
\ No newline at end of file