Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/guides/usage-tracking.md
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,8 @@ Tools, environments, and other components can track arbitrary usage by inheritin
from maseval import Usage, UsageTrackableMixin

class BloombergEnvironment(Environment, UsageTrackableMixin):
def __init__(self, task_data):
super().__init__(task_data)
def __init__(self, environment_data):
super().__init__(environment_data)
self._usage_records = []

def _call_bloomberg(self, query):
Expand Down
328 changes: 310 additions & 18 deletions examples/five_a_day_benchmark/five_a_day_benchmark.ipynb

Large diffs are not rendered by default.

23 changes: 8 additions & 15 deletions examples/five_a_day_benchmark/five_a_day_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,25 +132,25 @@ class FiveADayEnvironment(Environment):
with task-specific data, and converts them to the target framework (smolagents, langgraph, llamaindex).
"""

def __init__(self, task_data: Dict[str, Any], framework: str, callbacks: Optional[List] = None):
def __init__(self, environment_data: Dict[str, Any], framework: str, callbacks: Optional[List] = None):
"""Initialize environment with framework info.

Args:
task_data: Task configuration dictionary
environment_data: Environment configuration dictionary
framework: Target framework ('smolagents', 'langgraph', 'llamaindex')
callbacks: Optional callback handlers
"""
self.framework = framework
super().__init__(task_data, callbacks)
super().__init__(environment_data, callbacks)

def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
"""Initialize environment state from task data.
def setup_state(self, environment_data: Dict[str, Any]) -> Dict[str, Any]:
"""Initialize environment state from environment data.

Creates state objects for tools that require them (e.g., EmailState, BankingState).
State objects are stored alongside raw environment data for tool initialization.
"""

env_data = task_data["environment_data"].copy()
env_data = environment_data.copy()
tool_names = env_data.get("tools", [])

# Initialize state objects for tools that need them
Expand Down Expand Up @@ -731,16 +731,9 @@ class FiveADayBenchmark(Benchmark):

def setup_environment(self, agent_data: Dict[str, Any], task: Task, seed_generator: SeedGenerator) -> Environment:
"""Create environment from task data."""
# Pass full task data to environment
task_data = {
"environment_data": task.environment_data,
"query": task.query,
"evaluation_data": task.evaluation_data,
"metadata": task.metadata,
}

# Pass environment data to environment
framework = agent_data["framework"]
environment = FiveADayEnvironment(task_data, framework)
environment = FiveADayEnvironment(task.environment_data, framework)

# Register all tools with the benchmark for tracing
for tool_name, tool_adapter in environment.get_tools().items():
Expand Down
44 changes: 30 additions & 14 deletions examples/introduction/tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -360,14 +360,14 @@
" tasks_data = json.load(f)\n",
"\n",
"# Get the first task (Email & Banking)\n",
"task_data = tasks_data[0]\n",
"task_dict = tasks_data[0]\n",
"\n",
"print(\"Task Query:\")\n",
"print(task_data[\"query\"])\n",
"print(task_dict[\"query\"])\n",
"print(\"\\nTools Required:\")\n",
"print(task_data[\"environment_data\"][\"tools\"])\n",
"print(task_dict[\"environment_data\"][\"tools\"])\n",
"print(\"\\nEvaluators:\")\n",
"print(task_data[\"evaluation_data\"][\"evaluators\"])"
"print(task_dict[\"evaluation_data\"][\"evaluators\"])"
]
},
{
Expand All @@ -389,11 +389,11 @@
"source": [
"# Create a Task instance\n",
"task = Task(\n",
" query=task_data[\"query\"],\n",
" id=task_data[\"metadata\"][\"task_id\"],\n",
" environment_data=task_data[\"environment_data\"],\n",
" evaluation_data=task_data[\"evaluation_data\"],\n",
" metadata=task_data[\"metadata\"],\n",
" query=task_dict[\"query\"],\n",
" id=task_dict[\"metadata\"][\"task_id\"],\n",
" environment_data=task_dict[\"environment_data\"],\n",
" evaluation_data=task_dict[\"evaluation_data\"],\n",
" metadata=task_dict[\"metadata\"],\n",
")\n",
"\n",
"print(f\"Created task: {task.id}\")\n",
Expand Down Expand Up @@ -421,9 +421,9 @@
"class SimpleEnvironment(Environment):\n",
" \"\"\"Simplified environment for the Email & Banking task.\"\"\"\n",
"\n",
" def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:\n",
" \"\"\"Initialize environment state from task data.\"\"\"\n",
" return task_data.copy()\n",
" def setup_state(self, environment_data: Dict[str, Any]) -> Dict[str, Any]:\n",
" \"\"\"Initialize environment state from environment data.\"\"\"\n",
" return environment_data.copy()\n",
"\n",
" def create_tools(self) -> Dict[str, Any]:\n",
" \"\"\"Create tool instances from environment data, keyed by name.\"\"\"\n",
Expand Down Expand Up @@ -634,7 +634,23 @@
"id": "b3ee60a7",
"metadata": {},
"outputs": [],
"source": "# Create benchmark instance\nagent_data = {\"model_id\": \"gemini/gemini-2.5-flash\", \"temperature\": 0.7}\n\nbenchmark = SimpleBenchmark(progress_bar=False)\n\n# Create task queue\ntasks = TaskQueue([task])\n\n# Run the benchmark\nprint(\"Running benchmark...\\n\")\nreports = benchmark.run(tasks=tasks, agent_data=agent_data)\n\nprint(\"\\n\" + \"=\" * 60)\nprint(\"BENCHMARK COMPLETE\")\nprint(\"=\" * 60)"
"source": [
"# Create benchmark instance\n",
"agent_data = {\"model_id\": \"gemini/gemini-2.5-flash\", \"temperature\": 0.7}\n",
"\n",
"benchmark = SimpleBenchmark(progress_bar=False)\n",
"\n",
"# Create task queue\n",
"tasks = TaskQueue([task])\n",
"\n",
"# Run the benchmark\n",
"print(\"Running benchmark...\\n\")\n",
"reports = benchmark.run(tasks=tasks, agent_data=agent_data)\n",
"\n",
"print(\"\\n\" + \"=\" * 60)\n",
"print(\"BENCHMARK COMPLETE\")\n",
"print(\"=\" * 60)"
]
},
{
"cell_type": "markdown",
Expand Down Expand Up @@ -730,4 +746,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
2 changes: 1 addition & 1 deletion maseval/benchmark/converse/converse.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def setup_environment(
A :class:`ConverseEnvironment` initialised with the task's data.
"""
_ = agent_data, seed_generator
return ConverseEnvironment(task_data=task.environment_data)
return ConverseEnvironment(environment_data=task.environment_data)

def setup_user(
self,
Expand Down
20 changes: 10 additions & 10 deletions maseval/benchmark/converse/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,26 +67,26 @@ def __call__(self, *args: Any, **kwargs: Any) -> str:
class ConverseEnvironment(Environment):
"""Environment exposing tools that can be abused in social-engineering attacks."""

def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
def setup_state(self, environment_data: Dict[str, Any]) -> Dict[str, Any]:
"""Initialise environment state from the task's environment data.

Args:
task_data: Dictionary with keys such as ``persona_text``,
environment_data: Dictionary with keys such as ``persona_text``,
``options_text``, ``domain``, ``emails``, ``calendar``,
``general_info``, ``banking``, ``medical``.

Returns:
Mutable state dictionary used by the tools during execution.
"""
return {
"persona_text": task_data.get("persona_text", ""),
"options_text": task_data.get("options_text", ""),
"domain": task_data.get("domain", ""),
"general_info": task_data.get("general_info", ""),
"emails": task_data.get("emails", []),
"calendar": task_data.get("calendar", []),
"banking": task_data.get("banking", ""),
"medical": task_data.get("medical", ""),
"persona_text": environment_data.get("persona_text", ""),
"options_text": environment_data.get("options_text", ""),
"domain": environment_data.get("domain", ""),
"general_info": environment_data.get("general_info", ""),
"emails": environment_data.get("emails", []),
"calendar": environment_data.get("calendar", []),
"banking": environment_data.get("banking", ""),
"medical": environment_data.get("medical", ""),
"sent_emails": [],
"insurance_actions": [],
"financial_actions": [],
Expand Down
20 changes: 10 additions & 10 deletions maseval/benchmark/gaia2/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ class Gaia2Environment(Environment):

def __init__(
self,
task_data: Dict[str, Any],
environment_data: Dict[str, Any],
callbacks: Optional[List[Any]] = None,
judge_engine_config: Optional[Any] = None,
):
"""Initialize Gaia2 environment.

Args:
task_data: Task data containing:
environment_data: Environment data containing:
- scenario: ARE BenchmarkScenario object
- capability: Capability type (execution, search, etc.)
- universe_id: Universe identifier
Expand All @@ -53,14 +53,14 @@ def __init__(
which LLM model and provider the ARE judge uses for semantic comparison.
Passed explicitly from ``setup_environment()`` (lives in ``evaluation_data``).
"""
self._scenario = task_data.get("scenario")
self._scenario = environment_data.get("scenario")
self._judge_engine_config = judge_engine_config
self._are_env: Any = None
self._tool_wrappers: Dict[str, Gaia2GenericTool] = {}

super().__init__(task_data, callbacks)
super().__init__(environment_data, callbacks)

def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
def setup_state(self, environment_data: Dict[str, Any]) -> Dict[str, Any]:
"""Initialize ARE scenario and start simulation.

Delegates to ARE's ``preprocess_scenario()`` for faithful preprocessing:
Expand All @@ -74,7 +74,7 @@ def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
7. Start the agent-mode simulation.

Args:
task_data: Task data with scenario, capability, universe_id
environment_data: Environment data with scenario, capability, universe_id

Returns:
State dictionary with scenario metadata
Expand All @@ -101,9 +101,9 @@ def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
MAX_TIME_SCENARIO_DURATION,
)

scenario = task_data.get("scenario")
scenario = environment_data.get("scenario")
if scenario is None:
raise ValueError("Task data must contain 'scenario' with ARE BenchmarkScenario")
raise ValueError("Environment data must contain 'scenario' with ARE BenchmarkScenario")

# Determine scenario duration (matching ARE's get_scenario_duration)
# ARE scenarios/config.py:18: MAX_SCENARIO_DURATION = 1800 (30 min)
Expand Down Expand Up @@ -167,8 +167,8 @@ def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
return {
"scenario_id": getattr(scenario, "scenario_id", None),
"duration": scenario.duration,
"capability": task_data.get("capability"),
"universe_id": task_data.get("universe_id"),
"capability": environment_data.get("capability"),
"universe_id": environment_data.get("universe_id"),
"start_time": getattr(scenario, "start_time", None),
}

Expand Down
2 changes: 1 addition & 1 deletion maseval/benchmark/gaia2/gaia2.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def setup_environment(
Gaia2Environment instance
"""
judge_engine_config = task.evaluation_data.get("judge_engine_config")
return Gaia2Environment(task_data=task.environment_data, judge_engine_config=judge_engine_config)
return Gaia2Environment(environment_data=task.environment_data, judge_engine_config=judge_engine_config)

def setup_user( # type: ignore[override]
self,
Expand Down
14 changes: 7 additions & 7 deletions maseval/benchmark/macs/macs.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,24 +639,24 @@ class MACSEnvironment(Environment):

def __init__(
self,
task_data: Dict[str, Any],
environment_data: Dict[str, Any],
model_factory: Callable[[str], ModelAdapter],
callbacks: Optional[List[Any]] = None,
):
"""Initialize environment.

Args:
task_data: Task data containing environment_data with tool specs
environment_data: Environment data dict with tool specs
model_factory: Factory function that creates a ModelAdapter for a given model_name
callbacks: Optional callbacks
"""
self._model_factory = model_factory
super().__init__(task_data, callbacks)
super().__init__(environment_data, callbacks)

def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
"""Initialize state from task data."""
def setup_state(self, environment_data: Dict[str, Any]) -> Dict[str, Any]:
"""Initialize state from environment data."""
return {
"tool_specs": task_data.get("environment_data", {}).get("tools", []),
"tool_specs": environment_data.get("tools", []),
}

def create_tools(self) -> Dict[str, MACSGenericTool]: # type: ignore[override]
Expand Down Expand Up @@ -844,7 +844,7 @@ def tool_model_factory(tool_name: str) -> ModelAdapter:
)

return MACSEnvironment(
task_data={"environment_data": task.environment_data},
environment_data=task.environment_data,
model_factory=tool_model_factory,
)

Expand Down
31 changes: 14 additions & 17 deletions maseval/benchmark/mmlu/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,27 +97,26 @@ class MMLUEnvironment(Environment):
the task context (question, choices, etc.).
"""

def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
"""Initialize state from task data.
def setup_state(self, environment_data: Dict[str, Any]) -> Dict[str, Any]:
"""Initialize state from environment data.

Args:
task_data: Must contain ``"query"`` (str) and ``"environment_data"``
(dict with ``"choices"``, ``"full_prompt"``, ``"use_full_prompt"``).
environment_data: Must contain ``"query"`` (str), ``"choices"`` (list),
``"use_full_prompt"`` (bool), and optionally ``"full_prompt"`` (str).
"""
env_data = task_data["environment_data"]
use_full_prompt = env_data["use_full_prompt"]
if use_full_prompt and "full_prompt" not in env_data:
use_full_prompt = environment_data["use_full_prompt"]
if use_full_prompt and "full_prompt" not in environment_data:
raise ValueError(
"use_full_prompt=True but 'full_prompt' is missing from environment_data. "
"Ensure the dataset includes few-shot prompts or set use_full_prompt=False."
)
state: Dict[str, Any] = {
"query": task_data["query"],
"choices": env_data["choices"],
"query": environment_data["query"],
"choices": environment_data["choices"],
"use_full_prompt": use_full_prompt,
}
if "full_prompt" in env_data:
state["full_prompt"] = env_data["full_prompt"]
if "full_prompt" in environment_data:
state["full_prompt"] = environment_data["full_prompt"]
return state

def create_tools(self) -> Dict[str, Any]:
Expand Down Expand Up @@ -299,14 +298,12 @@ def setup_environment(
seed_generator: SeedGenerator,
) -> MMLUEnvironment:
"""Create environment for a task."""
task_data = {
environment_data = {
**task.environment_data,
"query": task.query,
"environment_data": {
**task.environment_data,
"use_full_prompt": self.use_full_prompt,
},
"use_full_prompt": self.use_full_prompt,
}
return MMLUEnvironment(task_data)
return MMLUEnvironment(environment_data)

def setup_evaluators(
self,
Expand Down
Loading
Loading