Skip to content

Commit 53fff0a

Browse files
cemdeclaude
andcommitted
docs: rename task_data to environment_data in examples and guides
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ad003db commit 53fff0a

4 files changed

Lines changed: 17 additions & 144 deletions

File tree

docs/guides/usage-tracking.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,8 @@ Tools, environments, and other components can track arbitrary usage by inheritin
322322
from maseval import Usage, UsageTrackableMixin
323323

324324
class BloombergEnvironment(Environment, UsageTrackableMixin):
325-
def __init__(self, task_data):
326-
super().__init__(task_data)
325+
def __init__(self, environment_data):
326+
super().__init__(environment_data)
327327
self._usage_records = []
328328

329329
def _call_bloomberg(self, query):

examples/five_a_day_benchmark/five_a_day_benchmark.ipynb

Lines changed: 4 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -258,26 +258,7 @@
258258
"id": "3d869e2f",
259259
"metadata": {},
260260
"outputs": [],
261-
"source": [
262-
"# Load Task 0 for demonstration in Part 1\n",
263-
"task_data, agent_configs = load_benchmark_data(\n",
264-
" config_type=\"multi\",\n",
265-
" framework=\"smolagents\",\n",
266-
" model_id=\"gemini-2.5-flash\",\n",
267-
" temperature=0.7,\n",
268-
")\n",
269-
"\n",
270-
"# Extract the first (and only) task and config\n",
271-
"task_0: Task = task_data[0]\n",
272-
"config_0: Dict[str, Any] = agent_configs[0]\n",
273-
"\n",
274-
"print(\"=\" * 60)\n",
275-
"print(\"TASK 0: Email & Banking\")\n",
276-
"print(\"=\" * 60)\n",
277-
"print(f\"\\nUser Query:\\n{task_0.query}\\n\")\n",
278-
"print(f\"Required Tools: {task_0.environment_data['tools']}\")\n",
279-
"print(f\"\\nEvaluators: {task_0.evaluation_data['evaluators']}\")"
280-
]
261+
"source": "# Load Task 0 for demonstration in Part 1\ntasks, agent_configs = load_benchmark_data(\n config_type=\"multi\",\n framework=\"smolagents\",\n model_id=\"gemini-2.5-flash\",\n temperature=0.7,\n)\n\n# Extract the first (and only) task and config\ntask_0: Task = tasks[0]\nconfig_0: Dict[str, Any] = agent_configs[0]\n\nprint(\"=\" * 60)\nprint(\"TASK 0: Email & Banking\")\nprint(\"=\" * 60)\nprint(f\"\\nUser Query:\\n{task_0.query}\\n\")\nprint(f\"Required Tools: {task_0.environment_data['tools']}\")\nprint(f\"\\nEvaluators: {task_0.evaluation_data['evaluators']}\")"
281262
},
282263
{
283264
"cell_type": "markdown",
@@ -407,55 +388,7 @@
407388
"id": "5d95d447",
408389
"metadata": {},
409390
"outputs": [],
410-
"source": [
411-
"class FiveADayEnvironment(Environment):\n",
412-
" \"\"\"Environment that creates framework-specific tools from task data.\"\"\"\n",
413-
"\n",
414-
" def __init__(self, task_data: Dict[str, Any], callbacks: List | None = None):\n",
415-
" super().__init__(task_data, callbacks)\n",
416-
"\n",
417-
" def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:\n",
418-
" \"\"\"Initialize environment state from task data.\"\"\"\n",
419-
" env_data = task_data[\"environment_data\"].copy()\n",
420-
" tool_names = env_data.get(\"tools\", [])\n",
421-
"\n",
422-
" # Create state objects (e.g., email inboxes, bank accounts)\n",
423-
" states = get_states(tool_names, env_data)\n",
424-
" env_data.update(states)\n",
425-
"\n",
426-
" return env_data\n",
427-
"\n",
428-
" def create_tools(self) -> Dict[str, Any]:\n",
429-
" \"\"\"Create and convert tools to framework-specific format, keyed by name.\"\"\"\n",
430-
" tools_dict: Dict[str, Any] = {}\n",
431-
"\n",
432-
" # Map tool names to their collection classes\n",
433-
" tool_mapping = {\n",
434-
" \"email\": (EmailToolCollection, lambda: (self.state[\"email_state\"],)),\n",
435-
" \"banking\": (BankingToolCollection, lambda: (self.state[\"banking_state\"],)),\n",
436-
" \"calculator\": (CalculatorToolCollection, lambda: ()),\n",
437-
" \"python_executor\": (CodeExecutionToolCollection, lambda: (self.state[\"python_executor_state\"],)),\n",
438-
" \"family_info\": (FamilyInfoToolCollection, lambda: (self.state[\"family_info\"],)),\n",
439-
" \"stock_price\": (StockPriceToolCollection, lambda: (self.state[\"stock_price_lookup\"],)),\n",
440-
" \"calendar\": (CalendarToolCollection, lambda: (self.state[\"calendar_state\"],)),\n",
441-
" \"hotel_search\": (HotelSearchToolCollection, lambda: (self.state[\"hotel_search_state\"],)),\n",
442-
" \"my_calendar_mcp\": (MCPCalendarToolCollection, lambda: (self.state[\"my_calendar_mcp_state\"],)),\n",
443-
" \"other_calendar_mcp\": (MCPCalendarToolCollection, lambda: (self.state[\"other_calendar_mcp_state\"],)),\n",
444-
" }\n",
445-
"\n",
446-
" for tool_name in self.state[\"tools\"]:\n",
447-
" if tool_name in tool_mapping:\n",
448-
" ToolClass, get_init_args = tool_mapping[tool_name]\n",
449-
" tool_instance = ToolClass(*get_init_args())\n",
450-
"\n",
451-
" # Get base tools and convert to framework format\n",
452-
" for base_tool in tool_instance.get_sub_tools():\n",
453-
" framework_tool = base_tool.to_smolagents()\n",
454-
" tool_key = getattr(base_tool, \"name\", None) or str(type(base_tool).__name__)\n",
455-
" tools_dict[tool_key] = framework_tool\n",
456-
"\n",
457-
" return tools_dict"
458-
]
391+
"source": "class FiveADayEnvironment(Environment):\n \"\"\"Environment that creates framework-specific tools from environment data.\"\"\"\n\n def __init__(self, environment_data: Dict[str, Any], callbacks: List | None = None):\n super().__init__(environment_data, callbacks)\n\n def setup_state(self, environment_data: Dict[str, Any]) -> Dict[str, Any]:\n \"\"\"Initialize environment state from environment data.\"\"\"\n env_data = environment_data.copy()\n tool_names = env_data.get(\"tools\", [])\n\n # Create state objects (e.g., email inboxes, bank accounts)\n states = get_states(tool_names, env_data)\n env_data.update(states)\n\n return env_data\n\n def create_tools(self) -> Dict[str, Any]:\n \"\"\"Create and convert tools to framework-specific format, keyed by name.\"\"\"\n tools_dict: Dict[str, Any] = {}\n\n # Map tool names to their collection classes\n tool_mapping = {\n \"email\": (EmailToolCollection, lambda: (self.state[\"email_state\"],)),\n \"banking\": (BankingToolCollection, lambda: (self.state[\"banking_state\"],)),\n \"calculator\": (CalculatorToolCollection, lambda: ()),\n \"python_executor\": (CodeExecutionToolCollection, lambda: (self.state[\"python_executor_state\"],)),\n \"family_info\": (FamilyInfoToolCollection, lambda: (self.state[\"family_info\"],)),\n \"stock_price\": (StockPriceToolCollection, lambda: (self.state[\"stock_price_lookup\"],)),\n \"calendar\": (CalendarToolCollection, lambda: (self.state[\"calendar_state\"],)),\n \"hotel_search\": (HotelSearchToolCollection, lambda: (self.state[\"hotel_search_state\"],)),\n \"my_calendar_mcp\": (MCPCalendarToolCollection, lambda: (self.state[\"my_calendar_mcp_state\"],)),\n \"other_calendar_mcp\": (MCPCalendarToolCollection, lambda: (self.state[\"other_calendar_mcp_state\"],)),\n }\n\n for tool_name in self.state[\"tools\"]:\n if tool_name in tool_mapping:\n ToolClass, get_init_args = tool_mapping[tool_name]\n tool_instance = ToolClass(*get_init_args())\n\n # Get base tools and convert to framework format\n for base_tool in tool_instance.get_sub_tools():\n framework_tool = base_tool.to_smolagents()\n tool_key = getattr(base_tool, \"name\", None) or str(type(base_tool).__name__)\n tools_dict[tool_key] = framework_tool\n\n return tools_dict"
459392
},
460393
{
461394
"cell_type": "markdown",
@@ -499,7 +432,7 @@
499432
"id": "5fbb228f",
500433
"metadata": {},
501434
"outputs": [],
502-
"source": "# Build the agents for task 0\n# Note: model_config is already set by load_benchmark_data()\n\n# Create environment from task data\nenvironment_0 = FiveADayEnvironment(\n {\n \"environment_data\": task_0.environment_data,\n \"query\": task_0.query,\n \"evaluation_data\": task_0.evaluation_data,\n \"metadata\": task_0.metadata,\n }\n)\n\n# Build agents using the build_agents function (no seeds for this demo)\nagents_to_run, agents_to_monitor = build_agents(config_0, environment_0)\n\nprint(f\"\\nBuilt Agents for Task: {task_0.id}\")\nprint(f\"{'=' * 60}\")\nprint(f\"\\nAgents to run: {[agent.name for agent in agents_to_run]}\")\nprint(f\"Agents to monitor: {list(agents_to_monitor.keys())}\")\n\n# Print details for each agent\nfor agent in agents_to_run:\n print(f\"\\n Agent: {agent.name}\")\n # smolagents stores tools as a dict with string keys\n print(f\" Tools: {list(agent.tools.keys())}\")\n if hasattr(agent, \"managed_agents\") and agent.managed_agents:\n # managed_agents is also a dict with string keys\n print(f\" Managed agents: {list(agent.managed_agents.keys())}\")\n for agent_name, managed in agent.managed_agents.items():\n print(f\" - {managed.name}: {list(managed.tools.keys())}\")\n\nprint(\"\\nAll agents built successfully.\")"
435+
"source": "# Build the agents for task 0\n# Note: model_config is already set by load_benchmark_data()\n\n# Create environment from environment data\nenvironment_0 = FiveADayEnvironment(task_0.environment_data)\n\n# Build agents using the build_agents function (no seeds for this demo)\nagents_to_run, agents_to_monitor = build_agents(config_0, environment_0)\n\nprint(f\"\\nBuilt Agents for Task: {task_0.id}\")\nprint(f\"{'=' * 60}\")\nprint(f\"\\nAgents to run: {[agent.name for agent in agents_to_run]}\")\nprint(f\"Agents to monitor: {list(agents_to_monitor.keys())}\")\n\n# Print details for each agent\nfor agent in agents_to_run:\n print(f\"\\n Agent: {agent.name}\")\n # smolagents stores tools as a dict with string keys\n print(f\" Tools: {list(agent.tools.keys())}\")\n if hasattr(agent, \"managed_agents\") and agent.managed_agents:\n # managed_agents is also a dict with string keys\n print(f\" Managed agents: {list(agent.managed_agents.keys())}\")\n for agent_name, managed in agent.managed_agents.items():\n print(f\" - {managed.name}: {list(managed.tools.keys())}\")\n\nprint(\"\\nAll agents built successfully.\")"
503436
},
504437
{
505438
"cell_type": "markdown",
@@ -523,7 +456,7 @@
523456
"id": "70c66cd0",
524457
"metadata": {},
525458
"outputs": [],
526-
"source": "class FiveADayBenchmark(Benchmark):\n \"\"\"5-A-Day benchmark with multi-agent support.\"\"\"\n\n def setup_environment(self, agent_data: Dict[str, Any], task: Task, seed_generator: SeedGenerator) -> Environment:\n \"\"\"Create environment from task data.\"\"\"\n task_data = {\n \"environment_data\": task.environment_data,\n \"query\": task.query,\n \"evaluation_data\": task.evaluation_data,\n \"metadata\": task.metadata,\n }\n\n environment = FiveADayEnvironment(task_data)\n\n # Register all tools for tracing\n for tool_name, tool_adapter in environment.get_tools().items():\n self.register(\"tools\", tool_name, tool_adapter)\n\n return environment\n\n def setup_agents(\n self,\n agent_data: Dict[str, Any],\n environment: Environment,\n task: Task,\n user,\n seed_generator: SeedGenerator,\n ) -> tuple[list[SmolAgentAdapter], Dict[str, SmolAgentAdapter]]:\n \"\"\"Create multi-agent system with orchestrator and specialists.\n\n Seeds are derived for each agent using the benchmark's seeding system\n with hierarchical paths. derive_seed() returns None if seeding is disabled.\n \"\"\"\n # Build seeds dict using seed_generator\n # Use child(\"agents\") to create logical paths like \"agents/primary_agent\"\n agent_gen = seed_generator.child(\"agents\")\n seeds = {}\n for agent_spec in agent_data[\"agents\"]:\n seeds[agent_spec[\"agent_id\"]] = agent_gen.derive_seed(agent_spec[\"agent_id\"])\n\n agents_to_run, agents_to_monitor = build_agents(agent_data, environment, seeds)\n\n # Create adapters for the primary agent(s) to run\n adapters_to_run = [SmolAgentAdapter(agent, agent.name) for agent in agents_to_run]\n\n # This ensures all agent traces are collected by the benchmark\n all_agents = {agent.name: agent for agent in agents_to_run} | agents_to_monitor\n adapters_to_monitor = {name: SmolAgentAdapter(agent, name) for name, agent in all_agents.items()}\n return adapters_to_run, adapters_to_monitor\n\n def setup_evaluators(self, environment, task, agents, user, seed_generator: SeedGenerator) -> Sequence[Evaluator]:\n \"\"\"Create evaluators based on task's evaluation criteria.\"\"\"\n if not task.evaluation_data[\"evaluators\"]:\n return []\n\n evaluator_instances = []\n for name in task.evaluation_data[\"evaluators\"]:\n evaluator_class = getattr(evaluators, name)\n evaluator_instances.append(evaluator_class(task, environment, user))\n\n return evaluator_instances\n\n def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment, query: str) -> Sequence[Any]:\n \"\"\"Execute agents and return their final answers.\"\"\"\n answers = [agent.run(query) for agent in agents]\n return answers\n\n def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:\n \"\"\"Return a model adapter for benchmark components that need LLM access.\n\n This benchmark doesn't use simulated tools, user simulators, or LLM judges,\n so this method is not called during execution.\n \"\"\"\n raise NotImplementedError(\"This benchmark doesn't use model adapters for tools/users/evaluators.\")\n\n def evaluate(\n self,\n evaluators: Sequence[Evaluator],\n agents: Dict[str, AgentAdapter],\n final_answer: Any,\n traces: Dict[str, Any],\n ) -> list[Dict[str, Any]]:\n \"\"\"Evaluate agent performance.\"\"\"\n results = []\n for evaluator in evaluators:\n filtered_traces = evaluator.filter_traces(traces)\n results.append(evaluator(filtered_traces, final_answer))\n return results"
459+
"source": "class FiveADayBenchmark(Benchmark):\n \"\"\"5-A-Day benchmark with multi-agent support.\"\"\"\n\n def setup_environment(self, agent_data: Dict[str, Any], task: Task, seed_generator: SeedGenerator) -> Environment:\n \"\"\"Create environment from environment data.\"\"\"\n environment = FiveADayEnvironment(task.environment_data)\n\n # Register all tools for tracing\n for tool_name, tool_adapter in environment.get_tools().items():\n self.register(\"tools\", tool_name, tool_adapter)\n\n return environment\n\n def setup_agents(\n self,\n agent_data: Dict[str, Any],\n environment: Environment,\n task: Task,\n user,\n seed_generator: SeedGenerator,\n ) -> tuple[list[SmolAgentAdapter], Dict[str, SmolAgentAdapter]]:\n \"\"\"Create multi-agent system with orchestrator and specialists.\n\n Seeds are derived for each agent using the benchmark's seeding system\n with hierarchical paths. derive_seed() returns None if seeding is disabled.\n \"\"\"\n # Build seeds dict using seed_generator\n # Use child(\"agents\") to create logical paths like \"agents/primary_agent\"\n agent_gen = seed_generator.child(\"agents\")\n seeds = {}\n for agent_spec in agent_data[\"agents\"]:\n seeds[agent_spec[\"agent_id\"]] = agent_gen.derive_seed(agent_spec[\"agent_id\"])\n\n agents_to_run, agents_to_monitor = build_agents(agent_data, environment, seeds)\n\n # Create adapters for the primary agent(s) to run\n adapters_to_run = [SmolAgentAdapter(agent, agent.name) for agent in agents_to_run]\n\n # This ensures all agent traces are collected by the benchmark\n all_agents = {agent.name: agent for agent in agents_to_run} | agents_to_monitor\n adapters_to_monitor = {name: SmolAgentAdapter(agent, name) for name, agent in all_agents.items()}\n return adapters_to_run, adapters_to_monitor\n\n def setup_evaluators(self, environment, task, agents, user, seed_generator: SeedGenerator) -> Sequence[Evaluator]:\n \"\"\"Create evaluators based on task's evaluation criteria.\"\"\"\n if not task.evaluation_data[\"evaluators\"]:\n return []\n\n evaluator_instances = []\n for name in task.evaluation_data[\"evaluators\"]:\n evaluator_class = getattr(evaluators, name)\n evaluator_instances.append(evaluator_class(task, environment, user))\n\n return evaluator_instances\n\n def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment, query: str) -> Sequence[Any]:\n \"\"\"Execute agents and return their final answers.\"\"\"\n answers = [agent.run(query) for agent in agents]\n return answers\n\n def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:\n \"\"\"Return a model adapter for benchmark components that need LLM access.\n\n This benchmark doesn't use simulated tools, user simulators, or LLM judges,\n so this method is not called during execution.\n \"\"\"\n raise NotImplementedError(\"This benchmark doesn't use model adapters for tools/users/evaluators.\")\n\n def evaluate(\n self,\n evaluators: Sequence[Evaluator],\n agents: Dict[str, AgentAdapter],\n final_answer: Any,\n traces: Dict[str, Any],\n ) -> list[Dict[str, Any]]:\n \"\"\"Evaluate agent performance.\"\"\"\n results = []\n for evaluator in evaluators:\n filtered_traces = evaluator.filter_traces(traces)\n results.append(evaluator(filtered_traces, final_answer))\n return results"
527460
},
528461
{
529462
"cell_type": "markdown",

0 commit comments

Comments
 (0)