You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
# Model adapters accept Optional[int], so None works fine
71
+
model =self.get_model_adapter(model_id, seed=agent_seed)
72
+
```
73
+
41
74
### Using Seeds in Setup Methods
42
75
43
76
All setup methods receive a `seed_generator` parameter. Use it to derive seeds for your components. When seeding is disabled (no `seed` passed to benchmark), `derive_seed()` returns `None`:
Copy file name to clipboardExpand all lines: examples/five_a_day_benchmark/five_a_day_benchmark.ipynb
+1-1Lines changed: 1 addition & 1 deletion
Original file line number
Diff line number
Diff line change
@@ -523,7 +523,7 @@
523
523
"id": "70c66cd0",
524
524
"metadata": {},
525
525
"outputs": [],
526
-
"source": "class FiveADayBenchmark(Benchmark):\n \"\"\"5-A-Day benchmark with multi-agent support.\"\"\"\n\n def setup_environment(self, agent_data: Dict[str, Any], task: Task, seed_generator: Optional[SeedGenerator] = None) -> Environment:\n \"\"\"Create environment from task data.\"\"\"\n task_data = {\n \"environment_data\": task.environment_data,\n \"query\": task.query,\n \"evaluation_data\": task.evaluation_data,\n \"metadata\": task.metadata,\n }\n\n environment = FiveADayEnvironment(task_data)\n\n # Register all tools for tracing\n for tool_name, tool_adapter in environment.get_tools().items():\n self.register(\"tools\", tool_name, tool_adapter)\n\n return environment\n\n def setup_agents(\n self,\n agent_data: Dict[str, Any],\n environment: Environment,\n task: Task,\n user=None,\n seed_generator: Optional[SeedGenerator] = None,\n ) -> tuple[list[SmolAgentAdapter], Dict[str, SmolAgentAdapter]]:\n \"\"\"Create multi-agent system with orchestrator and specialists.\n\n If seed_generator is provided, seeds are derived for each agent\n using the benchmark's seeding system with hierarchical paths.\n \"\"\"\n # Build seeds dict if seed_generator is available\n # Use child(\"agents\") to create logical paths like \"agents/primary_agent\"\n seeds = None\n if seed_generator is not None:\n agent_gen = seed_generator.child(\"agents\")\n seeds = {}\n for agent_spec in agent_data[\"agents\"]:\n seeds[agent_spec[\"agent_id\"]] = agent_gen.derive_seed(agent_spec[\"agent_id\"])\n\n agents_to_run, agents_to_monitor = build_agents(agent_data, environment, seeds)\n\n # Create adapters for the primary agent(s) to run\n adapters_to_run = [SmolAgentAdapter(agent, agent.name) for agent in agents_to_run]\n\n # This ensures all agent traces are collected by the benchmark\n all_agents = {agent.name: agent for agent in agents_to_run} | agents_to_monitor\n adapters_to_monitor = {name: SmolAgentAdapter(agent, name) for name, agent in all_agents.items()}\n return adapters_to_run, adapters_to_monitor\n\n def setup_evaluators(self, environment, task, agents, user, seed_generator: Optional[SeedGenerator] = None) -> Sequence[Evaluator]:\n \"\"\"Create evaluators based on task's evaluation criteria.\"\"\"\n if not task.evaluation_data[\"evaluators\"]:\n return []\n\n evaluator_instances = []\n for name in task.evaluation_data[\"evaluators\"]:\n evaluator_class = getattr(evaluators, name)\n evaluator_instances.append(evaluator_class(task, environment, user))\n\n return evaluator_instances\n\n def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment, query: str) -> Sequence[Any]:\n \"\"\"Execute agents and return their final answers.\"\"\"\n answers = [agent.run(query) for agent in agents]\n return answers\n\n def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:\n \"\"\"Return a model adapter for benchmark components that need LLM access.\n\n This benchmark doesn't use simulated tools, user simulators, or LLM judges,\n so this method is not called during execution.\n \"\"\"\n raise NotImplementedError(\"This benchmark doesn't use model adapters for tools/users/evaluators.\")\n\n def evaluate(\n self,\n evaluators: Sequence[Evaluator],\n agents: Dict[str, AgentAdapter],\n final_answer: Any,\n traces: Dict[str, Any],\n ) -> list[Dict[str, Any]]:\n \"\"\"Evaluate agent performance.\"\"\"\n results = []\n for evaluator in evaluators:\n filtered_traces = evaluator.filter_traces(traces)\n results.append(evaluator(filtered_traces, final_answer))\n return results"
526
+
"source": "class FiveADayBenchmark(Benchmark):\n \"\"\"5-A-Day benchmark with multi-agent support.\"\"\"\n\n def setup_environment(self, agent_data: Dict[str, Any], task: Task, seed_generator: SeedGenerator) -> Environment:\n \"\"\"Create environment from task data.\"\"\"\n task_data = {\n \"environment_data\": task.environment_data,\n \"query\": task.query,\n \"evaluation_data\": task.evaluation_data,\n \"metadata\": task.metadata,\n }\n\n environment = FiveADayEnvironment(task_data)\n\n # Register all tools for tracing\n for tool_name, tool_adapter in environment.get_tools().items():\n self.register(\"tools\", tool_name, tool_adapter)\n\n return environment\n\n def setup_agents(\n self,\n agent_data: Dict[str, Any],\n environment: Environment,\n task: Task,\n user,\n seed_generator: SeedGenerator,\n ) -> tuple[list[SmolAgentAdapter], Dict[str, SmolAgentAdapter]]:\n \"\"\"Create multi-agent system with orchestrator and specialists.\n\n Seeds are derived for each agent using the benchmark's seeding system\n with hierarchical paths. derive_seed() returns None if seeding is disabled.\n \"\"\"\n # Build seeds dict using seed_generator\n # Use child(\"agents\") to create logical paths like \"agents/primary_agent\"\n agent_gen = seed_generator.child(\"agents\")\n seeds = {}\n for agent_spec in agent_data[\"agents\"]:\n seeds[agent_spec[\"agent_id\"]] = agent_gen.derive_seed(agent_spec[\"agent_id\"])\n\n agents_to_run, agents_to_monitor = build_agents(agent_data, environment, seeds)\n\n # Create adapters for the primary agent(s) to run\n adapters_to_run = [SmolAgentAdapter(agent, agent.name) for agent in agents_to_run]\n\n # This ensures all agent traces are collected by the benchmark\n all_agents = {agent.name: agent for agent in agents_to_run} | agents_to_monitor\n adapters_to_monitor = {name: SmolAgentAdapter(agent, name) for name, agent in all_agents.items()}\n return adapters_to_run, adapters_to_monitor\n\n def setup_evaluators(self, environment, task, agents, user, seed_generator: SeedGenerator) -> Sequence[Evaluator]:\n \"\"\"Create evaluators based on task's evaluation criteria.\"\"\"\n if not task.evaluation_data[\"evaluators\"]:\n return []\n\n evaluator_instances = []\n for name in task.evaluation_data[\"evaluators\"]:\n evaluator_class = getattr(evaluators, name)\n evaluator_instances.append(evaluator_class(task, environment, user))\n\n return evaluator_instances\n\n def run_agents(self, agents: Sequence[AgentAdapter], task: Task, environment: Environment, query: str) -> Sequence[Any]:\n \"\"\"Execute agents and return their final answers.\"\"\"\n answers = [agent.run(query) for agent in agents]\n return answers\n\n def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:\n \"\"\"Return a model adapter for benchmark components that need LLM access.\n\n This benchmark doesn't use simulated tools, user simulators, or LLM judges,\n so this method is not called during execution.\n \"\"\"\n raise NotImplementedError(\"This benchmark doesn't use model adapters for tools/users/evaluators.\")\n\n def evaluate(\n self,\n evaluators: Sequence[Evaluator],\n agents: Dict[str, AgentAdapter],\n final_answer: Any,\n traces: Dict[str, Any],\n ) -> list[Dict[str, Any]]:\n \"\"\"Evaluate agent performance.\"\"\"\n results = []\n for evaluator in evaluators:\n filtered_traces = evaluator.filter_traces(traces)\n results.append(evaluator(filtered_traces, final_answer))\n return results"
0 commit comments