fixed examples

cemde · cemde · commit bf6c100ef32a · 2025-12-05T12:52:01.000Z
diff --git a/examples/five_a_day_benchmark/data/tasks.json b/examples/five_a_day_benchmark/data/tasks.json
@@ -70,7 +70,7 @@
   {
     "query": "I want to split my Apple stock equally among my children. Can you look up the current stock price and tell me how much each child would get if I give them equal shares?",
     "environment_data": {
-      "tools": ["websearch", "family_info", "calculator", "banking"],
+      "tools": ["stock_price", "family_info", "calculator", "banking"],
       "family_info": {
         "children": [
           { "name": "Emma", "age": 16 },
@@ -102,15 +102,10 @@
       ]
     },
     "metadata": {
-      "description": "Tests multi-step reasoning with web search, data retrieval, and mathematical computation. Agent must look up stock price, retrieve family information, and calculate per-child inheritance value.",
-      "tools_required": ["websearch", "family_info", "calculator", "banking"],
+      "description": "Tests multi-step reasoning with stock price lookup, data retrieval, and mathematical computation. Agent must look up stock price, retrieve family information, and calculate per-child inheritance value.",
+      "tools_required": ["stock_price", "family_info", "calculator", "banking"],
       "complexity": "medium",
-      "skills_tested": [
-        "web_search",
-        "data_retrieval",
-        "arithmetic",
-        "multi_step_reasoning"
-      ],
+      "skills_tested": ["data_retrieval", "arithmetic", "multi_step_reasoning"],
       "task_id": "finance_calculation"
     }
   },
diff --git a/examples/five_a_day_benchmark/evaluators/code_generation.py b/examples/five_a_day_benchmark/evaluators/code_generation.py
@@ -12,7 +12,7 @@
 
 from maseval import Evaluator, Environment, Task, User
 from .utils import normalize_final_answer, call_llm_judge
-from examples.five_a_day_benchmark.tools import get_safe_python_exec_environment
+from tools import get_safe_python_exec_environment
 
 
 class UnitTestEvaluator(Evaluator):
@@ -51,12 +51,15 @@ def __call__(self, traces: Dict[str, Any], final_answer: Optional[str] = None) -
             expected_output = test_case["expected_output"]
 
             try:
-                result = self._execute_code(code, self.function_name, test_input)
+                result, printed_output = self._execute_code(code, self.function_name, test_input)
                 passed = result == expected_output
                 test_results.append(passed)
 
                 if not passed:
-                    errors.append(f"Test {i}: expected {expected_output}, got {result}")
+                    error_msg = f"Test {i}: expected {expected_output}, got {result}"
+                    if printed_output:
+                        error_msg += f" [stdout: {printed_output.strip()}]"
+                    errors.append(error_msg)
 
             except Exception as e:
                 test_results.append(False)
@@ -73,8 +76,12 @@ def __call__(self, traces: Dict[str, Any], final_answer: Optional[str] = None) -
             "errors": errors if errors else None,
         }
 
-    def _execute_code(self, code: str, function_name: str, test_input: Any) -> Any:
-        """Execute code safely using RestrictedPython and return result."""
+    def _execute_code(self, code: str, function_name: str, test_input: Any) -> tuple[Any, str]:
+        """Execute code safely using RestrictedPython and return result with captured output.
+
+        Returns:
+            Tuple of (result, printed_output) where printed_output contains any print() calls.
+        """
         # Compile with RestrictedPython
         compile_result = compile_restricted(code, "<evaluator>", "exec")
 
@@ -90,15 +97,20 @@ def _execute_code(self, code: str, function_name: str, test_input: Any) -> Any:
 
         code_obj = compile_result.code if hasattr(compile_result, "code") else compile_result
 
-        # Get shared safe execution environment
-        safe_env = get_safe_python_exec_environment(include_print_collector=False)
+        # Get safe execution environment (includes PrintCollector)
+        safe_env = get_safe_python_exec_environment()
 
         exec(code_obj, safe_env)
 
         if function_name not in safe_env:
             raise ValueError(f"Function '{function_name}' not found in code")
 
-        return safe_env[function_name](test_input)
+        result = safe_env[function_name](test_input)
+
+        # Collect any print output
+        printed_output = safe_env.get("_print", lambda: "")()
+
+        return result, printed_output
 
     def _extract_code_from_answer(self, answer: str) -> Optional[str]:
         """Extract Python code from final answer string."""
diff --git a/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb b/examples/five_a_day_benchmark/five_a_day_benchmark.ipynb
@@ -670,7 +670,7 @@
     "\n",
     "    def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:\n",
     "        \"\"\"Return a model adapter for benchmark components that need LLM access.\n",
-    "        \n",
+    "\n",
     "        This benchmark doesn't use simulated tools, user simulators, or LLM judges,\n",
     "        so this method is not called during execution.\n",
     "        \"\"\"\n",
diff --git a/examples/five_a_day_benchmark/five_a_day_benchmark.py b/examples/five_a_day_benchmark/five_a_day_benchmark.py
@@ -236,7 +236,7 @@ def build_smolagents_single_agent(
     all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
-) -> Any:
+) -> tuple[Any, Dict[str, Any]]:
     """Build a single smolagents agent.
 
     Args:
@@ -247,7 +247,7 @@ def build_smolagents_single_agent(
         specialist_specs: Empty list for single-agent (ignored)
 
     Returns:
-        SmolAgentAdapter wrapping the created agent
+        Tuple of (primary_adapter, all_adapters_dict) for consistent interface
     """
     from smolagents import ToolCallingAgent
     from maseval.interface.agents.smolagents import SmolAgentAdapter
@@ -266,7 +266,8 @@ def build_smolagents_single_agent(
         verbosity_level=0,
     )
 
-    return SmolAgentAdapter(agent, primary_spec["agent_id"])
+    adapter = SmolAgentAdapter(agent, primary_spec["agent_id"])
+    return adapter, {primary_spec["agent_id"]: adapter}
 
 
 def build_langgraph_single_agent(
@@ -275,7 +276,7 @@ def build_langgraph_single_agent(
     all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
-) -> Any:
+) -> tuple[Any, Dict[str, Any]]:
     """Build a single langgraph agent.
 
     Args:
@@ -286,7 +287,7 @@ def build_langgraph_single_agent(
         specialist_specs: Empty list for single-agent (ignored)
 
     Returns:
-        LangGraphAgentAdapter wrapping the created graph
+        Tuple of (primary_adapter, all_adapters_dict) for consistent interface
     """
     from langchain_core.messages import SystemMessage
     from langgraph.graph import StateGraph, END
@@ -323,7 +324,8 @@ def call_model(state: AgentState):
     workflow.add_edge("tools", "agent")
 
     graph = workflow.compile()
-    return LangGraphAgentAdapter(graph, primary_spec["agent_id"])
+    adapter = LangGraphAgentAdapter(graph, primary_spec["agent_id"])
+    return adapter, {primary_spec["agent_id"]: adapter}
 
 
 def build_llamaindex_single_agent(
@@ -332,7 +334,7 @@ def build_llamaindex_single_agent(
     all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
-) -> Any:
+) -> tuple[Any, Dict[str, Any]]:
     """Build a single llamaindex agent.
 
     Args:
@@ -343,7 +345,7 @@ def build_llamaindex_single_agent(
         specialist_specs: Empty list for single-agent (ignored)
 
     Returns:
-        LlamaIndexAgentAdapter wrapping the created agent
+        Tuple of (primary_adapter, all_adapters_dict) for consistent interface
     """
     from llama_index.core.agent.workflow.react_agent import ReActAgent
     from maseval.interface.agents.llamaindex import LlamaIndexAgentAdapter
@@ -361,7 +363,8 @@ def build_llamaindex_single_agent(
         system_prompt=primary_spec.get("agent_instruction"),
     )
 
-    return LlamaIndexAgentAdapter(agent, primary_spec["agent_id"])
+    adapter = LlamaIndexAgentAdapter(agent, primary_spec["agent_id"])
+    return adapter, {primary_spec["agent_id"]: adapter}
 
 
 def build_smolagents_multi_agent(
@@ -370,7 +373,7 @@ def build_smolagents_multi_agent(
     all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
-) -> Any:
+) -> tuple[Any, Dict[str, Any]]:
     """Build smolagents multi-agent setup with orchestrator and specialists.
 
     Args:
@@ -381,12 +384,15 @@ def build_smolagents_multi_agent(
         specialist_specs: List of specialist agent specifications
 
     Returns:
-        SmolAgentAdapter wrapping the orchestrator agent
+        Tuple of (primary_adapter, all_adapters_dict) where all_adapters_dict
+        includes the orchestrator and all specialists for trace registration.
     """
     from smolagents import ToolCallingAgent, FinalAnswerTool
     from maseval.interface.agents.smolagents import SmolAgentAdapter
 
     specialist_agents = []
+    specialist_adapters_dict: Dict[str, Any] = {}
+
     for agent_spec in specialist_specs:
         specialist_seed = agent_spec.get("seed")
         specialist_model = get_model(model_id, "smolagents", temperature, specialist_seed)
@@ -404,6 +410,8 @@ def build_smolagents_multi_agent(
             verbosity_level=0,
         )
         specialist_agents.append(specialist)
+        # Create adapter for each specialist for trace registration
+        specialist_adapters_dict[agent_spec["agent_id"]] = SmolAgentAdapter(specialist, agent_spec["agent_id"])
 
     primary_adapters = filter_tool_adapters_by_prefix(all_tool_adapters, primary_spec["tools"])
     primary_tools = [adapter.tool for adapter in primary_adapters.values()]
@@ -421,7 +429,11 @@ def build_smolagents_multi_agent(
         verbosity_level=0,
     )
 
-    return SmolAgentAdapter(agent, primary_spec["agent_id"])
+    primary_adapter = SmolAgentAdapter(agent, primary_spec["agent_id"])
+
+    # Return primary adapter and dict of all adapters (including primary) for trace registration
+    all_adapters = {primary_spec["agent_id"]: primary_adapter, **specialist_adapters_dict}
+    return primary_adapter, all_adapters
 
 
 def build_langgraph_multi_agent(
@@ -430,7 +442,7 @@ def build_langgraph_multi_agent(
     all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
-) -> Any:
+) -> tuple[Any, Dict[str, Any]]:
     """Build langgraph multi-agent setup with orchestrator and specialists.
 
     Args:
@@ -441,7 +453,8 @@ def build_langgraph_multi_agent(
         specialist_specs: List of specialist agent specifications
 
     Returns:
-        LangGraphAgentAdapter wrapping the multi-agent graph
+        Tuple of (primary_adapter, all_adapters_dict). Note: LangGraph multi-agent
+        compiles specialists into graph nodes, so only the graph is traceable.
     """
     from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
     from langchain_core.tools import tool as create_tool
@@ -578,7 +591,8 @@ def route_after_orchestrator(state: MultiAgentState):
         workflow.add_edge(agent_id, "orchestrator")
 
     graph = workflow.compile()
-    return LangGraphAgentAdapter(graph, primary_spec["agent_id"])
+    adapter = LangGraphAgentAdapter(graph, primary_spec["agent_id"])
+    return adapter, {primary_spec["agent_id"]: adapter}
 
 
 def build_llamaindex_multi_agent(
@@ -587,7 +601,7 @@ def build_llamaindex_multi_agent(
     all_tool_adapters: Dict[str, Any],
     primary_spec: Dict[str, Any],
     specialist_specs: List[Dict[str, Any]],
-) -> Any:
+) -> tuple[Any, Dict[str, Any]]:
     """Build llamaindex multi-agent setup with orchestrator and specialists.
 
     Args:
@@ -598,7 +612,8 @@ def build_llamaindex_multi_agent(
         specialist_specs: List of specialist agent specifications
 
     Returns:
-        LlamaIndexAgentAdapter wrapping the orchestrator agent
+        Tuple of (primary_adapter, all_adapters_dict). Note: LlamaIndex multi-agent
+        uses handoff tools, so only the orchestrator is directly traceable.
     """
     from llama_index.core.agent.workflow.react_agent import ReActAgent
     from llama_index.core.tools import FunctionTool
@@ -666,7 +681,8 @@ async def run_specialist():
         system_prompt=primary_spec.get("agent_instruction"),
     )
 
-    return LlamaIndexAgentAdapter(orchestrator, primary_spec["agent_id"])
+    adapter = LlamaIndexAgentAdapter(orchestrator, primary_spec["agent_id"])
+    return adapter, {primary_spec["agent_id"]: adapter}
 
 
 def get_agent_builder(framework: str, agent_type: str):
@@ -723,7 +739,13 @@ def setup_environment(self, agent_data: Dict[str, Any], task: Task) -> Environme
     def setup_agents(
         self, agent_data: Dict[str, Any], environment: Environment, task: Task, user=None
     ) -> tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
-        """Create framework-specific agent with tools from environment."""
+        """Create framework-specific agent with tools from environment.
+
+        Returns:
+            Tuple of (agents_to_run, agents_dict):
+            - agents_to_run: List of adapters for agents that should be executed
+            - agents_dict: Dict of all adapters for trace registration (includes specialists)
+        """
         framework = agent_data["framework"]
         agent_type = agent_data["agent_type"]
         model_id = agent_data["model_config"]["model_id"]
@@ -736,11 +758,12 @@ def setup_agents(
         primary_spec = next(a for a in agents_specs if a["agent_id"] == primary_agent_id)
         specialist_specs = [a for a in agents_specs if a["agent_id"] != primary_agent_id]
 
-        # Build agent using unified interface
+        # Build agent using unified interface - now returns (primary_adapter, all_adapters_dict)
         builder = get_agent_builder(framework, agent_type)
-        agent_adapter = builder(model_id, temperature, all_tool_adapters, primary_spec, specialist_specs)
+        primary_adapter, all_adapters_dict = builder(model_id, temperature, all_tool_adapters, primary_spec, specialist_specs)
 
-        return [agent_adapter], {primary_agent_id: agent_adapter}
+        # Return primary adapter to run, and all adapters for trace registration
+        return [primary_adapter], all_adapters_dict
 
     def setup_evaluators(self, environment, task, agents, user) -> Sequence[Evaluator]:
         """Create evaluators based on task's evaluation_data.evaluators list."""
diff --git a/examples/five_a_day_benchmark/tools/code_execution.py b/examples/five_a_day_benchmark/tools/code_execution.py
@@ -73,29 +73,24 @@ def get_safe_guards() -> dict:
     }
 
 
-def get_safe_python_exec_environment(include_print_collector: bool = False) -> dict:
+def get_safe_python_exec_environment() -> dict:
     """Get a complete safe execution environment for RestrictedPython.
 
-    Args:
-        include_print_collector: If True, includes PrintCollector for capturing print output.
-                                If False, print goes to stdout (useful for evaluators).
+    Always includes PrintCollector for capturing print output. After exec(),
+    retrieve captured output via: env.get('_print', lambda: '')().
 
     Returns:
         A dictionary suitable for use as globals in exec() with RestrictedPython.
     """
-    env = {
+    from RestrictedPython.PrintCollector import PrintCollector
+
+    return {
         **safe_globals,
         "__builtins__": get_safe_builtins(),
         **get_safe_guards(),
+        "_print_": PrintCollector,
     }
 
-    if include_print_collector:
-        from RestrictedPython.PrintCollector import PrintCollector
-
-        env["_print_"] = PrintCollector
-
-    return env
-
 
 class CodeExecutionState:
     """Shared state for code execution tools.
@@ -106,7 +101,7 @@ class CodeExecutionState:
     def __init__(self, test_cases: list[dict[str, Any]] | None = None):
         self.test_cases = test_cases or []
         # Get shared safe execution environment with print collector for capturing output
-        self.safe_env = get_safe_python_exec_environment(include_print_collector=True)
+        self.safe_env = get_safe_python_exec_environment()
 
 
 class PythonExecutorExecuteTool(BaseTool):
diff --git a/examples/introduction/tutorial.ipynb b/examples/introduction/tutorial.ipynb