Skip to content

Commit bf6c100

Browse files
committed
fixed examples
1 parent aa29258 commit bf6c100

6 files changed

Lines changed: 122 additions & 58 deletions

File tree

examples/five_a_day_benchmark/data/tasks.json

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
{
7171
"query": "I want to split my Apple stock equally among my children. Can you look up the current stock price and tell me how much each child would get if I give them equal shares?",
7272
"environment_data": {
73-
"tools": ["websearch", "family_info", "calculator", "banking"],
73+
"tools": ["stock_price", "family_info", "calculator", "banking"],
7474
"family_info": {
7575
"children": [
7676
{ "name": "Emma", "age": 16 },
@@ -102,15 +102,10 @@
102102
]
103103
},
104104
"metadata": {
105-
"description": "Tests multi-step reasoning with web search, data retrieval, and mathematical computation. Agent must look up stock price, retrieve family information, and calculate per-child inheritance value.",
106-
"tools_required": ["websearch", "family_info", "calculator", "banking"],
105+
"description": "Tests multi-step reasoning with stock price lookup, data retrieval, and mathematical computation. Agent must look up stock price, retrieve family information, and calculate per-child inheritance value.",
106+
"tools_required": ["stock_price", "family_info", "calculator", "banking"],
107107
"complexity": "medium",
108-
"skills_tested": [
109-
"web_search",
110-
"data_retrieval",
111-
"arithmetic",
112-
"multi_step_reasoning"
113-
],
108+
"skills_tested": ["data_retrieval", "arithmetic", "multi_step_reasoning"],
114109
"task_id": "finance_calculation"
115110
}
116111
},

examples/five_a_day_benchmark/evaluators/code_generation.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from maseval import Evaluator, Environment, Task, User
1414
from .utils import normalize_final_answer, call_llm_judge
15-
from examples.five_a_day_benchmark.tools import get_safe_python_exec_environment
15+
from tools import get_safe_python_exec_environment
1616

1717

1818
class UnitTestEvaluator(Evaluator):
@@ -51,12 +51,15 @@ def __call__(self, traces: Dict[str, Any], final_answer: Optional[str] = None) -
5151
expected_output = test_case["expected_output"]
5252

5353
try:
54-
result = self._execute_code(code, self.function_name, test_input)
54+
result, printed_output = self._execute_code(code, self.function_name, test_input)
5555
passed = result == expected_output
5656
test_results.append(passed)
5757

5858
if not passed:
59-
errors.append(f"Test {i}: expected {expected_output}, got {result}")
59+
error_msg = f"Test {i}: expected {expected_output}, got {result}"
60+
if printed_output:
61+
error_msg += f" [stdout: {printed_output.strip()}]"
62+
errors.append(error_msg)
6063

6164
except Exception as e:
6265
test_results.append(False)
@@ -73,8 +76,12 @@ def __call__(self, traces: Dict[str, Any], final_answer: Optional[str] = None) -
7376
"errors": errors if errors else None,
7477
}
7578

76-
def _execute_code(self, code: str, function_name: str, test_input: Any) -> Any:
77-
"""Execute code safely using RestrictedPython and return result."""
79+
def _execute_code(self, code: str, function_name: str, test_input: Any) -> tuple[Any, str]:
80+
"""Execute code safely using RestrictedPython and return result with captured output.
81+
82+
Returns:
83+
Tuple of (result, printed_output) where printed_output contains any print() calls.
84+
"""
7885
# Compile with RestrictedPython
7986
compile_result = compile_restricted(code, "<evaluator>", "exec")
8087

@@ -90,15 +97,20 @@ def _execute_code(self, code: str, function_name: str, test_input: Any) -> Any:
9097

9198
code_obj = compile_result.code if hasattr(compile_result, "code") else compile_result
9299

93-
# Get shared safe execution environment
94-
safe_env = get_safe_python_exec_environment(include_print_collector=False)
100+
# Get safe execution environment (includes PrintCollector)
101+
safe_env = get_safe_python_exec_environment()
95102

96103
exec(code_obj, safe_env)
97104

98105
if function_name not in safe_env:
99106
raise ValueError(f"Function '{function_name}' not found in code")
100107

101-
return safe_env[function_name](test_input)
108+
result = safe_env[function_name](test_input)
109+
110+
# Collect any print output
111+
printed_output = safe_env.get("_print", lambda: "")()
112+
113+
return result, printed_output
102114

103115
def _extract_code_from_answer(self, answer: str) -> Optional[str]:
104116
"""Extract Python code from final answer string."""

examples/five_a_day_benchmark/five_a_day_benchmark.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -670,7 +670,7 @@
670670
"\n",
671671
" def get_model_adapter(self, model_id: str, **kwargs) -> ModelAdapter:\n",
672672
" \"\"\"Return a model adapter for benchmark components that need LLM access.\n",
673-
" \n",
673+
"\n",
674674
" This benchmark doesn't use simulated tools, user simulators, or LLM judges,\n",
675675
" so this method is not called during execution.\n",
676676
" \"\"\"\n",

examples/five_a_day_benchmark/five_a_day_benchmark.py

Lines changed: 45 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ def build_smolagents_single_agent(
236236
all_tool_adapters: Dict[str, Any],
237237
primary_spec: Dict[str, Any],
238238
specialist_specs: List[Dict[str, Any]],
239-
) -> Any:
239+
) -> tuple[Any, Dict[str, Any]]:
240240
"""Build a single smolagents agent.
241241
242242
Args:
@@ -247,7 +247,7 @@ def build_smolagents_single_agent(
247247
specialist_specs: Empty list for single-agent (ignored)
248248
249249
Returns:
250-
SmolAgentAdapter wrapping the created agent
250+
Tuple of (primary_adapter, all_adapters_dict) for consistent interface
251251
"""
252252
from smolagents import ToolCallingAgent
253253
from maseval.interface.agents.smolagents import SmolAgentAdapter
@@ -266,7 +266,8 @@ def build_smolagents_single_agent(
266266
verbosity_level=0,
267267
)
268268

269-
return SmolAgentAdapter(agent, primary_spec["agent_id"])
269+
adapter = SmolAgentAdapter(agent, primary_spec["agent_id"])
270+
return adapter, {primary_spec["agent_id"]: adapter}
270271

271272

272273
def build_langgraph_single_agent(
@@ -275,7 +276,7 @@ def build_langgraph_single_agent(
275276
all_tool_adapters: Dict[str, Any],
276277
primary_spec: Dict[str, Any],
277278
specialist_specs: List[Dict[str, Any]],
278-
) -> Any:
279+
) -> tuple[Any, Dict[str, Any]]:
279280
"""Build a single langgraph agent.
280281
281282
Args:
@@ -286,7 +287,7 @@ def build_langgraph_single_agent(
286287
specialist_specs: Empty list for single-agent (ignored)
287288
288289
Returns:
289-
LangGraphAgentAdapter wrapping the created graph
290+
Tuple of (primary_adapter, all_adapters_dict) for consistent interface
290291
"""
291292
from langchain_core.messages import SystemMessage
292293
from langgraph.graph import StateGraph, END
@@ -323,7 +324,8 @@ def call_model(state: AgentState):
323324
workflow.add_edge("tools", "agent")
324325

325326
graph = workflow.compile()
326-
return LangGraphAgentAdapter(graph, primary_spec["agent_id"])
327+
adapter = LangGraphAgentAdapter(graph, primary_spec["agent_id"])
328+
return adapter, {primary_spec["agent_id"]: adapter}
327329

328330

329331
def build_llamaindex_single_agent(
@@ -332,7 +334,7 @@ def build_llamaindex_single_agent(
332334
all_tool_adapters: Dict[str, Any],
333335
primary_spec: Dict[str, Any],
334336
specialist_specs: List[Dict[str, Any]],
335-
) -> Any:
337+
) -> tuple[Any, Dict[str, Any]]:
336338
"""Build a single llamaindex agent.
337339
338340
Args:
@@ -343,7 +345,7 @@ def build_llamaindex_single_agent(
343345
specialist_specs: Empty list for single-agent (ignored)
344346
345347
Returns:
346-
LlamaIndexAgentAdapter wrapping the created agent
348+
Tuple of (primary_adapter, all_adapters_dict) for consistent interface
347349
"""
348350
from llama_index.core.agent.workflow.react_agent import ReActAgent
349351
from maseval.interface.agents.llamaindex import LlamaIndexAgentAdapter
@@ -361,7 +363,8 @@ def build_llamaindex_single_agent(
361363
system_prompt=primary_spec.get("agent_instruction"),
362364
)
363365

364-
return LlamaIndexAgentAdapter(agent, primary_spec["agent_id"])
366+
adapter = LlamaIndexAgentAdapter(agent, primary_spec["agent_id"])
367+
return adapter, {primary_spec["agent_id"]: adapter}
365368

366369

367370
def build_smolagents_multi_agent(
@@ -370,7 +373,7 @@ def build_smolagents_multi_agent(
370373
all_tool_adapters: Dict[str, Any],
371374
primary_spec: Dict[str, Any],
372375
specialist_specs: List[Dict[str, Any]],
373-
) -> Any:
376+
) -> tuple[Any, Dict[str, Any]]:
374377
"""Build smolagents multi-agent setup with orchestrator and specialists.
375378
376379
Args:
@@ -381,12 +384,15 @@ def build_smolagents_multi_agent(
381384
specialist_specs: List of specialist agent specifications
382385
383386
Returns:
384-
SmolAgentAdapter wrapping the orchestrator agent
387+
Tuple of (primary_adapter, all_adapters_dict) where all_adapters_dict
388+
includes the orchestrator and all specialists for trace registration.
385389
"""
386390
from smolagents import ToolCallingAgent, FinalAnswerTool
387391
from maseval.interface.agents.smolagents import SmolAgentAdapter
388392

389393
specialist_agents = []
394+
specialist_adapters_dict: Dict[str, Any] = {}
395+
390396
for agent_spec in specialist_specs:
391397
specialist_seed = agent_spec.get("seed")
392398
specialist_model = get_model(model_id, "smolagents", temperature, specialist_seed)
@@ -404,6 +410,8 @@ def build_smolagents_multi_agent(
404410
verbosity_level=0,
405411
)
406412
specialist_agents.append(specialist)
413+
# Create adapter for each specialist for trace registration
414+
specialist_adapters_dict[agent_spec["agent_id"]] = SmolAgentAdapter(specialist, agent_spec["agent_id"])
407415

408416
primary_adapters = filter_tool_adapters_by_prefix(all_tool_adapters, primary_spec["tools"])
409417
primary_tools = [adapter.tool for adapter in primary_adapters.values()]
@@ -421,7 +429,11 @@ def build_smolagents_multi_agent(
421429
verbosity_level=0,
422430
)
423431

424-
return SmolAgentAdapter(agent, primary_spec["agent_id"])
432+
primary_adapter = SmolAgentAdapter(agent, primary_spec["agent_id"])
433+
434+
# Return primary adapter and dict of all adapters (including primary) for trace registration
435+
all_adapters = {primary_spec["agent_id"]: primary_adapter, **specialist_adapters_dict}
436+
return primary_adapter, all_adapters
425437

426438

427439
def build_langgraph_multi_agent(
@@ -430,7 +442,7 @@ def build_langgraph_multi_agent(
430442
all_tool_adapters: Dict[str, Any],
431443
primary_spec: Dict[str, Any],
432444
specialist_specs: List[Dict[str, Any]],
433-
) -> Any:
445+
) -> tuple[Any, Dict[str, Any]]:
434446
"""Build langgraph multi-agent setup with orchestrator and specialists.
435447
436448
Args:
@@ -441,7 +453,8 @@ def build_langgraph_multi_agent(
441453
specialist_specs: List of specialist agent specifications
442454
443455
Returns:
444-
LangGraphAgentAdapter wrapping the multi-agent graph
456+
Tuple of (primary_adapter, all_adapters_dict). Note: LangGraph multi-agent
457+
compiles specialists into graph nodes, so only the graph is traceable.
445458
"""
446459
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
447460
from langchain_core.tools import tool as create_tool
@@ -578,7 +591,8 @@ def route_after_orchestrator(state: MultiAgentState):
578591
workflow.add_edge(agent_id, "orchestrator")
579592

580593
graph = workflow.compile()
581-
return LangGraphAgentAdapter(graph, primary_spec["agent_id"])
594+
adapter = LangGraphAgentAdapter(graph, primary_spec["agent_id"])
595+
return adapter, {primary_spec["agent_id"]: adapter}
582596

583597

584598
def build_llamaindex_multi_agent(
@@ -587,7 +601,7 @@ def build_llamaindex_multi_agent(
587601
all_tool_adapters: Dict[str, Any],
588602
primary_spec: Dict[str, Any],
589603
specialist_specs: List[Dict[str, Any]],
590-
) -> Any:
604+
) -> tuple[Any, Dict[str, Any]]:
591605
"""Build llamaindex multi-agent setup with orchestrator and specialists.
592606
593607
Args:
@@ -598,7 +612,8 @@ def build_llamaindex_multi_agent(
598612
specialist_specs: List of specialist agent specifications
599613
600614
Returns:
601-
LlamaIndexAgentAdapter wrapping the orchestrator agent
615+
Tuple of (primary_adapter, all_adapters_dict). Note: LlamaIndex multi-agent
616+
uses handoff tools, so only the orchestrator is directly traceable.
602617
"""
603618
from llama_index.core.agent.workflow.react_agent import ReActAgent
604619
from llama_index.core.tools import FunctionTool
@@ -666,7 +681,8 @@ async def run_specialist():
666681
system_prompt=primary_spec.get("agent_instruction"),
667682
)
668683

669-
return LlamaIndexAgentAdapter(orchestrator, primary_spec["agent_id"])
684+
adapter = LlamaIndexAgentAdapter(orchestrator, primary_spec["agent_id"])
685+
return adapter, {primary_spec["agent_id"]: adapter}
670686

671687

672688
def get_agent_builder(framework: str, agent_type: str):
@@ -723,7 +739,13 @@ def setup_environment(self, agent_data: Dict[str, Any], task: Task) -> Environme
723739
def setup_agents(
724740
self, agent_data: Dict[str, Any], environment: Environment, task: Task, user=None
725741
) -> tuple[List[AgentAdapter], Dict[str, AgentAdapter]]:
726-
"""Create framework-specific agent with tools from environment."""
742+
"""Create framework-specific agent with tools from environment.
743+
744+
Returns:
745+
Tuple of (agents_to_run, agents_dict):
746+
- agents_to_run: List of adapters for agents that should be executed
747+
- agents_dict: Dict of all adapters for trace registration (includes specialists)
748+
"""
727749
framework = agent_data["framework"]
728750
agent_type = agent_data["agent_type"]
729751
model_id = agent_data["model_config"]["model_id"]
@@ -736,11 +758,12 @@ def setup_agents(
736758
primary_spec = next(a for a in agents_specs if a["agent_id"] == primary_agent_id)
737759
specialist_specs = [a for a in agents_specs if a["agent_id"] != primary_agent_id]
738760

739-
# Build agent using unified interface
761+
# Build agent using unified interface - now returns (primary_adapter, all_adapters_dict)
740762
builder = get_agent_builder(framework, agent_type)
741-
agent_adapter = builder(model_id, temperature, all_tool_adapters, primary_spec, specialist_specs)
763+
primary_adapter, all_adapters_dict = builder(model_id, temperature, all_tool_adapters, primary_spec, specialist_specs)
742764

743-
return [agent_adapter], {primary_agent_id: agent_adapter}
765+
# Return primary adapter to run, and all adapters for trace registration
766+
return [primary_adapter], all_adapters_dict
744767

745768
def setup_evaluators(self, environment, task, agents, user) -> Sequence[Evaluator]:
746769
"""Create evaluators based on task's evaluation_data.evaluators list."""

examples/five_a_day_benchmark/tools/code_execution.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -73,29 +73,24 @@ def get_safe_guards() -> dict:
7373
}
7474

7575

76-
def get_safe_python_exec_environment(include_print_collector: bool = False) -> dict:
76+
def get_safe_python_exec_environment() -> dict:
7777
"""Get a complete safe execution environment for RestrictedPython.
7878
79-
Args:
80-
include_print_collector: If True, includes PrintCollector for capturing print output.
81-
If False, print goes to stdout (useful for evaluators).
79+
Always includes PrintCollector for capturing print output. After exec(),
80+
retrieve captured output via: env.get('_print', lambda: '')().
8281
8382
Returns:
8483
A dictionary suitable for use as globals in exec() with RestrictedPython.
8584
"""
86-
env = {
85+
from RestrictedPython.PrintCollector import PrintCollector
86+
87+
return {
8788
**safe_globals,
8889
"__builtins__": get_safe_builtins(),
8990
**get_safe_guards(),
91+
"_print_": PrintCollector,
9092
}
9193

92-
if include_print_collector:
93-
from RestrictedPython.PrintCollector import PrintCollector
94-
95-
env["_print_"] = PrintCollector
96-
97-
return env
98-
9994

10095
class CodeExecutionState:
10196
"""Shared state for code execution tools.
@@ -106,7 +101,7 @@ class CodeExecutionState:
106101
def __init__(self, test_cases: list[dict[str, Any]] | None = None):
107102
self.test_cases = test_cases or []
108103
# Get shared safe execution environment with print collector for capturing output
109-
self.safe_env = get_safe_python_exec_environment(include_print_collector=True)
104+
self.safe_env = get_safe_python_exec_environment()
110105

111106

112107
class PythonExecutorExecuteTool(BaseTool):

0 commit comments

Comments
 (0)