Skip to content

Commit ba8c986

Browse files
committed
fixed task.id bad pattern and updated changelog
1 parent 4e2a9a8 commit ba8c986

12 files changed

Lines changed: 84 additions & 242 deletions

File tree

CHANGELOG.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,43 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2020
- Handles Anthropic-specific message format conversion (system messages, tool_use/tool_result blocks) internally while accepting OpenAI-compatible input
2121
- Added `anthropic` optional dependency: `pip install maseval[anthropic]`
2222

23+
**Benchmarks**
24+
25+
- Tau2 Benchmark: Full implementation of the tau2-bench benchmark for evaluating LLM-based agents on customer service tasks across airline, retail, and telecom domains (PR: #16)
26+
- `Tau2Benchmark`, `Tau2Environment`, `Tau2User`, `Tau2Evaluator` components for framework-agnostic evaluation (PR: #16)
27+
- `DefaultAgentTau2Benchmark` using an agent setup closely resembeling to the original tau2-bench implementation (PR: #16)
28+
- Data loading utilities: `load_tasks()`, `ensure_data_exists()`, `configure_model_ids()` (PR: #16)
29+
- Metrics: `compute_benchmark_metrics()`, `compute_pass_at_k()`, `compute_pass_hat_k()` for tau2-style scoring (PR: #16)
30+
- Domain implementations with tool kits: `AirlineTools`, `RetailTools`, `TelecomTools` with full database simulation (PR: #16)
31+
32+
**User**
33+
34+
- `AgenticUser` class for users that can use tools during conversations (PR: #16)
35+
- Multiple stop token support: `User` now accepts `stop_tokens` (list) instead of single `stop_token`, enabling different termination reasons (PR: #16)
36+
- Stop reason tracking: `User` traces now include `stop_reason`, `max_turns`, `turns_used`, and `stopped_by_user` for detailed termination analysis (PR: #16)
37+
38+
**Simulator**
39+
40+
- `AgenticUserLLMSimulator` for LLM-based user simulation with tool use capabilities (PR: #16)
41+
42+
**Examples**
43+
44+
- Tau2 benchmark example with default agent implementation and result comparison scripts (PR: #16)
45+
2346
### Changed
2447

48+
**Benchmark**
49+
50+
- `Benchmark.agent_data` parameter is now optional (defaults to empty dict) (PR: #16)
51+
52+
**Task**
53+
54+
- `Task.id` is now `str` type instead of `UUID`. Benchmarks can provide human-readable IDs directly (e.g., `Task(id="retail_001", ...)`). Auto-generates UUID string if not provided. (PR: #16)
55+
2556
### Fixed
2657

58+
- Task reports now use `task.id` directly instead of `metadata["task_id"]` (PR: #16)
59+
2760
### Removed
2861

2962
## [0.2.0] - 2025-12-05

examples/five_a_day_benchmark/five_a_day_benchmark.ipynb

Lines changed: 4 additions & 181 deletions
Original file line numberDiff line numberDiff line change
@@ -71,136 +71,7 @@
7171
"id": "954a7569",
7272
"metadata": {},
7373
"outputs": [],
74-
"source": [
75-
"# ruff: noqa E402\n",
76-
"# Setup: Set working directory to project root for proper imports\n",
77-
"# This must happen FIRST before any other imports\n",
78-
"import os\n",
79-
"import sys\n",
80-
"from pathlib import Path\n",
81-
"import json\n",
82-
"from typing import Any, Dict, List, Sequence\n",
83-
"from rich.console import Console\n",
84-
"from rich.panel import Panel\n",
85-
"\n",
86-
"# Determine notebook directory and set working directory to project root\n",
87-
"_notebook_dir = Path(__file__).parent if \"__file__\" in dir() else Path.cwd()\n",
88-
"if _notebook_dir.name == \"five_a_day_benchmark\":\n",
89-
" _project_root = _notebook_dir.parent.parent\n",
90-
" os.chdir(_project_root)\n",
91-
" # Add project root to path so `examples.five_a_day_benchmark.*` imports work\n",
92-
" if str(_project_root) not in sys.path:\n",
93-
" sys.path.insert(0, str(_project_root))\n",
94-
" # Also add the example directory for local imports (utils, tools, evaluators)\n",
95-
" if str(_notebook_dir) not in sys.path:\n",
96-
" sys.path.insert(0, str(_notebook_dir))\n",
97-
" print(f\"Working directory set to: {os.getcwd()}\")\n",
98-
"\n",
99-
"\n",
100-
"# Utility functions from this example\n",
101-
"# - derive_seed(): Creates reproducible seeds from task_id + agent_id\n",
102-
"# - sanitize_name(): Cleans agent names for framework compatibility\n",
103-
"from utils import derive_seed, sanitize_name\n",
104-
"\n",
105-
"# Tool collection classes and helpers\n",
106-
"# - EmailToolCollection, BankingToolCollection: Pre-built tool groups\n",
107-
"# - filter_tool_adapters_by_prefix(): Selects tools by name prefix\n",
108-
"# - get_states(): Initializes tool state objects (email inboxes, bank accounts, etc.)\n",
109-
"from tools import (\n",
110-
" EmailToolCollection,\n",
111-
" BankingToolCollection,\n",
112-
" CalculatorToolCollection,\n",
113-
" CodeExecutionToolCollection,\n",
114-
" FamilyInfoToolCollection,\n",
115-
" StockPriceToolCollection,\n",
116-
" CalendarToolCollection,\n",
117-
" HotelSearchToolCollection,\n",
118-
" MCPCalendarToolCollection,\n",
119-
" filter_tool_adapters_by_prefix,\n",
120-
" get_states,\n",
121-
")\n",
122-
"\n",
123-
"# smolagents: Our chosen agent framework\n",
124-
"from smolagents import ToolCallingAgent, LiteLLMModel, FinalAnswerTool\n",
125-
"\n",
126-
"# MASEval core components\n",
127-
"from maseval import Benchmark, Environment, Task, TaskCollection, AgentAdapter, Evaluator, ModelAdapter\n",
128-
"from maseval.interface.agents.smolagents import SmolAgentAdapter\n",
129-
"\n",
130-
"# Import evaluators module (dynamically loaded later)\n",
131-
"import evaluators\n",
132-
"\n",
133-
"\n",
134-
"def load_benchmark_data(\n",
135-
" config_type: str = \"multi\",\n",
136-
" framework: str = \"smolagents\",\n",
137-
" model_id: str = \"gemini-2.5-flash\",\n",
138-
" temperature: float = 0.7,\n",
139-
" limit: int | None = None,\n",
140-
" seed: int | None = None,\n",
141-
" task_indices: list[int] | None = None,\n",
142-
") -> tuple[TaskCollection, list[Dict[str, Any]]]:\n",
143-
" \"\"\"Load tasks and agent configurations.\n",
144-
"\n",
145-
" Args:\n",
146-
" config_type: 'single' or 'multi' agent configuration\n",
147-
" framework: Agent framework to use\n",
148-
" model_id: Model identifier\n",
149-
" temperature: Model temperature\n",
150-
" limit: Optional limit on number of tasks (None = all 5)\n",
151-
" seed: Random seed for reproducibility\n",
152-
" task_indices: Optional list of task indices to load (e.g., [0, 2, 4])\n",
153-
"\n",
154-
" Returns:\n",
155-
" Tuple of (TaskCollection, list of agent configs)\n",
156-
" \"\"\"\n",
157-
" data_dir = Path(\"examples/five_a_day_benchmark/data\")\n",
158-
"\n",
159-
" with open(data_dir / \"tasks.json\", \"r\") as f:\n",
160-
" tasks_raw = json.load(f)\n",
161-
" with open(data_dir / f\"{config_type}agent.json\", \"r\") as f:\n",
162-
" configs_raw = json.load(f)\n",
163-
"\n",
164-
" # Apply limit first\n",
165-
" if limit:\n",
166-
" tasks_raw = tasks_raw[:limit]\n",
167-
" configs_raw = configs_raw[:limit]\n",
168-
"\n",
169-
" # Then apply task_indices filter if specified\n",
170-
" if task_indices is not None:\n",
171-
" tasks_raw = [tasks_raw[i] for i in task_indices if i < len(tasks_raw)]\n",
172-
" configs_raw = [configs_raw[i] for i in task_indices if i < len(configs_raw)]\n",
173-
"\n",
174-
" tasks_data = []\n",
175-
" configs_data = []\n",
176-
"\n",
177-
" for task_dict, config in zip(tasks_raw, configs_raw):\n",
178-
" task_id = task_dict[\"metadata\"][\"task_id\"]\n",
179-
" task_dict[\"environment_data\"][\"agent_framework\"] = framework\n",
180-
"\n",
181-
" # Create Task object\n",
182-
" tasks_data.append(\n",
183-
" Task(\n",
184-
" query=task_dict[\"query\"],\n",
185-
" environment_data=task_dict[\"environment_data\"],\n",
186-
" evaluation_data=task_dict[\"evaluation_data\"],\n",
187-
" metadata=task_dict[\"metadata\"],\n",
188-
" )\n",
189-
" )\n",
190-
"\n",
191-
" # Enrich config with framework and model info\n",
192-
" config[\"framework\"] = framework\n",
193-
" config[\"model_config\"] = {\"model_id\": model_id, \"temperature\": temperature}\n",
194-
"\n",
195-
" # Derive seeds for reproducibility\n",
196-
" if seed is not None:\n",
197-
" for agent_spec in config[\"agents\"]:\n",
198-
" agent_spec[\"seed\"] = derive_seed(seed, task_id, agent_spec[\"agent_id\"])\n",
199-
"\n",
200-
" configs_data.append(config)\n",
201-
"\n",
202-
" return TaskCollection(tasks_data), configs_data"
203-
]
74+
"source": "# ruff: noqa E402\n# Setup: Set working directory to project root for proper imports\n# This must happen FIRST before any other imports\nimport os\nimport sys\nfrom pathlib import Path\nimport json\nfrom typing import Any, Dict, List, Sequence\nfrom rich.console import Console\nfrom rich.panel import Panel\n\n# Determine notebook directory and set working directory to project root\n_notebook_dir = Path(__file__).parent if \"__file__\" in dir() else Path.cwd()\nif _notebook_dir.name == \"five_a_day_benchmark\":\n _project_root = _notebook_dir.parent.parent\n os.chdir(_project_root)\n # Add project root to path so `examples.five_a_day_benchmark.*` imports work\n if str(_project_root) not in sys.path:\n sys.path.insert(0, str(_project_root))\n # Also add the example directory for local imports (utils, tools, evaluators)\n if str(_notebook_dir) not in sys.path:\n sys.path.insert(0, str(_notebook_dir))\n print(f\"Working directory set to: {os.getcwd()}\")\n\n\n# Utility functions from this example\n# - derive_seed(): Creates reproducible seeds from task_id + agent_id\n# - sanitize_name(): Cleans agent names for framework compatibility\nfrom utils import derive_seed, sanitize_name\n\n# Tool collection classes and helpers\n# - EmailToolCollection, BankingToolCollection: Pre-built tool groups\n# - filter_tool_adapters_by_prefix(): Selects tools by name prefix\n# - get_states(): Initializes tool state objects (email inboxes, bank accounts, etc.)\nfrom tools import (\n EmailToolCollection,\n BankingToolCollection,\n CalculatorToolCollection,\n CodeExecutionToolCollection,\n FamilyInfoToolCollection,\n StockPriceToolCollection,\n CalendarToolCollection,\n HotelSearchToolCollection,\n MCPCalendarToolCollection,\n filter_tool_adapters_by_prefix,\n get_states,\n)\n\n# smolagents: Our chosen agent framework\nfrom smolagents import ToolCallingAgent, LiteLLMModel, FinalAnswerTool\n\n# MASEval core components\nfrom maseval import Benchmark, Environment, Task, TaskCollection, AgentAdapter, Evaluator, ModelAdapter\nfrom maseval.interface.agents.smolagents import SmolAgentAdapter\n\n# Import evaluators module (dynamically loaded later)\nimport evaluators\n\n\ndef load_benchmark_data(\n config_type: str = \"multi\",\n framework: str = \"smolagents\",\n model_id: str = \"gemini-2.5-flash\",\n temperature: float = 0.7,\n limit: int | None = None,\n seed: int | None = None,\n task_indices: list[int] | None = None,\n) -> tuple[TaskCollection, list[Dict[str, Any]]]:\n \"\"\"Load tasks and agent configurations.\n\n Args:\n config_type: 'single' or 'multi' agent configuration\n framework: Agent framework to use\n model_id: Model identifier\n temperature: Model temperature\n limit: Optional limit on number of tasks (None = all 5)\n seed: Random seed for reproducibility\n task_indices: Optional list of task indices to load (e.g., [0, 2, 4])\n\n Returns:\n Tuple of (TaskCollection, list of agent configs)\n \"\"\"\n data_dir = Path(\"examples/five_a_day_benchmark/data\")\n\n with open(data_dir / \"tasks.json\", \"r\") as f:\n tasks_raw = json.load(f)\n with open(data_dir / f\"{config_type}agent.json\", \"r\") as f:\n configs_raw = json.load(f)\n\n # Apply limit first\n if limit:\n tasks_raw = tasks_raw[:limit]\n configs_raw = configs_raw[:limit]\n\n # Then apply task_indices filter if specified\n if task_indices is not None:\n tasks_raw = [tasks_raw[i] for i in task_indices if i < len(tasks_raw)]\n configs_raw = [configs_raw[i] for i in task_indices if i < len(configs_raw)]\n\n tasks_data = []\n configs_data = []\n\n for task_dict, config in zip(tasks_raw, configs_raw):\n task_id = task_dict[\"metadata\"][\"task_id\"]\n task_dict[\"environment_data\"][\"agent_framework\"] = framework\n\n # Create Task object with id from metadata\n tasks_data.append(\n Task(\n query=task_dict[\"query\"],\n id=task_id,\n environment_data=task_dict[\"environment_data\"],\n evaluation_data=task_dict[\"evaluation_data\"],\n metadata=task_dict[\"metadata\"],\n )\n )\n\n # Enrich config with framework and model info\n config[\"framework\"] = framework\n config[\"model_config\"] = {\"model_id\": model_id, \"temperature\": temperature}\n\n # Derive seeds for reproducibility\n if seed is not None:\n for agent_spec in config[\"agents\"]:\n agent_spec[\"seed\"] = derive_seed(seed, task_id, agent_spec[\"agent_id\"])\n\n configs_data.append(config)\n\n return TaskCollection(tasks_data), configs_data"
20475
},
20576
{
20677
"cell_type": "markdown",
@@ -558,41 +429,7 @@
558429
"id": "5fbb228f",
559430
"metadata": {},
560431
"outputs": [],
561-
"source": [
562-
"# Build the agents for task 0\n",
563-
"# Note: model_config is already set by load_benchmark_data()\n",
564-
"\n",
565-
"# Create environment from task data\n",
566-
"environment_0 = FiveADayEnvironment(\n",
567-
" {\n",
568-
" \"environment_data\": task_0.environment_data,\n",
569-
" \"query\": task_0.query,\n",
570-
" \"evaluation_data\": task_0.evaluation_data,\n",
571-
" \"metadata\": task_0.metadata,\n",
572-
" }\n",
573-
")\n",
574-
"\n",
575-
"# Build agents using the build_agents function\n",
576-
"agents_to_run, agents_to_monitor = build_agents(config_0, environment_0)\n",
577-
"\n",
578-
"print(f\"\\nBuilt Agents for Task: {task_0.metadata['task_id']}\")\n",
579-
"print(f\"{'=' * 60}\")\n",
580-
"print(f\"\\nAgents to run: {[agent.name for agent in agents_to_run]}\")\n",
581-
"print(f\"Agents to monitor: {list(agents_to_monitor.keys())}\")\n",
582-
"\n",
583-
"# Print details for each agent\n",
584-
"for agent in agents_to_run:\n",
585-
" print(f\"\\n Agent: {agent.name}\")\n",
586-
" # smolagents stores tools as a dict with string keys\n",
587-
" print(f\" Tools: {list(agent.tools.keys())}\")\n",
588-
" if hasattr(agent, \"managed_agents\") and agent.managed_agents:\n",
589-
" # managed_agents is also a dict with string keys\n",
590-
" print(f\" Managed agents: {list(agent.managed_agents.keys())}\")\n",
591-
" for agent_name, managed in agent.managed_agents.items():\n",
592-
" print(f\" - {managed.name}: {list(managed.tools.keys())}\")\n",
593-
"\n",
594-
"print(\"\\nAll agents built successfully.\")"
595-
]
432+
"source": "# Build the agents for task 0\n# Note: model_config is already set by load_benchmark_data()\n\n# Create environment from task data\nenvironment_0 = FiveADayEnvironment(\n {\n \"environment_data\": task_0.environment_data,\n \"query\": task_0.query,\n \"evaluation_data\": task_0.evaluation_data,\n \"metadata\": task_0.metadata,\n }\n)\n\n# Build agents using the build_agents function\nagents_to_run, agents_to_monitor = build_agents(config_0, environment_0)\n\nprint(f\"\\nBuilt Agents for Task: {task_0.id}\")\nprint(f\"{'=' * 60}\")\nprint(f\"\\nAgents to run: {[agent.name for agent in agents_to_run]}\")\nprint(f\"Agents to monitor: {list(agents_to_monitor.keys())}\")\n\n# Print details for each agent\nfor agent in agents_to_run:\n print(f\"\\n Agent: {agent.name}\")\n # smolagents stores tools as a dict with string keys\n print(f\" Tools: {list(agent.tools.keys())}\")\n if hasattr(agent, \"managed_agents\") and agent.managed_agents:\n # managed_agents is also a dict with string keys\n print(f\" Managed agents: {list(agent.managed_agents.keys())}\")\n for agent_name, managed in agent.managed_agents.items():\n print(f\" - {managed.name}: {list(managed.tools.keys())}\")\n\nprint(\"\\nAll agents built successfully.\")"
596433
},
597434
{
598435
"cell_type": "markdown",
@@ -707,21 +544,7 @@
707544
"id": "b04bbd0d",
708545
"metadata": {},
709546
"outputs": [],
710-
"source": [
711-
"# Reload all 5 tasks for the benchmark\n",
712-
"tasks, agent_configs = load_benchmark_data(\n",
713-
" config_type=\"multi\",\n",
714-
" framework=\"smolagents\",\n",
715-
" model_id=\"gemini-2.5-flash\",\n",
716-
" temperature=0.7,\n",
717-
" seed=42,\n",
718-
" # No task_indices = load all tasks\n",
719-
")\n",
720-
"\n",
721-
"print(f\"Loaded {len(tasks)} tasks:\")\n",
722-
"for i, task in enumerate(tasks):\n",
723-
" print(f\" {i}. {task.metadata['task_id']}: {task.metadata['description']}\")"
724-
]
547+
"source": "# Reload all 5 tasks for the benchmark\ntasks, agent_configs = load_benchmark_data(\n config_type=\"multi\",\n framework=\"smolagents\",\n model_id=\"gemini-2.5-flash\",\n temperature=0.7,\n seed=42,\n # No task_indices = load all tasks\n)\n\nprint(f\"Loaded {len(tasks)} tasks:\")\nfor i, task in enumerate(tasks):\n print(f\" {i}. {task.id}: {task.metadata['description']}\")"
725548
},
726549
{
727550
"cell_type": "markdown",
@@ -898,4 +721,4 @@
898721
},
899722
"nbformat": 4,
900723
"nbformat_minor": 5
901-
}
724+
}

examples/five_a_day_benchmark/five_a_day_benchmark.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -873,10 +873,11 @@ def load_benchmark_data(
873873
task_id = task_dict["metadata"]["task_id"]
874874
task_dict["environment_data"]["agent_framework"] = framework
875875

876-
# Create task
876+
# Create task with id from metadata
877877
tasks_data.append(
878878
Task(
879879
query=task_dict["query"],
880+
id=task_id,
880881
environment_data=task_dict["environment_data"],
881882
evaluation_data=task_dict["evaluation_data"],
882883
metadata=task_dict["metadata"],

examples/introduction/tutorial.ipynb

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -386,19 +386,7 @@
386386
"id": "f5498a8d",
387387
"metadata": {},
388388
"outputs": [],
389-
"source": [
390-
"# Create a Task instance\n",
391-
"task = Task(\n",
392-
" query=task_data[\"query\"],\n",
393-
" environment_data=task_data[\"environment_data\"],\n",
394-
" evaluation_data=task_data[\"evaluation_data\"],\n",
395-
" metadata=task_data[\"metadata\"],\n",
396-
")\n",
397-
"\n",
398-
"print(f\"Created task: {task.metadata['task_id']}\")\n",
399-
"print(f\"Complexity: {task.metadata['complexity']}\")\n",
400-
"print(f\"Skills tested: {', '.join(task.metadata['skills_tested'])}\")"
401-
]
389+
"source": "# Create a Task instance\ntask = Task(\n query=task_data[\"query\"],\n id=task_data[\"metadata\"][\"task_id\"],\n environment_data=task_data[\"environment_data\"],\n evaluation_data=task_data[\"evaluation_data\"],\n metadata=task_data[\"metadata\"],\n)\n\nprint(f\"Created task: {task.id}\")\nprint(f\"Complexity: {task.metadata['complexity']}\")\nprint(f\"Skills tested: {', '.join(task.metadata['skills_tested'])}\")"
402390
},
403391
{
404392
"cell_type": "markdown",
@@ -745,4 +733,4 @@
745733
},
746734
"nbformat": 4,
747735
"nbformat_minor": 5
748-
}
736+
}

0 commit comments

Comments
 (0)