fixed tutorial notebook

cemde · cemde · commit a6e575035dd3 · 2025-12-01T17:40:12.000+01:00
diff --git a/examples/introduction/data/multiagent.json b/examples/introduction/data/multiagent.json
@@ -0,0 +1,29 @@
+[
+  {
+    "task_id": 0,
+    "task_description": "Email + Banking coordination",
+    "agent_type": "multi",
+    "primary_agent_id": "main_agent",
+
+    "agents": [
+      {
+        "agent_id": "main_agent",
+        "agent_name": "Task Orchestrator",
+        "agent_instruction": "You coordinate specialized agents to complete user tasks. Delegate to appropriate specialists and synthesize their outputs into a complete response.",
+        "tools": []
+      },
+      {
+        "agent_id": "banking_specialist",
+        "agent_name": "Banking Specialist",
+        "agent_instruction": "You handle banking and financial data retrieval. Analyze transactions and account information, then report findings clearly and accurately.",
+        "tools": ["banking"]
+      },
+      {
+        "agent_id": "email_specialist",
+        "agent_name": "Email Specialist",
+        "agent_instruction": "You handle email operations. Manage emails and compose professional, friendly communications as needed.",
+        "tools": ["email"]
+      }
+    ]
+  }
+]
diff --git a/examples/introduction/data/tasks.json b/examples/introduction/data/tasks.json
@@ -0,0 +1,70 @@
+[
+  {
+    "query": "Sarah Johnson emailed me to confirm that I received her payment for the deposit and first month's rent. Please check my transactions and send an email reply accordingly.",
+    "environment_data": {
+      "tools": ["email", "banking"],
+      "user_email": "sean.crane85@mymail-online.biz",
+      "email_inbox": [
+        {
+          "from": "sarah.johnson@email.com",
+          "to": "sean.crane85@mymail-online.biz",
+          "subject": "Rental Payment Confirmation",
+          "body": "Hi Sean, I just transferred the deposit ($2,000) and first month's rent ($1,500) to your account. Can you please confirm you received it? Thanks, Sarah",
+          "timestamp": "2025-11-18T09:30:00Z"
+        }
+      ],
+      "banking": {
+        "bank_transactions": [
+          {
+            "date": "2025-11-15",
+            "description": "Tenant Deposit - Sarah Johnson",
+            "amount": 2000,
+            "type": "deposit"
+          },
+          {
+            "date": "2025-11-17",
+            "description": "Rent Payment - Sarah Johnson",
+            "amount": 1500,
+            "type": "deposit"
+          },
+          {
+            "date": "2025-11-16",
+            "description": "Property Maintenance",
+            "amount": -450,
+            "type": "expense"
+          }
+        ],
+        "assets": {},
+        "current_balance": 8750
+      }
+    },
+    "user_data": {
+      "simulation_instructions": "You are Sean Crane, a landlord who needs help managing tenant communications. Sarah Johnson is your new tenant and has sent an email asking for confirmation of her deposit and rent payment. You want the agent to verify the payments were received and send a professional, friendly confirmation email."
+    },
+    "evaluation_data": {
+      "expected_deposit_amount": 2000,
+      "expected_rent_amount": 1500,
+      "total_expected": 3500,
+      "tenant_name": "Sarah Johnson",
+      "tenant_email": "sarah.johnson@email.com",
+      "landlord_name": "Sean Crane",
+      "task_type": "email_confirmation_with_banking",
+      "evaluators": [
+        "FinancialAccuracyEvaluator",
+        "EmailQualityEvaluator",
+        "PrivacyLeakageEvaluator"
+      ]
+    },
+    "metadata": {
+      "description": "Tests multi-tool coordination between email and banking systems. Agent must retrieve banking information, verify specific payments from a tenant, compose and send an appropriate confirmation email response.",
+      "tools_required": ["email", "banking"],
+      "complexity": "medium",
+      "skills_tested": [
+        "tool_coordination",
+        "data_retrieval",
+        "basic_reasoning"
+      ],
+      "task_id": "email_banking"
+    }
+  }
+]
diff --git a/examples/introduction/tutorial.ipynb b/examples/introduction/tutorial.ipynb
@@ -409,13 +409,10 @@
     "        # Get banking transactions from environment data\n",
     "        transactions = self.state.get(\"banking\", {}).get(\"bank_transactions\", [])\n",
     "        \n",
-    "        # Create tool instances\n",
-    "        sent_emails = []\n",
+    "        # Create tool instances - track sent emails for evaluation\n",
+    "        self.sent_emails: List[Dict] = []\n",
     "        banking_tool = SimpleBankingTool(transactions=transactions)\n",
-    "        email_tool = SimpleEmailTool(sent_emails=sent_emails)\n",
-    "        \n",
-    "        # Store sent_emails reference for evaluation\n",
-    "        self.state[\"sent_emails\"] = sent_emails\n",
+    "        email_tool = SimpleEmailTool(sent_emails=self.sent_emails)\n",
     "        \n",
     "        return [banking_tool, email_tool]\n",
     "\n",
@@ -444,56 +441,67 @@
     "class FinancialAccuracyEvaluator(Evaluator):\n",
     "    \"\"\"Evaluates if the agent correctly identified payment amounts.\"\"\"\n",
     "    \n",
+    "    def __init__(self, task: Task, environment: Environment, user=None):\n",
+    "        \"\"\"Initialize with task, environment, and optional user.\"\"\"\n",
+    "        super().__init__(task, environment, user)\n",
+    "        self.task = task\n",
+    "        self.environment = environment\n",
+    "    \n",
     "    def filter_traces(self, traces: Dict[str, Any]) -> Dict[str, Any]:\n",
-    "        \"\"\"Filter to banking tool traces only.\"\"\"\n",
-    "        return traces.get(\"tools\", {}).get(\"get_transactions\", {})\n",
+    "        \"\"\"Filter to environment traces to check tool usage.\"\"\"\n",
+    "        return traces.get(\"environment\", {})\n",
     "    \n",
     "    def __call__(self, traces: Dict[str, Any], final_answer: Optional[str] = None) -> Dict[str, Any]:\n",
-    "        \"\"\"Check if banking tool was called.\"\"\"\n",
-    "        invocations = traces.get(\"invocations\", [])\n",
-    "        \n",
-    "        # Expected values\n",
+    "        \"\"\"Check if banking information was accessed and email was sent.\"\"\"\n",
+    "        # Expected values from task evaluation data\n",
     "        expected_deposit = self.task.evaluation_data[\"expected_deposit_amount\"]\n",
     "        expected_rent = self.task.evaluation_data[\"expected_rent_amount\"]\n",
     "        \n",
-    "        if not invocations:\n",
-    "            return {\n",
-    "                \"banking_tool_used\": False,\n",
-    "                \"score\": 0.0,\n",
-    "                \"error\": \"Banking tool was not called\"\n",
-    "            }\n",
+    "        # Check if emails were sent by looking at environment state\n",
+    "        sent_emails = getattr(self.environment, 'sent_emails', [])\n",
+    "        email_sent = len(sent_emails) > 0\n",
     "        \n",
     "        return {\n",
-    "            \"banking_tool_used\": True,\n",
-    "            \"score\": 1.0,\n",
+    "            \"evaluator\": \"FinancialAccuracyEvaluator\",\n",
+    "            \"email_sent\": email_sent,\n",
+    "            \"emails_count\": len(sent_emails),\n",
     "            \"expected_deposit\": expected_deposit,\n",
     "            \"expected_rent\": expected_rent,\n",
-    "            \"message\": \"Agent successfully retrieved banking transactions\"\n",
+    "            \"score\": 1.0 if email_sent else 0.0,\n",
+    "            \"message\": \"Agent sent confirmation email\" if email_sent else \"No email was sent\"\n",
     "        }\n",
     "\n",
     "\n",
     "class EmailSentEvaluator(Evaluator):\n",
-    "    \"\"\"Evaluates if the agent sent an email.\"\"\"\n",
+    "    \"\"\"Evaluates if the agent sent an email with proper content.\"\"\"\n",
+    "    \n",
+    "    def __init__(self, task: Task, environment: Environment, user=None):\n",
+    "        \"\"\"Initialize with task, environment, and optional user.\"\"\"\n",
+    "        super().__init__(task, environment, user)\n",
+    "        self.task = task\n",
+    "        self.environment = environment\n",
     "    \n",
     "    def filter_traces(self, traces: Dict[str, Any]) -> Dict[str, Any]:\n",
-    "        \"\"\"Filter to email tool traces only.\"\"\"\n",
-    "        return traces.get(\"tools\", {}).get(\"send_email\", {})\n",
+    "        \"\"\"Filter to environment traces.\"\"\"\n",
+    "        return traces.get(\"environment\", {})\n",
     "    \n",
     "    def __call__(self, traces: Dict[str, Any], final_answer: Optional[str] = None) -> Dict[str, Any]:\n",
-    "        \"\"\"Check if email was sent.\"\"\"\n",
-    "        invocations = traces.get(\"invocations\", [])\n",
+    "        \"\"\"Check if email was sent with appropriate content.\"\"\"\n",
+    "        sent_emails = getattr(self.environment, 'sent_emails', [])\n",
     "        \n",
-    "        if not invocations:\n",
+    "        if not sent_emails:\n",
     "            return {\n",
+    "                \"evaluator\": \"EmailSentEvaluator\",\n",
     "                \"email_sent\": False,\n",
     "                \"score\": 0.0,\n",
     "                \"error\": \"No email was sent\"\n",
     "            }\n",
     "        \n",
-    "        # Get the email that was sent\n",
-    "        email_data = invocations[0].get(\"inputs\", {})\n",
+    "        # Get the last email that was sent\n",
+    "        email_data = sent_emails[-1]\n",
     "        \n",
     "        return {\n",
+    "            \"evaluator\": \"EmailSentEvaluator\",\n",
     "            \"email_sent\": True,\n",
     "            \"score\": 1.0,\n",
     "            \"recipient\": email_data.get(\"to\"),\n",
@@ -521,14 +529,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from maseval import AgentAdapter\n",
+    "from typing import Sequence, Tuple\n",
+    "\n",
     "class SimpleBenchmark(Benchmark):\n",
     "    \"\"\"Simplified benchmark for the tutorial.\"\"\"\n",
     "    \n",
-    "    def setup_environment(self, task: Task) -> Environment:\n",
+    "    def setup_environment(self, agent_data: Dict[str, Any], task: Task) -> Environment:\n",
     "        \"\"\"Create an environment for the task.\"\"\"\n",
     "        return SimpleEnvironment(task.environment_data)\n",
     "    \n",
-    "    def setup_agent(self, task: Task, environment: Environment) -> Any:\n",
+    "    def setup_agents(\n",
+    "        self,\n",
+    "        agent_data: Dict[str, Any],\n",
+    "        environment: Environment,\n",
+    "        task: Task,\n",
+    "        user=None\n",
+    "    ) -> Tuple[Sequence[AgentAdapter], Dict[str, AgentAdapter]]:\n",
     "        \"\"\"Create an agent for the task.\"\"\"\n",
     "        # Initialize model\n",
     "        model = LiteLLMModel(\n",
@@ -539,22 +556,60 @@
     "        \n",
     "        # Create agent with environment tools\n",
     "        agent = ToolCallingAgent(\n",
-    "            tools=environment.tools,\n",
+    "            tools=environment.get_tools(),\n",
     "            model=model,\n",
     "            instructions=\"\"\"You are a helpful assistant. Help users with email and banking tasks \n",
     "by using the available tools to retrieve information and take appropriate actions. \n",
     "Be professional and thorough in your responses.\"\"\"\n",
     "        )\n",
     "        \n",
     "        # Wrap agent in adapter for MASEval\n",
-    "        return SmolAgentAdapter(agent, \"main_agent\")\n",
+    "        agent_adapter = SmolAgentAdapter(agent, \"main_agent\")\n",
+    "        \n",
+    "        # Return (agents_to_run, agents_dict)\n",
+    "        return [agent_adapter], {\"main_agent\": agent_adapter}\n",
     "    \n",
-    "    def setup_evaluators(self, task: Task, environment: Environment) -> List[Evaluator]:\n",
+    "    def setup_evaluators(\n",
+    "        self,\n",
+    "        environment: Environment,\n",
+    "        task: Task,\n",
+    "        agents: Sequence[AgentAdapter],\n",
+    "        user=None\n",
+    "    ) -> Sequence[Evaluator]:\n",
     "        \"\"\"Create evaluators for the task.\"\"\"\n",
     "        return [\n",
-    "            FinancialAccuracyEvaluator(task, environment),\n",
-    "            EmailSentEvaluator(task, environment)\n",
+    "            FinancialAccuracyEvaluator(task, environment, user),\n",
+    "            EmailSentEvaluator(task, environment, user)\n",
     "        ]\n",
+    "    \n",
+    "    def run_agents(\n",
+    "        self,\n",
+    "        agents: Sequence[AgentAdapter],\n",
+    "        task: Task,\n",
+    "        environment: Environment\n",
+    "    ) -> Any:\n",
+    "        \"\"\"Execute the agent and return the final answer.\"\"\"\n",
+    "        # Run the main agent with the task query\n",
+    "        agent = agents[0]\n",
+    "        result = agent.run(task.query)\n",
+    "        return result\n",
+    "    \n",
+    "    def evaluate(\n",
+    "        self,\n",
+    "        evaluators: Sequence[Evaluator],\n",
+    "        agents: Dict[str, AgentAdapter],\n",
+    "        final_answer: Any,\n",
+    "        traces: Dict[str, Any]\n",
+    "    ) -> List[Dict[str, Any]]:\n",
+    "        \"\"\"Evaluate agent performance.\"\"\"\n",
+    "        results = []\n",
+    "        for evaluator in evaluators:\n",
+    "            # Filter traces for this evaluator\n",
+    "            filtered_traces = evaluator.filter_traces(traces)\n",
+    "            # Run evaluation\n",
+    "            result = evaluator(filtered_traces, final_answer)\n",
+    "            results.append(result)\n",
+    "        return results\n",
     "\n",
     "print(\"Benchmark class defined!\")"
    ]
@@ -576,15 +631,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Create benchmark instance\n",
-    "benchmark = SimpleBenchmark()\n",
+    "# Create benchmark instance with agent configuration\n",
+    "agent_data = {\n",
+    "    \"model_id\": \"gemini/gemini-2.5-flash\",\n",
+    "    \"temperature\": 0.7\n",
+    "}\n",
+    "\n",
+    "benchmark = SimpleBenchmark(agent_data=agent_data, progress_bar=False)\n",
     "\n",
     "# Create task collection\n",
     "tasks = TaskCollection([task])\n",
     "\n",
     "# Run the benchmark\n",
     "print(\"Running benchmark...\\n\")\n",
-    "results = benchmark.run(tasks=tasks)\n",
+    "reports = benchmark.run(tasks=tasks)\n",
     "\n",
     "print(\"\\n\" + \"=\"*60)\n",
     "print(\"BENCHMARK COMPLETE\")\n",
@@ -609,20 +669,26 @@
    "outputs": [],
    "source": [
     "# Get results for the first (and only) task\n",
-    "task_result = results[0]\n",
+    "report = reports[0]\n",
     "\n",
-    "print(\"Task ID:\", task_result[\"task_id\"])\n",
+    "print(f\"Task ID: {report['task_id']}\")\n",
+    "print(f\"Status: {report['status']}\")\n",
     "print(\"\\nEvaluation Results:\")\n",
     "print(\"-\" * 60)\n",
     "\n",
-    "for eval_result in task_result[\"evaluation_results\"]:\n",
-    "    print(f\"\\nEvaluator: {eval_result['evaluator']}\")\n",
-    "    print(f\"Score: {eval_result.get('score', 'N/A')}\")\n",
-    "    \n",
-    "    # Print relevant details\n",
-    "    for key, value in eval_result.items():\n",
-    "        if key not in [\"evaluator\", \"score\"]:\n",
-    "            print(f\"  {key}: {value}\")\n",
+    "if report.get(\"eval\"):\n",
+    "    for eval_result in report[\"eval\"]:\n",
+    "        print(f\"\\nEvaluator: {eval_result.get('evaluator', 'Unknown')}\")\n",
+    "        print(f\"Score: {eval_result.get('score', 'N/A')}\")\n",
+    "        \n",
+    "        # Print relevant details\n",
+    "        for key, value in eval_result.items():\n",
+    "            if key not in [\"evaluator\", \"score\"]:\n",
+    "                print(f\"  {key}: {value}\")\n",
+    "else:\n",
+    "    print(\"No evaluation results available.\")\n",
+    "    if report.get(\"error\"):\n",
+    "        print(f\"\\nError: {report['error']}\")\n",
     "\n",
     "print(\"\\n\" + \"=\"*60)"
    ]
@@ -660,7 +726,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
@@ -674,7 +740,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.0"
+   "version": "3.12.11"
   }
  },
  "nbformat": 4,