|
409 | 409 | " # Get banking transactions from environment data\n", |
410 | 410 | " transactions = self.state.get(\"banking\", {}).get(\"bank_transactions\", [])\n", |
411 | 411 | " \n", |
412 | | - " # Create tool instances\n", |
413 | | - " sent_emails = []\n", |
| 412 | + " # Create tool instances - track sent emails for evaluation\n", |
| 413 | + " self.sent_emails: List[Dict] = []\n", |
414 | 414 | " banking_tool = SimpleBankingTool(transactions=transactions)\n", |
415 | | - " email_tool = SimpleEmailTool(sent_emails=sent_emails)\n", |
416 | | - " \n", |
417 | | - " # Store sent_emails reference for evaluation\n", |
418 | | - " self.state[\"sent_emails\"] = sent_emails\n", |
| 415 | + " email_tool = SimpleEmailTool(sent_emails=self.sent_emails)\n", |
419 | 416 | " \n", |
420 | 417 | " return [banking_tool, email_tool]\n", |
421 | 418 | "\n", |
|
444 | 441 | "class FinancialAccuracyEvaluator(Evaluator):\n", |
445 | 442 | " \"\"\"Evaluates if the agent correctly identified payment amounts.\"\"\"\n", |
446 | 443 | " \n", |
| 444 | + " def __init__(self, task: Task, environment: Environment, user=None):\n", |
| 445 | + " \"\"\"Initialize with task, environment, and optional user.\"\"\"\n", |
| 446 | + " super().__init__(task, environment, user)\n", |
| 447 | + " self.task = task\n", |
| 448 | + " self.environment = environment\n", |
| 449 | + " \n", |
447 | 450 | " def filter_traces(self, traces: Dict[str, Any]) -> Dict[str, Any]:\n", |
448 | | - " \"\"\"Filter to banking tool traces only.\"\"\"\n", |
449 | | - " return traces.get(\"tools\", {}).get(\"get_transactions\", {})\n", |
| 451 | + " \"\"\"Filter to environment traces to check tool usage.\"\"\"\n", |
| 452 | + " return traces.get(\"environment\", {})\n", |
450 | 453 | " \n", |
451 | 454 | " def __call__(self, traces: Dict[str, Any], final_answer: Optional[str] = None) -> Dict[str, Any]:\n", |
452 | | - " \"\"\"Check if banking tool was called.\"\"\"\n", |
453 | | - " invocations = traces.get(\"invocations\", [])\n", |
454 | | - " \n", |
455 | | - " # Expected values\n", |
| 455 | + " \"\"\"Check if banking information was accessed and email was sent.\"\"\"\n", |
| 456 | + " # Expected values from task evaluation data\n", |
456 | 457 | " expected_deposit = self.task.evaluation_data[\"expected_deposit_amount\"]\n", |
457 | 458 | " expected_rent = self.task.evaluation_data[\"expected_rent_amount\"]\n", |
458 | 459 | " \n", |
459 | | - " if not invocations:\n", |
460 | | - " return {\n", |
461 | | - " \"banking_tool_used\": False,\n", |
462 | | - " \"score\": 0.0,\n", |
463 | | - " \"error\": \"Banking tool was not called\"\n", |
464 | | - " }\n", |
| 460 | + " # Check if emails were sent by looking at environment state\n", |
| 461 | + " sent_emails = getattr(self.environment, 'sent_emails', [])\n", |
| 462 | + " email_sent = len(sent_emails) > 0\n", |
465 | 463 | " \n", |
466 | 464 | " return {\n", |
467 | | - " \"banking_tool_used\": True,\n", |
468 | | - " \"score\": 1.0,\n", |
| 465 | + " \"evaluator\": \"FinancialAccuracyEvaluator\",\n", |
| 466 | + " \"email_sent\": email_sent,\n", |
| 467 | + " \"emails_count\": len(sent_emails),\n", |
469 | 468 | " \"expected_deposit\": expected_deposit,\n", |
470 | 469 | " \"expected_rent\": expected_rent,\n", |
471 | | - " \"message\": \"Agent successfully retrieved banking transactions\"\n", |
| 470 | + " \"score\": 1.0 if email_sent else 0.0,\n", |
| 471 | + " \"message\": \"Agent sent confirmation email\" if email_sent else \"No email was sent\"\n", |
472 | 472 | " }\n", |
473 | 473 | "\n", |
474 | 474 | "\n", |
475 | 475 | "class EmailSentEvaluator(Evaluator):\n", |
476 | | - " \"\"\"Evaluates if the agent sent an email.\"\"\"\n", |
| 476 | + " \"\"\"Evaluates if the agent sent an email with proper content.\"\"\"\n", |
| 477 | + " \n", |
| 478 | + " def __init__(self, task: Task, environment: Environment, user=None):\n", |
| 479 | + " \"\"\"Initialize with task, environment, and optional user.\"\"\"\n", |
| 480 | + " super().__init__(task, environment, user)\n", |
| 481 | + " self.task = task\n", |
| 482 | + " self.environment = environment\n", |
477 | 483 | " \n", |
478 | 484 | " def filter_traces(self, traces: Dict[str, Any]) -> Dict[str, Any]:\n", |
479 | | - " \"\"\"Filter to email tool traces only.\"\"\"\n", |
480 | | - " return traces.get(\"tools\", {}).get(\"send_email\", {})\n", |
| 485 | + " \"\"\"Filter to environment traces.\"\"\"\n", |
| 486 | + " return traces.get(\"environment\", {})\n", |
481 | 487 | " \n", |
482 | 488 | " def __call__(self, traces: Dict[str, Any], final_answer: Optional[str] = None) -> Dict[str, Any]:\n", |
483 | | - " \"\"\"Check if email was sent.\"\"\"\n", |
484 | | - " invocations = traces.get(\"invocations\", [])\n", |
| 489 | + " \"\"\"Check if email was sent with appropriate content.\"\"\"\n", |
| 490 | + " sent_emails = getattr(self.environment, 'sent_emails', [])\n", |
485 | 491 | " \n", |
486 | | - " if not invocations:\n", |
| 492 | + " if not sent_emails:\n", |
487 | 493 | " return {\n", |
| 494 | + " \"evaluator\": \"EmailSentEvaluator\",\n", |
488 | 495 | " \"email_sent\": False,\n", |
489 | 496 | " \"score\": 0.0,\n", |
490 | 497 | " \"error\": \"No email was sent\"\n", |
491 | 498 | " }\n", |
492 | 499 | " \n", |
493 | | - " # Get the email that was sent\n", |
494 | | - " email_data = invocations[0].get(\"inputs\", {})\n", |
| 500 | + " # Get the last email that was sent\n", |
| 501 | + " email_data = sent_emails[-1]\n", |
495 | 502 | " \n", |
496 | 503 | " return {\n", |
| 504 | + " \"evaluator\": \"EmailSentEvaluator\",\n", |
497 | 505 | " \"email_sent\": True,\n", |
498 | 506 | " \"score\": 1.0,\n", |
499 | 507 | " \"recipient\": email_data.get(\"to\"),\n", |
|
521 | 529 | "metadata": {}, |
522 | 530 | "outputs": [], |
523 | 531 | "source": [ |
| 532 | + "from maseval import AgentAdapter\n", |
| 533 | + "from typing import Sequence, Tuple\n", |
| 534 | + "\n", |
524 | 535 | "class SimpleBenchmark(Benchmark):\n", |
525 | 536 | " \"\"\"Simplified benchmark for the tutorial.\"\"\"\n", |
526 | 537 | " \n", |
527 | | - " def setup_environment(self, task: Task) -> Environment:\n", |
| 538 | + " def setup_environment(self, agent_data: Dict[str, Any], task: Task) -> Environment:\n", |
528 | 539 | " \"\"\"Create an environment for the task.\"\"\"\n", |
529 | 540 | " return SimpleEnvironment(task.environment_data)\n", |
530 | 541 | " \n", |
531 | | - " def setup_agent(self, task: Task, environment: Environment) -> Any:\n", |
| 542 | + " def setup_agents(\n", |
| 543 | + " self,\n", |
| 544 | + " agent_data: Dict[str, Any],\n", |
| 545 | + " environment: Environment,\n", |
| 546 | + " task: Task,\n", |
| 547 | + " user=None\n", |
| 548 | + " ) -> Tuple[Sequence[AgentAdapter], Dict[str, AgentAdapter]]:\n", |
532 | 549 | " \"\"\"Create an agent for the task.\"\"\"\n", |
533 | 550 | " # Initialize model\n", |
534 | 551 | " model = LiteLLMModel(\n", |
|
539 | 556 | " \n", |
540 | 557 | " # Create agent with environment tools\n", |
541 | 558 | " agent = ToolCallingAgent(\n", |
542 | | - " tools=environment.tools,\n", |
| 559 | + " tools=environment.get_tools(),\n", |
543 | 560 | " model=model,\n", |
544 | 561 | " instructions=\"\"\"You are a helpful assistant. Help users with email and banking tasks \n", |
545 | 562 | "by using the available tools to retrieve information and take appropriate actions. \n", |
546 | 563 | "Be professional and thorough in your responses.\"\"\"\n", |
547 | 564 | " )\n", |
548 | 565 | " \n", |
549 | 566 | " # Wrap agent in adapter for MASEval\n", |
550 | | - " return SmolAgentAdapter(agent, \"main_agent\")\n", |
| 567 | + " agent_adapter = SmolAgentAdapter(agent, \"main_agent\")\n", |
| 568 | + " \n", |
| 569 | + " # Return (agents_to_run, agents_dict)\n", |
| 570 | + " return [agent_adapter], {\"main_agent\": agent_adapter}\n", |
551 | 571 | " \n", |
552 | | - " def setup_evaluators(self, task: Task, environment: Environment) -> List[Evaluator]:\n", |
| 572 | + " def setup_evaluators(\n", |
| 573 | + " self,\n", |
| 574 | + " environment: Environment,\n", |
| 575 | + " task: Task,\n", |
| 576 | + " agents: Sequence[AgentAdapter],\n", |
| 577 | + " user=None\n", |
| 578 | + " ) -> Sequence[Evaluator]:\n", |
553 | 579 | " \"\"\"Create evaluators for the task.\"\"\"\n", |
554 | 580 | " return [\n", |
555 | | - " FinancialAccuracyEvaluator(task, environment),\n", |
556 | | - " EmailSentEvaluator(task, environment)\n", |
| 581 | + " FinancialAccuracyEvaluator(task, environment, user),\n", |
| 582 | + " EmailSentEvaluator(task, environment, user)\n", |
557 | 583 | " ]\n", |
| 584 | + " \n", |
| 585 | + " def run_agents(\n", |
| 586 | + " self,\n", |
| 587 | + " agents: Sequence[AgentAdapter],\n", |
| 588 | + " task: Task,\n", |
| 589 | + " environment: Environment\n", |
| 590 | + " ) -> Any:\n", |
| 591 | + " \"\"\"Execute the agent and return the final answer.\"\"\"\n", |
| 592 | + " # Run the main agent with the task query\n", |
| 593 | + " agent = agents[0]\n", |
| 594 | + " result = agent.run(task.query)\n", |
| 595 | + " return result\n", |
| 596 | + " \n", |
| 597 | + " def evaluate(\n", |
| 598 | + " self,\n", |
| 599 | + " evaluators: Sequence[Evaluator],\n", |
| 600 | + " agents: Dict[str, AgentAdapter],\n", |
| 601 | + " final_answer: Any,\n", |
| 602 | + " traces: Dict[str, Any]\n", |
| 603 | + " ) -> List[Dict[str, Any]]:\n", |
| 604 | + " \"\"\"Evaluate agent performance.\"\"\"\n", |
| 605 | + " results = []\n", |
| 606 | + " for evaluator in evaluators:\n", |
| 607 | + " # Filter traces for this evaluator\n", |
| 608 | + " filtered_traces = evaluator.filter_traces(traces)\n", |
| 609 | + " # Run evaluation\n", |
| 610 | + " result = evaluator(filtered_traces, final_answer)\n", |
| 611 | + " results.append(result)\n", |
| 612 | + " return results\n", |
558 | 613 | "\n", |
559 | 614 | "print(\"Benchmark class defined!\")" |
560 | 615 | ] |
|
576 | 631 | "metadata": {}, |
577 | 632 | "outputs": [], |
578 | 633 | "source": [ |
579 | | - "# Create benchmark instance\n", |
580 | | - "benchmark = SimpleBenchmark()\n", |
| 634 | + "# Create benchmark instance with agent configuration\n", |
| 635 | + "agent_data = {\n", |
| 636 | + " \"model_id\": \"gemini/gemini-2.5-flash\",\n", |
| 637 | + " \"temperature\": 0.7\n", |
| 638 | + "}\n", |
| 639 | + "\n", |
| 640 | + "benchmark = SimpleBenchmark(agent_data=agent_data, progress_bar=False)\n", |
581 | 641 | "\n", |
582 | 642 | "# Create task collection\n", |
583 | 643 | "tasks = TaskCollection([task])\n", |
584 | 644 | "\n", |
585 | 645 | "# Run the benchmark\n", |
586 | 646 | "print(\"Running benchmark...\\n\")\n", |
587 | | - "results = benchmark.run(tasks=tasks)\n", |
| 647 | + "reports = benchmark.run(tasks=tasks)\n", |
588 | 648 | "\n", |
589 | 649 | "print(\"\\n\" + \"=\"*60)\n", |
590 | 650 | "print(\"BENCHMARK COMPLETE\")\n", |
|
609 | 669 | "outputs": [], |
610 | 670 | "source": [ |
611 | 671 | "# Get results for the first (and only) task\n", |
612 | | - "task_result = results[0]\n", |
| 672 | + "report = reports[0]\n", |
613 | 673 | "\n", |
614 | | - "print(\"Task ID:\", task_result[\"task_id\"])\n", |
| 674 | + "print(f\"Task ID: {report['task_id']}\")\n", |
| 675 | + "print(f\"Status: {report['status']}\")\n", |
615 | 676 | "print(\"\\nEvaluation Results:\")\n", |
616 | 677 | "print(\"-\" * 60)\n", |
617 | 678 | "\n", |
618 | | - "for eval_result in task_result[\"evaluation_results\"]:\n", |
619 | | - " print(f\"\\nEvaluator: {eval_result['evaluator']}\")\n", |
620 | | - " print(f\"Score: {eval_result.get('score', 'N/A')}\")\n", |
621 | | - " \n", |
622 | | - " # Print relevant details\n", |
623 | | - " for key, value in eval_result.items():\n", |
624 | | - " if key not in [\"evaluator\", \"score\"]:\n", |
625 | | - " print(f\" {key}: {value}\")\n", |
| 679 | + "if report.get(\"eval\"):\n", |
| 680 | + " for eval_result in report[\"eval\"]:\n", |
| 681 | + " print(f\"\\nEvaluator: {eval_result.get('evaluator', 'Unknown')}\")\n", |
| 682 | + " print(f\"Score: {eval_result.get('score', 'N/A')}\")\n", |
| 683 | + " \n", |
| 684 | + " # Print relevant details\n", |
| 685 | + " for key, value in eval_result.items():\n", |
| 686 | + " if key not in [\"evaluator\", \"score\"]:\n", |
| 687 | + " print(f\" {key}: {value}\")\n", |
| 688 | + "else:\n", |
| 689 | + " print(\"No evaluation results available.\")\n", |
| 690 | + " if report.get(\"error\"):\n", |
| 691 | + " print(f\"\\nError: {report['error']}\")\n", |
626 | 692 | "\n", |
627 | 693 | "print(\"\\n\" + \"=\"*60)" |
628 | 694 | ] |
|
660 | 726 | ], |
661 | 727 | "metadata": { |
662 | 728 | "kernelspec": { |
663 | | - "display_name": "Python 3", |
| 729 | + "display_name": ".venv", |
664 | 730 | "language": "python", |
665 | 731 | "name": "python3" |
666 | 732 | }, |
|
674 | 740 | "name": "python", |
675 | 741 | "nbconvert_exporter": "python", |
676 | 742 | "pygments_lexer": "ipython3", |
677 | | - "version": "3.8.0" |
| 743 | + "version": "3.12.11" |
678 | 744 | } |
679 | 745 | }, |
680 | 746 | "nbformat": 4, |
|
0 commit comments