|
| 1 | +# Copyright 2025 © BeeAI a Series of LF Projects, LLC |
| 2 | +# SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +import asyncio |
| 5 | +import os |
| 6 | +from collections.abc import Awaitable, Callable |
| 7 | +from pathlib import Path |
| 8 | +from typing import TypeVar |
| 9 | + |
| 10 | +import pytest |
| 11 | +from deepeval import evaluate |
| 12 | +from deepeval.dataset import EvaluationDataset, Golden |
| 13 | +from deepeval.evaluate import DisplayConfig |
| 14 | +from deepeval.metrics import BaseMetric |
| 15 | +from deepeval.test_case import LLMTestCase |
| 16 | +from deepeval.test_run.test_run import TestRunResultDisplay |
| 17 | +from rich.console import Console, Group |
| 18 | +from rich.panel import Panel |
| 19 | +from rich.table import Table |
| 20 | + |
| 21 | +from beeai_framework.agents import AnyAgent |
| 22 | + |
| 23 | +ROOT_CACHE_DIR = f"/tmp/.cache" |
| 24 | +Path(ROOT_CACHE_DIR).mkdir(parents=True, exist_ok=True) |
| 25 | + |
| 26 | + |
| 27 | +T = TypeVar("T", bound=AnyAgent) |
| 28 | + |
| 29 | + |
| 30 | +async def create_dataset( |
| 31 | + *, |
| 32 | + name: str, |
| 33 | + agent_factory: Callable[[], T], |
| 34 | + agent_run: Callable[[T, LLMTestCase], Awaitable[None]], |
| 35 | + goldens: list[Golden], |
| 36 | + cache: bool | None = None, |
| 37 | +) -> EvaluationDataset: |
| 38 | + dataset = EvaluationDataset() |
| 39 | + |
| 40 | + cache_dir = Path(f"{ROOT_CACHE_DIR}/{name}") |
| 41 | + if cache is None: |
| 42 | + cache = os.getenv("EVAL_CACHE_DATASET", "").lower() == "true" |
| 43 | + |
| 44 | + if cache and cache_dir.exists(): |
| 45 | + for file_path in cache_dir.glob("*.json"): |
| 46 | + dataset.add_test_cases_from_json_file( |
| 47 | + file_path=str(file_path.absolute().resolve()), |
| 48 | + input_key_name="input", |
| 49 | + actual_output_key_name="actual_output", |
| 50 | + expected_output_key_name="expected_output", |
| 51 | + context_key_name="context", |
| 52 | + tools_called_key_name="tools_called", |
| 53 | + expected_tools_key_name="expected_tools", |
| 54 | + retrieval_context_key_name="retrieval_context", |
| 55 | + ) |
| 56 | + else: |
| 57 | + |
| 58 | + async def process_golden(golden: Golden) -> LLMTestCase: |
| 59 | + agent = agent_factory() |
| 60 | + case = LLMTestCase( |
| 61 | + input=golden.input, |
| 62 | + expected_tools=golden.expected_tools, |
| 63 | + actual_output="", |
| 64 | + expected_output=golden.expected_output, |
| 65 | + comments=golden.comments, |
| 66 | + context=golden.context, |
| 67 | + tools_called=golden.tools_called, |
| 68 | + retrieval_context=golden.retrieval_context, |
| 69 | + additional_metadata=golden.additional_metadata, |
| 70 | + ) |
| 71 | + await agent_run(agent, case) |
| 72 | + return case |
| 73 | + |
| 74 | + for test_case in await asyncio.gather(*[process_golden(golden) for golden in goldens], return_exceptions=False): |
| 75 | + dataset.add_test_case(test_case) |
| 76 | + |
| 77 | + if cache: |
| 78 | + dataset.save_as(file_type="json", directory=str(cache_dir.absolute()), include_test_cases=True) |
| 79 | + |
| 80 | + for case in dataset.test_cases: |
| 81 | + case.name = f"{name} - {case.input[0:128].strip()}" # type: ignore |
| 82 | + |
| 83 | + return dataset |
| 84 | + |
| 85 | + |
| 86 | +def evaluate_dataset( |
| 87 | + dataset: EvaluationDataset, metrics: list[BaseMetric], display_mode: TestRunResultDisplay | None = None |
| 88 | +) -> None: |
| 89 | + console = Console() |
| 90 | + console.print("[bold green]Evaluating dataset[/bold green]") |
| 91 | + |
| 92 | + if display_mode is None: |
| 93 | + display_mode = TestRunResultDisplay(os.environ.get("EVAL_DISPLAY_MODE", "all")) |
| 94 | + |
| 95 | + output = evaluate( |
| 96 | + test_cases=dataset.test_cases, # type: ignore |
| 97 | + metrics=metrics, |
| 98 | + display_config=DisplayConfig( |
| 99 | + show_indicator=False, print_results=False, verbose_mode=False, display_option=None |
| 100 | + ), |
| 101 | + ) |
| 102 | + |
| 103 | + # Calculate pass/fail counts |
| 104 | + total = len(output.test_results) |
| 105 | + passed = sum( |
| 106 | + bool(test_result.metrics_data) and all(md.success for md in (test_result.metrics_data or [])) |
| 107 | + for test_result in output.test_results |
| 108 | + ) |
| 109 | + failed = total - passed |
| 110 | + |
| 111 | + # Print summary table |
| 112 | + summary_table = Table(title="Test Results Summary", show_header=True, header_style="bold cyan") |
| 113 | + summary_table.add_column("Total", justify="right") |
| 114 | + summary_table.add_column("Passed", justify="right", style="green") |
| 115 | + summary_table.add_column("Failed", justify="right", style="red") |
| 116 | + summary_table.add_row(str(total), str(passed), str(failed)) |
| 117 | + console.print(summary_table) |
| 118 | + |
| 119 | + for test_result in output.test_results: |
| 120 | + if display_mode != TestRunResultDisplay.ALL and ( |
| 121 | + (display_mode == TestRunResultDisplay.FAILING and test_result.success) |
| 122 | + or (display_mode == TestRunResultDisplay.PASSING and not test_result.success) |
| 123 | + ): |
| 124 | + continue |
| 125 | + |
| 126 | + # Info Table |
| 127 | + info_table = Table(show_header=False, box=None, pad_edge=False) |
| 128 | + info_table.add_row("Input", str(test_result.input)) |
| 129 | + info_table.add_row("Expected Output", str(test_result.expected_output)) |
| 130 | + info_table.add_row("Actual Output", str(test_result.actual_output)) |
| 131 | + |
| 132 | + # Metrics Table |
| 133 | + metrics_table = Table(title="Metrics", show_header=True, header_style="bold magenta") |
| 134 | + metrics_table.add_column("Metric") |
| 135 | + metrics_table.add_column("Success") |
| 136 | + metrics_table.add_column("Score") |
| 137 | + metrics_table.add_column("Threshold") |
| 138 | + metrics_table.add_column("Reason") |
| 139 | + metrics_table.add_column("Error") |
| 140 | + # metrics_table.add_column("Verbose Log") |
| 141 | + |
| 142 | + for metric_data in test_result.metrics_data or []: |
| 143 | + metrics_table.add_row( |
| 144 | + str(metric_data.name), |
| 145 | + str(metric_data.success), |
| 146 | + str(metric_data.score), |
| 147 | + str(metric_data.threshold), |
| 148 | + str(metric_data.reason), |
| 149 | + str(metric_data.error) if metric_data.error else "", |
| 150 | + # str(metric_data.verbose_logs), |
| 151 | + ) |
| 152 | + |
| 153 | + # Print the panel with info and metrics table |
| 154 | + console.print( |
| 155 | + Panel( |
| 156 | + Group(info_table, metrics_table), |
| 157 | + title=f"[bold blue]{test_result.name}[/bold blue]", |
| 158 | + border_style="blue", |
| 159 | + ) |
| 160 | + ) |
| 161 | + |
| 162 | + # Gather failed tests |
| 163 | + if failed: |
| 164 | + pytest.fail(f"{failed}/{total} tests failed. See the summary table above for more details.", pytrace=False) |
| 165 | + else: |
| 166 | + assert 1 == 1 |
0 commit comments