forked from microsoft/agent-framework
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgaia_sample.py
More file actions
78 lines (62 loc) · 3.21 KB
/
Copy pathgaia_sample.py
File metadata and controls
78 lines (62 loc) · 3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# Copyright (c) Microsoft. All rights reserved.
"""GAIA Benchmark Sample.
To run this sample, execute it from the root directory of the agent-framework repository:
cd /path/to/agent-framework
uv run python python/packages/lab/gaia/gaia_sample.py
This avoids namespace package conflicts that occur when running from within the gaia package directory.
"""
from agent_framework.azure import AzureAIAgentClient
from agent_framework.lab.gaia import GAIA, Evaluation, GAIATelemetryConfig, Prediction, Task
from azure.identity.aio import AzureCliCredential
def evaluate_task(task: Task, prediction: Prediction) -> Evaluation:
"""Evaluate the prediction for a given task."""
# Simple evaluation: check if the prediction contains the answer
is_correct = (task.answer or "").lower() in prediction.prediction.lower()
return Evaluation(is_correct=is_correct, score=1 if is_correct else 0)
async def main() -> None:
"""Run GAIA benchmark with telemetry configuration."""
# Configure telemetry for tracing
telemetry_config = GAIATelemetryConfig(
enable_tracing=True, # Enable OpenTelemetry tracing
# Configure local file tracing
trace_to_file=True, # Export traces to local file
file_path="gaia_benchmark_traces.jsonl", # Custom file path for traces
)
# Create a single agent once and reuse it for all tasks
async with (
AzureCliCredential() as credential,
AzureAIAgentClient(async_credential=credential).create_agent(
name="GaiaAgent",
instructions="Solve tasks to your best ability.",
) as agent,
):
async def run_task(task: Task) -> Prediction:
"""Run a single GAIA task and return the prediction using the shared agent."""
input_message = f"Task: {task.question}"
if task.file_name:
input_message += f"\nFile: {task.file_name}"
result = await agent.run(input_message)
return Prediction(prediction=result.text, messages=result.messages)
# Create the GAIA benchmark runner with telemetry configuration
runner = GAIA(evaluator=evaluate_task, telemetry_config=telemetry_config)
# Run the benchmark with the task runner.
# By default, this will check for locally cached benchmark data and checkout
# the latest version from HuggingFace if not found.
results = await runner.run(
run_task,
level=1, # Level 1, 2, or 3 or multiple levels like [1, 2]
max_n=5, # Maximum number of tasks to run per level
parallel=2, # Number of parallel tasks to run
timeout=60, # Timeout per task in seconds
out="gaia_results_level1.jsonl", # Output file to save results including detailed traces (optional)
)
# Print the results.
print("\n=== GAIA Benchmark Results ===")
for result in results:
print(f"\n--- Task ID: {result.task_id} ---")
print(f"Task: {result.task.question[:100]}...")
print(f"Prediction: {result.prediction.prediction}")
print(f"Evaluation: Correct={result.evaluation.is_correct}, Score={result.evaluation.score}")
if __name__ == "__main__":
import asyncio
asyncio.run(main())