Skip to content

Commit 27e1ecf

Browse files
committed
fixed Gaia2 evaluator config
1 parent 0356b96 commit 27e1ecf

9 files changed

Lines changed: 390 additions & 18 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1616
- `DefaultAgentGaia2Benchmark` with ReAct-style agent for direct comparison with ARE reference implementation (PR: #26)
1717
- Tool wrapper (`AREToolWrapper`) for MASEval tracing of ARE tools with simulation time tracking (PR: #26)
1818
- Data loading utilities: `load_tasks()`, `configure_model_ids()` for loading scenarios from HuggingFace (PR: #26)
19+
- `Gaia2JudgeEngineConfig` for configuring the judge's LLM model and provider (e.g., switching from HuggingFace to OpenRouter) via `configure_model_ids(tasks, judge_engine_config=...)` (PR: #PR_NUMBER_PLACEHOLDER)
1920
- Metrics: `compute_gaia2_metrics()` for GSR (Goal Success Rate) computation by capability type (PR: #26)
2021
- Support for 5 capability dimensions: execution, search, adaptability, time, ambiguity (PR: #26)
2122
- Added `gaia2` optional dependency: `pip install maseval[gaia2]` (PR: #26)

maseval/benchmark/gaia2/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,11 @@ def get_model_adapter(self, model_id, **kwargs):
7171
wrap_are_tools,
7272
)
7373

74-
# Data loading
74+
# Data loading and configuration
7575
from maseval.benchmark.gaia2.data_loader import (
7676
load_tasks,
7777
configure_model_ids,
78+
Gaia2JudgeEngineConfig,
7879
VALID_CAPABILITIES,
7980
VALID_SPLITS,
8081
HF_DATASET_ID,
@@ -98,9 +99,10 @@ def get_model_adapter(self, model_id, **kwargs):
9899
# Tool wrapper
99100
"Gaia2GenericTool",
100101
"wrap_are_tools",
101-
# Data loading
102+
# Data loading and configuration
102103
"load_tasks",
103104
"configure_model_ids",
105+
"Gaia2JudgeEngineConfig",
104106
"VALID_CAPABILITIES",
105107
"VALID_SPLITS",
106108
"HF_DATASET_ID",

maseval/benchmark/gaia2/data_loader.py

Lines changed: 74 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,72 @@
33
This module provides functions to:
44
1. Load Gaia2 scenarios from HuggingFace
55
2. Convert scenarios to MASEval Task objects
6-
3. Configure model IDs for benchmark components
6+
3. Configure model IDs and judge engine for benchmark components
77
88
Reference Paper: "GAIA-2: A Controllable Multi-Turn Conversational Benchmark for Agents"
99
Data: https://huggingface.co/datasets/meta-agents-research-environments/gaia2
1010
1111
No side effects on import. Data download/processing must be explicitly called.
1212
"""
1313

14+
from dataclasses import dataclass
1415
from typing import Any, Dict, List, Optional, Tuple, Union
1516

1617
from maseval import Task, TaskQueue
1718
from maseval.core.task import TaskProtocol
1819

1920

21+
# =============================================================================
22+
# Judge Engine Configuration
23+
# =============================================================================
24+
25+
26+
@dataclass
27+
class Gaia2JudgeEngineConfig:
28+
"""Configuration for the ARE judge's LLM engine used in semantic comparison.
29+
30+
ARE's ``GraphPerEventJudge`` uses an LLM to semantically compare tool arguments
31+
(e.g., email content, calendar event descriptions) between agent actions and oracle
32+
(expected) actions. This config controls which model and provider the judge uses.
33+
34+
Defaults match ARE's built-in defaults.
35+
36+
ARE's ``LLMEngineConfig`` only supports ``model_name``, ``provider``, and
37+
``endpoint``. Provider-specific parameters (e.g., OpenRouter's ``fallbacks``
38+
or ``route``) are not supported by ARE's engine pipeline.
39+
40+
ARE ``validation/configs.py:28-29``
41+
42+
Attributes:
43+
model_name: LLM model identifier for the judge engine.
44+
provider: LLM provider name (e.g., ``"huggingface"``, ``"openrouter"``, ``"openai"``).
45+
Passed to LiteLLM as ``custom_llm_provider``.
46+
endpoint: Optional custom API endpoint URL.
47+
48+
Example::
49+
50+
from maseval.benchmark.gaia2 import (
51+
load_tasks, configure_model_ids, Gaia2JudgeEngineConfig,
52+
)
53+
54+
tasks = load_tasks(capability="execution", limit=5)
55+
56+
# Use OpenRouter instead of HuggingFace for judge LLM
57+
configure_model_ids(
58+
tasks,
59+
judge_engine_config=Gaia2JudgeEngineConfig(
60+
provider="openrouter",
61+
),
62+
)
63+
"""
64+
65+
# ARE validation/configs.py:28
66+
model_name: str = "meta-llama/Meta-Llama-3.3-70B-Instruct"
67+
# ARE validation/configs.py:29
68+
provider: str = "huggingface"
69+
endpoint: Optional[str] = None
70+
71+
2072
# =============================================================================
2173
# Constants
2274
# =============================================================================
@@ -231,27 +283,36 @@ def configure_model_ids(
231283
tasks: Union[TaskQueue, List[Task]],
232284
*,
233285
evaluator_model_id: Optional[str] = None,
286+
judge_engine_config: Optional[Gaia2JudgeEngineConfig] = None,
234287
) -> Union[TaskQueue, List[Task]]:
235-
"""Configure model IDs for benchmark components in task data.
288+
"""Configure model IDs and judge engine for benchmark components.
236289
237-
Gaia2 uses ARE's deterministic judge by default, but can optionally
238-
use an LLM-based judge for complex assertions.
290+
Gaia2's ``GraphPerEventJudge`` uses an LLM for semantic comparison of tool
291+
arguments (email content, calendar descriptions, etc.). By default it uses
292+
ARE's built-in defaults (``meta-llama/Meta-Llama-3.3-70B-Instruct`` via
293+
HuggingFace). Pass ``judge_engine_config`` to override the model/provider.
239294
240295
Note: Unlike Tau2, Gaia2 doesn't have a user simulator (interactions
241296
happen through scheduled events), so there's no user_model_id.
242297
243298
Args:
244-
tasks: TaskQueue or list of Tasks to configure
245-
evaluator_model_id: Optional model ID for LLM-based evaluation
299+
tasks: TaskQueue or list of Tasks to configure.
300+
evaluator_model_id: Optional model ID for LLM-based evaluation.
301+
judge_engine_config: Optional judge engine configuration. Controls
302+
which LLM model and provider the ARE judge uses for semantic
303+
comparison. When ``None``, ARE's defaults are used.
246304
247305
Returns:
248-
The same collection (mutated in place for convenience)
306+
The same collection (mutated in place for convenience).
307+
308+
Example::
249309
250-
Example:
251310
>>> tasks = load_tasks(capability="execution", limit=5)
252311
>>> configure_model_ids(
253312
... tasks,
254-
... evaluator_model_id="gpt-4o", # Optional, for LLM-based judge
313+
... judge_engine_config=Gaia2JudgeEngineConfig(
314+
... provider="openrouter",
315+
... ),
255316
... )
256317
"""
257318
for task in tasks:
@@ -264,6 +325,10 @@ def configure_model_ids(
264325
)
265326
task.evaluation_data["model_id"] = evaluator_model_id
266327

328+
# Evaluation data: judge engine configuration (optional)
329+
if judge_engine_config is not None:
330+
task.evaluation_data["judge_engine_config"] = judge_engine_config
331+
267332
return tasks
268333

269334

maseval/benchmark/gaia2/environment.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def __init__(
3333
self,
3434
task_data: Dict[str, Any],
3535
callbacks: Optional[List[Any]] = None,
36+
judge_engine_config: Optional[Any] = None,
3637
):
3738
"""Initialize Gaia2 environment.
3839
@@ -42,8 +43,12 @@ def __init__(
4243
- capability: Capability type (execution, search, etc.)
4344
- universe_id: Universe identifier
4445
callbacks: Optional callbacks
46+
judge_engine_config: Optional :class:`Gaia2JudgeEngineConfig` controlling
47+
which LLM model and provider the ARE judge uses for semantic comparison.
48+
Passed explicitly from ``setup_environment()`` (lives in ``evaluation_data``).
4549
"""
4650
self._scenario = task_data.get("scenario")
51+
self._judge_engine_config = judge_engine_config
4752
self._are_env: Any = None
4853
self._tool_wrappers: Dict[str, Gaia2GenericTool] = {}
4954

@@ -103,9 +108,29 @@ def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
103108
# This handles: SystemApp insertion, duration setting, scenario initialization,
104109
# oracle run, soft reset, judge creation, turn initialization with trigger
105110
# conditions, and judge state initialization.
106-
# Passing GraphPerEventJudgeConfig() creates a deterministic judge (no LLM).
111+
# GraphPerEventJudge uses an LLM for semantic comparison of tool arguments
112+
# (email content, calendar descriptions, etc.) via soft checkers.
107113
# ARE scenarios/scenario_imported_from_json/utils.py:43-157
108-
judge_config = GraphPerEventJudgeConfig()
114+
if self._judge_engine_config is not None:
115+
# User provided custom judge engine config — create engine explicitly
116+
# ARE validation/configs.py:32-59
117+
from are.simulation.agents.are_simulation_agent_config import ( # type: ignore[import-not-found]
118+
LLMEngineConfig,
119+
)
120+
from are.simulation.validation.configs import create_judge_engine # type: ignore[import-not-found]
121+
122+
llm_engine_config = LLMEngineConfig(
123+
model_name=self._judge_engine_config.model_name,
124+
provider=self._judge_engine_config.provider,
125+
endpoint=self._judge_engine_config.endpoint,
126+
)
127+
engine = create_judge_engine(llm_engine_config)
128+
judge_config = GraphPerEventJudgeConfig(engine=engine)
129+
else:
130+
# Default: use ARE's built-in defaults (Llama 3.3 70B via HuggingFace)
131+
# ARE validation/configs.py:28-29, 149
132+
judge_config = GraphPerEventJudgeConfig()
133+
109134
preprocess_scenario(
110135
scenario=scenario,
111136
judge_config=judge_config,

maseval/benchmark/gaia2/evaluator.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@
2828
class Gaia2Evaluator(Evaluator):
2929
"""Evaluates Gaia2 scenarios using ARE's judge system.
3030
31-
Uses ARE's GraphPerEventJudge for deterministic evaluation based on
32-
the event DAG. Supports optional LLM-based judge for complex assertions.
31+
Uses ARE's ``GraphPerEventJudge`` which combines deterministic hard checks
32+
(exact value matching) with LLM-based soft checks (semantic comparison of
33+
content like email bodies and calendar descriptions).
3334
3435
The evaluator compares completed events in the simulation against
3536
oracle (expected) events to compute Goal Success Rate (GSR).
@@ -143,8 +144,24 @@ def __call__(
143144
# Fallback: create judge if not available on scenario
144145
from are.simulation.validation import GraphPerEventJudgeConfig, JudgeFactory # type: ignore[import-not-found]
145146

146-
judge_config = GraphPerEventJudgeConfig()
147-
judge = JudgeFactory()(judge_config)
147+
judge_engine_config = self.task.evaluation_data.get("judge_engine_config")
148+
if judge_engine_config is not None:
149+
from are.simulation.agents.are_simulation_agent_config import ( # type: ignore[import-not-found]
150+
LLMEngineConfig,
151+
)
152+
from are.simulation.validation.configs import create_judge_engine # type: ignore[import-not-found]
153+
154+
llm_engine_config = LLMEngineConfig(
155+
model_name=judge_engine_config.model_name,
156+
provider=judge_engine_config.provider,
157+
endpoint=judge_engine_config.endpoint,
158+
)
159+
engine = create_judge_engine(llm_engine_config)
160+
judge_cfg = GraphPerEventJudgeConfig(engine=engine)
161+
else:
162+
judge_cfg = GraphPerEventJudgeConfig()
163+
164+
judge = JudgeFactory()(judge_cfg)
148165
judge.initialize_state(scenario)
149166

150167
# Ensure intermediate turns are judged before final validation.

maseval/benchmark/gaia2/gaia2.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,8 @@ def setup_environment(
140140
Returns:
141141
Gaia2Environment instance
142142
"""
143-
return Gaia2Environment(task_data=task.environment_data)
143+
judge_engine_config = task.evaluation_data.get("judge_engine_config")
144+
return Gaia2Environment(task_data=task.environment_data, judge_engine_config=judge_engine_config)
144145

145146
def setup_user( # type: ignore[override]
146147
self,

tests/test_benchmarks/test_gaia2/test_data_loader.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,3 +147,84 @@ def test_works_with_list(self):
147147

148148
assert result is tasks
149149
assert all(t.evaluation_data.get("model_id") == "test-model" for t in tasks)
150+
151+
def test_sets_judge_engine_config(self, sample_gaia2_task_queue):
152+
"""Test configure_model_ids stores judge_engine_config in evaluation_data."""
153+
from maseval.benchmark.gaia2.data_loader import Gaia2JudgeEngineConfig, configure_model_ids
154+
155+
config = Gaia2JudgeEngineConfig(provider="openrouter")
156+
configure_model_ids(sample_gaia2_task_queue, judge_engine_config=config)
157+
158+
for task in sample_gaia2_task_queue:
159+
assert task.evaluation_data.get("judge_engine_config") is config
160+
161+
def test_judge_engine_config_none_does_not_set(self, sample_gaia2_task_queue):
162+
"""Test configure_model_ids with None judge_engine_config does not modify evaluation_data."""
163+
from maseval.benchmark.gaia2.data_loader import configure_model_ids
164+
165+
configure_model_ids(sample_gaia2_task_queue)
166+
167+
for task in sample_gaia2_task_queue:
168+
assert "judge_engine_config" not in task.evaluation_data
169+
170+
def test_both_evaluator_and_judge_config(self, sample_gaia2_task_queue):
171+
"""Test configure_model_ids sets both evaluator model_id and judge_engine_config."""
172+
from maseval.benchmark.gaia2.data_loader import Gaia2JudgeEngineConfig, configure_model_ids
173+
174+
config = Gaia2JudgeEngineConfig(model_name="gpt-4o", provider="openai")
175+
configure_model_ids(
176+
sample_gaia2_task_queue,
177+
evaluator_model_id="gpt-4o",
178+
judge_engine_config=config,
179+
)
180+
181+
for task in sample_gaia2_task_queue:
182+
assert task.evaluation_data.get("model_id") == "gpt-4o"
183+
assert task.evaluation_data.get("judge_engine_config") is config
184+
185+
186+
# =============================================================================
187+
# Test Gaia2JudgeEngineConfig
188+
# =============================================================================
189+
190+
191+
@pytest.mark.benchmark
192+
class TestGaia2JudgeEngineConfig:
193+
"""Tests for Gaia2JudgeEngineConfig dataclass."""
194+
195+
def test_default_values_match_are(self):
196+
"""Test defaults match ARE's validation/configs.py:28-29."""
197+
from maseval.benchmark.gaia2.data_loader import Gaia2JudgeEngineConfig
198+
199+
config = Gaia2JudgeEngineConfig()
200+
assert config.model_name == "meta-llama/Meta-Llama-3.3-70B-Instruct"
201+
assert config.provider == "huggingface"
202+
assert config.endpoint is None
203+
204+
def test_custom_provider(self):
205+
"""Test custom provider can be set."""
206+
from maseval.benchmark.gaia2.data_loader import Gaia2JudgeEngineConfig
207+
208+
config = Gaia2JudgeEngineConfig(provider="openrouter")
209+
assert config.provider == "openrouter"
210+
assert config.model_name == "meta-llama/Meta-Llama-3.3-70B-Instruct"
211+
212+
def test_custom_model_and_provider(self):
213+
"""Test custom model and provider can be set together."""
214+
from maseval.benchmark.gaia2.data_loader import Gaia2JudgeEngineConfig
215+
216+
config = Gaia2JudgeEngineConfig(
217+
model_name="openai/gpt-4o",
218+
provider="openrouter",
219+
endpoint="https://openrouter.ai/api/v1",
220+
)
221+
assert config.model_name == "openai/gpt-4o"
222+
assert config.provider == "openrouter"
223+
assert config.endpoint == "https://openrouter.ai/api/v1"
224+
225+
def test_importable_from_package(self):
226+
"""Test Gaia2JudgeEngineConfig is importable from the gaia2 package."""
227+
from maseval.benchmark.gaia2 import Gaia2JudgeEngineConfig
228+
229+
config = Gaia2JudgeEngineConfig()
230+
assert config is not None

0 commit comments

Comments
 (0)