fixed Gaia2 evaluator config

cemde · cemde · commit 27e1ecfc0205 · 2026-02-13T10:00:57.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `DefaultAgentGaia2Benchmark` with ReAct-style agent for direct comparison with ARE reference implementation (PR: #26)
   - Tool wrapper (`AREToolWrapper`) for MASEval tracing of ARE tools with simulation time tracking (PR: #26)
   - Data loading utilities: `load_tasks()`, `configure_model_ids()` for loading scenarios from HuggingFace (PR: #26)
+  - `Gaia2JudgeEngineConfig` for configuring the judge's LLM model and provider (e.g., switching from HuggingFace to OpenRouter) via `configure_model_ids(tasks, judge_engine_config=...)` (PR: #PR_NUMBER_PLACEHOLDER)
   - Metrics: `compute_gaia2_metrics()` for GSR (Goal Success Rate) computation by capability type (PR: #26)
   - Support for 5 capability dimensions: execution, search, adaptability, time, ambiguity (PR: #26)
   - Added `gaia2` optional dependency: `pip install maseval[gaia2]` (PR: #26)
diff --git a/maseval/benchmark/gaia2/__init__.py b/maseval/benchmark/gaia2/__init__.py
@@ -71,10 +71,11 @@ def get_model_adapter(self, model_id, **kwargs):
     wrap_are_tools,
 )
 
-# Data loading
+# Data loading and configuration
 from maseval.benchmark.gaia2.data_loader import (
     load_tasks,
     configure_model_ids,
+    Gaia2JudgeEngineConfig,
     VALID_CAPABILITIES,
     VALID_SPLITS,
     HF_DATASET_ID,
@@ -98,9 +99,10 @@ def get_model_adapter(self, model_id, **kwargs):
     # Tool wrapper
     "Gaia2GenericTool",
     "wrap_are_tools",
-    # Data loading
+    # Data loading and configuration
     "load_tasks",
     "configure_model_ids",
+    "Gaia2JudgeEngineConfig",
     "VALID_CAPABILITIES",
     "VALID_SPLITS",
     "HF_DATASET_ID",
diff --git a/maseval/benchmark/gaia2/data_loader.py b/maseval/benchmark/gaia2/data_loader.py
@@ -3,20 +3,72 @@
 This module provides functions to:
 1. Load Gaia2 scenarios from HuggingFace
 2. Convert scenarios to MASEval Task objects
-3. Configure model IDs for benchmark components
+3. Configure model IDs and judge engine for benchmark components
 
 Reference Paper: "GAIA-2: A Controllable Multi-Turn Conversational Benchmark for Agents"
 Data: https://huggingface.co/datasets/meta-agents-research-environments/gaia2
 
 No side effects on import. Data download/processing must be explicitly called.
 """
 
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 from maseval import Task, TaskQueue
 from maseval.core.task import TaskProtocol
 
 
+# =============================================================================
+# Judge Engine Configuration
+# =============================================================================
+
+
+@dataclass
+class Gaia2JudgeEngineConfig:
+    """Configuration for the ARE judge's LLM engine used in semantic comparison.
+
+    ARE's ``GraphPerEventJudge`` uses an LLM to semantically compare tool arguments
+    (e.g., email content, calendar event descriptions) between agent actions and oracle
+    (expected) actions. This config controls which model and provider the judge uses.
+
+    Defaults match ARE's built-in defaults.
+
+    ARE's ``LLMEngineConfig`` only supports ``model_name``, ``provider``, and
+    ``endpoint``. Provider-specific parameters (e.g., OpenRouter's ``fallbacks``
+    or ``route``) are not supported by ARE's engine pipeline.
+
+    ARE ``validation/configs.py:28-29``
+
+    Attributes:
+        model_name: LLM model identifier for the judge engine.
+        provider: LLM provider name (e.g., ``"huggingface"``, ``"openrouter"``, ``"openai"``).
+            Passed to LiteLLM as ``custom_llm_provider``.
+        endpoint: Optional custom API endpoint URL.
+
+    Example::
+
+        from maseval.benchmark.gaia2 import (
+            load_tasks, configure_model_ids, Gaia2JudgeEngineConfig,
+        )
+
+        tasks = load_tasks(capability="execution", limit=5)
+
+        # Use OpenRouter instead of HuggingFace for judge LLM
+        configure_model_ids(
+            tasks,
+            judge_engine_config=Gaia2JudgeEngineConfig(
+                provider="openrouter",
+            ),
+        )
+    """
+
+    # ARE validation/configs.py:28
+    model_name: str = "meta-llama/Meta-Llama-3.3-70B-Instruct"
+    # ARE validation/configs.py:29
+    provider: str = "huggingface"
+    endpoint: Optional[str] = None
+
+
 # =============================================================================
 # Constants
 # =============================================================================
@@ -231,27 +283,36 @@ def configure_model_ids(
     tasks: Union[TaskQueue, List[Task]],
     *,
     evaluator_model_id: Optional[str] = None,
+    judge_engine_config: Optional[Gaia2JudgeEngineConfig] = None,
 ) -> Union[TaskQueue, List[Task]]:
-    """Configure model IDs for benchmark components in task data.
+    """Configure model IDs and judge engine for benchmark components.
 
-    Gaia2 uses ARE's deterministic judge by default, but can optionally
-    use an LLM-based judge for complex assertions.
+    Gaia2's ``GraphPerEventJudge`` uses an LLM for semantic comparison of tool
+    arguments (email content, calendar descriptions, etc.). By default it uses
+    ARE's built-in defaults (``meta-llama/Meta-Llama-3.3-70B-Instruct`` via
+    HuggingFace). Pass ``judge_engine_config`` to override the model/provider.
 
     Note: Unlike Tau2, Gaia2 doesn't have a user simulator (interactions
     happen through scheduled events), so there's no user_model_id.
 
     Args:
-        tasks: TaskQueue or list of Tasks to configure
-        evaluator_model_id: Optional model ID for LLM-based evaluation
+        tasks: TaskQueue or list of Tasks to configure.
+        evaluator_model_id: Optional model ID for LLM-based evaluation.
+        judge_engine_config: Optional judge engine configuration. Controls
+            which LLM model and provider the ARE judge uses for semantic
+            comparison. When ``None``, ARE's defaults are used.
 
     Returns:
-        The same collection (mutated in place for convenience)
+        The same collection (mutated in place for convenience).
+
+    Example::
 
-    Example:
         >>> tasks = load_tasks(capability="execution", limit=5)
         >>> configure_model_ids(
         ...     tasks,
-        ...     evaluator_model_id="gpt-4o",  # Optional, for LLM-based judge
+        ...     judge_engine_config=Gaia2JudgeEngineConfig(
+        ...         provider="openrouter",
+        ...     ),
         ... )
     """
     for task in tasks:
@@ -264,6 +325,10 @@ def configure_model_ids(
                 )
             task.evaluation_data["model_id"] = evaluator_model_id
 
+        # Evaluation data: judge engine configuration (optional)
+        if judge_engine_config is not None:
+            task.evaluation_data["judge_engine_config"] = judge_engine_config
+
     return tasks
 
 
diff --git a/maseval/benchmark/gaia2/environment.py b/maseval/benchmark/gaia2/environment.py
@@ -33,6 +33,7 @@ def __init__(
         self,
         task_data: Dict[str, Any],
         callbacks: Optional[List[Any]] = None,
+        judge_engine_config: Optional[Any] = None,
     ):
         """Initialize Gaia2 environment.
 
@@ -42,8 +43,12 @@ def __init__(
                 - capability: Capability type (execution, search, etc.)
                 - universe_id: Universe identifier
             callbacks: Optional callbacks
+            judge_engine_config: Optional :class:`Gaia2JudgeEngineConfig` controlling
+                which LLM model and provider the ARE judge uses for semantic comparison.
+                Passed explicitly from ``setup_environment()`` (lives in ``evaluation_data``).
         """
         self._scenario = task_data.get("scenario")
+        self._judge_engine_config = judge_engine_config
         self._are_env: Any = None
         self._tool_wrappers: Dict[str, Gaia2GenericTool] = {}
 
@@ -103,9 +108,29 @@ def setup_state(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
         # This handles: SystemApp insertion, duration setting, scenario initialization,
         # oracle run, soft reset, judge creation, turn initialization with trigger
         # conditions, and judge state initialization.
-        # Passing GraphPerEventJudgeConfig() creates a deterministic judge (no LLM).
+        # GraphPerEventJudge uses an LLM for semantic comparison of tool arguments
+        # (email content, calendar descriptions, etc.) via soft checkers.
         # ARE scenarios/scenario_imported_from_json/utils.py:43-157
-        judge_config = GraphPerEventJudgeConfig()
+        if self._judge_engine_config is not None:
+            # User provided custom judge engine config — create engine explicitly
+            # ARE validation/configs.py:32-59
+            from are.simulation.agents.are_simulation_agent_config import (  # type: ignore[import-not-found]
+                LLMEngineConfig,
+            )
+            from are.simulation.validation.configs import create_judge_engine  # type: ignore[import-not-found]
+
+            llm_engine_config = LLMEngineConfig(
+                model_name=self._judge_engine_config.model_name,
+                provider=self._judge_engine_config.provider,
+                endpoint=self._judge_engine_config.endpoint,
+            )
+            engine = create_judge_engine(llm_engine_config)
+            judge_config = GraphPerEventJudgeConfig(engine=engine)
+        else:
+            # Default: use ARE's built-in defaults (Llama 3.3 70B via HuggingFace)
+            # ARE validation/configs.py:28-29, 149
+            judge_config = GraphPerEventJudgeConfig()
+
         preprocess_scenario(
             scenario=scenario,
             judge_config=judge_config,
diff --git a/maseval/benchmark/gaia2/evaluator.py b/maseval/benchmark/gaia2/evaluator.py
@@ -28,8 +28,9 @@
 class Gaia2Evaluator(Evaluator):
     """Evaluates Gaia2 scenarios using ARE's judge system.
 
-    Uses ARE's GraphPerEventJudge for deterministic evaluation based on
-    the event DAG. Supports optional LLM-based judge for complex assertions.
+    Uses ARE's ``GraphPerEventJudge`` which combines deterministic hard checks
+    (exact value matching) with LLM-based soft checks (semantic comparison of
+    content like email bodies and calendar descriptions).
 
     The evaluator compares completed events in the simulation against
     oracle (expected) events to compute Goal Success Rate (GSR).
@@ -143,8 +144,24 @@ def __call__(
                 # Fallback: create judge if not available on scenario
                 from are.simulation.validation import GraphPerEventJudgeConfig, JudgeFactory  # type: ignore[import-not-found]
 
-                judge_config = GraphPerEventJudgeConfig()
-                judge = JudgeFactory()(judge_config)
+                judge_engine_config = self.task.evaluation_data.get("judge_engine_config")
+                if judge_engine_config is not None:
+                    from are.simulation.agents.are_simulation_agent_config import (  # type: ignore[import-not-found]
+                        LLMEngineConfig,
+                    )
+                    from are.simulation.validation.configs import create_judge_engine  # type: ignore[import-not-found]
+
+                    llm_engine_config = LLMEngineConfig(
+                        model_name=judge_engine_config.model_name,
+                        provider=judge_engine_config.provider,
+                        endpoint=judge_engine_config.endpoint,
+                    )
+                    engine = create_judge_engine(llm_engine_config)
+                    judge_cfg = GraphPerEventJudgeConfig(engine=engine)
+                else:
+                    judge_cfg = GraphPerEventJudgeConfig()
+
+                judge = JudgeFactory()(judge_cfg)
                 judge.initialize_state(scenario)
 
             # Ensure intermediate turns are judged before final validation.
diff --git a/maseval/benchmark/gaia2/gaia2.py b/maseval/benchmark/gaia2/gaia2.py
@@ -140,7 +140,8 @@ def setup_environment(
         Returns:
             Gaia2Environment instance
         """
-        return Gaia2Environment(task_data=task.environment_data)
+        judge_engine_config = task.evaluation_data.get("judge_engine_config")
+        return Gaia2Environment(task_data=task.environment_data, judge_engine_config=judge_engine_config)
 
     def setup_user(  # type: ignore[override]
         self,
diff --git a/tests/test_benchmarks/test_gaia2/test_data_loader.py b/tests/test_benchmarks/test_gaia2/test_data_loader.py
@@ -147,3 +147,84 @@ def test_works_with_list(self):
 
         assert result is tasks
         assert all(t.evaluation_data.get("model_id") == "test-model" for t in tasks)
+
+    def test_sets_judge_engine_config(self, sample_gaia2_task_queue):
+        """Test configure_model_ids stores judge_engine_config in evaluation_data."""
+        from maseval.benchmark.gaia2.data_loader import Gaia2JudgeEngineConfig, configure_model_ids
+
+        config = Gaia2JudgeEngineConfig(provider="openrouter")
+        configure_model_ids(sample_gaia2_task_queue, judge_engine_config=config)
+
+        for task in sample_gaia2_task_queue:
+            assert task.evaluation_data.get("judge_engine_config") is config
+
+    def test_judge_engine_config_none_does_not_set(self, sample_gaia2_task_queue):
+        """Test configure_model_ids with None judge_engine_config does not modify evaluation_data."""
+        from maseval.benchmark.gaia2.data_loader import configure_model_ids
+
+        configure_model_ids(sample_gaia2_task_queue)
+
+        for task in sample_gaia2_task_queue:
+            assert "judge_engine_config" not in task.evaluation_data
+
+    def test_both_evaluator_and_judge_config(self, sample_gaia2_task_queue):
+        """Test configure_model_ids sets both evaluator model_id and judge_engine_config."""
+        from maseval.benchmark.gaia2.data_loader import Gaia2JudgeEngineConfig, configure_model_ids
+
+        config = Gaia2JudgeEngineConfig(model_name="gpt-4o", provider="openai")
+        configure_model_ids(
+            sample_gaia2_task_queue,
+            evaluator_model_id="gpt-4o",
+            judge_engine_config=config,
+        )
+
+        for task in sample_gaia2_task_queue:
+            assert task.evaluation_data.get("model_id") == "gpt-4o"
+            assert task.evaluation_data.get("judge_engine_config") is config
+
+
+# =============================================================================
+# Test Gaia2JudgeEngineConfig
+# =============================================================================
+
+
+@pytest.mark.benchmark
+class TestGaia2JudgeEngineConfig:
+    """Tests for Gaia2JudgeEngineConfig dataclass."""
+
+    def test_default_values_match_are(self):
+        """Test defaults match ARE's validation/configs.py:28-29."""
+        from maseval.benchmark.gaia2.data_loader import Gaia2JudgeEngineConfig
+
+        config = Gaia2JudgeEngineConfig()
+        assert config.model_name == "meta-llama/Meta-Llama-3.3-70B-Instruct"
+        assert config.provider == "huggingface"
+        assert config.endpoint is None
+
+    def test_custom_provider(self):
+        """Test custom provider can be set."""
+        from maseval.benchmark.gaia2.data_loader import Gaia2JudgeEngineConfig
+
+        config = Gaia2JudgeEngineConfig(provider="openrouter")
+        assert config.provider == "openrouter"
+        assert config.model_name == "meta-llama/Meta-Llama-3.3-70B-Instruct"
+
+    def test_custom_model_and_provider(self):
+        """Test custom model and provider can be set together."""
+        from maseval.benchmark.gaia2.data_loader import Gaia2JudgeEngineConfig
+
+        config = Gaia2JudgeEngineConfig(
+            model_name="openai/gpt-4o",
+            provider="openrouter",
+            endpoint="https://openrouter.ai/api/v1",
+        )
+        assert config.model_name == "openai/gpt-4o"
+        assert config.provider == "openrouter"
+        assert config.endpoint == "https://openrouter.ai/api/v1"
+
+    def test_importable_from_package(self):
+        """Test Gaia2JudgeEngineConfig is importable from the gaia2 package."""
+        from maseval.benchmark.gaia2 import Gaia2JudgeEngineConfig
+
+        config = Gaia2JudgeEngineConfig()
+        assert config is not None
diff --git a/tests/test_benchmarks/test_gaia2/test_environment.py b/tests/test_benchmarks/test_gaia2/test_environment.py
diff --git a/tests/test_benchmarks/test_gaia2/test_evaluator.py b/tests/test_benchmarks/test_gaia2/test_evaluator.py