1111No side effects on import. Data download/processing must be explicitly called.
1212"""
1313
14- from pathlib import Path
1514from typing import Any , Dict , List , Optional , Tuple , Union
1615
1716from maseval import Task , TaskQueue
2221# Constants
2322# =============================================================================
2423
25- DEFAULT_DATA_DIR = Path (__file__ ).parent / "data"
26-
24+ # HuggingFace config names that exist on the dataset.
25+ # Each config corresponds to a capability and contains ~160 scenarios.
26+ # ARE simulation/types.py: CapabilityTag enum values
2727VALID_CAPABILITIES : Tuple [str , ...] = (
2828 "execution" ,
2929 "search" ,
3030 "adaptability" ,
3131 "time" ,
3232 "ambiguity" ,
33- "agent2agent" ,
34- "noise" ,
3533)
3634
3735VALID_SPLITS : Tuple [str , ...] = ("validation" ,) # Only validation has oracle events
3836
39- DEFAULT_CONFIG = "validation" # Full dataset
4037# ARE scenarios/config.py:20: DEFAULT_SCENARIO_TIMEOUT = 1860
4138DEFAULT_TIMEOUT_SECONDS = 1860.0 # 31 minutes per task (matching ARE)
4239DEFAULT_MAX_RETRIES = 1
4340
4441# HuggingFace dataset info
4542HF_DATASET_ID = "meta-agents-research-environments/gaia2"
43+ HF_DATASET_REVISION = "78ea3bdbdeec2bdcd6afa5420915d8a22f23ed99"
4644
4745
4846# =============================================================================
@@ -59,24 +57,24 @@ def load_tasks(
5957) -> TaskQueue :
6058 """Load Gaia2 tasks from HuggingFace.
6159
60+ Each HuggingFace config corresponds to a capability (execution, search,
61+ adaptability, time, ambiguity). When ``capability`` is None, all
62+ capabilities are loaded and combined.
63+
64+ GAIA2 is event-driven: the task query is delivered to agents via the
65+ notification system at runtime (first ``send_message_to_agent`` event),
66+ not as a static field. ``task.query`` is left empty.
67+
6268 Args:
63- capability: Filter by capability type (execution, search, adaptability,
64- time, ambiguity, agent2agent, noise). None loads all.
69+ capability: Filter by capability type. None loads all capabilities.
6570 split: Dataset split (currently only "validation" available)
66- limit: Maximum number of tasks to load
71+ limit: Maximum number of tasks to load (across all capabilities)
6772 timeout_seconds: Maximum execution time per task. Default 1860 (31 minutes,
6873 matching ARE's DEFAULT_SCENARIO_TIMEOUT). Set to None to disable timeout.
6974 max_retries: Maximum retry attempts. Default 1 (skip on failure).
7075
7176 Returns:
72- TaskQueue with Task objects containing:
73- - id: Unique scenario identifier
74- - query: Initial task instructions
75- - environment_data: {"scenario": BenchmarkScenario, "capability": str}
76- - evaluation_data: {"oracle_events": [...], "judge_type": str}
77- - user_data: {} # Gaia2 uses event-based simulation, not user turns
78- - metadata: {"capability": str, "universe_id": str, ...}
79- - protocol: TaskProtocol with timeout and tags
77+ TaskQueue with Task objects.
8078
8179 Raises:
8280 ValueError: If capability or split is invalid
@@ -111,136 +109,117 @@ def load_tasks(
111109 "Or: uv add --optional gaia2 meta-agents-research-environments"
112110 ) from e
113111
114- # Determine HuggingFace config name
115- config_name = capability if capability else DEFAULT_CONFIG
116-
117- # Load dataset from HuggingFace
118- # Passing `split` guarantees the return type is Dataset (not DatasetDict)
119- dataset = load_dataset (
120- HF_DATASET_ID ,
121- name = config_name ,
122- split = split ,
123- )
124- assert isinstance (dataset , Dataset )
125-
126- # Apply limit
127- if limit :
128- dataset = dataset .select (range (min (limit , len (dataset ))))
112+ # When no capability specified, load all capabilities and combine
113+ capabilities = [capability ] if capability else list (VALID_CAPABILITIES )
129114
130- # Convert to MASEval Task objects
131115 importer = JsonScenarioImporter ()
132- tasks = []
133-
134- for row in dataset :
135- # Parse scenario from JSON
136- scenario , oracle_events , _ = importer .import_from_json_to_benchmark (json_str = row ["data" ])
137-
138- task = _convert_gaia2_to_maseval (
139- row = row ,
140- scenario = scenario ,
141- oracle_events = oracle_events ,
142- timeout_seconds = timeout_seconds ,
143- max_retries = max_retries ,
144- config_capability = config_name ,
116+ tasks : List [Task ] = []
117+
118+ for cap in capabilities :
119+ # Each capability is a HuggingFace config name
120+ # Passing `split` guarantees the return type is Dataset (not DatasetDict)
121+ dataset = load_dataset (
122+ HF_DATASET_ID ,
123+ name = cap ,
124+ split = split ,
125+ revision = HF_DATASET_REVISION ,
145126 )
146- tasks .append (task )
127+ assert isinstance (dataset , Dataset )
128+
129+ for row in dataset :
130+ # Parse scenario from JSON
131+ # import_from_json_to_benchmark returns (scenario, completed_events, _)
132+ # completed_events are from previous runs, not oracle events.
133+ # Oracle events are generated at runtime by preprocess_scenario().
134+ scenario , _ , _ = importer .import_from_json_to_benchmark (json_str = row ["data" ])
135+
136+ task = _convert_gaia2_to_maseval (
137+ row = row ,
138+ scenario = scenario ,
139+ timeout_seconds = timeout_seconds ,
140+ max_retries = max_retries ,
141+ config_capability = cap ,
142+ )
143+ tasks .append (task )
144+
145+ if limit and len (tasks ) >= limit :
146+ break
147+
148+ if limit and len (tasks ) >= limit :
149+ break
147150
148151 return TaskQueue (tasks )
149152
150153
151- def _get_scenario_metadata (scenario : Any , key : str , default : Any = None ) -> Any :
152- """Safely get metadata from an ARE scenario object.
153-
154- Args:
155- scenario: ARE BenchmarkScenario object
156- key: Metadata key to retrieve
157- default: Default value if key not found
158-
159- Returns:
160- The metadata value or default
161- """
162- metadata = getattr (scenario , "metadata" , None )
163- if metadata is None :
164- return default
165- if isinstance (metadata , dict ):
166- return metadata .get (key , default )
167- # Try attribute access as fallback
168- return getattr (metadata , key , default )
169-
170-
171154def _convert_gaia2_to_maseval (
172155 row : Dict [str , Any ],
173156 scenario : Any ,
174- oracle_events : List [Any ],
175157 timeout_seconds : Optional [float ],
176158 max_retries : int ,
177159 config_capability : str ,
178160) -> Task :
179161 """Convert Gaia2 scenario to MASEval Task.
180162
163+ GAIA2 is event-driven: the task query is delivered via the notification
164+ system at runtime (first ``send_message_to_agent`` event). There is no
165+ static query field on ARE scenario objects.
166+ ARE agents/default_agent/are_simulation_main.py:79-102
167+
168+ Oracle events are generated at runtime by ``preprocess_scenario()`` during
169+ environment setup, not at data-load time.
170+
181171 Args:
182172 row: Raw row from HuggingFace dataset
183173 scenario: ARE BenchmarkScenario object
184- oracle_events: List of oracle events for evaluation
185174 timeout_seconds: Maximum execution time per task
186175 max_retries: Maximum retry attempts
187176 config_capability: The capability from the HuggingFace config name
188177
189178 Returns:
190179 MASEval Task object
191180 """
192- # Extract query from scenario's task definition
193- query = getattr (scenario , "task_instruction" , "" )
194-
195- # Use capability from config name (which determines which dataset subset was loaded)
196- capability = config_capability
181+ scenario_id = getattr (scenario , "scenario_id" , None )
197182
198183 # Build environment_data
199184 # Duration is NOT set here — ARE's preprocess_scenario() sets it during
200185 # environment setup based on capability (1800s standard, 420s for Time).
201186 # ARE scenarios/config.py:18-19, scenarios/scenario_imported_from_json/utils.py:69-76
202187 environment_data : Dict [str , Any ] = {
203188 "scenario" : scenario ,
204- "capability" : capability ,
205- "universe_id" : _get_scenario_metadata (scenario , "universe_id" ),
189+ "capability" : config_capability ,
206190 }
207191
208- # Build evaluation_data with oracle events
192+ # Evaluation uses scenario.judge.validate() at runtime (created by
193+ # preprocess_scenario). No static evaluation data needed at load time.
194+ # ARE scenarios/scenario_imported_from_json/utils.py:110-112
209195 evaluation_data : Dict [str , Any ] = {
210- "oracle_events" : oracle_events ,
211- "judge_type" : _get_scenario_metadata (scenario , "judge_type" , "graph_per_event" ),
196+ "judge_type" : "graph_per_event" ,
212197 }
213198
214- # Build metadata
215199 metadata : Dict [str , Any ] = {
216- "scenario_id" : row .get ("scenario_id" ) or row .get ("id" ),
217- "capability" : capability ,
218- "universe_id" : environment_data .get ("universe_id" ),
200+ "scenario_id" : scenario_id or row .get ("scenario_id" ) or row .get ("id" ),
201+ "capability" : config_capability ,
219202 }
220203
221- # Build protocol
222204 protocol = TaskProtocol (
223205 timeout_seconds = timeout_seconds ,
224206 max_retries = max_retries ,
225- tags = {"capability" : capability , "benchmark" : "gaia2" },
207+ tags = {"capability" : config_capability , "benchmark" : "gaia2" },
226208 )
227209
228- # Build task kwargs
229- task_kwargs : Dict [str , Any ] = {
230- "query" : query ,
231- "environment_data" : environment_data ,
232- "evaluation_data" : evaluation_data ,
233- "user_data" : {}, # Gaia2 uses event-based simulation
234- "metadata" : metadata ,
235- "protocol" : protocol ,
236- }
237-
238- # Include id if provided
239210 task_id = row .get ("id" ) or row .get ("scenario_id" )
240- if task_id :
241- task_kwargs ["id" ] = str (task_id )
242-
243- return Task (** task_kwargs )
211+ if not task_id :
212+ raise ValueError ("HuggingFace row missing both 'id' and 'scenario_id' fields" )
213+
214+ return Task (
215+ id = str (task_id ),
216+ query = "" , # Event-driven: real query comes from notification system at runtime
217+ environment_data = environment_data ,
218+ evaluation_data = evaluation_data ,
219+ user_data = {},
220+ metadata = metadata ,
221+ protocol = protocol ,
222+ )
244223
245224
246225# =============================================================================
@@ -298,6 +277,6 @@ def configure_model_ids(
298277 tasks = load_tasks (limit = 5 )
299278 print (f"Loaded { len (tasks )} tasks" )
300279 for task in tasks :
301- print (f" - { task .id } : { task .query [: 50 ] } ... " )
280+ print (f" - { task .id } (capability= { task .metadata . get ( 'capability' ) } ) " )
302281 except ImportError as e :
303282 print (f"Error: { e } " )
0 commit comments