Skip to content

Commit 2384db6

Browse files
committed
removed task collection
1 parent 126a54f commit 2384db6

26 files changed

Lines changed: 804 additions & 767 deletions

docs/getting-started/quickstart.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ Once implemented, run your benchmark:
117117

118118
```python
119119
# Define your tasks
120-
tasks = TaskCollection([Task(query="...", expected="..."), ...])
120+
tasks = TaskQueue([Task(query="..."), ...])
121121

122122
# Configure your agents (e.g., model parameters, tool settings)
123123
agent_config = {"model": "gpt-4", "temperature": 0.7}

docs/reference/task.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# Task
22

3-
Tasks define individual benchmark scenarios including inputs, expected outputs, and any metadata needed for evaluation. TaskCollections group related tasks together.
3+
Tasks define individual benchmark scenarios including inputs, expected outputs, and any metadata needed for evaluation. TaskQueues group related tasks together.
44

55
[:material-github: View source](https://github.com/parameterlab/maseval/blob/main/maseval/core/task.py){ .md-source-file }
66

77
::: maseval.core.task.Task
88

9-
::: maseval.core.task.TaskCollection
9+
::: maseval.core.task.TaskQueue

examples/five_a_day_benchmark/five_a_day_benchmark.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@
124124
"from smolagents import ToolCallingAgent, LiteLLMModel, FinalAnswerTool\n",
125125
"\n",
126126
"# MASEval core components\n",
127-
"from maseval import Benchmark, Environment, Task, TaskCollection, AgentAdapter, Evaluator, ModelAdapter\n",
127+
"from maseval import Benchmark, Environment, Task, TaskQueue, AgentAdapter, Evaluator, ModelAdapter\n",
128128
"from maseval.interface.agents.smolagents import SmolAgentAdapter\n",
129129
"\n",
130130
"# Import evaluators module (dynamically loaded later)\n",
@@ -139,7 +139,7 @@
139139
" limit: int | None = None,\n",
140140
" seed: int | None = None,\n",
141141
" task_indices: list[int] | None = None,\n",
142-
") -> tuple[TaskCollection, list[Dict[str, Any]]]:\n",
142+
") -> tuple[TaskQueue, list[Dict[str, Any]]]:\n",
143143
" \"\"\"Load tasks and agent configurations.\n",
144144
"\n",
145145
" Args:\n",
@@ -152,7 +152,7 @@
152152
" task_indices: Optional list of task indices to load (e.g., [0, 2, 4])\n",
153153
"\n",
154154
" Returns:\n",
155-
" Tuple of (TaskCollection, list of agent configs)\n",
155+
" Tuple of (TaskQueue, list of agent configs)\n",
156156
" \"\"\"\n",
157157
" data_dir = Path(\"examples/five_a_day_benchmark/data\")\n",
158158
"\n",
@@ -199,7 +199,7 @@
199199
"\n",
200200
" configs_data.append(config)\n",
201201
"\n",
202-
" return TaskCollection(tasks_data), configs_data"
202+
" return TaskQueue(tasks_data), configs_data"
203203
]
204204
},
205205
{

examples/five_a_day_benchmark/five_a_day_benchmark.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
from utils import derive_seed, sanitize_name # type: ignore[unresolved-import]
2828

29-
from maseval import Benchmark, Environment, Evaluator, Task, TaskCollection, AgentAdapter, ModelAdapter
29+
from maseval import Benchmark, Environment, Evaluator, Task, TaskQueue, AgentAdapter, ModelAdapter
3030
from maseval.core.callbacks.result_logger import FileResultLogger
3131

3232
# Import tool implementations
@@ -825,7 +825,7 @@ def load_benchmark_data(
825825
limit: Optional[int] = None,
826826
specific_task: Optional[int] = None,
827827
seed: Optional[int] = None,
828-
) -> tuple[TaskCollection, List[Dict[str, Any]]]:
828+
) -> tuple[TaskQueue, List[Dict[str, Any]]]:
829829
"""Load tasks and agent configurations with validation.
830830
831831
Args:
@@ -838,7 +838,7 @@ def load_benchmark_data(
838838
seed: Base random seed for reproducibility (None for non-deterministic)
839839
840840
Returns:
841-
Tuple of (TaskCollection, agent_configs_list)
841+
Tuple of (TaskQueue, agent_configs_list)
842842
"""
843843
if limit is not None and specific_task is not None:
844844
raise ValueError("Cannot specify both limit and specific_task")
@@ -896,7 +896,7 @@ def load_benchmark_data(
896896

897897
print(f"Loaded {len(tasks_data)} tasks and {len(configs_data)} agent configs\n")
898898

899-
return TaskCollection(tasks_data), configs_data
899+
return TaskQueue(tasks_data), configs_data
900900

901901

902902
# ============================================================================

examples/introduction/tutorial.ipynb

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@
330330
"metadata": {},
331331
"outputs": [],
332332
"source": [
333-
"from maseval import Benchmark, Environment, Evaluator, Task, TaskCollection\n",
333+
"from maseval import Benchmark, Environment, Evaluator, Task, TaskQueue\n",
334334
"from maseval.interface.agents.smolagents import SmolAgentAdapter\n",
335335
"\n",
336336
"print(\"MASEval components imported successfully!\")"
@@ -634,13 +634,13 @@
634634
"metadata": {},
635635
"outputs": [],
636636
"source": [
637-
"# Create benchmark instance with agent configuration\n",
638-
"agent_data = {\"model_id\": \"gemini/gemini-2.5-flash\", \"temperature\": 0.7}\n",
639-
"\n",
640-
"benchmark = SimpleBenchmark(agent_data=agent_data, progress_bar=False)\n",
641-
"\n",
642-
"# Create task collection\n",
643-
"tasks = TaskCollection([task])\n",
637+
"\"# Create benchmark instance with agent configuration\\n\",\n",
638+
" \"agent_data = {\\\"model_id\\\": \\\"gemini/gemini-2.5-flash\\\", \\\"temperature\\\": 0.7}\\n\",\n",
639+
" \"\\n\",\n",
640+
" \"benchmark = SimpleBenchmark(agent_data=agent_data, progress_bar=False)\\n\",\n",
641+
" \"\\n\",\n",
642+
" \"# Create task queue\\n\",\n",
643+
" \"tasks = TaskQueue([task])\\n\",\n",
644644
"\n",
645645
"# Run the benchmark\n",
646646
"print(\"Running benchmark...\\n\")\n",

maseval/__init__.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,17 @@
88
Benchmarks sit in the `maseval.benchmark` submodule.
99
"""
1010

11-
from .core.task import Task, TaskCollection, TaskProtocol, TimeoutAction
11+
from .core.task import (
12+
Task,
13+
TaskProtocol,
14+
TimeoutAction,
15+
# Task queue classes
16+
BaseTaskQueue,
17+
TaskQueue,
18+
SequentialTaskQueue,
19+
PriorityTaskQueue,
20+
AdaptiveTaskQueue,
21+
)
1222
from .core.environment import Environment
1323
from .core.agent import AgentAdapter
1424
from .core.benchmark import Benchmark, TaskExecutionStatus
@@ -29,7 +39,6 @@
2939
from .core.tracing import TraceableMixin
3040
from .core.registry import ComponentRegistry
3141
from .core.context import TaskContext
32-
from .core.queue import TaskQueue, SequentialQueue, PriorityQueue, AdaptiveQueue
3342
from .core.exceptions import (
3443
MASEvalError,
3544
AgentError,
@@ -45,7 +54,6 @@
4554
__all__ = [
4655
# Tasks
4756
"Task",
48-
"TaskCollection",
4957
"TaskProtocol",
5058
"TimeoutAction",
5159
# Core abstractions
@@ -79,10 +87,11 @@
7987
"ComponentRegistry",
8088
"TaskContext",
8189
# Task queues
90+
"BaseTaskQueue",
8291
"TaskQueue",
83-
"SequentialQueue",
84-
"PriorityQueue",
85-
"AdaptiveQueue",
92+
"SequentialTaskQueue",
93+
"PriorityTaskQueue",
94+
"AdaptiveTaskQueue",
8695
# Exceptions and validation
8796
"MASEvalError",
8897
"AgentError",

maseval/benchmark/macs/data_loader.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from urllib.error import HTTPError, URLError
1616
from urllib.request import urlopen
1717

18-
from maseval import Task, TaskCollection
18+
from maseval import Task, TaskQueue
1919

2020

2121
# =============================================================================
@@ -422,7 +422,7 @@ def load_tasks(
422422
domain: str,
423423
data_dir: Optional[Path] = None,
424424
limit: Optional[int] = None,
425-
) -> TaskCollection:
425+
) -> TaskQueue:
426426
"""Load tasks for a MACS domain.
427427
428428
Args:
@@ -432,7 +432,7 @@ def load_tasks(
432432
limit: Maximum number of tasks to load
433433
434434
Returns:
435-
TaskCollection containing Task objects
435+
TaskQueue containing Task objects
436436
437437
Raises:
438438
ValueError: If domain is not valid
@@ -465,7 +465,7 @@ def load_tasks(
465465
)
466466
)
467467

468-
return TaskCollection(tasks)
468+
return TaskQueue(tasks)
469469

470470

471471
def load_agent_config(
@@ -503,12 +503,12 @@ def load_agent_config(
503503

504504

505505
def configure_model_ids(
506-
tasks: Union[TaskCollection, List[Task]],
506+
tasks: Union[TaskQueue, List[Task]],
507507
*,
508508
tool_model_id: Optional[str] = None,
509509
user_model_id: Optional[str] = None,
510510
evaluator_model_id: Optional[str] = None,
511-
) -> Union[TaskCollection, List[Task]]:
511+
) -> Union[TaskQueue, List[Task]]:
512512
"""Configure model IDs for benchmark components in task data.
513513
514514
This helper merges runtime model configuration into task data structures,
@@ -519,13 +519,13 @@ def configure_model_ids(
519519
task-specific overrides in the original data to take precedence.
520520
521521
Args:
522-
tasks: TaskCollection or list of Tasks to configure
522+
tasks: TaskQueue or list of Tasks to configure
523523
tool_model_id: Model ID for tool simulators (stored in environment_data)
524524
user_model_id: Model ID for user simulator (stored in user_data)
525525
evaluator_model_id: Model ID for evaluators (stored in evaluation_data)
526526
527527
Returns:
528-
The same collection (mutated in place for convenience)
528+
The same queue or list (mutated in place for convenience)
529529
530530
Example:
531531
```python

0 commit comments

Comments
 (0)