[Move DISCO queue to core]:

arubique · arubique · commit 6ad80a8d7bde · 2026-03-09T12:36:44.000+01:00
- Add InformativeSubsetQueue
- Rename AnchorPointsTaskQueue to DISCOQueue
- Make DISCOQueue a subclass of InformativeSubsetQueue
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -41,7 +41,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 **Core**
 
-- Added `AnchorPointsTaskQueue` to `maseval.core.task` for subset-based evaluation (e.g., anchor-point selection for DISCO). Available via `from maseval import AnchorPointsTaskQueue`. (PR: #34)
+- Added `DISCOQueue` to `maseval.core.task` for subset-based evaluation (e.g., anchor-point selection for DISCO). Available via `from maseval import DISCOQueue`. (PR: #34)
 - Added `SeedGenerator` abstract base class and `DefaultSeedGenerator` implementation for reproducible benchmark runs via SHA-256-based seed derivation (PR: #24)
 - Added `seed` and `seed_generator` parameters to `Benchmark.__init__` for enabling reproducibility (PR: #24)
 - Added `seed_generator` parameter to all benchmark setup methods (`setup_environment`, `setup_user`, `setup_agents`, `setup_evaluators`) (PR: #24)
@@ -88,7 +88,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 **Benchmarks**
 
-- `MMLUBenchmark` no longer implements `setup_agents()` — consistent with other benchmarks, agent creation is left to concrete subclasses (e.g., `HuggingFaceMMLUBenchmark`). Removed silent `.get()` fallbacks for required fields (`gold`, `query`, `model_id`) so missing data surfaces errors immediately instead of failing silently. `AnchorPointsTaskQueue` moved from `maseval.benchmark.mmlu` to `maseval.core.task` and now extends `SequentialTaskQueue` instead of `AdaptiveTaskQueue`. Added `mmlu` optional extra (`pip install maseval[mmlu]`). (PR: #34)
+- `MMLUBenchmark` no longer implements `setup_agents()` — consistent with other benchmarks, agent creation is left to concrete subclasses (e.g., `HuggingFaceMMLUBenchmark`). Removed silent `.get()` fallbacks for required fields (`gold`, `query`, `model_id`) so missing data surfaces errors immediately instead of failing silently. `DISCOQueue` moved from `maseval.benchmark.mmlu` to `maseval.core.task` and now extends `SequentialTaskQueue` instead of `AdaptiveTaskQueue`. Added `mmlu` optional extra (`pip install maseval[mmlu]`). (PR: #34)
 - `MACSBenchmark` and `Tau2Benchmark` benchmarks now actively use the seeding system by deriving seeds for model adapters. Seeds are passed to agents, user simulators, tool simulators, and LLM-based evaluators for reproducible runs. (PR: #26)
   - `Gaia2Benchmark`: Seeds `agents/gaia2_agent`, `evaluators/judge`
   - `MACSBenchmark`: Seeds `environment/tools/tool_{name}`, `simulators/user`, `evaluators/user_gsr`, `evaluators/system_gsr`
diff --git a/docs/benchmark/mmlu.md b/docs/benchmark/mmlu.md
@@ -10,7 +10,7 @@ The **MMLU Benchmark** evaluates language models on multiple-choice questions sp
 [MMLU](https://arxiv.org/abs/2009.03300) (Hendrycks et al., 2021) is a widely used benchmark for measuring knowledge and reasoning across diverse domains. The MASEval implementation features:
 
 - **Log-likelihood MCQ evaluation** matching lm-evaluation-harness methodology
-- **Anchor-point task selection** via `AnchorPointsTaskQueue` for DISCO-style subset evaluation
+- **Anchor-point task selection** via `DISCOQueue` for DISCO-style subset evaluation
 - **HuggingFace integration** with batched log-probability computation
 - **lm-eval compatibility** mode for exact numerical reproduction
 
@@ -88,7 +88,7 @@ tasks = load_tasks(
     anchor_points_path="/path/to/anchor_points.json",
 )
 
-# tasks is an AnchorPointsTaskQueue — only anchor tasks are evaluated
+# tasks is an DISCOQueue — only anchor tasks are evaluated
 print(f"Evaluating {len(tasks)} anchor tasks")
 ```
 
diff --git a/maseval/__init__.py b/maseval/__init__.py
@@ -16,7 +16,8 @@
     BaseTaskQueue,
     TaskQueue,
     SequentialTaskQueue,
-    AnchorPointsTaskQueue,
+    InformativeSubsetQueue,
+    DISCOQueue,
     PriorityTaskQueue,
     AdaptiveTaskQueue,
 )
@@ -94,7 +95,8 @@
     "BaseTaskQueue",
     "TaskQueue",
     "SequentialTaskQueue",
-    "AnchorPointsTaskQueue",
+    "InformativeSubsetQueue",
+    "DISCOQueue",
     "PriorityTaskQueue",
     "AdaptiveTaskQueue",
     # Model adapters
diff --git a/maseval/benchmark/mmlu/__init__.py b/maseval/benchmark/mmlu/__init__.py
@@ -7,7 +7,7 @@
         HuggingFaceMMLUBenchmark,
         load_tasks,
     )
-    from maseval import AnchorPointsTaskQueue
+    from maseval import DISCOQueue, InformativeSubsetQueue
 
     # Load tasks and anchor points
     tasks = load_tasks(
@@ -20,7 +20,7 @@
     results = benchmark.run(tasks=tasks, agent_data={"model_id": "meta-llama/Llama-2-7b-hf"})
 """
 
-from maseval import AnchorPointsTaskQueue
+from maseval import DISCOQueue
 
 from .mmlu import (
     DEFAULT_AGENT_NAME,
@@ -58,7 +58,8 @@
     "MMLUEvaluator",
     "MMLUModelAgent",
     "MMLUAgentAdapter",
-    "AnchorPointsTaskQueue",
+    "InformativeSubsetQueue",
+    "DISCOQueue",
     "load_tasks",
     "compute_benchmark_metrics",
 ]
diff --git a/maseval/benchmark/mmlu/mmlu.py b/maseval/benchmark/mmlu/mmlu.py
@@ -10,7 +10,7 @@
     from maseval.benchmark.mmlu import (
         HuggingFaceMMLUBenchmark, load_tasks,
     )
-    from maseval import AnchorPointsTaskQueue
+    from maseval import DISCOQueue
 
     # Load tasks (optionally filtered to anchor points)
     tasks = load_tasks(
@@ -39,7 +39,7 @@
 
 from maseval import (
     AgentAdapter,
-    AnchorPointsTaskQueue,
+    DISCOQueue,
     Benchmark,
     Environment,
     Evaluator,
@@ -963,13 +963,13 @@ def load_tasks(
     data_path: Union[str, Path],
     anchor_points_path: Optional[Union[str, Path]] = None,
     limit: Optional[int] = None,
-) -> Union[AnchorPointsTaskQueue, SequentialTaskQueue]:
+) -> Union[DISCOQueue, SequentialTaskQueue]:
     """Load MMLU tasks from JSON file.
 
     Args:
         data_path: Path to MMLU prompts JSON file (mmlu_prompts_examples.json format).
         anchor_points_path: Optional path to anchor points pickle file.
-            If provided, returns an AnchorPointsTaskQueue that evaluates
+            If provided, returns an DISCOQueue that evaluates
             only the anchor tasks in order.
         limit: Optional limit on number of tasks to load.
 
@@ -1024,7 +1024,7 @@ def load_tasks(
 
     # Create appropriate queue
     if anchor_points is not None:
-        return AnchorPointsTaskQueue(tasks, anchor_points)
+        return DISCOQueue(tasks, anchor_points)
     else:
         return SequentialTaskQueue(tasks)
 
diff --git a/maseval/core/task.py b/maseval/core/task.py
@@ -273,51 +273,92 @@ def __iter__(self) -> Iterator[Task]:
         return iter(self._tasks)
 
 
-class AnchorPointsTaskQueue(SequentialTaskQueue):
-    """Task queue that evaluates a specified subset of tasks in a given order.
+class InformativeSubsetQueue(SequentialTaskQueue):
+    """Evaluates an informative subset of tasks in a specified order.
 
-    Used for anchor-point-based evaluation where performance on a full dataset
-    is predicted from results on a carefully selected subset. Anchor points are
-    integer indices into the original task list. Only tasks at those indices are
-    yielded, in the order specified by ``anchor_points``.
+    Used for efficient evaluation where a carefully selected subset of tasks
+    can predict performance on the full dataset. The subset is defined by
+    ``indices`` — integer positions into the original task list. Only tasks
+    at those positions are yielded, in the order given by ``indices``.
 
-    When ``anchor_points`` is ``None``, all tasks are yielded in their original order
-    (equivalent to ``SequentialTaskQueue``).
+    The informativeness criterion (how the indices were chosen) is determined
+    by the caller or by a subclass. This base class is criterion-agnostic.
+
+    When ``indices`` is ``None``, all tasks are yielded in their original
+    order (equivalent to ``SequentialTaskQueue``).
 
     Attributes:
         _all_tasks: The complete, unfiltered task list.
-        _anchor_points: The anchor-point indices, or ``None``.
+        _indices: The subset indices, or ``None``.
 
     Example:
         ```python
         # Evaluate only tasks at indices 0, 5, 12
-        queue = AnchorPointsTaskQueue(tasks, anchor_points=[0, 5, 12])
+        queue = InformativeSubsetQueue(tasks, indices=[0, 5, 12])
 
         for task in queue:
             result = execute(task)  # Only 3 tasks
         ```
     """
 
-    def __init__(self, tasks: Iterable[Task], anchor_points: Optional[List[int]] = None) -> None:
-        """Initialize anchor-points task queue.
+    def __init__(self, tasks: Iterable[Task], indices: Optional[List[int]] = None) -> None:
+        """Initialize informative-subset task queue.
 
         Args:
             tasks: Full list of tasks (ordered by index).
-            anchor_points: Indices into ``tasks`` selecting which tasks to evaluate
+            indices: Positions into ``tasks`` selecting which tasks to evaluate
                 and in what order. If ``None``, evaluates all tasks in order.
         """
         all_tasks = list(tasks)
         self._all_tasks: List[Task] = all_tasks
-        self._anchor_points: Optional[List[int]] = anchor_points
+        self._indices: Optional[List[int]] = indices
 
-        if anchor_points is not None:
+        if indices is not None:
             task_by_index: Dict[int, Task] = {i: task for i, task in enumerate(all_tasks)}
-            filtered = [task_by_index[idx] for idx in anchor_points if idx in task_by_index]
+            filtered = [task_by_index[idx] for idx in indices if idx in task_by_index]
             super().__init__(filtered)
         else:
             super().__init__(all_tasks)
 
 
+class DISCOQueue(InformativeSubsetQueue):
+    """Diversity-based informative subset using DISCO anchor points.
+
+    Selects a diverse subset of tasks (anchor points) for evaluation. Full
+    benchmark performance is then predicted from results on this subset using
+    DISCO (DISCOvering key features for accurate prediction of LLM abilities
+    on benchmarks).
+
+    The informativeness criterion is **diversity**: anchor points are chosen
+    to maximise disagreement across models, so that a small evaluation set
+    captures the discriminative structure of the full benchmark.
+
+    Reference: `DISCO: DISCOvering key features for accurate prediction of
+    LLM abilities on benchmarks <https://arxiv.org/abs/2407.12890>`_
+
+    Example:
+        ```python
+        queue = DISCOQueue(tasks, anchor_points=[0, 5, 12])
+
+        for task in queue:
+            result = execute(task)  # Only 3 tasks
+        ```
+    """
+
+    def __init__(self, tasks: Iterable[Task], anchor_points: Optional[List[int]] = None) -> None:
+        """Initialize DISCO task queue.
+
+        Args:
+            tasks: Full list of tasks (ordered by index).
+            anchor_points: Diversity-selected indices into ``tasks``.
+                Typically loaded from a DISCO anchor-points file or
+                downloaded from a HuggingFace DISCO model repo.
+                If ``None``, evaluates all tasks in order.
+        """
+        self._anchor_points: Optional[List[int]] = anchor_points
+        super().__init__(tasks, indices=anchor_points)
+
+
 class PriorityTaskQueue(BaseTaskQueue):
     """Execute tasks ordered by priority.