[Move DISCO queue to core]:

arubique · arubique · commit b498ce7c08da · 2026-03-11T17:28:24.000+01:00
- Rename HuggingFaceMMLUBenchmark to DefaultMMLUBenchmark for consistency with other benchmarks
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 **Benchmarks**
 
-- MMLU Benchmark with DISCO support: Integration for evaluating language models on MMLU (Massive Multitask Language Understanding) multiple-choice questions, compatible with DISCO anchor-point methodology. Includes `MMLUBenchmark`, `HuggingFaceMMLUBenchmark`, `MMLUEnvironment`, `MMLUEvaluator`, `MMLUModelAgent`, `MMLUAgentAdapter`, `load_tasks()`, and `compute_benchmark_metrics()`. Install with `pip install maseval[mmlu]`. Optional extras: `lm-eval` (for `HuggingFaceMMLUBenchmark.precompute_all_logprobs_lmeval`), `disco` (for DISCO prediction in the example). (PR: #34)
+- MMLU Benchmark with DISCO support: Integration for evaluating language models on MMLU (Massive Multitask Language Understanding) multiple-choice questions, compatible with DISCO anchor-point methodology. Includes `MMLUBenchmark`, `DefaultMMLUBenchmark`, `MMLUEnvironment`, `MMLUEvaluator`, `MMLUModelAgent`, `MMLUAgentAdapter`, `load_tasks()`, and `compute_benchmark_metrics()`. Install with `pip install maseval[mmlu]`. Optional extras: `lm-eval` (for `DefaultMMLUBenchmark.precompute_all_logprobs_lmeval`), `disco` (for DISCO prediction in the example). (PR: #34)
 
 - CONVERSE benchmark for contextual safety evaluation in adversarial agent-to-agent conversations, including `ConverseBenchmark`, `DefaultAgentConverseBenchmark`, `ConverseEnvironment`, `ConverseExternalAgent`, `PrivacyEvaluator`, `SecurityEvaluator`, and `load_tasks()` utilities for `travel`, `real_estate`, and `insurance` domains. Benchmark source files are now downloaded on first use via `ensure_data_exists()` instead of being bundled in the package. (PR: #28)
 
@@ -88,7 +88,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 **Benchmarks**
 
-- `MMLUBenchmark` no longer implements `setup_agents()` — consistent with other benchmarks, agent creation is left to concrete subclasses (e.g., `HuggingFaceMMLUBenchmark`). Removed silent `.get()` fallbacks for required fields (`gold`, `query`, `model_id`) so missing data surfaces errors immediately instead of failing silently. `DISCOQueue` moved from `maseval.benchmark.mmlu` to `maseval.core.task` and now extends `SequentialTaskQueue` instead of `AdaptiveTaskQueue`. Added `mmlu` optional extra (`pip install maseval[mmlu]`). (PR: #34)
+- `MMLUBenchmark` no longer implements `setup_agents()` — consistent with other benchmarks, agent creation is left to concrete subclasses (e.g., `DefaultMMLUBenchmark`). Removed silent `.get()` fallbacks for required fields (`gold`, `query`, `model_id`) so missing data surfaces errors immediately instead of failing silently. `DISCOQueue` moved from `maseval.benchmark.mmlu` to `maseval.core.task` and now extends `SequentialTaskQueue` instead of `AdaptiveTaskQueue`. Added `mmlu` optional extra (`pip install maseval[mmlu]`). (PR: #34)
 - `MACSBenchmark` and `Tau2Benchmark` benchmarks now actively use the seeding system by deriving seeds for model adapters. Seeds are passed to agents, user simulators, tool simulators, and LLM-based evaluators for reproducible runs. (PR: #26)
   - `Gaia2Benchmark`: Seeds `agents/gaia2_agent`, `evaluators/judge`
   - `MACSBenchmark`: Seeds `environment/tools/tool_{name}`, `simulators/user`, `evaluators/user_gsr`, `evaluators/system_gsr`
diff --git a/docs/benchmark/mmlu.md b/docs/benchmark/mmlu.md
@@ -52,7 +52,7 @@ pip install maseval[lm-eval]
 
 ```python
 from maseval.benchmark.mmlu import (
-    HuggingFaceMMLUBenchmark,
+    DefaultMMLUBenchmark,
     load_tasks,
     compute_benchmark_metrics,
 )
@@ -61,7 +61,7 @@ from maseval.benchmark.mmlu import (
 tasks = load_tasks(data_path="/path/to/mmlu_prompts_examples.json")
 
 # Create benchmark with HuggingFace model
-benchmark = HuggingFaceMMLUBenchmark(
+benchmark = DefaultMMLUBenchmark(
     model_id="meta-llama/Llama-2-7b-hf",
     device="cuda:0",
 )
@@ -118,7 +118,7 @@ class MyMMLUBenchmark(MMLUBenchmark):
 
 ::: maseval.benchmark.mmlu.MMLUBenchmark
 
-::: maseval.benchmark.mmlu.HuggingFaceMMLUBenchmark
+::: maseval.benchmark.mmlu.DefaultMMLUBenchmark
 
 ::: maseval.benchmark.mmlu.MMLUEnvironment
 
diff --git a/examples/mmlu_benchmark/mmlu_benchmark.py b/examples/mmlu_benchmark/mmlu_benchmark.py
@@ -52,7 +52,7 @@
 # MMLU benchmark imports
 from maseval.benchmark.mmlu import (
     DEFAULT_DEVICE,
-    HuggingFaceMMLUBenchmark,
+    DefaultMMLUBenchmark,
     load_tasks,
     compute_benchmark_metrics,
 )
@@ -691,7 +691,7 @@ def main():
     )
 
     # Create benchmark
-    benchmark = HuggingFaceMMLUBenchmark(
+    benchmark = DefaultMMLUBenchmark(
         model_id=args.model_id,
         device=args.device,
         trust_remote_code=True,
diff --git a/maseval/benchmark/mmlu/__init__.py b/maseval/benchmark/mmlu/__init__.py
@@ -4,7 +4,7 @@
 
 Usage:
     from maseval.benchmark.mmlu import (
-        HuggingFaceMMLUBenchmark,
+        DefaultMMLUBenchmark,
         load_tasks,
     )
     from maseval import DISCOQueue, InformativeSubsetQueue
@@ -16,7 +16,7 @@
     )
 
     # Run benchmark
-    benchmark = HuggingFaceMMLUBenchmark(model_id="meta-llama/Llama-2-7b-hf")
+    benchmark = DefaultMMLUBenchmark(model_id="meta-llama/Llama-2-7b-hf")
     results = benchmark.run(tasks=tasks, agent_data={"model_id": "meta-llama/Llama-2-7b-hf"})
 """
 
@@ -33,7 +33,7 @@
     TARGET_DELIMITER,
     TASK_TYPE_MMLU,
     MMLUBenchmark,
-    HuggingFaceMMLUBenchmark,
+    DefaultMMLUBenchmark,
     MMLUEnvironment,
     MMLUEvaluator,
     MMLUModelAgent,
@@ -53,7 +53,7 @@
     "TARGET_DELIMITER",
     "TASK_TYPE_MMLU",
     "MMLUBenchmark",
-    "HuggingFaceMMLUBenchmark",
+    "DefaultMMLUBenchmark",
     "MMLUEnvironment",
     "MMLUEvaluator",
     "MMLUModelAgent",
diff --git a/maseval/benchmark/mmlu/mmlu.py b/maseval/benchmark/mmlu/mmlu.py
@@ -8,7 +8,7 @@
 
 Usage:
     from maseval.benchmark.mmlu import (
-        HuggingFaceMMLUBenchmark, load_tasks,
+        DefaultMMLUBenchmark, load_tasks,
     )
     from maseval import DISCOQueue
 
@@ -19,7 +19,7 @@
     )
 
     # Run with the HuggingFace concrete implementation
-    benchmark = HuggingFaceMMLUBenchmark(model_id="meta-llama/Llama-2-7b-hf")
+    benchmark = DefaultMMLUBenchmark(model_id="meta-llama/Llama-2-7b-hf")
     results = benchmark.run(tasks=tasks, agent_data={"model_id": "meta-llama/Llama-2-7b-hf"})
 """
 
@@ -342,7 +342,7 @@ class MMLUBenchmark(Benchmark):
     - ``setup_agents()`` - create agents for MCQ evaluation
     - ``get_model_adapter()`` - provide model adapters
 
-    For a ready-to-use implementation, see ``HuggingFaceMMLUBenchmark``.
+    For a ready-to-use implementation, see ``DefaultMMLUBenchmark``.
     """
 
     def __init__(
@@ -431,7 +431,7 @@ def evaluate(
         return results
 
 
-class HuggingFaceMMLUBenchmark(MMLUBenchmark):
+class DefaultMMLUBenchmark(MMLUBenchmark):
     """MMLU Benchmark using HuggingFace transformers models.
 
     This concrete implementation uses log-likelihood based MCQ evaluation