[Move DISCO queue to core]:

arubique · arubique · commit c0f81b9f71c8 · 2026-03-09T12:03:37.000+01:00
- Update dependencies
diff --git a/docs/benchmark/mmlu.md b/docs/benchmark/mmlu.md
@@ -18,16 +18,22 @@ Check out the [BENCHMARKS.md](https://github.com/parameterlab/MASEval/blob/main/
 
 ## Installation
 
-MMLU has an optional dependency extra (currently empty, as core MMLU requires no additional packages):
+Install MMLU with all dependencies needed to run the HuggingFace benchmark and example script:
 
 ```bash
 pip install maseval[mmlu]
 ```
 
-For the HuggingFace implementation, also install transformers:
+Or with uv:
 
 ```bash
-pip install maseval[mmlu,transformers]
+uv sync --extra mmlu
+```
+
+This installs `transformers`, `torch`, `numpy`, and `huggingface_hub` (the latter two via `transformers`). You can then run the example:
+
+```bash
+python examples/mmlu_benchmark/mmlu_benchmark.py --model_id alignment-handbook/zephyr-7b-sft-full
 ```
 
 For DISCO prediction support:
diff --git a/pyproject.toml b/pyproject.toml
@@ -82,10 +82,20 @@ multiagentbench = [
 ]
 tau2 = ["docstring-parser>=0.16", "addict>=2.4.0"]
 converse = []
-mmlu = []
+# HuggingFace model + tokenizer, default dataset download; numpy for example script and anchor-point loading;
+# lm-eval for --use_lmeval_batching (exact lm-evaluation-harness reproduction); aiohttp required by lm_eval.models.api_models
+mmlu = [
+    "transformers>=4.37.0",
+    "numpy>=1.20.0",
+    "aiohttp>=3.9.0",
+    "lm-eval @ git+https://github.com/arubique/lm-evaluation-harness.git@main",
+]
 
-# LM Evaluation Harness (for HuggingFaceMMLUBenchmark.precompute_all_logprobs_lmeval)
-lm-eval = ["lm-eval @ git+https://github.com/arubique/lm-evaluation-harness.git@main"]
+# LM Evaluation Harness (same as in mmlu; aiohttp required by lm_eval.models.api_models)
+lm-eval = [
+    "aiohttp>=3.9.0",
+    "lm-eval @ git+https://github.com/arubique/lm-evaluation-harness.git@main",
+]
 
 # DISCO prediction (for MMLU benchmark example)
 disco = [