[Create PR]:

arubique · arubique · commit 6c3cb22f32c9 · 2026-02-16T21:24:33.000+01:00
- Add default value to --data_path
diff --git a/examples/mmlu_benchmark/README.md b/examples/mmlu_benchmark/README.md
@@ -43,7 +43,13 @@ The benchmark expects a JSON file in the `mmlu_prompts_examples.json` format:
 
 ### Basic Evaluation
 
-Evaluate a model on all MMLU tasks:
+Evaluate a model on all MMLU tasks (uses `arubique/flattened-MMLU` by default):
+
+```bash
+python mmlu_benchmark.py --model_id "meta-llama/Llama-2-7b-hf"
+```
+
+To use a local JSON file or another Hugging Face dataset:
 
 ```bash
 python mmlu_benchmark.py \
@@ -137,7 +143,7 @@ python mmlu_benchmark.py \
 | Argument | Description | Default |
 |----------|-------------|---------|
 | `--model_id` | HuggingFace model identifier (required) | - |
-| `--data_path` | Path to MMLU prompts JSON file (required) | - |
+| `--data_path` | Path to MMLU prompts JSON file or Hugging Face dataset repo id | `arubique/flattened-MMLU` |
 | `--anchor_points_path` | Path to anchor points pickle file | None |
 | `--output_dir` | Directory to save results | `./results` |
 | `--predictions_path` | Path to save predictions pickle (for DISCO) | None |
diff --git a/examples/mmlu_benchmark/mmlu_benchmark.py b/examples/mmlu_benchmark/mmlu_benchmark.py
@@ -14,29 +14,27 @@
         --use_full_prompt
 
 Usage:
-    # Run with default settings (evaluates on all tasks)
-    python mmlu_benchmark.py --model_id "meta-llama/Llama-2-7b-hf" --data_path /path/to/mmlu_prompts_examples.json
+    # Run with default settings (evaluates on all tasks; uses arubique/flattened-MMLU by default)
+    python mmlu_benchmark.py --model_id "meta-llama/Llama-2-7b-hf"
 
     # Run with anchor points filtering (for DISCO prediction)
     python mmlu_benchmark.py \\
         --model_id "alignment-handbook/zephyr-7b-sft-full" \\
-        --data_path /path/to/mmlu_prompts_examples.json \\
         --anchor_points_path /path/to/anchor_points_disagreement.pkl
 
     # Run with DISCO prediction (passing --disco_model_path enables it)
     python mmlu_benchmark.py \\
         --model_id "alignment-handbook/zephyr-7b-sft-full" \\
-        --data_path /path/to/mmlu_prompts_examples.json \\
         --anchor_points_path /path/to/anchor_points_disagreement.pkl \\
         --disco_model_path /path/to/fitted_weights.pkl \\
         --disco_transform_path /path/to/transform.pkl \\
         --pca 256
 
     # Run on a subset of tasks for testing
-    python mmlu_benchmark.py \\
-        --model_id "meta-llama/Llama-2-7b-hf" \\
-        --data_path /path/to/mmlu_prompts_examples.json \\
-        --limit 10
+    python mmlu_benchmark.py --model_id "meta-llama/Llama-2-7b-hf" --limit 10
+
+    # Override data source (path to JSON or Hugging Face repo id)
+    python mmlu_benchmark.py --model_id "meta-llama/Llama-2-7b-hf" --data_path /path/to/mmlu_prompts_examples.json
 """
 
 import argparse
@@ -90,7 +88,7 @@ def parse_args():
     parser.add_argument(
         "--data_path",
         type=str,
-        required=True,
+        default="arubique/flattened-MMLU",
         help="Path to MMLU prompts JSON file, or Hugging Face dataset repo id (e.g. username/mmlu-prompts-examples)",
     )