|
1 | | -"""Translate legacy --flag_name arguments into `olmo-eval beaker launch` |
2 | | -flags and delegate to scripts/submit_eval_jobs.sh. |
| 1 | +"""Submit evaluation jobs using allenai/olmo-eval-internal. |
3 | 2 |
|
4 | | -For new code, prefer calling submit_eval_jobs.sh directly with olmo-eval flags. |
5 | | -This wrapper exists to keep existing call sites working. |
| 3 | +Submits a Beaker v2 experiment that runs `olmo-eval run` against a model. The |
| 4 | +Beaker image ships with CUDA and PyTorch; olmo-eval-internal, vllm, and |
| 5 | +transformers are installed at job start via INSTALL_SCRIPT to allow testing the |
| 6 | +latest code. When `--location` is a Beaker dataset, the model is mounted at |
| 7 | +`/model`. |
| 8 | +
|
| 9 | +Example: |
| 10 | + uv run python scripts/submit_eval_jobs.py \\ |
| 11 | + --model_name qwen3_4b_base_dapo_20260422_083224 \\ |
| 12 | + --location 01KPTSPMHGEZVYCDNR0XBVJCGZ \\ |
| 13 | + --tasks aime_2025:pass_at_32 \\ |
| 14 | + --max_length 8192 \\ |
| 15 | + --cluster ai2/jupiter-cirrascale-2 ai2/saturn-cirrascale \\ |
| 16 | + --priority urgent \\ |
| 17 | + --preemptible \\ |
| 18 | + --workspace ai2/open-instruct-dev |
6 | 19 | """ |
7 | 20 |
|
8 | 21 | import argparse |
9 | | -import os |
| 22 | +import re |
| 23 | +import shlex |
10 | 24 | import subprocess |
11 | | -import sys |
12 | | -from pathlib import Path |
| 25 | +from datetime import date |
| 26 | + |
| 27 | +import yaml |
| 28 | + |
| 29 | +from open_instruct import launch_utils |
| 30 | + |
| 31 | + |
| 32 | +BEAKER_ID_RE = re.compile(r"^[0-9A-Z]{26}$") |
| 33 | +DEFAULT_CLUSTERS = ("ai2/jupiter",) |
| 34 | +MAX_EXPERIMENT_NAME_LEN = 128 |
| 35 | +EXPERIMENT_NAME_SAFE_RE = re.compile(r"[^A-Za-z0-9_.-]+") |
| 36 | + |
| 37 | +DEFAULT_OLMO_EVAL_REF = "main" |
| 38 | +GIT_REF_SAFE_RE = re.compile(r"^[A-Za-z0-9._/-]+$") |
| 39 | + |
| 40 | + |
| 41 | +def build_install_script(ref: str) -> str: |
| 42 | + if not GIT_REF_SAFE_RE.match(ref): |
| 43 | + raise ValueError(f"Invalid git ref {ref!r}; expected characters [A-Za-z0-9._/-].") |
| 44 | + return ( |
| 45 | + "set -euo pipefail && " |
| 46 | + "git clone " |
| 47 | + "https://x-access-token:${GITHUB_TOKEN}@github.com/allenai/olmo-eval-internal.git " |
| 48 | + "/opt/olmo-eval-internal && " |
| 49 | + f"cd /opt/olmo-eval-internal && git checkout {shlex.quote(ref)} && " |
| 50 | + "uv pip install --cache-dir /weka/oe-eval-default/olmo-eval-pypi-cache -e '.[vllm]' && " |
| 51 | + "uv pip install --cache-dir /weka/oe-eval-default/olmo-eval-pypi-cache " |
| 52 | + "--upgrade 'vllm[runai]>=0.19.0' 'transformers>=5.4.0' && " |
| 53 | + "cd /workspace" |
| 54 | + ) |
13 | 55 |
|
14 | 56 |
|
15 | 57 | def parse_args() -> argparse.Namespace: |
16 | 58 | parser = argparse.ArgumentParser(description=__doc__) |
17 | | - parser.add_argument("--model_name", type=str, default=None, help="Used as the experiment name (-n).") |
| 59 | + parser.add_argument("--model_name", type=str, required=True, help="Human-readable run name.") |
18 | 60 | parser.add_argument( |
19 | 61 | "--location", |
20 | 62 | type=str, |
21 | 63 | required=True, |
22 | | - help="Model path (-m). Host path / HF repo / s3:// / gs://. Beaker dataset ids are not supported.", |
| 64 | + help=( |
| 65 | + "Model location. Accepts: a bare Beaker dataset id (26 uppercase alphanumerics), " |
| 66 | + "'beaker://<id>', an HF repo id (e.g. allenai/OLMo-2-1124-7B-Instruct), " |
| 67 | + "an absolute Weka/NFS path, or a gs:// URL." |
| 68 | + ), |
| 69 | + ) |
| 70 | + parser.add_argument( |
| 71 | + "--tasks", |
| 72 | + type=str, |
| 73 | + default="aime_2025:pass_at_32", |
| 74 | + help="Comma-separated olmo-eval task specs. See `olmo-eval tasks`/`olmo-eval suites`.", |
23 | 75 | ) |
24 | | - parser.add_argument("--tasks", type=str, default="aime_2025:pass_at_32") |
25 | 76 | parser.add_argument("--num_gpus", type=int, default=1) |
26 | | - parser.add_argument("--cluster", nargs="+", default=["h100"]) |
| 77 | + parser.add_argument("--cluster", nargs="+", default=list(DEFAULT_CLUSTERS)) |
27 | 78 | parser.add_argument("--priority", type=str, default="normal") |
28 | 79 | parser.add_argument("--preemptible", action="store_true") |
29 | 80 | parser.add_argument("--workspace", type=str, default="ai2/tulu-3-results") |
30 | 81 | parser.add_argument("--budget", type=str, default="ai2/oe-adapt") |
31 | | - parser.add_argument("--beaker_image", type=str, default=None) |
| 82 | + parser.add_argument( |
| 83 | + "--beaker_image", |
| 84 | + type=str, |
| 85 | + default="ai2-tylerm/olmo-eval-cu1281-trc290-amd64", |
| 86 | + help="Beaker image with olmo-eval installed.", |
| 87 | + ) |
32 | 88 | parser.add_argument("--revision", type=str, default=None, help="HF revision (git sha/tag).") |
33 | | - parser.add_argument("--max_length", type=int, default=32768) |
34 | | - parser.add_argument("--sampling_max_tokens", type=int, default=None) |
| 89 | + parser.add_argument( |
| 90 | + "--max_length", |
| 91 | + type=int, |
| 92 | + default=32768, |
| 93 | + help="Provider max_model_len. Sampling max_tokens comes from the task definition.", |
| 94 | + ) |
| 95 | + parser.add_argument( |
| 96 | + "--sampling_max_tokens", |
| 97 | + type=int, |
| 98 | + default=None, |
| 99 | + help="Override per-task sampling max_tokens (applied via -o max_tokens=N after each -t).", |
| 100 | + ) |
35 | 101 | parser.add_argument("--experiment_name", type=str, default=None) |
36 | | - parser.add_argument("--olmo_eval_ref", type=str, default="main") |
37 | | - parser.add_argument("--dry_run", action="store_true") |
| 102 | + parser.add_argument( |
| 103 | + "--olmo_eval_ref", |
| 104 | + type=str, |
| 105 | + default=DEFAULT_OLMO_EVAL_REF, |
| 106 | + help="Git ref (branch/tag/sha) of allenai/olmo-eval-internal to install at job start.", |
| 107 | + ) |
| 108 | + parser.add_argument( |
| 109 | + "--dry_run", action="store_true", help="Print the spec and beaker command, but do not write or submit." |
| 110 | + ) |
38 | 111 | return parser.parse_args() |
39 | 112 |
|
40 | 113 |
|
41 | | -def build_launch_args(args: argparse.Namespace) -> list[str]: |
42 | | - out: list[str] = [] |
43 | | - name = args.experiment_name or args.model_name |
44 | | - if name: |
45 | | - out += ["-n", name] |
46 | | - out += ["-m", args.location] |
| 114 | +def resolve_model_mount(location: str) -> tuple[str, str | None]: |
| 115 | + """Resolve --location into (model_path_in_container, beaker_dataset_id_or_None).""" |
| 116 | + if location.startswith("beaker://"): |
| 117 | + return "/model", location[len("beaker://") :] |
| 118 | + if BEAKER_ID_RE.match(location): |
| 119 | + return "/model", location |
| 120 | + return location, None |
| 121 | + |
| 122 | + |
| 123 | +def build_inner_cmd(args: argparse.Namespace, model_path: str) -> list[str]: |
| 124 | + cmd = [ |
| 125 | + "olmo-eval", |
| 126 | + "run", |
| 127 | + "-m", |
| 128 | + model_path, |
| 129 | + "--harness", |
| 130 | + "default", |
| 131 | + "-o", |
| 132 | + "provider.kind=vllm_server", |
| 133 | + "-o", |
| 134 | + f"provider.max_model_len={args.max_length}", |
| 135 | + "-o", |
| 136 | + "provider.trust_remote_code=true", |
| 137 | + ] |
47 | 138 | if args.revision: |
48 | | - out += ["-o", f"provider.revision={args.revision}"] |
49 | | - out += ["-o", f"provider.max_model_len={args.max_length}"] |
| 139 | + cmd += ["-o", f"provider.revision={args.revision}"] |
50 | 140 | for task in args.tasks.split(","): |
51 | 141 | task = task.strip() |
52 | 142 | if not task: |
53 | 143 | continue |
54 | | - out += ["-t", task] |
| 144 | + cmd += ["-t", task] |
55 | 145 | if args.sampling_max_tokens is not None: |
56 | | - out += ["-o", f"max_tokens={args.sampling_max_tokens}"] |
57 | | - out += ["--gpus", str(args.num_gpus)] |
58 | | - for cluster in args.cluster: |
59 | | - out += ["-c", cluster] |
60 | | - out += ["-p", args.priority] |
61 | | - if args.preemptible: |
62 | | - out += ["--preemptible"] |
63 | | - out += ["-w", args.workspace, "-B", args.budget] |
64 | | - if args.beaker_image: |
65 | | - out += ["-I", args.beaker_image] |
66 | | - if args.dry_run: |
67 | | - out += ["-d"] |
68 | | - return out |
| 146 | + cmd += ["-o", f"max_tokens={args.sampling_max_tokens}"] |
| 147 | + cmd += ["--num-gpus", str(args.num_gpus)] |
| 148 | + cmd += ["--output-dir", "/results"] |
| 149 | + return cmd |
| 150 | + |
| 151 | + |
| 152 | +def build_spec(args: argparse.Namespace, inner_cmd: list[str], dataset_id: str | None, experiment_name: str) -> dict: |
| 153 | + non_weka_clusters = [c for c in args.cluster if c not in launch_utils.WEKA_CLUSTERS] |
| 154 | + if non_weka_clusters: |
| 155 | + raise ValueError( |
| 156 | + f"Clusters {non_weka_clusters} do not support Weka mounts required by this script. " |
| 157 | + f"Use one of {launch_utils.WEKA_CLUSTERS}." |
| 158 | + ) |
| 159 | + datasets: list[dict] = [ |
| 160 | + {"mountPath": "/weka/oe-adapt-default", "source": {"weka": "oe-adapt-default"}}, |
| 161 | + {"mountPath": "/weka/oe-training-default", "source": {"weka": "oe-training-default"}}, |
| 162 | + {"mountPath": "/weka/oe-eval-default", "source": {"weka": "oe-eval-default"}}, |
| 163 | + ] |
| 164 | + if dataset_id: |
| 165 | + datasets.append({"mountPath": "/model", "source": {"beaker": dataset_id}}) |
| 166 | + |
| 167 | + full_command = f"{build_install_script(args.olmo_eval_ref)} && {shlex.join(inner_cmd)}" |
| 168 | + |
| 169 | + return { |
| 170 | + "version": "v2", |
| 171 | + "description": experiment_name, |
| 172 | + "budget": args.budget, |
| 173 | + "retry": {"allowedTaskRetries": 2}, |
| 174 | + "tasks": [ |
| 175 | + { |
| 176 | + "name": experiment_name, |
| 177 | + "image": {"beaker": args.beaker_image}, |
| 178 | + "command": ["/bin/bash", "-c"], |
| 179 | + "arguments": [full_command], |
| 180 | + "envVars": [ |
| 181 | + {"name": "HF_TOKEN", "secret": "HF_TOKEN"}, |
| 182 | + {"name": "OPENAI_API_KEY", "secret": "openai_api_key"}, |
| 183 | + {"name": "GITHUB_TOKEN", "secret": "GITHUB_TOKEN"}, |
| 184 | + {"name": "VLLM_ALLOW_LONG_MAX_MODEL_LEN", "value": "1"}, |
| 185 | + ], |
| 186 | + "datasets": datasets, |
| 187 | + "result": {"path": "/results"}, |
| 188 | + "resources": {"gpuCount": args.num_gpus}, |
| 189 | + "constraints": {"cluster": list(args.cluster)}, |
| 190 | + "context": {"priority": args.priority, "preemptible": args.preemptible}, |
| 191 | + } |
| 192 | + ], |
| 193 | + } |
69 | 194 |
|
70 | 195 |
|
71 | 196 | def main() -> None: |
72 | 197 | args = parse_args() |
73 | | - launch_args = build_launch_args(args) |
74 | | - script = Path(__file__).resolve().parent / "submit_eval_jobs.sh" |
75 | | - env = os.environ.copy() |
76 | | - env["OLMO_EVAL_REF"] = args.olmo_eval_ref |
77 | | - cmd = [str(script), *launch_args] |
78 | | - print("Running:", " ".join(cmd)) |
79 | | - sys.exit(subprocess.run(cmd, env=env).returncode) |
| 198 | + launch_utils.validate_beaker_workspace(args.workspace) |
| 199 | + |
| 200 | + model_path, dataset_id = resolve_model_mount(args.location) |
| 201 | + inner_cmd = build_inner_cmd(args, model_path) |
| 202 | + |
| 203 | + today = date.today().strftime("%m%d%Y") |
| 204 | + raw_name = args.experiment_name or f"olmo_eval_{args.model_name}_{today}" |
| 205 | + experiment_name = EXPERIMENT_NAME_SAFE_RE.sub("_", raw_name)[:MAX_EXPERIMENT_NAME_LEN] |
| 206 | + spec = build_spec(args, inner_cmd, dataset_id, experiment_name) |
| 207 | + |
| 208 | + print("Inner command:", shlex.join(inner_cmd)) |
| 209 | + |
| 210 | + if args.dry_run: |
| 211 | + print("Dry run; spec:") |
| 212 | + print(yaml.safe_dump(spec, default_flow_style=False, sort_keys=False)) |
| 213 | + return |
| 214 | + |
| 215 | + spec_path = launch_utils.auto_created_spec_path(experiment_name) |
| 216 | + with open(spec_path, "w") as f: |
| 217 | + yaml.safe_dump(spec, f, default_flow_style=False, sort_keys=False) |
| 218 | + print("Spec written to:", spec_path) |
| 219 | + |
| 220 | + beaker_cmd = ["beaker", "experiment", "create", spec_path, "--workspace", args.workspace] |
| 221 | + print("Running:", shlex.join(beaker_cmd)) |
| 222 | + subprocess.run(beaker_cmd, check=True) |
80 | 223 |
|
81 | 224 |
|
82 | 225 | if __name__ == "__main__": |
|
0 commit comments