Skip to content

Commit deedfbe

Browse files
Drop submit_eval_jobs.{py,sh} changes; moved to PR #1658. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent c47442b commit deedfbe

2 files changed

Lines changed: 188 additions & 77 deletions

File tree

scripts/submit_eval_jobs.py

Lines changed: 188 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,225 @@
1-
"""Translate legacy --flag_name arguments into `olmo-eval beaker launch`
2-
flags and delegate to scripts/submit_eval_jobs.sh.
1+
"""Submit evaluation jobs using allenai/olmo-eval-internal.
32
4-
For new code, prefer calling submit_eval_jobs.sh directly with olmo-eval flags.
5-
This wrapper exists to keep existing call sites working.
3+
Submits a Beaker v2 experiment that runs `olmo-eval run` against a model. The
4+
Beaker image ships with CUDA and PyTorch; olmo-eval-internal, vllm, and
5+
transformers are installed at job start via INSTALL_SCRIPT to allow testing the
6+
latest code. When `--location` is a Beaker dataset, the model is mounted at
7+
`/model`.
8+
9+
Example:
10+
uv run python scripts/submit_eval_jobs.py \\
11+
--model_name qwen3_4b_base_dapo_20260422_083224 \\
12+
--location 01KPTSPMHGEZVYCDNR0XBVJCGZ \\
13+
--tasks aime_2025:pass_at_32 \\
14+
--max_length 8192 \\
15+
--cluster ai2/jupiter-cirrascale-2 ai2/saturn-cirrascale \\
16+
--priority urgent \\
17+
--preemptible \\
18+
--workspace ai2/open-instruct-dev
619
"""
720

821
import argparse
9-
import os
22+
import re
23+
import shlex
1024
import subprocess
11-
import sys
12-
from pathlib import Path
25+
from datetime import date
26+
27+
import yaml
28+
29+
from open_instruct import launch_utils
30+
31+
32+
BEAKER_ID_RE = re.compile(r"^[0-9A-Z]{26}$")
33+
DEFAULT_CLUSTERS = ("ai2/jupiter",)
34+
MAX_EXPERIMENT_NAME_LEN = 128
35+
EXPERIMENT_NAME_SAFE_RE = re.compile(r"[^A-Za-z0-9_.-]+")
36+
37+
DEFAULT_OLMO_EVAL_REF = "main"
38+
GIT_REF_SAFE_RE = re.compile(r"^[A-Za-z0-9._/-]+$")
39+
40+
41+
def build_install_script(ref: str) -> str:
42+
if not GIT_REF_SAFE_RE.match(ref):
43+
raise ValueError(f"Invalid git ref {ref!r}; expected characters [A-Za-z0-9._/-].")
44+
return (
45+
"set -euo pipefail && "
46+
"git clone "
47+
"https://x-access-token:${GITHUB_TOKEN}@github.com/allenai/olmo-eval-internal.git "
48+
"/opt/olmo-eval-internal && "
49+
f"cd /opt/olmo-eval-internal && git checkout {shlex.quote(ref)} && "
50+
"uv pip install --cache-dir /weka/oe-eval-default/olmo-eval-pypi-cache -e '.[vllm]' && "
51+
"uv pip install --cache-dir /weka/oe-eval-default/olmo-eval-pypi-cache "
52+
"--upgrade 'vllm[runai]>=0.19.0' 'transformers>=5.4.0' && "
53+
"cd /workspace"
54+
)
1355

1456

1557
def parse_args() -> argparse.Namespace:
1658
parser = argparse.ArgumentParser(description=__doc__)
17-
parser.add_argument("--model_name", type=str, default=None, help="Used as the experiment name (-n).")
59+
parser.add_argument("--model_name", type=str, required=True, help="Human-readable run name.")
1860
parser.add_argument(
1961
"--location",
2062
type=str,
2163
required=True,
22-
help="Model path (-m). Host path / HF repo / s3:// / gs://. Beaker dataset ids are not supported.",
64+
help=(
65+
"Model location. Accepts: a bare Beaker dataset id (26 uppercase alphanumerics), "
66+
"'beaker://<id>', an HF repo id (e.g. allenai/OLMo-2-1124-7B-Instruct), "
67+
"an absolute Weka/NFS path, or a gs:// URL."
68+
),
69+
)
70+
parser.add_argument(
71+
"--tasks",
72+
type=str,
73+
default="aime_2025:pass_at_32",
74+
help="Comma-separated olmo-eval task specs. See `olmo-eval tasks`/`olmo-eval suites`.",
2375
)
24-
parser.add_argument("--tasks", type=str, default="aime_2025:pass_at_32")
2576
parser.add_argument("--num_gpus", type=int, default=1)
26-
parser.add_argument("--cluster", nargs="+", default=["h100"])
77+
parser.add_argument("--cluster", nargs="+", default=list(DEFAULT_CLUSTERS))
2778
parser.add_argument("--priority", type=str, default="normal")
2879
parser.add_argument("--preemptible", action="store_true")
2980
parser.add_argument("--workspace", type=str, default="ai2/tulu-3-results")
3081
parser.add_argument("--budget", type=str, default="ai2/oe-adapt")
31-
parser.add_argument("--beaker_image", type=str, default=None)
82+
parser.add_argument(
83+
"--beaker_image",
84+
type=str,
85+
default="ai2-tylerm/olmo-eval-cu1281-trc290-amd64",
86+
help="Beaker image with olmo-eval installed.",
87+
)
3288
parser.add_argument("--revision", type=str, default=None, help="HF revision (git sha/tag).")
33-
parser.add_argument("--max_length", type=int, default=32768)
34-
parser.add_argument("--sampling_max_tokens", type=int, default=None)
89+
parser.add_argument(
90+
"--max_length",
91+
type=int,
92+
default=32768,
93+
help="Provider max_model_len. Sampling max_tokens comes from the task definition.",
94+
)
95+
parser.add_argument(
96+
"--sampling_max_tokens",
97+
type=int,
98+
default=None,
99+
help="Override per-task sampling max_tokens (applied via -o max_tokens=N after each -t).",
100+
)
35101
parser.add_argument("--experiment_name", type=str, default=None)
36-
parser.add_argument("--olmo_eval_ref", type=str, default="main")
37-
parser.add_argument("--dry_run", action="store_true")
102+
parser.add_argument(
103+
"--olmo_eval_ref",
104+
type=str,
105+
default=DEFAULT_OLMO_EVAL_REF,
106+
help="Git ref (branch/tag/sha) of allenai/olmo-eval-internal to install at job start.",
107+
)
108+
parser.add_argument(
109+
"--dry_run", action="store_true", help="Print the spec and beaker command, but do not write or submit."
110+
)
38111
return parser.parse_args()
39112

40113

41-
def build_launch_args(args: argparse.Namespace) -> list[str]:
42-
out: list[str] = []
43-
name = args.experiment_name or args.model_name
44-
if name:
45-
out += ["-n", name]
46-
out += ["-m", args.location]
114+
def resolve_model_mount(location: str) -> tuple[str, str | None]:
115+
"""Resolve --location into (model_path_in_container, beaker_dataset_id_or_None)."""
116+
if location.startswith("beaker://"):
117+
return "/model", location[len("beaker://") :]
118+
if BEAKER_ID_RE.match(location):
119+
return "/model", location
120+
return location, None
121+
122+
123+
def build_inner_cmd(args: argparse.Namespace, model_path: str) -> list[str]:
124+
cmd = [
125+
"olmo-eval",
126+
"run",
127+
"-m",
128+
model_path,
129+
"--harness",
130+
"default",
131+
"-o",
132+
"provider.kind=vllm_server",
133+
"-o",
134+
f"provider.max_model_len={args.max_length}",
135+
"-o",
136+
"provider.trust_remote_code=true",
137+
]
47138
if args.revision:
48-
out += ["-o", f"provider.revision={args.revision}"]
49-
out += ["-o", f"provider.max_model_len={args.max_length}"]
139+
cmd += ["-o", f"provider.revision={args.revision}"]
50140
for task in args.tasks.split(","):
51141
task = task.strip()
52142
if not task:
53143
continue
54-
out += ["-t", task]
144+
cmd += ["-t", task]
55145
if args.sampling_max_tokens is not None:
56-
out += ["-o", f"max_tokens={args.sampling_max_tokens}"]
57-
out += ["--gpus", str(args.num_gpus)]
58-
for cluster in args.cluster:
59-
out += ["-c", cluster]
60-
out += ["-p", args.priority]
61-
if args.preemptible:
62-
out += ["--preemptible"]
63-
out += ["-w", args.workspace, "-B", args.budget]
64-
if args.beaker_image:
65-
out += ["-I", args.beaker_image]
66-
if args.dry_run:
67-
out += ["-d"]
68-
return out
146+
cmd += ["-o", f"max_tokens={args.sampling_max_tokens}"]
147+
cmd += ["--num-gpus", str(args.num_gpus)]
148+
cmd += ["--output-dir", "/results"]
149+
return cmd
150+
151+
152+
def build_spec(args: argparse.Namespace, inner_cmd: list[str], dataset_id: str | None, experiment_name: str) -> dict:
153+
non_weka_clusters = [c for c in args.cluster if c not in launch_utils.WEKA_CLUSTERS]
154+
if non_weka_clusters:
155+
raise ValueError(
156+
f"Clusters {non_weka_clusters} do not support Weka mounts required by this script. "
157+
f"Use one of {launch_utils.WEKA_CLUSTERS}."
158+
)
159+
datasets: list[dict] = [
160+
{"mountPath": "/weka/oe-adapt-default", "source": {"weka": "oe-adapt-default"}},
161+
{"mountPath": "/weka/oe-training-default", "source": {"weka": "oe-training-default"}},
162+
{"mountPath": "/weka/oe-eval-default", "source": {"weka": "oe-eval-default"}},
163+
]
164+
if dataset_id:
165+
datasets.append({"mountPath": "/model", "source": {"beaker": dataset_id}})
166+
167+
full_command = f"{build_install_script(args.olmo_eval_ref)} && {shlex.join(inner_cmd)}"
168+
169+
return {
170+
"version": "v2",
171+
"description": experiment_name,
172+
"budget": args.budget,
173+
"retry": {"allowedTaskRetries": 2},
174+
"tasks": [
175+
{
176+
"name": experiment_name,
177+
"image": {"beaker": args.beaker_image},
178+
"command": ["/bin/bash", "-c"],
179+
"arguments": [full_command],
180+
"envVars": [
181+
{"name": "HF_TOKEN", "secret": "HF_TOKEN"},
182+
{"name": "OPENAI_API_KEY", "secret": "openai_api_key"},
183+
{"name": "GITHUB_TOKEN", "secret": "GITHUB_TOKEN"},
184+
{"name": "VLLM_ALLOW_LONG_MAX_MODEL_LEN", "value": "1"},
185+
],
186+
"datasets": datasets,
187+
"result": {"path": "/results"},
188+
"resources": {"gpuCount": args.num_gpus},
189+
"constraints": {"cluster": list(args.cluster)},
190+
"context": {"priority": args.priority, "preemptible": args.preemptible},
191+
}
192+
],
193+
}
69194

70195

71196
def main() -> None:
72197
args = parse_args()
73-
launch_args = build_launch_args(args)
74-
script = Path(__file__).resolve().parent / "submit_eval_jobs.sh"
75-
env = os.environ.copy()
76-
env["OLMO_EVAL_REF"] = args.olmo_eval_ref
77-
cmd = [str(script), *launch_args]
78-
print("Running:", " ".join(cmd))
79-
sys.exit(subprocess.run(cmd, env=env).returncode)
198+
launch_utils.validate_beaker_workspace(args.workspace)
199+
200+
model_path, dataset_id = resolve_model_mount(args.location)
201+
inner_cmd = build_inner_cmd(args, model_path)
202+
203+
today = date.today().strftime("%m%d%Y")
204+
raw_name = args.experiment_name or f"olmo_eval_{args.model_name}_{today}"
205+
experiment_name = EXPERIMENT_NAME_SAFE_RE.sub("_", raw_name)[:MAX_EXPERIMENT_NAME_LEN]
206+
spec = build_spec(args, inner_cmd, dataset_id, experiment_name)
207+
208+
print("Inner command:", shlex.join(inner_cmd))
209+
210+
if args.dry_run:
211+
print("Dry run; spec:")
212+
print(yaml.safe_dump(spec, default_flow_style=False, sort_keys=False))
213+
return
214+
215+
spec_path = launch_utils.auto_created_spec_path(experiment_name)
216+
with open(spec_path, "w") as f:
217+
yaml.safe_dump(spec, f, default_flow_style=False, sort_keys=False)
218+
print("Spec written to:", spec_path)
219+
220+
beaker_cmd = ["beaker", "experiment", "create", spec_path, "--workspace", args.workspace]
221+
print("Running:", shlex.join(beaker_cmd))
222+
subprocess.run(beaker_cmd, check=True)
80223

81224

82225
if __name__ == "__main__":

scripts/submit_eval_jobs.sh

Lines changed: 0 additions & 32 deletions
This file was deleted.

0 commit comments

Comments
 (0)