@@ -21,18 +21,43 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
2121 export MODEL_NAME=" DeepSeek-R1-0528"
2222 elif [[ $MODEL_PREFIX == " gptoss" && $PRECISION == " fp4" ]]; then
2323 # The llm-d job.slurm bind-mounts $MODEL_DIR into /models inside
24- # the container, so MODEL_PATH must be an existing directory on
25- # the host (an HF id will not work without further plumbing).
26- # Stage the model out-of-band (e.g. `huggingface-cli download
27- # openai/gpt-oss-120b --local-dir /models/gpt-oss-120b`) before
28- # running this benchmark.
29- if [[ ! -d " /models/gpt-oss-120b" ]]; then
30- echo " Error: /models/gpt-oss-120b not found on this runner." >&2
31- echo " Pre-stage the model with:" >&2
32- echo " huggingface-cli download openai/gpt-oss-120b --local-dir /models/gpt-oss-120b" >&2
33- exit 1
24+ # the container, so MODEL_PATH must be an existing host
25+ # directory (an HF id alone will not work). Resolution order:
26+ # 1. /models/gpt-oss-120b (cluster-shared, staged by an admin)
27+ # 2. $HOME/inferencex-models/gpt-oss-120b (per-runner, written
28+ # by this script - downloaded once, cached across dispatches)
29+ # The download happens inline on the runner host so users with
30+ # only gh-dispatch access can stage the model without ssh.
31+ GPTOSS_LOCAL_DIR=" $HOME /inferencex-models/gpt-oss-120b"
32+ if [[ -d " /models/gpt-oss-120b" ]]; then
33+ export MODEL_PATH=" /models/gpt-oss-120b"
34+ else
35+ mkdir -p " $HOME /inferencex-models"
36+ # flock serializes concurrent dispatches so a second run
37+ # waits for the first download to finish instead of racing.
38+ (
39+ flock -x 200
40+ if [[ ! -d " $GPTOSS_LOCAL_DIR " ]]; then
41+ echo " Staging openai/gpt-oss-120b -> $GPTOSS_LOCAL_DIR (one-time, ~60 GB)"
42+ if command -v huggingface-cli > /dev/null 2>&1 ; then
43+ huggingface-cli download openai/gpt-oss-120b \
44+ --local-dir " $GPTOSS_LOCAL_DIR "
45+ elif python3 -c " import huggingface_hub" 2> /dev/null; then
46+ python3 - << 'PY '
47+ import os
48+ from huggingface_hub import snapshot_download
49+ snapshot_download(repo_id="openai/gpt-oss-120b",
50+ local_dir=os.environ["GPTOSS_LOCAL_DIR"])
51+ PY
52+ else
53+ echo " Error: neither huggingface-cli nor python3 huggingface_hub available." >&2
54+ echo " Cannot auto-stage gpt-oss-120b on this runner." >&2
55+ exit 1
56+ fi
57+ fi
58+ ) 200> " $HOME /inferencex-models/.gpt-oss-120b.download.lock"
59+ export MODEL_PATH=" $GPTOSS_LOCAL_DIR "
3460 fi
35- export MODEL_PATH=" /models/gpt-oss-120b"
3661 export MODEL_NAME=" gpt-oss-120b"
3762 else
3863 echo " Unsupported MODEL_PREFIX/PRECISION for llm-d-vllm on H200: $MODEL_PREFIX /$PRECISION " >&2
0 commit comments