Skip to content

Commit 94345c5

Browse files
committed
[NV] llm-d: auto-stage gpt-oss-120b under $HOME on H200 runner if /models is empty
Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
1 parent 22b2864 commit 94345c5

1 file changed

Lines changed: 36 additions & 11 deletions

File tree

runners/launch_h200-dgxc-slurm.sh

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,43 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
2121
export MODEL_NAME="DeepSeek-R1-0528"
2222
elif [[ $MODEL_PREFIX == "gptoss" && $PRECISION == "fp4" ]]; then
2323
# The llm-d job.slurm bind-mounts $MODEL_DIR into /models inside
24-
# the container, so MODEL_PATH must be an existing directory on
25-
# the host (an HF id will not work without further plumbing).
26-
# Stage the model out-of-band (e.g. `huggingface-cli download
27-
# openai/gpt-oss-120b --local-dir /models/gpt-oss-120b`) before
28-
# running this benchmark.
29-
if [[ ! -d "/models/gpt-oss-120b" ]]; then
30-
echo "Error: /models/gpt-oss-120b not found on this runner." >&2
31-
echo " Pre-stage the model with:" >&2
32-
echo " huggingface-cli download openai/gpt-oss-120b --local-dir /models/gpt-oss-120b" >&2
33-
exit 1
24+
# the container, so MODEL_PATH must be an existing host
25+
# directory (an HF id alone will not work). Resolution order:
26+
# 1. /models/gpt-oss-120b (cluster-shared, staged by an admin)
27+
# 2. $HOME/inferencex-models/gpt-oss-120b (per-runner, written
28+
# by this script - downloaded once, cached across dispatches)
29+
# The download happens inline on the runner host so users with
30+
# only gh-dispatch access can stage the model without ssh.
31+
GPTOSS_LOCAL_DIR="$HOME/inferencex-models/gpt-oss-120b"
32+
if [[ -d "/models/gpt-oss-120b" ]]; then
33+
export MODEL_PATH="/models/gpt-oss-120b"
34+
else
35+
mkdir -p "$HOME/inferencex-models"
36+
# flock serializes concurrent dispatches so a second run
37+
# waits for the first download to finish instead of racing.
38+
(
39+
flock -x 200
40+
if [[ ! -d "$GPTOSS_LOCAL_DIR" ]]; then
41+
echo "Staging openai/gpt-oss-120b -> $GPTOSS_LOCAL_DIR (one-time, ~60 GB)"
42+
if command -v huggingface-cli >/dev/null 2>&1; then
43+
huggingface-cli download openai/gpt-oss-120b \
44+
--local-dir "$GPTOSS_LOCAL_DIR"
45+
elif python3 -c "import huggingface_hub" 2>/dev/null; then
46+
python3 - <<'PY'
47+
import os
48+
from huggingface_hub import snapshot_download
49+
snapshot_download(repo_id="openai/gpt-oss-120b",
50+
local_dir=os.environ["GPTOSS_LOCAL_DIR"])
51+
PY
52+
else
53+
echo "Error: neither huggingface-cli nor python3 huggingface_hub available." >&2
54+
echo " Cannot auto-stage gpt-oss-120b on this runner." >&2
55+
exit 1
56+
fi
57+
fi
58+
) 200>"$HOME/inferencex-models/.gpt-oss-120b.download.lock"
59+
export MODEL_PATH="$GPTOSS_LOCAL_DIR"
3460
fi
35-
export MODEL_PATH="/models/gpt-oss-120b"
3661
export MODEL_NAME="gpt-oss-120b"
3762
else
3863
echo "Unsupported MODEL_PREFIX/PRECISION for llm-d-vllm on H200: $MODEL_PREFIX/$PRECISION" >&2

0 commit comments

Comments
 (0)