Skip to content

Commit 22b2864

Browse files
committed
[NV] llm-d: require staged /models/gpt-oss-120b on H200 runner
Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
1 parent 1f1d41c commit 22b2864

1 file changed

Lines changed: 12 additions & 7 deletions

File tree

runners/launch_h200-dgxc-slurm.sh

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,19 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
2020
export MODEL_PATH="/models/DeepSeek-R1-0528"
2121
export MODEL_NAME="DeepSeek-R1-0528"
2222
elif [[ $MODEL_PREFIX == "gptoss" && $PRECISION == "fp4" ]]; then
23-
# Try the cluster's pre-staged path first; fall back to the HF
24-
# id so the first run can pull the model if /models/ is empty.
25-
# Same shape as launch_b200-dgxc-slurm.sh DSv4-Pro detection.
26-
if [[ -d "/models/gpt-oss-120b" ]]; then
27-
export MODEL_PATH="/models/gpt-oss-120b"
28-
else
29-
export MODEL_PATH="openai/gpt-oss-120b"
23+
# The llm-d job.slurm bind-mounts $MODEL_DIR into /models inside
24+
# the container, so MODEL_PATH must be an existing directory on
25+
# the host (an HF id will not work without further plumbing).
26+
# Stage the model out-of-band (e.g. `huggingface-cli download
27+
# openai/gpt-oss-120b --local-dir /models/gpt-oss-120b`) before
28+
# running this benchmark.
29+
if [[ ! -d "/models/gpt-oss-120b" ]]; then
30+
echo "Error: /models/gpt-oss-120b not found on this runner." >&2
31+
echo " Pre-stage the model with:" >&2
32+
echo " huggingface-cli download openai/gpt-oss-120b --local-dir /models/gpt-oss-120b" >&2
33+
exit 1
3034
fi
35+
export MODEL_PATH="/models/gpt-oss-120b"
3136
export MODEL_NAME="gpt-oss-120b"
3237
else
3338
echo "Unsupported MODEL_PREFIX/PRECISION for llm-d-vllm on H200: $MODEL_PREFIX/$PRECISION" >&2

0 commit comments

Comments
 (0)