[NV] llm-d: require staged /models/gpt-oss-120b on H200 runner

ezrasilvera · ezrasilvera · commit 22b2864e4ace · 2026-06-05T13:56:28.000+03:00
Signed-off-by: Ezra Silvera &lt;ezra@il.ibm.com&gt;
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
@@ -20,14 +20,19 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
             export MODEL_PATH="/models/DeepSeek-R1-0528"
             export MODEL_NAME="DeepSeek-R1-0528"
         elif [[ $MODEL_PREFIX == "gptoss" && $PRECISION == "fp4" ]]; then
-            # Try the cluster's pre-staged path first; fall back to the HF
-            # id so the first run can pull the model if /models/ is empty.
-            # Same shape as launch_b200-dgxc-slurm.sh DSv4-Pro detection.
-            if [[ -d "/models/gpt-oss-120b" ]]; then
-                export MODEL_PATH="/models/gpt-oss-120b"
-            else
-                export MODEL_PATH="openai/gpt-oss-120b"
+            # The llm-d job.slurm bind-mounts $MODEL_DIR into /models inside
+            # the container, so MODEL_PATH must be an existing directory on
+            # the host (an HF id will not work without further plumbing).
+            # Stage the model out-of-band (e.g. `huggingface-cli download
+            # openai/gpt-oss-120b --local-dir /models/gpt-oss-120b`) before
+            # running this benchmark.
+            if [[ ! -d "/models/gpt-oss-120b" ]]; then
+                echo "Error: /models/gpt-oss-120b not found on this runner." >&2
+                echo "       Pre-stage the model with:" >&2
+                echo "         huggingface-cli download openai/gpt-oss-120b --local-dir /models/gpt-oss-120b" >&2
+                exit 1
             fi
+            export MODEL_PATH="/models/gpt-oss-120b"
             export MODEL_NAME="gpt-oss-120b"
         else
             echo "Unsupported MODEL_PREFIX/PRECISION for llm-d-vllm on H200: $MODEL_PREFIX/$PRECISION" >&2