[NV] llm-d: add gpt-oss-120b H200 1P+1D smoke test alongside DSR1

ezrasilvera · ezrasilvera · commit 1f1d41cf7d5f · 2026-06-05T13:43:08.000+03:00
Signed-off-by: Ezra Silvera &lt;ezra@il.ibm.com&gt;
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -11311,3 +11311,41 @@ dsr1-fp8-h200-llm-d-vllm-simple:
           dp-attn: true
           additional-settings:
           - "DECODE_NODES=1"
+
+# llm-d-vllm 1P+1D smoke test on H200 with gpt-oss-120b (MXFP4).
+#
+# Same topology and llm-d scaffolding as dsr1-fp8-h200-llm-d-vllm-simple,
+# but with a much smaller MoE so memory pressure is comfortable on H200.
+# Selectable independently via `--config-keys gptoss-fp4-h200-llm-d-vllm-simple`.
+gptoss-fp4-h200-llm-d-vllm-simple:
+  image: ghcr.io/ezrasilvera/llm-d-nokube-vllm:v0.7.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  runner: h200-multinode
+  precision: fp4
+  framework: llm-d-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1, 4, 16, 64, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "RANDOM_RANGE_RATIO=0.05"
+          - "CONFIG_FILE=gptoss-fp4-h200-1p1d-simple.yaml"
+        decode:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
diff --git a/benchmarks/multi_node/gptoss_fp4_h200_llm-d-vllm.sh b/benchmarks/multi_node/gptoss_fp4_h200_llm-d-vllm.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+#
+# Wrapper for the gpt-oss-120b H200 llm-d-vllm 1P+1D smoke benchmark.
+# Sibling of dsr1_fp8_h200_llm-d-vllm.sh; the wrapper itself is
+# model-agnostic (just sets topology env and calls the shared submit.sh).
+# Kept as its own file so the runner's
+#   SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_h200_llm-d-vllm.sh"
+# rule resolves gptoss/fp4 here.
+
+set -euo pipefail
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    MODEL_PATH \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/llm-d" || exit 1
+
+export TIME_LIMIT="${TIME_LIMIT:-08:00:00}"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+JOB_ID=$(bash ./submit.sh \
+    "$PREFILL_NODES" \
+    "$DECODE_NODES" \
+    "$ISL" "$OSL" "${CONC_LIST// /x}" inf \
+    "$RANDOM_RANGE_RATIO")
+
+if [[ -z "$JOB_ID" ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/llm-d-recipes/gptoss-fp4-h200-1p1d-simple.yaml b/benchmarks/multi_node/llm-d-recipes/gptoss-fp4-h200-1p1d-simple.yaml
@@ -0,0 +1,100 @@
+# gpt-oss-120b (MXFP4) on H200, simple 1P+1D P/D disagg.
+#
+# Smoke-test sibling of dsr1-fp8-h200-1p1d-simple.yaml. Same llm-d-vllm
+# scaffolding (EPP plugins/profiles/dataLayer + per-role extra-args), but
+# with a much smaller MoE so the H200 (140 GB HBM) has comfortable
+# headroom for workspace, KV cache, and cudagraph capture.
+#
+# At EP=8, gpt-oss-120b puts ~15 GB of weights on each rank vs DSR1's
+# ~84 GB, so the workspace/KV/cudagraph fights from the DSR1 recipe go
+# away and the per-role flags are dramatically simpler.
+#
+#   1 prefill node (DP=8 EP=8 dp-attn, intra-node EP over NVLink)
+#   1 decode  node (DP=8 EP=8 dp-attn, intra-node EP over NVLink)
+#   total 2 H200 nodes / 16 GPUs.
+#
+# Selected via additional-settings: CONFIG_FILE=gptoss-fp4-h200-1p1d-simple.yaml
+# with PREFILL_NODES=1 DECODE_NODES=1 from the wrapper.
+
+# ---- EPP scheduling config ----
+# Identical to dsr1-fp8-h200-1p1d-simple.yaml. Plugins, profiles, scorer
+# weights are model-agnostic - they govern routing, not engine config.
+apiVersion: llm-d.ai/v1alpha1
+kind: EndpointPickerConfig
+
+plugins:
+  - name: file-disc
+    type: file-discovery
+    parameters:
+      path: /tmp/endpoints.yaml
+      watchFile: false
+
+  - type: disagg-headers-handler
+  - type: always-disagg-pd-decider
+  - type: disagg-profile-handler
+    parameters:
+      deciderPluginName: always-disagg-pd-decider
+  - type: prefill-filter
+  - type: decode-filter
+  - type: prefix-cache-scorer
+  - type: queue-scorer
+  - type: kv-cache-utilization-scorer
+  - type: active-request-scorer
+  - type: max-score-picker
+
+schedulingProfiles:
+  - name: prefill
+    plugins:
+      - pluginRef: prefill-filter
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: queue-scorer
+        weight: 2
+      - pluginRef: kv-cache-utilization-scorer
+        weight: 2
+      - pluginRef: max-score-picker
+  - name: decode
+    plugins:
+      - pluginRef: decode-filter
+      - pluginRef: active-request-scorer
+        weight: 2
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: max-score-picker
+
+dataLayer:
+  discovery:
+    pluginRef: file-disc
+
+# ---- Per-role vLLM flags ----
+# Common flags (--enable-expert-parallel, --tensor-parallel-size,
+# --data-parallel-size, --kv_transfer_config, --moe-backend) are set in
+# server.sh. Cross-node DP coordination flags are NOT emitted because
+# LWS_GROUP_SIZE = PREFILL_NODES = DECODE_NODES = 1.
+#
+# kv-cache-dtype is left at default (auto). gpt-oss-120b has not been
+# validated with --kv-cache-dtype fp8 in this image; using auto avoids
+# silent KV-quantization mismatches on a smoke test.
+prefill:
+  extra-args: >-
+    --gpu-memory-utilization 0.85
+    --max-num-batched-tokens 16384
+    --max-num-seqs 16
+    --max-model-len 16384
+    --block-size 256
+    --no-enable-prefix-caching
+  env: {}
+
+decode:
+  extra-args: >-
+    --gpu-memory-utilization 0.90
+    --max-num-batched-tokens 256
+    --max-num-seqs 256
+    --max-model-len 16384
+    --block-size 256
+    --no-enable-prefix-caching
+  env: {}
+
+# ---- SLURM resource directives ----
+slurm:
+  time_limit: "04:00:00"
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
@@ -19,6 +19,16 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
             export MODEL_PATH="/models/DeepSeek-R1-0528"
             export MODEL_NAME="DeepSeek-R1-0528"
+        elif [[ $MODEL_PREFIX == "gptoss" && $PRECISION == "fp4" ]]; then
+            # Try the cluster's pre-staged path first; fall back to the HF
+            # id so the first run can pull the model if /models/ is empty.
+            # Same shape as launch_b200-dgxc-slurm.sh DSv4-Pro detection.
+            if [[ -d "/models/gpt-oss-120b" ]]; then
+                export MODEL_PATH="/models/gpt-oss-120b"
+            else
+                export MODEL_PATH="openai/gpt-oss-120b"
+            fi
+            export MODEL_NAME="gpt-oss-120b"
         else
             echo "Unsupported MODEL_PREFIX/PRECISION for llm-d-vllm on H200: $MODEL_PREFIX/$PRECISION" >&2
             exit 1