Skip to content

Commit 1f1d41c

Browse files
committed
[NV] llm-d: add gpt-oss-120b H200 1P+1D smoke test alongside DSR1
Signed-off-by: Ezra Silvera <ezra@il.ibm.com>
1 parent f284d7d commit 1f1d41c

4 files changed

Lines changed: 196 additions & 0 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11311,3 +11311,41 @@ dsr1-fp8-h200-llm-d-vllm-simple:
1131111311
dp-attn: true
1131211312
additional-settings:
1131311313
- "DECODE_NODES=1"
11314+
11315+
# llm-d-vllm 1P+1D smoke test on H200 with gpt-oss-120b (MXFP4).
11316+
#
11317+
# Same topology and llm-d scaffolding as dsr1-fp8-h200-llm-d-vllm-simple,
11318+
# but with a much smaller MoE so memory pressure is comfortable on H200.
11319+
# Selectable independently via `--config-keys gptoss-fp4-h200-llm-d-vllm-simple`.
11320+
gptoss-fp4-h200-llm-d-vllm-simple:
11321+
image: ghcr.io/ezrasilvera/llm-d-nokube-vllm:v0.7.0
11322+
model: openai/gpt-oss-120b
11323+
model-prefix: gptoss
11324+
runner: h200-multinode
11325+
precision: fp4
11326+
framework: llm-d-vllm
11327+
multinode: true
11328+
disagg: true
11329+
scenarios:
11330+
fixed-seq-len:
11331+
- isl: 1024
11332+
osl: 1024
11333+
search-space:
11334+
- spec-decoding: "none"
11335+
conc-list: [ 1, 4, 16, 64, 256 ]
11336+
prefill:
11337+
num-worker: 1
11338+
tp: 1
11339+
ep: 8
11340+
dp-attn: true
11341+
additional-settings:
11342+
- "PREFILL_NODES=1"
11343+
- "RANDOM_RANGE_RATIO=0.05"
11344+
- "CONFIG_FILE=gptoss-fp4-h200-1p1d-simple.yaml"
11345+
decode:
11346+
num-worker: 1
11347+
tp: 1
11348+
ep: 8
11349+
dp-attn: true
11350+
additional-settings:
11351+
- "DECODE_NODES=1"
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/usr/bin/env bash
2+
#
3+
# Wrapper for the gpt-oss-120b H200 llm-d-vllm 1P+1D smoke benchmark.
4+
# Sibling of dsr1_fp8_h200_llm-d-vllm.sh; the wrapper itself is
5+
# model-agnostic (just sets topology env and calls the shared submit.sh).
6+
# Kept as its own file so the runner's
7+
# SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_h200_llm-d-vllm.sh"
8+
# rule resolves gptoss/fp4 here.
9+
10+
set -euo pipefail
11+
12+
source "$(dirname "$0")/../benchmark_lib.sh"
13+
14+
check_env_vars \
15+
CONC_LIST \
16+
ISL \
17+
OSL \
18+
IMAGE \
19+
MODEL_PATH \
20+
PREFILL_NODES \
21+
DECODE_NODES \
22+
RANDOM_RANGE_RATIO
23+
24+
if [[ -n "${SLURM_JOB_ID:-}" ]]; then
25+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
26+
fi
27+
28+
set -x
29+
30+
cd "$GITHUB_WORKSPACE/benchmarks/multi_node/llm-d" || exit 1
31+
32+
export TIME_LIMIT="${TIME_LIMIT:-08:00:00}"
33+
export MODEL_PATH=$MODEL_PATH
34+
export MODEL_NAME=$MODEL_NAME
35+
export CONTAINER_IMAGE=$IMAGE
36+
37+
JOB_ID=$(bash ./submit.sh \
38+
"$PREFILL_NODES" \
39+
"$DECODE_NODES" \
40+
"$ISL" "$OSL" "${CONC_LIST// /x}" inf \
41+
"$RANDOM_RANGE_RATIO")
42+
43+
if [[ -z "$JOB_ID" ]]; then
44+
echo "Failed to submit job" >&2
45+
exit 1
46+
fi
47+
48+
echo "$JOB_ID"
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# gpt-oss-120b (MXFP4) on H200, simple 1P+1D P/D disagg.
2+
#
3+
# Smoke-test sibling of dsr1-fp8-h200-1p1d-simple.yaml. Same llm-d-vllm
4+
# scaffolding (EPP plugins/profiles/dataLayer + per-role extra-args), but
5+
# with a much smaller MoE so the H200 (140 GB HBM) has comfortable
6+
# headroom for workspace, KV cache, and cudagraph capture.
7+
#
8+
# At EP=8, gpt-oss-120b puts ~15 GB of weights on each rank vs DSR1's
9+
# ~84 GB, so the workspace/KV/cudagraph fights from the DSR1 recipe go
10+
# away and the per-role flags are dramatically simpler.
11+
#
12+
# 1 prefill node (DP=8 EP=8 dp-attn, intra-node EP over NVLink)
13+
# 1 decode node (DP=8 EP=8 dp-attn, intra-node EP over NVLink)
14+
# total 2 H200 nodes / 16 GPUs.
15+
#
16+
# Selected via additional-settings: CONFIG_FILE=gptoss-fp4-h200-1p1d-simple.yaml
17+
# with PREFILL_NODES=1 DECODE_NODES=1 from the wrapper.
18+
19+
# ---- EPP scheduling config ----
20+
# Identical to dsr1-fp8-h200-1p1d-simple.yaml. Plugins, profiles, scorer
21+
# weights are model-agnostic - they govern routing, not engine config.
22+
apiVersion: llm-d.ai/v1alpha1
23+
kind: EndpointPickerConfig
24+
25+
plugins:
26+
- name: file-disc
27+
type: file-discovery
28+
parameters:
29+
path: /tmp/endpoints.yaml
30+
watchFile: false
31+
32+
- type: disagg-headers-handler
33+
- type: always-disagg-pd-decider
34+
- type: disagg-profile-handler
35+
parameters:
36+
deciderPluginName: always-disagg-pd-decider
37+
- type: prefill-filter
38+
- type: decode-filter
39+
- type: prefix-cache-scorer
40+
- type: queue-scorer
41+
- type: kv-cache-utilization-scorer
42+
- type: active-request-scorer
43+
- type: max-score-picker
44+
45+
schedulingProfiles:
46+
- name: prefill
47+
plugins:
48+
- pluginRef: prefill-filter
49+
- pluginRef: prefix-cache-scorer
50+
weight: 3
51+
- pluginRef: queue-scorer
52+
weight: 2
53+
- pluginRef: kv-cache-utilization-scorer
54+
weight: 2
55+
- pluginRef: max-score-picker
56+
- name: decode
57+
plugins:
58+
- pluginRef: decode-filter
59+
- pluginRef: active-request-scorer
60+
weight: 2
61+
- pluginRef: prefix-cache-scorer
62+
weight: 3
63+
- pluginRef: max-score-picker
64+
65+
dataLayer:
66+
discovery:
67+
pluginRef: file-disc
68+
69+
# ---- Per-role vLLM flags ----
70+
# Common flags (--enable-expert-parallel, --tensor-parallel-size,
71+
# --data-parallel-size, --kv_transfer_config, --moe-backend) are set in
72+
# server.sh. Cross-node DP coordination flags are NOT emitted because
73+
# LWS_GROUP_SIZE = PREFILL_NODES = DECODE_NODES = 1.
74+
#
75+
# kv-cache-dtype is left at default (auto). gpt-oss-120b has not been
76+
# validated with --kv-cache-dtype fp8 in this image; using auto avoids
77+
# silent KV-quantization mismatches on a smoke test.
78+
prefill:
79+
extra-args: >-
80+
--gpu-memory-utilization 0.85
81+
--max-num-batched-tokens 16384
82+
--max-num-seqs 16
83+
--max-model-len 16384
84+
--block-size 256
85+
--no-enable-prefix-caching
86+
env: {}
87+
88+
decode:
89+
extra-args: >-
90+
--gpu-memory-utilization 0.90
91+
--max-num-batched-tokens 256
92+
--max-num-seqs 256
93+
--max-model-len 16384
94+
--block-size 256
95+
--no-enable-prefix-caching
96+
env: {}
97+
98+
# ---- SLURM resource directives ----
99+
slurm:
100+
time_limit: "04:00:00"

runners/launch_h200-dgxc-slurm.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,16 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
1919
if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
2020
export MODEL_PATH="/models/DeepSeek-R1-0528"
2121
export MODEL_NAME="DeepSeek-R1-0528"
22+
elif [[ $MODEL_PREFIX == "gptoss" && $PRECISION == "fp4" ]]; then
23+
# Try the cluster's pre-staged path first; fall back to the HF
24+
# id so the first run can pull the model if /models/ is empty.
25+
# Same shape as launch_b200-dgxc-slurm.sh DSv4-Pro detection.
26+
if [[ -d "/models/gpt-oss-120b" ]]; then
27+
export MODEL_PATH="/models/gpt-oss-120b"
28+
else
29+
export MODEL_PATH="openai/gpt-oss-120b"
30+
fi
31+
export MODEL_NAME="gpt-oss-120b"
2232
else
2333
echo "Unsupported MODEL_PREFIX/PRECISION for llm-d-vllm on H200: $MODEL_PREFIX/$PRECISION" >&2
2434
exit 1

0 commit comments

Comments
 (0)