Skip to content

Commit 524ca63

Browse files
committed
fix(profile): add GB200 DSV4 MTP3 profile
1 parent 3b7d8a7 commit 524ca63

3 files changed

Lines changed: 296 additions & 9 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8680,6 +8680,37 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
86808680
ep: 8
86818681
dp-attn: true
86828682

8683+
# Dedicated profile point for the DeepSeek-V4 guide's 16-chip / global batch
8684+
# 256 shape: 1 prefill DEP8 + 1 decode DEP8 on GB200, MTP3, conc=256.
8685+
dsv4-fp4-gb200-dynamo-vllm-mtp3-profile:
8686+
image: vllm/vllm-openai:v0.21.0-ubuntu2404
8687+
model: deepseek-ai/DeepSeek-V4-Pro
8688+
model-prefix: dsv4
8689+
runner: gb200
8690+
precision: fp4
8691+
framework: dynamo-vllm
8692+
multinode: true
8693+
disagg: true
8694+
scenarios:
8695+
fixed-seq-len:
8696+
- isl: 8192
8697+
osl: 1024
8698+
search-space:
8699+
- conc-list: [256]
8700+
spec-decoding: mtp
8701+
prefill:
8702+
num-worker: 1
8703+
tp: 8
8704+
ep: 8
8705+
dp-attn: true
8706+
additional-settings:
8707+
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-profile-16gpu-conc256-mtp3.yaml"
8708+
decode:
8709+
num-worker: 1
8710+
tp: 8
8711+
ep: 8
8712+
dp-attn: true
8713+
86838714
dsv4-fp4-b300-dynamo-vllm:
86848715
image: vllm/vllm-openai:v0.20.1
86858716
model: deepseek-ai/DeepSeek-V4-Pro

.github/workflows/profile.yml

Lines changed: 109 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,17 @@ jobs:
113113
EP_SIZE: ${{ matrix.config.ep }}
114114
DP_ATTENTION: ${{ matrix.config['dp-attn'] }}
115115
CONC: ${{ matrix.config.conc }}
116+
CONC_JSON: ${{ toJson(matrix.config.conc) }}
117+
PREFILL_NUM_WORKERS: ${{ matrix.config.prefill['num-worker'] }}
118+
PREFILL_TP: ${{ matrix.config.prefill.tp }}
119+
PREFILL_EP: ${{ matrix.config.prefill.ep }}
120+
PREFILL_DP_ATTN: ${{ matrix.config.prefill['dp-attn'] }}
121+
PREFILL_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.prefill['additional-settings']) }}
122+
DECODE_NUM_WORKERS: ${{ matrix.config.decode['num-worker'] }}
123+
DECODE_TP: ${{ matrix.config.decode.tp }}
124+
DECODE_EP: ${{ matrix.config.decode.ep }}
125+
DECODE_DP_ATTN: ${{ matrix.config.decode['dp-attn'] }}
126+
DECODE_ADDITIONAL_SETTINGS_JSON: ${{ toJson(matrix.config.decode['additional-settings']) }}
116127
SPEC_DECODING: ${{ matrix.config.spec-decoding }}
117128
DISAGG: ${{ matrix.config.disagg }}
118129
MOE_DEBUG: '0'
@@ -148,7 +159,7 @@ jobs:
148159
ref: ${{ inputs.ref || github.sha }}
149160
clean: false
150161

151-
- name: Launch + Profile (single-node sglang/vllm)
162+
- name: Launch + Profile
152163
id: run
153164
env:
154165
RUNNER_NAME: ${{ runner.name }}
@@ -159,19 +170,108 @@ jobs:
159170
shell: bash
160171
run: |
161172
set -euo pipefail
162-
ep_val="${EP_SIZE:-1}"
163-
res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}"
173+
174+
export_additional_settings() {
175+
local settings_json="$1"
176+
python3 - "$settings_json" <<'PY'
177+
import json
178+
import sys
179+
180+
raw = sys.argv[1]
181+
if not raw or raw == "null":
182+
raise SystemExit(0)
183+
for item in json.loads(raw) or []:
184+
print(item)
185+
PY
186+
}
187+
188+
normalize_conc() {
189+
python3 - <<'PY'
190+
import json
191+
import os
192+
193+
raw = os.environ.get("CONC_JSON") or os.environ.get("CONC") or "[]"
194+
try:
195+
value = json.loads(raw)
196+
except json.JSONDecodeError:
197+
value = raw
198+
if isinstance(value, list):
199+
print("x".join(str(v) for v in value))
200+
else:
201+
print(str(value))
202+
PY
203+
}
204+
205+
if [ -n "${PREFILL_NUM_WORKERS:-}" ] && [ -n "${DECODE_NUM_WORKERS:-}" ]; then
206+
conc_val="$(normalize_conc)"
207+
res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_prefill-tp${PREFILL_TP}-ep${PREFILL_EP}-dp${PREFILL_DP_ATTN}-nw${PREFILL_NUM_WORKERS}_decode-tp${DECODE_TP}-ep${DECODE_EP}-dp${DECODE_DP_ATTN}-nw${DECODE_NUM_WORKERS}_disagg-${DISAGG}_spec-${SPEC_DECODING}_conc${conc_val}_${RUNNER_NAME}"
208+
209+
echo "IS_MULTINODE=true" >> "$GITHUB_ENV"
210+
echo "PREFILL_GPUS=$((PREFILL_NUM_WORKERS * PREFILL_TP))" >> "$GITHUB_ENV"
211+
echo "DECODE_GPUS=$((DECODE_NUM_WORKERS * DECODE_TP))" >> "$GITHUB_ENV"
212+
213+
while IFS= read -r setting; do
214+
if [ -n "$setting" ]; then
215+
export "$setting"
216+
fi
217+
done < <(export_additional_settings "${PREFILL_ADDITIONAL_SETTINGS_JSON:-null}")
218+
while IFS= read -r setting; do
219+
if [ -n "$setting" ]; then
220+
export "$setting"
221+
fi
222+
done < <(export_additional_settings "${DECODE_ADDITIONAL_SETTINGS_JSON:-null}")
223+
else
224+
ep_val="${EP_SIZE:-1}"
225+
res_name="${EXP_NAME}_${PRECISION}_${FRAMEWORK}_tp${TP}_ep${ep_val}_dpa_${DP_ATTENTION}_conc${CONC}_${RUNNER_NAME}"
226+
fi
227+
164228
export RESULT_FILENAME="${res_name}"
165229
echo "RESULT_FILENAME=${res_name}" >> "$GITHUB_ENV"
166230
167231
bash ./runners/launch_${RUNNER_NAME%%_*}.sh
168232
169233
if [ ! -f "${res_name}.json" ]; then
170-
echo "Run failed: Benchmark result ${res_name}.json not found." >&2
171-
exit 1
234+
result_candidate="$(find . -maxdepth 1 -type f -name "${res_name}_*.json" | sort | head -n1 || true)"
235+
if [ -n "$result_candidate" ] && [ -f "$result_candidate" ]; then
236+
cp "$result_candidate" "${res_name}.json"
237+
else
238+
echo "Run failed: Benchmark result ${res_name}.json not found." >&2
239+
exit 1
240+
fi
172241
fi
173242
174243
trace_path="profile_${res_name}.trace.json.gz"
244+
if [ ! -f "$trace_path" ] && [ -d LOGS/profiles ]; then
245+
trace_candidate="$(python3 - <<'PY'
246+
from pathlib import Path
247+
248+
root = Path("LOGS/profiles")
249+
candidates = [
250+
p for p in root.rglob("*")
251+
if p.is_file() and (
252+
p.name.endswith(".trace.json")
253+
or p.name.endswith(".trace.json.gz")
254+
or p.name.endswith(".pt.trace.json")
255+
or p.name.endswith(".json")
256+
)
257+
]
258+
candidates = [
259+
p for p in candidates
260+
if not p.name.startswith("results_") and "profile_export" not in p.name
261+
]
262+
if candidates:
263+
print(max(candidates, key=lambda p: p.stat().st_size))
264+
PY
265+
)"
266+
if [ -n "$trace_candidate" ] && [ -f "$trace_candidate" ]; then
267+
if [[ "$trace_candidate" == *.gz ]]; then
268+
cp "$trace_candidate" "$trace_path"
269+
else
270+
gzip -c "$trace_candidate" > "$trace_path"
271+
fi
272+
fi
273+
fi
274+
175275
if [ -f "$trace_path" ]; then
176276
echo "trace=$trace_path" >> "$GITHUB_OUTPUT"
177277
if [ "${FRAMEWORK}" = "sglang" ]; then
@@ -252,21 +352,21 @@ jobs:
252352
run: |
253353
set -euo pipefail
254354
255-
dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}"
355+
dest_dir="storage/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}"
256356
mkdir -p "$dest_dir"
257357
cp "$TRACE_LOCAL" "$dest_dir/trace.json.gz"
258358
259359
pushd storage >/dev/null
260360
git config user.name "github-actions"
261361
git config user.email "github-actions@github.com"
262362
git add -A
263-
git commit -m "Add profile: ${GITHUB_SHA} ${{ matrix.config['exp-name'] }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}" || echo "Nothing to commit"
363+
git commit -m "Add profile: ${GITHUB_SHA} ${RESULT_FILENAME}" || echo "Nothing to commit"
264364
git push
265365
STORAGE_SHA="$(git rev-parse HEAD)"
266366
popd >/dev/null
267367
268-
export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}/trace.json.gz"
269-
export TITLE="${{ matrix.config['exp-name'] }}_${{ matrix.config.precision }}_tp${{ matrix.config.tp }}_ep${{ matrix.config.ep || 1 }}_conc${{ matrix.config.conc }}"
368+
export RAW_URL="https://raw.githubusercontent.com/SemiAnalysisAI/InferenceX-trace-storage/${STORAGE_SHA}/profiles/${GITHUB_SHA}/${{ matrix.config.runner }}/${{ matrix.config.framework }}/${RESULT_FILENAME}/trace.json.gz"
369+
export TITLE="${RESULT_FILENAME}"
270370
271371
enc_src="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["RAW_URL"], safe=""))')"
272372
enc_title="$(python3 -c 'import os,urllib.parse; print(urllib.parse.quote(os.environ["TITLE"], safe=""))')"
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
name: "svf-vllm-disagg-gb200-profile-16gpu-conc256-mtp3"
2+
3+
model:
4+
path: "deepseek-v4-pro"
5+
container: "vllm/vllm-openai:v0.21.0-ubuntu2404"
6+
precision: "fp4"
7+
8+
dynamo:
9+
install: true
10+
wheel: "1.2.0.dev20260426"
11+
12+
setup_script: vllm-container-deps.sh
13+
14+
slurm:
15+
time_limit: "8:00:00"
16+
17+
health_check:
18+
max_attempts: 1440
19+
interval_seconds: 10
20+
21+
resources:
22+
gpu_type: "gb200"
23+
gpus_per_node: 4
24+
prefill_nodes: 2
25+
decode_nodes: 2
26+
prefill_workers: 1
27+
decode_workers: 1
28+
gpus_per_prefill: 8
29+
gpus_per_decode: 8
30+
31+
infra:
32+
etcd_nats_dedicated_node: true
33+
34+
frontend:
35+
type: dynamo
36+
enable_multiple_frontends: false
37+
38+
backend:
39+
type: vllm
40+
connector: null
41+
prefill_environment:
42+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
43+
TILELANG_CLEANUP_TEMP_FILES: "1"
44+
VLLM_USE_NCCL_SYMM_MEM: "1"
45+
TORCH_SYMMMEM: "NVSHMEM"
46+
NCCL_CUMEM_ENABLE: "1"
47+
NCCL_MNNVL_ENABLE: "1"
48+
NCCL_NVLS_ENABLE: "1"
49+
VLLM_SERVER_DEV_MODE: "1"
50+
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
51+
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
52+
UCX_MEMTYPE_CACHE: "n"
53+
UCX_MEMTYPE_REG_WHOLE: "n"
54+
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
55+
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
56+
NCCL_P2P_LEVEL: NVL
57+
decode_environment:
58+
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
59+
TILELANG_CLEANUP_TEMP_FILES: "1"
60+
VLLM_USE_NCCL_SYMM_MEM: "1"
61+
TORCH_SYMMMEM: "NVSHMEM"
62+
NCCL_CUMEM_ENABLE: "1"
63+
NCCL_MNNVL_ENABLE: "1"
64+
NCCL_NVLS_ENABLE: "1"
65+
VLLM_SERVER_DEV_MODE: "1"
66+
UCX_MEMTYPE_CACHE: "n"
67+
UCX_MEMTYPE_REG_WHOLE: "n"
68+
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
69+
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
70+
NCCL_P2P_LEVEL: NVL
71+
vllm_config:
72+
prefill:
73+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
74+
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
75+
kv-cache-dtype: "fp8"
76+
tensor-parallel-size: 1
77+
pipeline-parallel-size: 1
78+
data-parallel-hybrid-lb: true
79+
data-parallel-size: 8
80+
data-parallel-rpc-port: 13345
81+
enable-expert-parallel: true
82+
enable-ep-weight-filter: true
83+
moe-backend: deep_gemm_mega_moe
84+
enforce-eager: true
85+
speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
86+
attention-config: '{"use_fp4_indexer_cache":true}'
87+
max-model-len: 9472
88+
max-num-seqs: 8
89+
max-num-batched-tokens: 16384
90+
trust-remote-code: true
91+
no-enable-prefix-caching: true
92+
no-enable-flashinfer-autotune: true
93+
no-async-scheduling: true
94+
block-size: 256
95+
gpu-memory-utilization: 0.9
96+
no-disable-hybrid-kv-cache-manager: true
97+
enable-sleep-mode: true
98+
numa-bind: true
99+
tokenizer-mode: deepseek_v4
100+
decode:
101+
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
102+
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
103+
kv-cache-dtype: "fp8"
104+
tensor-parallel-size: 1
105+
pipeline-parallel-size: 1
106+
data-parallel-hybrid-lb: true
107+
data-parallel-size: 8
108+
data-parallel-rpc-port: 13345
109+
enable-expert-parallel: true
110+
enable-ep-weight-filter: true
111+
moe-backend: deep_gemm_mega_moe
112+
speculative-config: '{"method":"mtp","num_speculative_tokens":3}'
113+
attention-config: '{"use_fp4_indexer_cache":true}'
114+
max-model-len: 9472
115+
max-num-seqs: 256
116+
max-cudagraph-capture-size: 256
117+
max-num-batched-tokens: 256
118+
trust-remote-code: true
119+
no-enable-prefix-caching: true
120+
no-enable-flashinfer-autotune: true
121+
block-size: 256
122+
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
123+
gpu-memory-utilization: 0.9
124+
stream-interval: 50
125+
no-disable-hybrid-kv-cache-manager: true
126+
enable-sleep-mode: true
127+
tokenizer-mode: deepseek_v4
128+
129+
profiling:
130+
type: "torch"
131+
prefill:
132+
start_step: 100000
133+
stop_step: 100001
134+
decode:
135+
start_step: 3
136+
stop_step: 4
137+
138+
benchmark:
139+
type: "sa-bench"
140+
isl: 8192
141+
osl: 1024
142+
concurrencies: "256"
143+
req_rate: "inf"
144+
num_prompts_mult: 1
145+
num_warmup_mult: 1
146+
use_chat_template: true
147+
custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
148+
149+
identity:
150+
model:
151+
repo: "deepseek-ai/DeepSeek-V4-Pro"
152+
revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
153+
container:
154+
image: "vllm/vllm-openai:v0.21.0-ubuntu2404"
155+
frameworks:
156+
dynamo: "1.2.0.dev20260426"

0 commit comments

Comments
 (0)