Skip to content

Commit c040b5c

Browse files
yunzhoul-nvElnifiocquil11
authored
feat: adds more configurations for GB200 SGLang DSR1 (#335)
* bring all configs here * test for GB200 only * updates the files and git clone urls * update the prefill nodes * update 1k1k fp4 config * updates to run 1k1k fp4 only * updates the FP4 8k1k * update the model path * restore changes to full sweeps * updates the config for 1k1k fp4 * temporarily disable some concurrencies * updates the params * updates the branch * update config * temporarily disable all other configs * Revert "temporarily disable all other configs" This reverts commit ce40018. * update comments * bump the image for DSR1 * update the model-path args * model-path not permitted * switches the branch * add perf changelog * used the wrong model path here... --------- Co-authored-by: Elnifio <elnifio0519@gmail.com> Co-authored-by: Cameron Quilici <cjquilici@gmail.com>
1 parent acd4ffe commit c040b5c

6 files changed

Lines changed: 248 additions & 39 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 185 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,8 @@ gptoss-fp4-h200-vllm:
332332

333333
dsr1-fp4-gb200-dynamo-trt:
334334
image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
335-
model: deepseek-r1-fp4
335+
# Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading
336+
model: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2
336337
model-prefix: dsr1
337338
runner: gb200
338339
precision: fp4
@@ -773,8 +774,10 @@ dsr1-fp4-gb200-dynamo-trt:
773774
- "DECODE_MTP_SIZE=0"
774775

775776
dsr1-fp8-gb200-dynamo-sglang:
776-
image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1
777-
model: deepseek-ai/DeepSeek-R1-0528
777+
image: lmsysorg/sglang:v0.5.5.post2
778+
# model: deepseek-ai/DeepSeek-R1-0528
779+
# Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading
780+
model: /mnt/lustre01/models/deepseek-r1-0528
778781
model-prefix: dsr1
779782
runner: gb200
780783
precision: fp8
@@ -798,6 +801,7 @@ dsr1-fp8-gb200-dynamo-sglang:
798801
additional-settings:
799802
- "PREFILL_NODES=4"
800803
- "N_ADDITIONAL_FRONTENDS=9"
804+
- "SCRIPT_MODE=1k1k-max-tpt"
801805
decode:
802806
num-worker: 1
803807
tp: 1
@@ -819,7 +823,7 @@ dsr1-fp8-gb200-dynamo-sglang:
819823
additional-settings:
820824
- "PREFILL_NODES=1"
821825
- "N_ADDITIONAL_FRONTENDS=9"
822-
- "SCRIPT_MODE=1p_4d"
826+
- "SCRIPT_MODE=1k1k-low-latency"
823827
decode:
824828
num-worker: 4
825829
tp: 1
@@ -841,6 +845,7 @@ dsr1-fp8-gb200-dynamo-sglang:
841845
additional-settings:
842846
- "PREFILL_NODES=6"
843847
- "N_ADDITIONAL_FRONTENDS=9"
848+
- "SCRIPT_MODE=1k1k-max-tpt"
844849
decode:
845850
num-worker: 1
846851
tp: 1
@@ -852,22 +857,193 @@ dsr1-fp8-gb200-dynamo-sglang:
852857
- isl: 8192
853858
osl: 1024
854859
search-space:
860+
# Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4)
861+
- spec-decoding: "none"
862+
conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512 ]
863+
prefill:
864+
num-worker: 1
865+
tp: 1
866+
ep: 1
867+
dp-attn: true
868+
additional-settings:
869+
- "PREFILL_NODES=1"
870+
- "N_ADDITIONAL_FRONTENDS=8"
871+
- "SCRIPT_MODE=8k1k-low-latency"
872+
decode:
873+
num-worker: 1
874+
tp: 1
875+
ep: 1
876+
dp-attn: true
877+
additional-settings:
878+
- "DECODE_NODES=1"
879+
880+
# Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32)
881+
- spec-decoding: "none"
882+
conc-list: [ 512, 1024, 2048, 6144 ]
883+
prefill:
884+
num-worker: 5
885+
tp: 1
886+
ep: 1
887+
dp-attn: true
888+
additional-settings:
889+
- "PREFILL_NODES=10"
890+
- "N_ADDITIONAL_FRONTENDS=8"
891+
- "SCRIPT_MODE=8k1k-max-tpt"
892+
decode:
893+
num-worker: 1
894+
tp: 1
895+
ep: 1
896+
dp-attn: true
897+
additional-settings:
898+
- "DECODE_NODES=8"
899+
900+
dsr1-fp4-gb200-dynamo-sglang:
901+
image: lmsysorg/sglang:v0.5.5.post2
902+
# TODO: what is the right name?
903+
# model: deepseek-ai/DeepSeek-R1-0528-fp4-v2
904+
# Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading
905+
model: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2
906+
model-prefix: dsr1
907+
runner: gb200
908+
precision: fp4
909+
framework: dynamo-sglang
910+
multinode: true
911+
disagg: true
912+
seq-len-configs:
913+
- isl: 1024
914+
osl: 1024
915+
search-space:
916+
# Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4)
917+
- spec-decoding: "none"
918+
conc-list: [ 4, 8, 32, 64 ]
919+
prefill:
920+
num-worker: 1
921+
tp: 1
922+
ep: 1
923+
dp-attn: true
924+
additional-settings:
925+
- "PREFILL_NODES=1"
926+
- "N_ADDITIONAL_FRONTENDS=8"
927+
- "SCRIPT_MODE=1k1k-low-latency"
928+
decode:
929+
num-worker: 2
930+
tp: 1
931+
ep: 1
932+
dp-attn: true
933+
additional-settings:
934+
- "DECODE_NODES=2"
935+
936+
# Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48)
855937
- spec-decoding: "none"
856-
conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ]
938+
conc-list: [ 512, 1024, 2048, 4096, 8192 ]
939+
prefill:
940+
num-worker: 4
941+
tp: 1
942+
ep: 1
943+
dp-attn: true
944+
additional-settings:
945+
- "PREFILL_NODES=4"
946+
- "N_ADDITIONAL_FRONTENDS=8"
947+
- "SCRIPT_MODE=1k1k-middle-curve"
948+
decode:
949+
num-worker: 1
950+
tp: 1
951+
ep: 1
952+
dp-attn: true
953+
additional-settings:
954+
- "DECODE_NODES=12"
955+
956+
# Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32)
957+
- spec-decoding: "none"
958+
conc-list: [ 8192, 12000, 15000 ]
959+
prefill:
960+
num-worker: 4
961+
tp: 1
962+
ep: 1
963+
dp-attn: true
964+
additional-settings:
965+
- "PREFILL_NODES=4"
966+
- "N_ADDITIONAL_FRONTENDS=8"
967+
- "SCRIPT_MODE=1k1k-max-tpt"
968+
decode:
969+
num-worker: 1
970+
tp: 1
971+
ep: 1
972+
dp-attn: true
973+
additional-settings:
974+
- "DECODE_NODES=8"
975+
- isl: 8192
976+
osl: 1024
977+
search-space:
978+
- spec-decoding: "none"
979+
conc-list: [ 4, 8, 32, 64 ]
980+
prefill:
981+
num-worker: 1
982+
tp: 1
983+
ep: 1
984+
dp-attn: false
985+
additional-settings:
986+
- "PREFILL_NODES=1"
987+
- "N_ADDITIONAL_FRONTENDS=8"
988+
- "SCRIPT_MODE=8k1k-low-latency"
989+
decode:
990+
num-worker: 4
991+
tp: 1
992+
ep: 1
993+
dp-attn: true
994+
additional-settings:
995+
- "DECODE_NODES=4"
996+
- spec-decoding: "none"
997+
conc-list: [ 512, 1024, 2048, 4096 ]
857998
prefill:
858999
num-worker: 6
859-
# tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
860-
# https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
1000+
tp: 1
1001+
ep: 1
1002+
dp-attn: false
1003+
additional-settings:
1004+
- "PREFILL_NODES=6"
1005+
- "N_ADDITIONAL_FRONTENDS=9"
1006+
- "SCRIPT_MODE=8k1k-middle-curve"
1007+
decode:
1008+
num-worker: 1
8611009
tp: 1
8621010
ep: 1
8631011
dp-attn: true
8641012
additional-settings:
865-
- "PREFILL_NODES=12"
1013+
- "DECODE_NODES=12"
1014+
- spec-decoding: "none"
1015+
conc-list: [ 1024, 2048, ]
1016+
prefill:
1017+
num-worker: 10
1018+
tp: 1
1019+
ep: 1
1020+
dp-attn: true
1021+
additional-settings:
1022+
- "PREFILL_NODES=10"
8661023
- "N_ADDITIONAL_FRONTENDS=8"
1024+
- "SCRIPT_MODE=8k1k-max-tpt"
8671025
decode:
8681026
num-worker: 1
8691027
tp: 1
8701028
ep: 1
8711029
dp-attn: true
8721030
additional-settings:
873-
- "DECODE_NODES=6"
1031+
- "DECODE_NODES=8"
1032+
- spec-decoding: "none"
1033+
conc-list: [ 8192 ]
1034+
prefill:
1035+
num-worker: 10
1036+
tp: 1
1037+
ep: 1
1038+
dp-attn: true
1039+
additional-settings:
1040+
- "PREFILL_NODES=10"
1041+
- "N_ADDITIONAL_FRONTENDS=8"
1042+
- "SCRIPT_MODE=8k1k-max-tpt"
1043+
decode:
1044+
num-worker: 1
1045+
tp: 1
1046+
ep: 1
1047+
dp-attn: true
1048+
additional-settings:
1049+
- "DECODE_NODES=8"

.github/workflows/benchmark-multinode-tmpl.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ env:
8686
EXP_NAME: ${{ inputs.exp-name }}
8787
IMAGE: ${{ inputs.image }}
8888
MODEL_PREFIX: ${{ inputs.model-prefix }}
89+
MODEL: ${{ inputs.model }}
8990
FRAMEWORK: ${{ inputs.framework }}
9091
PRECISION: ${{ inputs.precision }}
9192
ISL: ${{ inputs.isl }}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
2+
#!/bin/bash
3+
4+
set -x
5+
6+
source "$(dirname "$0")/benchmark_lib.sh"
7+
8+
check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \
9+
PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \
10+
DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \
11+
PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS SGL_SLURM_JOBS_PATH # SGL_SLURM_JOBS_PATH FIXME
12+
13+
# Always clone and setup Dynamo
14+
echo "Cloning Dynamo repository..."
15+
git clone https://github.com/ai-dynamo/dynamo.git
16+
cd dynamo && git checkout b7107d008392eded64c23a7540fb99bca46b4c91 && cd .. # All configs are frozen in this branch
17+
18+
cd "$SGL_SLURM_JOBS_PATH"
19+
20+
# Set up SGL launch script-specific environment variables
21+
export TIME_LIMIT="04:00:00"
22+
export MODEL_PATH=$MODEL_PATH
23+
export CONFIG_DIR=$CONFIG_DIR
24+
export CONTAINER_IMAGE=$IMAGE
25+
export GPU_TYPE="gb200-fp4"
26+
27+
# Launch jobs based on ISL/OSL
28+
# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
29+
# by a list of numbers delimted by 'x'. This is because of how the underlying launch script
30+
# expects the concurrencies.
31+
bash ./submit_disagg.sh $PREFILL_NODES \
32+
$PREFILL_NUM_WORKERS \
33+
$DECODE_NODES \
34+
$DECODE_NUM_WORKERS \
35+
$N_ADDITIONAL_FRONTENDS \
36+
$ISL $OSL "${CONC_LIST// /x}" inf \
37+
$GPU_TYPE \
38+
$SCRIPT_MODE

benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,8 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \
1212

1313
# Always clone and setup Dynamo
1414
echo "Cloning Dynamo repository..."
15-
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
16-
git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git
17-
else
18-
git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git
19-
fi
15+
git clone https://github.com/ai-dynamo/dynamo.git
16+
cd dynamo && git checkout b7107d008392eded64c23a7540fb99bca46b4c91 && cd .. # All configs are frozen in this branch
2017

2118
cd "$SGL_SLURM_JOBS_PATH"
2219

@@ -25,6 +22,7 @@ export TIME_LIMIT="04:00:00"
2522
export MODEL_PATH=$MODEL_PATH
2623
export CONFIG_DIR=$CONFIG_DIR
2724
export CONTAINER_IMAGE=$IMAGE
25+
export GPU_TYPE="gb200-fp8"
2826

2927
# Launch jobs based on ISL/OSL
3028
# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
@@ -36,4 +34,5 @@ bash ./submit_disagg.sh $PREFILL_NODES \
3634
$DECODE_NUM_WORKERS \
3735
$N_ADDITIONAL_FRONTENDS \
3836
$ISL $OSL "${CONC_LIST// /x}" inf \
39-
$SCRIPT_MODE
37+
$GPU_TYPE \
38+
$SCRIPT_MODE

perf-changelog.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,3 +95,11 @@
9595
description:
9696
- "Add benchmark script for GPTOSS FP4 B200 TRT-LLM"
9797
pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/256
98+
99+
- config-keys:
100+
- dsr1-fp4-gb200-dynamo-trt
101+
- dsr1-fp4-gb200-dynamo-sglang
102+
- dsr1-fp8-gb200-dynamo-sglang
103+
description:
104+
- "Add more configurations for GB200 SGLang DSR1"
105+
pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/335

runners/launch_gb200-nv.sh

Lines changed: 11 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -13,35 +13,22 @@ export SLURM_JOB_NAME="benchmark-dynamo.job"
1313
# For now we add conditionals to this script to use newer code for the 1k1k configs
1414

1515
### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars
16-
if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
17-
# Set IMAGE based on ISL/OSL
18-
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
19-
export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.5.post2.sqsh"
20-
else
21-
export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh"
22-
fi
23-
export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528"
24-
export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k"
16+
SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
17+
srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
2518

26-
# FIXME: Another workaround for all the different branching
27-
# THIS NEEDS TO BE STANDARDIZED ASAP
28-
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
29-
export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs"
30-
else
31-
export SGL_SLURM_JOBS_PATH="dynamo/components/backends/sglang/slurm_jobs"
32-
fi
33-
else
34-
SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
35-
srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
19+
# Update the IMAGE variable to the squash file
20+
export IMAGE=$SQUASH_FILE
3621

37-
# Update the IMAGE variable to the squash file
38-
export IMAGE=$SQUASH_FILE
22+
# MODEL_PATH is set in `nvidia-master.yaml` or any other yaml files
23+
export MODEL_PATH=$MODEL
3924

40-
export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2"
25+
if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
26+
export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k"
27+
export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs"
28+
else
4129
export SERVED_MODEL_NAME="deepseek-r1-fp4"
4230
fi
4331

44-
4532
export ISL="$ISL"
4633
export OSL="$OSL"
4734

@@ -148,4 +135,4 @@ PY
148135
done
149136
fi
150137

151-
echo "All result files processed"
138+
echo "All result files processed"

0 commit comments

Comments
 (0)