Skip to content

Commit efcb4e4

Browse files
authored
sglang: add fp8 8k1k and fp4 1k1k (#274)
* go * typo * typo... * more
1 parent f22cf47 commit efcb4e4

3 files changed

Lines changed: 137 additions & 11 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 97 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -798,6 +798,7 @@ dsr1-fp8-gb200-dynamo-sglang:
798798
additional-settings:
799799
- "PREFILL_NODES=4"
800800
- "N_ADDITIONAL_FRONTENDS=9"
801+
- "SCRIPT_MODE=max-tpt"
801802
decode:
802803
num-worker: 1
803804
tp: 1
@@ -852,22 +853,112 @@ dsr1-fp8-gb200-dynamo-sglang:
852853
- isl: 8192
853854
osl: 1024
854855
search-space:
856+
# Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4)
855857
- spec-decoding: "none"
856-
conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ]
858+
conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512 ]
857859
prefill:
858-
num-worker: 6
859-
# tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
860-
# https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
860+
num-worker: 1
861861
tp: 1
862862
ep: 1
863863
dp-attn: true
864864
additional-settings:
865-
- "PREFILL_NODES=12"
865+
- "PREFILL_NODES=1"
866866
- "N_ADDITIONAL_FRONTENDS=8"
867867
decode:
868868
num-worker: 1
869869
tp: 1
870870
ep: 1
871871
dp-attn: true
872872
additional-settings:
873-
- "DECODE_NODES=6"
873+
- "DECODE_NODES=1"
874+
875+
# Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32)
876+
- spec-decoding: "none"
877+
conc-list: [ 512, 1024, 2048, 6144 ]
878+
prefill:
879+
num-worker: 5
880+
tp: 1
881+
ep: 1
882+
dp-attn: true
883+
additional-settings:
884+
- "PREFILL_NODES=2"
885+
- "N_ADDITIONAL_FRONTENDS=8"
886+
decode:
887+
num-worker: 1
888+
tp: 1
889+
ep: 1
890+
dp-attn: true
891+
additional-settings:
892+
- "DECODE_NODES=8"
893+
894+
dsr1-fp4-gb200-dynamo-sglang:
895+
# TODO: swap
896+
image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1
897+
# TODO: what is the right name?
898+
model: deepseek-ai/DeepSeek-R1-0528-fp4-v2
899+
model-prefix: dsr1
900+
runner: gb200
901+
precision: fp4
902+
framework: dynamo-sglang
903+
multinode: true
904+
disagg: true
905+
seq-len-configs:
906+
- isl: 1024
907+
osl: 1024
908+
search-space:
909+
# Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4)
910+
- spec-decoding: "none"
911+
conc-list: [ 4, 8, 32, 64, 128, 112, 128, 256 ]
912+
prefill:
913+
num-worker: 1
914+
tp: 1
915+
ep: 1
916+
dp-attn: true
917+
additional-settings:
918+
- "PREFILL_NODES=1"
919+
- "N_ADDITIONAL_FRONTENDS=8"
920+
decode:
921+
num-worker: 2
922+
tp: 1
923+
ep: 1
924+
dp-attn: true
925+
additional-settings:
926+
- "DECODE_NODES=2"
927+
928+
# Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48)
929+
- spec-decoding: "none"
930+
conc-list: [ 512, 1024, 2048, 4096, 8192 ]
931+
prefill:
932+
num-worker: 1
933+
tp: 1
934+
ep: 1
935+
dp-attn: true
936+
additional-settings:
937+
- "PREFILL_NODES=1"
938+
- "N_ADDITIONAL_FRONTENDS=8"
939+
decode:
940+
num-worker: 2
941+
tp: 1
942+
ep: 1
943+
dp-attn: true
944+
additional-settings:
945+
- "DECODE_NODES=12"
946+
947+
# Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32)
948+
- spec-decoding: "none"
949+
conc-list: [ 8192, 12000, 15000 ]
950+
prefill:
951+
num-worker: 1
952+
tp: 1
953+
ep: 1
954+
dp-attn: true
955+
additional-settings:
956+
- "PREFILL_NODES=1"
957+
- "N_ADDITIONAL_FRONTENDS=8"
958+
decode:
959+
num-worker: 2
960+
tp: 1
961+
ep: 1
962+
dp-attn: true
963+
additional-settings:
964+
- "DECODE_NODES=8"
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
2+
#!/bin/bash
3+
4+
set -x
5+
6+
source "$(dirname "$0")/benchmark_lib.sh"
7+
8+
check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \
9+
PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \
10+
DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \
11+
PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS SGL_SLURM_JOBS_PATH # SGL_SLURM_JOBS_PATH FIXME
12+
13+
# Always clone and setup Dynamo
14+
echo "Cloning Dynamo repository..."
15+
git clone --branch ishan/sa-1.1-sgl-dsr1 https://github.com/ai-dynamo/dynamo.git
16+
17+
cd "$SGL_SLURM_JOBS_PATH"
18+
19+
# Set up SGL launch script-specific environment variables
20+
export TIME_LIMIT="04:00:00"
21+
export MODEL_PATH=$MODEL_PATH
22+
export CONFIG_DIR=$CONFIG_DIR
23+
export CONTAINER_IMAGE=$IMAGE
24+
export GPU_TYPE="gb200-fp4"
25+
26+
# Launch jobs based on ISL/OSL
27+
# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
28+
# by a list of numbers delimted by 'x'. This is because of how the underlying launch script
29+
# expects the concurrencies.
30+
bash ./submit_disagg.sh $PREFILL_NODES \
31+
$PREFILL_NUM_WORKERS \
32+
$DECODE_NODES \
33+
$DECODE_NUM_WORKERS \
34+
$N_ADDITIONAL_FRONTENDS \
35+
$ISL $OSL "${CONC_LIST// /x}" inf \
36+
$GPU_TYPE \
37+
$SCRIPT_MODE

benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,7 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \
1212

1313
# Always clone and setup Dynamo
1414
echo "Cloning Dynamo repository..."
15-
if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
16-
git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git
17-
else
18-
git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git
19-
fi
15+
git clone --branch ishan/sa-1.1-sgl-dsr1 https://github.com/ai-dynamo/dynamo.git
2016

2117
cd "$SGL_SLURM_JOBS_PATH"
2218

@@ -25,6 +21,7 @@ export TIME_LIMIT="04:00:00"
2521
export MODEL_PATH=$MODEL_PATH
2622
export CONFIG_DIR=$CONFIG_DIR
2723
export CONTAINER_IMAGE=$IMAGE
24+
export GPU_TYPE="gb200-fp8"
2825

2926
# Launch jobs based on ISL/OSL
3027
# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
@@ -36,4 +33,5 @@ bash ./submit_disagg.sh $PREFILL_NODES \
3633
$DECODE_NUM_WORKERS \
3734
$N_ADDITIONAL_FRONTENDS \
3835
$ISL $OSL "${CONC_LIST// /x}" inf \
36+
$GPU_TYPE \
3937
$SCRIPT_MODE

0 commit comments

Comments
 (0)