feat: adds more configurations for GB200 SGLang DSR1 (#335)

yunzhoul-nv · Elnifio · cquil11 · web-flow · commit c040b5cf23ce · 2025-12-19T01:29:11.000Z
* bring all configs here * test for GB200 only * updates the files and git clone urls * update the prefill nodes * update 1k1k fp4 config * updates to run 1k1k fp4 only * updates the FP4 8k1k * update the model path * restore changes to full sweeps * updates the config for 1k1k fp4 * temporarily disable some concurrencies * updates the params * updates the branch * update config * temporarily disable all other configs * Revert "temporarily disable all other configs" This reverts commit ce40018. * update comments * bump the image for DSR1 * update the model-path args * model-path not permitted * switches the branch * add perf changelog * used the wrong model path here... --------- Co-authored-by: Elnifio <elnifio0519@gmail.com> Co-authored-by: Cameron Quilici <cjquilici@gmail.com>
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -332,7 +332,8 @@ gptoss-fp4-h200-vllm:
 
 dsr1-fp4-gb200-dynamo-trt:
   image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3
-  model: deepseek-r1-fp4
+  # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading
+  model: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2
   model-prefix: dsr1
   runner: gb200
   precision: fp4
@@ -773,8 +774,10 @@ dsr1-fp4-gb200-dynamo-trt:
         - "DECODE_MTP_SIZE=0"
 
 dsr1-fp8-gb200-dynamo-sglang:
-  image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1
-  model: deepseek-ai/DeepSeek-R1-0528
+  image: lmsysorg/sglang:v0.5.5.post2
+  # model: deepseek-ai/DeepSeek-R1-0528
+  # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading
+  model: /mnt/lustre01/models/deepseek-r1-0528
   model-prefix: dsr1
   runner: gb200
   precision: fp8
@@ -798,6 +801,7 @@ dsr1-fp8-gb200-dynamo-sglang:
         additional-settings:
         - "PREFILL_NODES=4"
         - "N_ADDITIONAL_FRONTENDS=9"
+        - "SCRIPT_MODE=1k1k-max-tpt"
       decode:
         num-worker: 1
         tp: 1
@@ -819,7 +823,7 @@ dsr1-fp8-gb200-dynamo-sglang:
         additional-settings:
         - "PREFILL_NODES=1"
         - "N_ADDITIONAL_FRONTENDS=9"
-        - "SCRIPT_MODE=1p_4d"
+        - "SCRIPT_MODE=1k1k-low-latency"
       decode:
         num-worker: 4
         tp: 1
@@ -841,6 +845,7 @@ dsr1-fp8-gb200-dynamo-sglang:
         additional-settings:
         - "PREFILL_NODES=6"
         - "N_ADDITIONAL_FRONTENDS=9"
+        - "SCRIPT_MODE=1k1k-max-tpt"
       decode:
         num-worker: 1
         tp: 1
@@ -852,22 +857,193 @@ dsr1-fp8-gb200-dynamo-sglang:
   - isl: 8192
     osl: 1024
     search-space:
+    # Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4)
+    - spec-decoding: "none"
+      conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512 ]
+      prefill:
+        num-worker: 1
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "N_ADDITIONAL_FRONTENDS=8"
+        - "SCRIPT_MODE=8k1k-low-latency"
+      decode:
+        num-worker: 1
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+
+    # Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32)
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048, 6144 ]
+      prefill:
+        num-worker: 5
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=10"
+        - "N_ADDITIONAL_FRONTENDS=8"
+        - "SCRIPT_MODE=8k1k-max-tpt"
+      decode:
+        num-worker: 1
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=8"
+
+dsr1-fp4-gb200-dynamo-sglang:
+  image: lmsysorg/sglang:v0.5.5.post2
+  # TODO: what is the right name?
+  # model: deepseek-ai/DeepSeek-R1-0528-fp4-v2
+  # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading
+  model: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2
+  model-prefix: dsr1
+  runner: gb200
+  precision: fp4
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4)
+    - spec-decoding: "none"
+      conc-list: [ 4, 8, 32, 64 ]
+      prefill:
+        num-worker: 1
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "N_ADDITIONAL_FRONTENDS=8"
+        - "SCRIPT_MODE=1k1k-low-latency"
+      decode:
+        num-worker: 2
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=2"
+
+    # Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48)
     - spec-decoding: "none"
-      conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ]
+      conc-list: [ 512, 1024, 2048, 4096, 8192 ]
+      prefill:
+        num-worker: 4
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+        - "N_ADDITIONAL_FRONTENDS=8"
+        - "SCRIPT_MODE=1k1k-middle-curve"
+      decode:
+        num-worker: 1
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=12"
+
+    # Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32)
+    - spec-decoding: "none"
+      conc-list: [ 8192, 12000, 15000 ]
+      prefill:
+        num-worker: 4
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+        - "N_ADDITIONAL_FRONTENDS=8"
+        - "SCRIPT_MODE=1k1k-max-tpt"
+      decode:
+        num-worker: 1
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=8"
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - spec-decoding: "none"
+      conc-list: [ 4, 8, 32, 64 ]
+      prefill:
+        num-worker: 1
+        tp: 1
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+        - "N_ADDITIONAL_FRONTENDS=8"
+        - "SCRIPT_MODE=8k1k-low-latency"
+      decode:
+        num-worker: 4
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=4"
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048, 4096 ]
       prefill:
         num-worker: 6
-        # tp, ep, and dp-attn do nothing because they are hardcoded in the following file:
-        # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
+        tp: 1
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=6"
+        - "N_ADDITIONAL_FRONTENDS=9"
+        - "SCRIPT_MODE=8k1k-middle-curve"
+      decode:
+        num-worker: 1
         tp: 1
         ep: 1
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=12"
+        - "DECODE_NODES=12"
+    - spec-decoding: "none"
+      conc-list: [ 1024, 2048, ]
+      prefill:
+        num-worker: 10
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=10"
         - "N_ADDITIONAL_FRONTENDS=8"
+        - "SCRIPT_MODE=8k1k-max-tpt"
       decode:
         num-worker: 1
         tp: 1
         ep: 1
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=6"
+        - "DECODE_NODES=8"
+    - spec-decoding: "none"
+      conc-list: [ 8192 ]
+      prefill:
+        num-worker: 10
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=10"
+        - "N_ADDITIONAL_FRONTENDS=8"
+        - "SCRIPT_MODE=8k1k-max-tpt"
+      decode:
+        num-worker: 1
+        tp: 1
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=8"
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -86,6 +86,7 @@ env:
   EXP_NAME: ${{ inputs.exp-name }}
   IMAGE: ${{ inputs.image }}
   MODEL_PREFIX: ${{ inputs.model-prefix }}
+  MODEL: ${{ inputs.model }}
   FRAMEWORK: ${{ inputs.framework }}
   PRECISION: ${{ inputs.precision }}
   ISL: ${{ inputs.isl }}
diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh
@@ -0,0 +1,38 @@
+
+#!/bin/bash
+
+set -x
+
+source "$(dirname "$0")/benchmark_lib.sh"
+
+check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \
+    PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \
+    PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS SGL_SLURM_JOBS_PATH # SGL_SLURM_JOBS_PATH FIXME
+
+# Always clone and setup Dynamo
+echo "Cloning Dynamo repository..."
+git clone https://github.com/ai-dynamo/dynamo.git
+cd dynamo && git checkout b7107d008392eded64c23a7540fb99bca46b4c91 && cd .. # All configs are frozen in this branch
+
+cd "$SGL_SLURM_JOBS_PATH"
+
+# Set up SGL launch script-specific environment variables
+export TIME_LIMIT="04:00:00"
+export MODEL_PATH=$MODEL_PATH
+export CONFIG_DIR=$CONFIG_DIR
+export CONTAINER_IMAGE=$IMAGE
+export GPU_TYPE="gb200-fp4"
+
+# Launch jobs based on ISL/OSL
+# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
+# by a list of numbers delimted by 'x'. This is because of how the underlying launch script
+# expects the concurrencies.
+bash ./submit_disagg.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $N_ADDITIONAL_FRONTENDS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    $GPU_TYPE \
+    $SCRIPT_MODE
diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh
@@ -12,11 +12,8 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \
 
 # Always clone and setup Dynamo
 echo "Cloning Dynamo repository..."
-if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
-    git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git
-else
-    git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git
-fi
+git clone https://github.com/ai-dynamo/dynamo.git
+cd dynamo && git checkout b7107d008392eded64c23a7540fb99bca46b4c91 && cd .. # All configs are frozen in this branch
 
 cd "$SGL_SLURM_JOBS_PATH"
 
@@ -25,6 +22,7 @@ export TIME_LIMIT="04:00:00"
 export MODEL_PATH=$MODEL_PATH
 export CONFIG_DIR=$CONFIG_DIR
 export CONTAINER_IMAGE=$IMAGE
+export GPU_TYPE="gb200-fp8"
 
 # Launch jobs based on ISL/OSL
 # Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
@@ -36,4 +34,5 @@ bash ./submit_disagg.sh $PREFILL_NODES \
     $DECODE_NUM_WORKERS \
     $N_ADDITIONAL_FRONTENDS \
     $ISL $OSL "${CONC_LIST// /x}" inf \
-    $SCRIPT_MODE
+    $GPU_TYPE \
+    $SCRIPT_MODE
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -95,3 +95,11 @@
   description:
     - "Add benchmark script for GPTOSS FP4 B200 TRT-LLM"
   pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/256
+
+- config-keys:
+    - dsr1-fp4-gb200-dynamo-trt
+    - dsr1-fp4-gb200-dynamo-sglang
+    - dsr1-fp8-gb200-dynamo-sglang
+  description:
+    - "Add more configurations for GB200 SGLang DSR1"
+  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/335
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
@@ -13,35 +13,22 @@ export SLURM_JOB_NAME="benchmark-dynamo.job"
 # For now we add conditionals to this script to use newer code for the 1k1k configs
 
 ### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars
-if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
-    # Set IMAGE based on ISL/OSL
-    if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
-        export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.5.post2.sqsh"
-    else
-        export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh"
-    fi
-    export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528"
-    export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k"
+SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
 
-    # FIXME: Another workaround for all the different branching
-    # THIS NEEDS TO BE STANDARDIZED ASAP
-    if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
-        export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs"
-    else
-        export SGL_SLURM_JOBS_PATH="dynamo/components/backends/sglang/slurm_jobs"
-    fi
-else
-    SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-    srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
+# Update the IMAGE variable to the squash file
+export IMAGE=$SQUASH_FILE
 
-    # Update the IMAGE variable to the squash file
-    export IMAGE=$SQUASH_FILE
+# MODEL_PATH is set in `nvidia-master.yaml` or any other yaml files
+export MODEL_PATH=$MODEL
 
-    export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2"
+if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
+    export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k"
+    export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs"
+else
     export SERVED_MODEL_NAME="deepseek-r1-fp4"
 fi
 
-
 export ISL="$ISL"
 export OSL="$OSL"
 
@@ -148,4 +135,4 @@ PY
     done
 fi
 
-echo "All result files processed"
+echo "All result files processed"