Tune H100 Qwen SGLang Pareto recipe

anish-shanbhag · anish-shanbhag · commit c3b92eb8cf79 · 2026-05-28T17:40:17.000-07:00
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -9208,11 +9208,15 @@ qwen3.5-fp8-h100-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+      - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
+      - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+      - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
+      - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
 
 qwen3.5-fp8-h100-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
diff --git a/benchmarks/single_node/qwen3.5_fp8_h100.sh b/benchmarks/single_node/qwen3.5_fp8_h100.sh
@@ -1,17 +1,15 @@
 #!/usr/bin/env bash
 
 # Qwen-3.5-397B-A17B FP8 on H100 via sglang.
-# Mirrors qwen3.5_fp8_h200.sh but with tighter memory accommodations:
-# H100 has 80GB HBM3 vs H200's 141GB HBM3e, so weights + KV cache fit
-# more snugly. Mem-fraction-static lowered from 0.8 → 0.75 and
-# chunked-prefill-size from 16384 → 8192 to leave more headroom.
-# Sweep tops out at conc=32 instead of 64 for the same reason.
+# Uses TP8/EP1 at conc 1-8, TP8/EP8 at conc 16-64,
+# and TP8/EP8 with DP attention at conc 128-256.
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 
 check_env_vars \
     MODEL \
     TP \
+    DP_ATTENTION \
     CONC \
     ISL \
     OSL \
@@ -35,7 +33,47 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     MAX_SEQ_LEN="$EVAL_MAX_MODEL_LEN"
 fi
 
-echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
+PARALLEL_ARGS=(--tp "$TP")
+if [ "${EP_SIZE}" -gt 1 ]; then
+    PARALLEL_ARGS+=(--expert-parallel-size "$EP_SIZE")
+fi
+
+SCHEDULER_RECV_INTERVAL=
+if [ "${DP_ATTENTION}" != "true" ]; then
+    case "$CONC" in
+      1|2|4)
+        SCHEDULER_RECV_INTERVAL=2
+        ;;
+      8)
+        SCHEDULER_RECV_INTERVAL=60
+        ;;
+      16)
+        SCHEDULER_RECV_INTERVAL=30
+        ;;
+      32)
+        SCHEDULER_RECV_INTERVAL=1200
+        ;;
+      64)
+        SCHEDULER_RECV_INTERVAL=600
+        ;;
+      *)
+        echo "Unsupported CONC=$CONC for qwen3.5 FP8 H100 SGLang recipe" >&2
+        exit 1
+        ;;
+    esac
+fi
+
+SCHEDULER_ARGS=()
+if [ -n "$SCHEDULER_RECV_INTERVAL" ]; then
+    SCHEDULER_ARGS=(--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL")
+fi
+if [ "${DP_ATTENTION}" = "true" ]; then
+    PARALLEL_ARGS+=(--dp-size "$TP" --enable-dp-attention)
+fi
+
+echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
+echo "SCHEDULER_RECV_INTERVAL: ${SCHEDULER_RECV_INTERVAL:-none}"
+echo "SCHEDULER_ARGS: ${SCHEDULER_ARGS[*]}"
 
 start_gpu_monitor
 
@@ -44,15 +82,14 @@ python3 -m sglang.launch_server \
   --model "$MODEL" \
   --host 0.0.0.0 \
   --port "$PORT" \
-  --tp "$TP" \
-  --expert-parallel-size "$EP_SIZE" \
+  "${PARALLEL_ARGS[@]}" \
   --reasoning-parser qwen3 \
   --tool-call-parser qwen3_coder \
   --enable-flashinfer-allreduce-fusion \
-  --max-running-requests 64 \
-  --chunked-prefill-size 8192 \
+  --max-running-requests 256 \
+  --chunked-prefill-size 16384 \
   --decode-log-interval 1 \
-  --mem-fraction-static 0.75 \
+  --mem-fraction-static 0.8 \
   --cuda-graph-max-bs "$CONC" \
   --context-length "$MAX_SEQ_LEN" \
   --kv-cache-dtype fp8_e4m3 \
@@ -62,7 +99,9 @@ python3 -m sglang.launch_server \
   --tokenizer-worker-num 6 \
   --mamba-ssm-dtype bfloat16 \
   --disable-radix-cache \
+  --enable-symm-mem \
   --trust-remote-code \
+  "${SCHEDULER_ARGS[@]}" \
   > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3200,3 +3200,12 @@
     - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523, 1P1D TP8/EP1, dp-attn false, conc [8..512]"
     - "MoRI conn.py overlay (48e459bd) via job.slurm; launcher qwen3.5_fp4_mi355x_sglang-disagg.sh"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1579
+
+- config-keys:
+    - qwen3.5-fp8-h100-sglang
+  description:
+    - "Tune Qwen3.5-397B-A17B-FP8 H100 SGLang aggregate recipe for 1k/1k and 8k/1k sweeps"
+    - "Use TP8/EP1 for conc 1-8, TP8/EP8 for conc 16-64, and TP8/EP8 DP-attention for conc 128-256"
+    - "Use scheduler-recv-interval values 2/60/30/1200/600 for non-DP conc 1-4/8/16/32/64"
+    - "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544