Skip to content

Commit f65d6b4

Browse files
kedarpotdar-nvhshrivastava-droidAnkur-singh
authored
Qwen3.5 FP8 H200 SGLang (#855)
add Qwen3.5-h200-sglang config --------- Co-authored-by: hshrivastava-droid <hshrivastava@nvidia.com> Co-authored-by: Ankur Singh <ankusingh@nvidia.com>
1 parent efd49e5 commit f65d6b4

3 files changed

Lines changed: 112 additions & 1 deletion

File tree

.github/configs/nvidia-master.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1974,6 +1974,28 @@ dsr1-fp8-h200-sglang:
19741974
search-space:
19751975
- { tp: 8, conc-start: 4, conc-end: 64 }
19761976

1977+
qwen3.5-fp8-h200-sglang:
1978+
image: lmsysorg/sglang:v0.5.9-cu129-amd64
1979+
model: Qwen/Qwen3.5-397B-A17B-FP8
1980+
model-prefix: qwen3.5
1981+
runner: h200
1982+
precision: fp8
1983+
framework: sglang
1984+
multinode: false
1985+
seq-len-configs:
1986+
- isl: 1024
1987+
osl: 1024
1988+
search-space:
1989+
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
1990+
- isl: 1024
1991+
osl: 8192
1992+
search-space:
1993+
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
1994+
- isl: 8192
1995+
osl: 1024
1996+
search-space:
1997+
- { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
1998+
19771999
dsr1-fp8-h200-trt:
19782000
image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
19792001
model: deepseek-ai/DeepSeek-R1-0528
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#!/usr/bin/env bash
2+
3+
source "$(dirname "$0")/../benchmark_lib.sh"
4+
5+
check_env_vars \
6+
MODEL \
7+
TP \
8+
CONC \
9+
ISL \
10+
OSL \
11+
RANDOM_RANGE_RATIO \
12+
RESULT_FILENAME \
13+
EP_SIZE
14+
15+
if [[ -n "$SLURM_JOB_ID" ]]; then
16+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
17+
fi
18+
19+
nvidia-smi
20+
21+
hf download "$MODEL"
22+
23+
SERVER_LOG=/workspace/server.log
24+
PORT=${PORT:-8888}
25+
MAX_SEQ_LEN=$((ISL + OSL + 20))
26+
27+
echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
28+
29+
set -x
30+
python3 -m sglang.launch_server \
31+
--model "$MODEL" \
32+
--host 0.0.0.0 \
33+
--port "$PORT" \
34+
--tp "$TP" \
35+
--expert-parallel-size "$EP_SIZE" \
36+
--reasoning-parser qwen3 \
37+
--tool-call-parser qwen3_coder \
38+
--enable-flashinfer-allreduce-fusion \
39+
--max-running-requests 128 \
40+
--chunked-prefill-size 16384 \
41+
--decode-log-interval 1 \
42+
--mem-fraction-static 0.8 \
43+
--cuda-graph-max-bs "$CONC" \
44+
--context-length "$MAX_SEQ_LEN" \
45+
--kv-cache-dtype fp8_e4m3 \
46+
--quantization fp8 \
47+
--attention-backend flashinfer \
48+
--stream-interval 50 \
49+
--tokenizer-worker-num 6 \
50+
--mamba-ssm-dtype bfloat16 \
51+
--disable-radix-cache \
52+
--trust-remote-code \
53+
> "$SERVER_LOG" 2>&1 &
54+
55+
SERVER_PID=$!
56+
57+
# Wait for server to be ready
58+
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
59+
60+
pip install -q datasets pandas
61+
62+
run_benchmark_serving \
63+
--model "$MODEL" \
64+
--port "$PORT" \
65+
--backend vllm \
66+
--input-len "$ISL" \
67+
--output-len "$OSL" \
68+
--random-range-ratio "$RANDOM_RANGE_RATIO" \
69+
--num-prompts "$((CONC * 10))" \
70+
--max-concurrency "$CONC" \
71+
--result-filename "$RESULT_FILENAME" \
72+
--result-dir /workspace/
73+
74+
# After throughput, run evaluation only if RUN_EVAL is true
75+
if [ "${RUN_EVAL}" = "true" ]; then
76+
run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC
77+
append_lm_eval_summary
78+
fi
79+
set +x

perf-changelog.yaml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -834,7 +834,7 @@
834834
- "TP=8, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths"
835835
- "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html"
836836
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/839
837-
837+
838838
- config-keys:
839839
- dsr1-fp8-mi355x-sglang-disagg
840840
- dsr1-fp8-mi355x-sglang-disagg-mtp
@@ -865,3 +865,13 @@
865865
- "Uses trtllm_mha attention backend and flashinfer_trtllm MOE runner"
866866
- "Enable SGLANG_ENABLE_FLASHINFER_GEMM=true, NCCL_NVLS_ENABLE=1"
867867
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/804
868+
869+
- config-keys:
870+
- qwen3.5-fp8-h200-sglang
871+
description:
872+
- "Add Qwen 3.5 FP8 H200 SGLang configuration"
873+
- "Model: Qwen/Qwen3.5-397B-A17B-FP8, runner: h200, image: lmsysorg/sglang:v0.5.8-cu130-amd64"
874+
- "Benchmark script: benchmarks/single_node/qwen3.5_fp8_h200.sh"
875+
- "Server: reasoning-parser qwen3, tool-call-parser qwen3_coder, enable-flashinfer-allreduce-fusion, mem-fraction-static 0.8"
876+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/855
877+

0 commit comments

Comments
 (0)