Skip to content

Commit c5ff8da

Browse files
authored
[AMD][MI35X] 0526 DSV4 (#1568)
1 parent ec05272 commit c5ff8da

3 files changed

Lines changed: 13 additions & 4 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1813,7 +1813,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
18131813
# image tag, so bumping sglang is just an image tag bump here. Sweeps
18141814
# DP-attention on/off and EP=8.
18151815
dsv4-fp4-mi355x-sglang:
1816-
image: rocm/sgl-dev:rocm720-mi35x-8c3b5aa-20260521-DSv4
1816+
image: rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4
18171817
model: deepseek-ai/DeepSeek-V4-Pro
18181818
model-prefix: dsv4
18191819
runner: mi355x
@@ -1825,12 +1825,12 @@ dsv4-fp4-mi355x-sglang:
18251825
- isl: 1024
18261826
osl: 1024
18271827
search-space:
1828-
- { tp: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
1828+
- { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 }
18291829
- { tp: 8, dp-attn: false, conc-start: 1 , conc-end: 32 }
18301830
- isl: 8192
18311831
osl: 1024
18321832
search-space:
1833-
- { tp: 8, dp-attn: true, conc-start: 64, conc-end: 512 }
1833+
- { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 }
18341834
- { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32 }
18351835

18361836
# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm

benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ export SGLANG_FORCE_TRITON_MOE_FP8=0
7777
export SGLANG_HACK_FLASHMLA_BACKEND=triton
7878
export SGLANG_OPT_USE_TILELANG_INDEXER=true
7979
export SGLANG_OPT_USE_TRITON_SWA_PREPARE=true
80-
export AITER_BF16_FP8_MOE_BOUND=1
80+
export AITER_BF16_FP8_MOE_BOUND=0
8181
export SGLANG_OPT_FUSE_WQA_WKV=true
8282
export SGLANG_OPT_USE_FUSED_PAGED_COMPRESS=true
8383
export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=0
@@ -116,6 +116,8 @@ python3 -m sglang.launch_server \
116116
--disable-radix-cache \
117117
--attention-backend compressed \
118118
--max-running-requests ${CONC} \
119+
--mem-fraction-static 0.90 \
120+
--swa-full-tokens-ratio 0.15 \
119121
--page-size 256 \
120122
--context-length $MAX_MODEL_LEN \
121123
--chunked-prefill-size 8192 \

perf-changelog.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3146,6 +3146,13 @@
31463146
- "Set vLLM serving knobs in benchmarks/single_node/minimaxm2.5_fp8_h200.sh: generated benchmark max-model-len, previous eval max-model-len handling, fp8 KV cache, FlashInfer attention/autotune, Triton MoE, and MiniMax QK norm fusion"
31473147
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1354
31483148

3149+
- config-keys:
3150+
- dsv4-fp4-mi355x-sglang
3151+
description:
3152+
- "Bump image to rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4"
3153+
- "Add args to avoid kvcache pool full issue on high conc"
3154+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1568
3155+
31493156
- config-keys:
31503157
- qwen3.5-fp8-h200-sglang
31513158
- dsr1-fp8-mi355x-sglang

0 commit comments

Comments
 (0)