Skip to content

Commit 142c925

Browse files
committed
Merge remote-tracking branch 'origin/main' into sweep-canary-gate
2 parents d87804d + bb00055 commit 142c925

18 files changed

Lines changed: 2984 additions & 1165 deletions

.github/configs/amd-master.yaml

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,6 +1350,115 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
13501350
- "DECODE_NODES=1"
13511351
- "DECODE_MTP_SIZE=2"
13521352

1353+
kimik2.5-fp4-mi355x-vllm-disagg:
1354+
image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
1355+
model: amd/Kimi-K2.5-MXFP4
1356+
model-prefix: kimik2.5
1357+
runner: mi355x-disagg
1358+
precision: fp4
1359+
framework: vllm-disagg
1360+
multinode: true
1361+
disagg: true
1362+
scenarios:
1363+
fixed-seq-len:
1364+
- isl: 1024
1365+
osl: 1024
1366+
search-space:
1367+
# 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
1368+
- spec-decoding: "none"
1369+
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
1370+
prefill:
1371+
num-worker: 1
1372+
tp: 8
1373+
ep: 1
1374+
dp-attn: false
1375+
additional-settings:
1376+
- "PREFILL_NODES=1"
1377+
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
1378+
decode:
1379+
num-worker: 2
1380+
tp: 8
1381+
ep: 8
1382+
dp-attn: false
1383+
additional-settings:
1384+
- "DECODE_NODES=2"
1385+
1386+
- isl: 8192
1387+
osl: 1024
1388+
search-space:
1389+
- spec-decoding: "none"
1390+
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
1391+
prefill:
1392+
num-worker: 1
1393+
tp: 8
1394+
ep: 1
1395+
dp-attn: false
1396+
additional-settings:
1397+
- "PREFILL_NODES=1"
1398+
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
1399+
decode:
1400+
num-worker: 2
1401+
tp: 8
1402+
ep: 8
1403+
dp-attn: false
1404+
additional-settings:
1405+
- "DECODE_NODES=2"
1406+
1407+
minimaxm2.5-fp8-mi355x-vllm-disagg:
1408+
image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031
1409+
model: MiniMaxAI/MiniMax-M2.5
1410+
model-prefix: minimaxm2.5
1411+
runner: mi355x-disagg
1412+
precision: fp8
1413+
framework: vllm-disagg
1414+
multinode: true
1415+
disagg: true
1416+
scenarios:
1417+
fixed-seq-len:
1418+
- isl: 1024
1419+
osl: 1024
1420+
search-space:
1421+
# 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
1422+
# Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536,
1423+
# TP8 shards to 192 which is not divisible by FP8 block_n=128.
1424+
- spec-decoding: "none"
1425+
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
1426+
prefill:
1427+
num-worker: 1
1428+
tp: 8
1429+
ep: 8
1430+
dp-attn: false
1431+
additional-settings:
1432+
- "PREFILL_NODES=1"
1433+
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
1434+
decode:
1435+
num-worker: 2
1436+
tp: 8
1437+
ep: 8
1438+
dp-attn: false
1439+
additional-settings:
1440+
- "DECODE_NODES=2"
1441+
1442+
- isl: 8192
1443+
osl: 1024
1444+
search-space:
1445+
- spec-decoding: "none"
1446+
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
1447+
prefill:
1448+
num-worker: 1
1449+
tp: 8
1450+
ep: 8
1451+
dp-attn: false
1452+
additional-settings:
1453+
- "PREFILL_NODES=1"
1454+
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
1455+
decode:
1456+
num-worker: 2
1457+
tp: 8
1458+
ep: 8
1459+
dp-attn: false
1460+
additional-settings:
1461+
- "DECODE_NODES=2"
13531462

13541463
dsr1-fp4-mi355x-sglang-disagg:
13551464
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519

benchmarks/benchmark_lib.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ run_benchmark_serving() {
210210
local dsv4=false
211211
local trust_remote_code=false
212212
local server_pid=""
213+
local tokenizer=""
213214

214215
while [[ $# -gt 0 ]]; do
215216
case $1 in
@@ -278,6 +279,10 @@ run_benchmark_serving() {
278279
server_pid="$2"
279280
shift 2
280281
;;
282+
--tokenizer)
283+
tokenizer="$2"
284+
shift 2
285+
;;
281286
*)
282287
echo "Unknown parameter: $1"
283288
return 1
@@ -385,6 +390,10 @@ run_benchmark_serving() {
385390
benchmark_cmd+=(--trust-remote-code)
386391
fi
387392

393+
if [[ -n "$tokenizer" ]]; then
394+
benchmark_cmd+=(--tokenizer "$tokenizer")
395+
fi
396+
388397
# Run benchmark with optional server monitoring
389398
set -x
390399
if [[ -n "$server_pid" ]]; then
Lines changed: 62 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,108 @@
11
#!/bin/bash
2+
# Dual-Engine Disaggregated Benchmark Runner
3+
#
4+
# ENGINE=sglang (default): SGLang benchmark
5+
# ENGINE=vllm: vLLM benchmark
6+
#
7+
# Produces JSON result files via benchmark_serving.py so that the CI pipeline
8+
# can collect and process results.
9+
#
10+
# Usage: bash bench.sh <n_prefill> <n_decode> <prefill_gpus> <decode_gpus> \
11+
# <model_dir> <model_name> <log_path> <isl> <osl> \
12+
# <concurrency_list> <req_rate> <random_range_ratio> <num_prompts_multiplier>
13+
14+
ENGINE="${ENGINE:-sglang-disagg}"
215

316
n_prefill=$1
417
n_decode=$2
518
prefill_gpus=$3
619
decode_gpus=$4
720
model_path=$5
821
model_name=$6
9-
MODEL_PATH="${model_path}/${model_name}"
22+
MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}"
23+
# vllm-disagg uses --served-model-name MODEL_NAME; sglang defaults to MODEL_PATH
24+
if [[ "$ENGINE" == "vllm-disagg" ]]; then
25+
BENCH_MODEL="${MODEL_NAME:-${MODEL_PATH}}"
26+
else
27+
BENCH_MODEL="${MODEL_PATH}"
28+
fi
1029
log_path=$7
1130

1231
chosen_isl=${8:-1024}
1332
chosen_osl=${9:-1024}
1433
concurrency_list=${10:-"512x1"}
15-
chosen_req_rate=${11:-1}
34+
if [[ "$ENGINE" == "vllm-disagg" ]]; then
35+
chosen_req_rate=${11:-inf}
36+
else
37+
chosen_req_rate=${11:-1}
38+
fi
1639
random_range_ratio=${12:-0.8}
1740
num_prompts_multiplier=${13:-10}
1841

1942
IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"
2043

21-
echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
44+
ROUTER_PORT="${ROUTER_PORT:-30000}"
2245

23-
head_node="localhost"
24-
head_port="30000"
46+
export TRANSFORMERS_VERBOSITY=error
47+
export TOKENIZERS_PARALLELISM=false
2548

49+
echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
2650

27-
profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}"
28-
mkdir -p $profile_folder
51+
profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}"
52+
mkdir -p "$profile_folder"
2953

3054
source "$(dirname "$0")/../../benchmark_lib.sh"
3155

32-
# Repo root inside the container (3 levels up from this script's directory)
3356
REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
3457

35-
for max_concurrency in ${chosen_concurrencies[@]}; do
58+
for max_concurrency in "${chosen_concurrencies[@]}"; do
3659

3760
export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}"
3861

62+
num_prompts=$(( max_concurrency * num_prompts_multiplier ))
63+
if [[ "$num_prompts" -lt 16 ]]; then
64+
num_prompts=16
65+
fi
66+
3967
echo "profile_folder: $profile_folder"
4068
echo "max_concurrency: $max_concurrency"
4169
echo "chosen_req_rate: $chosen_req_rate"
4270
echo "MODEL_PATH: $MODEL_PATH"
43-
echo "head_port: $head_port"
71+
echo "ROUTER_PORT: $ROUTER_PORT"
4472
echo "chosen_isl: $chosen_isl"
4573
echo "chosen_osl: $chosen_osl"
74+
echo "num_prompts: $num_prompts"
4675
echo "export_file: $export_file"
4776

77+
# Engine-specific extra flags
78+
extra_flags=""
79+
if [[ "$ENGINE" == "vllm-disagg" ]]; then
80+
extra_flags="--trust-remote-code --tokenizer $MODEL_PATH"
81+
else
82+
if [ "$IS_MTP" = "true" ]; then
83+
extra_flags="--use-chat-template"
84+
fi
85+
fi
86+
4887
run_benchmark_serving \
4988
--bench-serving-dir "$REPO_ROOT" \
50-
--model ${MODEL_PATH} \
51-
--port ${head_port} \
89+
--model "$BENCH_MODEL" \
90+
--port "$ROUTER_PORT" \
5291
--backend openai \
53-
--input-len ${chosen_isl} \
54-
--output-len ${chosen_osl} \
55-
--random-range-ratio ${random_range_ratio} \
56-
--num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \
92+
--input-len "$chosen_isl" \
93+
--output-len "$chosen_osl" \
94+
--random-range-ratio "$random_range_ratio" \
95+
--num-prompts "$num_prompts" \
5796
--max-concurrency "$max_concurrency" \
5897
--result-filename "$export_file" \
5998
--result-dir /workspace/ \
60-
$( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" )
99+
$extra_flags
61100

62101
echo "-----------------------------------------"
102+
103+
# vLLM: cooldown between rounds for idle KV block reaper
104+
if [[ "$ENGINE" == "vllm-disagg" ]]; then
105+
echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..."
106+
sleep 10
107+
fi
63108
done

0 commit comments

Comments
 (0)