|
1 | 1 | #!/bin/bash |
| 2 | +# Dual-Engine Disaggregated Benchmark Runner |
| 3 | +# |
| 4 | +# ENGINE=sglang (default): SGLang benchmark |
| 5 | +# ENGINE=vllm: vLLM benchmark |
| 6 | +# |
| 7 | +# Produces JSON result files via benchmark_serving.py so that the CI pipeline |
| 8 | +# can collect and process results. |
| 9 | +# |
| 10 | +# Usage: bash bench.sh <n_prefill> <n_decode> <prefill_gpus> <decode_gpus> \ |
| 11 | +# <model_dir> <model_name> <log_path> <isl> <osl> \ |
| 12 | +# <concurrency_list> <req_rate> <random_range_ratio> <num_prompts_multiplier> |
| 13 | + |
| 14 | +ENGINE="${ENGINE:-sglang-disagg}" |
2 | 15 |
|
3 | 16 | n_prefill=$1 |
4 | 17 | n_decode=$2 |
5 | 18 | prefill_gpus=$3 |
6 | 19 | decode_gpus=$4 |
7 | 20 | model_path=$5 |
8 | 21 | model_name=$6 |
9 | | -MODEL_PATH="${model_path}/${model_name}" |
| 22 | +MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}" |
| 23 | +# vllm-disagg uses --served-model-name MODEL_NAME; sglang defaults to MODEL_PATH |
| 24 | +if [[ "$ENGINE" == "vllm-disagg" ]]; then |
| 25 | + BENCH_MODEL="${MODEL_NAME:-${MODEL_PATH}}" |
| 26 | +else |
| 27 | + BENCH_MODEL="${MODEL_PATH}" |
| 28 | +fi |
10 | 29 | log_path=$7 |
11 | 30 |
|
12 | 31 | chosen_isl=${8:-1024} |
13 | 32 | chosen_osl=${9:-1024} |
14 | 33 | concurrency_list=${10:-"512x1"} |
15 | | -chosen_req_rate=${11:-1} |
| 34 | +if [[ "$ENGINE" == "vllm-disagg" ]]; then |
| 35 | + chosen_req_rate=${11:-inf} |
| 36 | +else |
| 37 | + chosen_req_rate=${11:-1} |
| 38 | +fi |
16 | 39 | random_range_ratio=${12:-0.8} |
17 | 40 | num_prompts_multiplier=${13:-10} |
18 | 41 |
|
19 | 42 | IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list" |
20 | 43 |
|
21 | | -echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" |
| 44 | +ROUTER_PORT="${ROUTER_PORT:-30000}" |
22 | 45 |
|
23 | | -head_node="localhost" |
24 | | -head_port="30000" |
| 46 | +export TRANSFORMERS_VERBOSITY=error |
| 47 | +export TOKENIZERS_PARALLELISM=false |
25 | 48 |
|
| 49 | +echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}" |
26 | 50 |
|
27 | | -profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}" |
28 | | -mkdir -p $profile_folder |
| 51 | +profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}" |
| 52 | +mkdir -p "$profile_folder" |
29 | 53 |
|
30 | 54 | source "$(dirname "$0")/../../benchmark_lib.sh" |
31 | 55 |
|
32 | | -# Repo root inside the container (3 levels up from this script's directory) |
33 | 56 | REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" |
34 | 57 |
|
35 | | -for max_concurrency in ${chosen_concurrencies[@]}; do |
| 58 | +for max_concurrency in "${chosen_concurrencies[@]}"; do |
36 | 59 |
|
37 | 60 | export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}" |
38 | 61 |
|
| 62 | + num_prompts=$(( max_concurrency * num_prompts_multiplier )) |
| 63 | + if [[ "$num_prompts" -lt 16 ]]; then |
| 64 | + num_prompts=16 |
| 65 | + fi |
| 66 | + |
39 | 67 | echo "profile_folder: $profile_folder" |
40 | 68 | echo "max_concurrency: $max_concurrency" |
41 | 69 | echo "chosen_req_rate: $chosen_req_rate" |
42 | 70 | echo "MODEL_PATH: $MODEL_PATH" |
43 | | - echo "head_port: $head_port" |
| 71 | + echo "ROUTER_PORT: $ROUTER_PORT" |
44 | 72 | echo "chosen_isl: $chosen_isl" |
45 | 73 | echo "chosen_osl: $chosen_osl" |
| 74 | + echo "num_prompts: $num_prompts" |
46 | 75 | echo "export_file: $export_file" |
47 | 76 |
|
| 77 | + # Engine-specific extra flags |
| 78 | + extra_flags="" |
| 79 | + if [[ "$ENGINE" == "vllm-disagg" ]]; then |
| 80 | + extra_flags="--trust-remote-code --tokenizer $MODEL_PATH" |
| 81 | + else |
| 82 | + if [ "$IS_MTP" = "true" ]; then |
| 83 | + extra_flags="--use-chat-template" |
| 84 | + fi |
| 85 | + fi |
| 86 | + |
48 | 87 | run_benchmark_serving \ |
49 | 88 | --bench-serving-dir "$REPO_ROOT" \ |
50 | | - --model ${MODEL_PATH} \ |
51 | | - --port ${head_port} \ |
| 89 | + --model "$BENCH_MODEL" \ |
| 90 | + --port "$ROUTER_PORT" \ |
52 | 91 | --backend openai \ |
53 | | - --input-len ${chosen_isl} \ |
54 | | - --output-len ${chosen_osl} \ |
55 | | - --random-range-ratio ${random_range_ratio} \ |
56 | | - --num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \ |
| 92 | + --input-len "$chosen_isl" \ |
| 93 | + --output-len "$chosen_osl" \ |
| 94 | + --random-range-ratio "$random_range_ratio" \ |
| 95 | + --num-prompts "$num_prompts" \ |
57 | 96 | --max-concurrency "$max_concurrency" \ |
58 | 97 | --result-filename "$export_file" \ |
59 | 98 | --result-dir /workspace/ \ |
60 | | - $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" ) |
| 99 | + $extra_flags |
61 | 100 |
|
62 | 101 | echo "-----------------------------------------" |
| 102 | + |
| 103 | + # vLLM: cooldown between rounds for idle KV block reaper |
| 104 | + if [[ "$ENGINE" == "vllm-disagg" ]]; then |
| 105 | + echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..." |
| 106 | + sleep 10 |
| 107 | + fi |
63 | 108 | done |
0 commit comments