SemiAnalysisAI
diff --git a/‎.github/configs/amd-master.yaml‎
Lines changed: 109 additions & 0 deletions b/‎.github/configs/amd-master.yaml‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎benchmarks/benchmark_lib.sh‎
Lines changed: 9 additions & 0 deletions b/‎benchmarks/benchmark_lib.sh‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎benchmarks/multi_node/amd_utils/bench.sh‎
Lines changed: 62 additions & 17 deletions b/‎benchmarks/multi_node/amd_utils/bench.sh‎
Lines changed: 62 additions & 17 deletions
@@ -1350,6 +1350,115 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=2"
 
+kimik2.5-fp4-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x-disagg
+  precision: fp4
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total 
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+minimaxm2.5-fp8-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P2D: 1 prefill node (co-located with proxy) + 2 decode nodes = 3 nodes total
+      # Prefill also needs EP=8: MiniMax M2.5 expert intermediate_size=1536,
+      # TP8 shards to 192 which is not divisible by FP8 block_n=128.
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
   image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
 
@@ -210,6 +210,7 @@ run_benchmark_serving() {
     local dsv4=false
     local trust_remote_code=false
     local server_pid=""
+    local tokenizer=""
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -278,6 +279,10 @@ run_benchmark_serving() {
                 server_pid="$2"
                 shift 2
                 ;;
+            --tokenizer)
+                tokenizer="$2"
+                shift 2
+                ;;
             *)
                 echo "Unknown parameter: $1"
                 return 1
@@ -385,6 +390,10 @@ run_benchmark_serving() {
         benchmark_cmd+=(--trust-remote-code)
     fi
 
+    if [[ -n "$tokenizer" ]]; then
+        benchmark_cmd+=(--tokenizer "$tokenizer")
+    fi
+
     # Run benchmark with optional server monitoring
     set -x
     if [[ -n "$server_pid" ]]; then
 
@@ -1,63 +1,108 @@
 #!/bin/bash
+# Dual-Engine Disaggregated Benchmark Runner
+#
+# ENGINE=sglang (default): SGLang benchmark
+# ENGINE=vllm:             vLLM benchmark
+#
+# Produces JSON result files via benchmark_serving.py so that the CI pipeline
+# can collect and process results.
+#
+# Usage: bash bench.sh <n_prefill> <n_decode> <prefill_gpus> <decode_gpus> \
+#            <model_dir> <model_name> <log_path> <isl> <osl> \
+#            <concurrency_list> <req_rate> <random_range_ratio> <num_prompts_multiplier>
+
+ENGINE="${ENGINE:-sglang-disagg}"
 
 n_prefill=$1
 n_decode=$2
 prefill_gpus=$3
 decode_gpus=$4
 model_path=$5
 model_name=$6
-MODEL_PATH="${model_path}/${model_name}"
+MODEL_PATH="${MODEL_PATH:-${model_path}/${model_name}}"
+# vllm-disagg uses --served-model-name MODEL_NAME; sglang defaults to MODEL_PATH
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    BENCH_MODEL="${MODEL_NAME:-${MODEL_PATH}}"
+else
+    BENCH_MODEL="${MODEL_PATH}"
+fi
 log_path=$7
 
 chosen_isl=${8:-1024}
 chosen_osl=${9:-1024}
 concurrency_list=${10:-"512x1"}
-chosen_req_rate=${11:-1}
+if [[ "$ENGINE" == "vllm-disagg" ]]; then
+    chosen_req_rate=${11:-inf}
+else
+    chosen_req_rate=${11:-1}
+fi
 random_range_ratio=${12:-0.8}
 num_prompts_multiplier=${13:-10}
 
 IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"
 
-echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
+ROUTER_PORT="${ROUTER_PORT:-30000}"
 
-head_node="localhost"
-head_port="30000"
+export TRANSFORMERS_VERBOSITY=error
+export TOKENIZERS_PARALLELISM=false
 
+echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[0]}; ${chosen_req_rate}"
 
-profile_folder="${log_path}/sglang_isl_${chosen_isl}_osl_${chosen_osl}"
-mkdir -p $profile_folder
+profile_folder="${log_path}/${ENGINE}_isl_${chosen_isl}_osl_${chosen_osl}"
+mkdir -p "$profile_folder"
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-# Repo root inside the container (3 levels up from this script's directory)
 REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
 
-for max_concurrency in ${chosen_concurrencies[@]}; do
+for max_concurrency in "${chosen_concurrencies[@]}"; do
 
     export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}"
 
+    num_prompts=$(( max_concurrency * num_prompts_multiplier ))
+    if [[ "$num_prompts" -lt 16 ]]; then
+        num_prompts=16
+    fi
+
     echo "profile_folder: $profile_folder"
     echo "max_concurrency: $max_concurrency"
     echo "chosen_req_rate: $chosen_req_rate"
     echo "MODEL_PATH: $MODEL_PATH"
-    echo "head_port: $head_port"
+    echo "ROUTER_PORT: $ROUTER_PORT"
     echo "chosen_isl: $chosen_isl"
     echo "chosen_osl: $chosen_osl"
+    echo "num_prompts: $num_prompts"
     echo "export_file: $export_file"
 
+    # Engine-specific extra flags
+    extra_flags=""
+    if [[ "$ENGINE" == "vllm-disagg" ]]; then
+        extra_flags="--trust-remote-code --tokenizer $MODEL_PATH"
+    else
+        if [ "$IS_MTP" = "true" ]; then
+            extra_flags="--use-chat-template"
+        fi
+    fi
+
     run_benchmark_serving \
         --bench-serving-dir "$REPO_ROOT" \
-        --model  ${MODEL_PATH} \
-        --port ${head_port} \
+        --model "$BENCH_MODEL" \
+        --port "$ROUTER_PORT" \
         --backend openai \
-        --input-len ${chosen_isl} \
-        --output-len ${chosen_osl} \
-        --random-range-ratio ${random_range_ratio} \
-        --num-prompts $(( $max_concurrency * $num_prompts_multiplier )) \
+        --input-len "$chosen_isl" \
+        --output-len "$chosen_osl" \
+        --random-range-ratio "$random_range_ratio" \
+        --num-prompts "$num_prompts" \
         --max-concurrency "$max_concurrency" \
         --result-filename "$export_file" \
         --result-dir /workspace/ \
-        $( [ "$IS_MTP" = "true" ] && echo "--use-chat-template" )
+        $extra_flags
 
     echo "-----------------------------------------"
+
+    # vLLM: cooldown between rounds for idle KV block reaper
+    if [[ "$ENGINE" == "vllm-disagg" ]]; then
+        echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..."
+        sleep 10
+    fi
 done