[AMD] Add Qwen3.5 FP8 MI355X SGLang disaggregated benchmark (#1570)

ChangLiu0709 · cliu1004@amd.com · chunfangamd · web-flow · commit 7ae613caf4ee · 2026-05-28T00:46:30.000-04:00
* Add Qwen3.5 FP8 MI355X SGLang disaggregated benchmark (PR-1). Introduce CI config, model server flags, multinode launch script, and amd_utils plumbing (sudo auto-detect, optional disagg decode TP/DP flags) for qwen3.5-fp8-mi355x-sglang-disagg smoke sweeps on MI355X. Co-authored-by: chunfangamd <chun.fang@amd.com> Co-authored-by: ChangLiu0709 <cliu1004@amd.com> * Add perf-changelog entry for qwen3.5-fp8-mi355x-sglang-disagg. Required when adding new amd-master.yaml benchmark configs (PR #1570). Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: chunfangamd <chun.fang@amd.com> Co-authored-by: ChangLiu0709 <cliu1004@amd.com> * Fix misleading 8k1k comment in qwen3.5-fp8-mi355x-sglang-disagg config. The search-space uses 1P+1D TP8/EP1 with dp-attn, not TP2/EP2 from the aggregated qwen3.5-fp8-mi355x-sglang recipe. Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: chunfangamd <chun.fang@amd.com> Co-authored-by: ChangLiu0709 <cliu1004@amd.com> * qwen3.5-fp8-mi355x-sglang-disagg: 8k1k row uses dp-attn=false With --enable-dp-attention + --moe-a2a-backend mori, sglang auto-promotes moe_ep_size=tp_size=8, but is_deepep_class_backend() excludes MoRI, so num_shared_slots stays at the global value (1) and the (num_experts - num_shared_slots) % moe_ep_size assertion in fused_moe_triton/layer.py fires for Qwen3.5 (512 routed + 1 shared). Track upstream sglang for a fix; flip back to dp-attn=true once MoRI is added to is_deepep_class_backend() or shared-slot accounting is reconciled. Co-authored-by: chunfangamd <chun.fang@amd.com> Co-authored-by: ChangLiu0709 <cliu1004@amd.com> * fix: add FRAMEWORK to check_env_vars in qwen3.5 sglang-disagg script Matches sister sglang-disagg scripts (dsr1_fp8, dsr1_fp4) and the GLM-5 disagg launch script. submit.sh requires FRAMEWORK; surfacing the missing-var failure at the top of the launch script gives a cleaner error than letting it fail deep inside submit.sh. Addresses Cursor Bugbot review comment on PR #1570. Co-authored-by: chunfangamd <chun.fang@amd.com> Co-authored-by: ChangLiu0709 <cliu1004@amd.com> --------- Co-authored-by: cliu1004@amd.com <cliu1004@amd.com@mia1-p01-g18.mia.tensorwave.lan> Co-authored-by: chunfangamd <chun.fang@amd.com> Co-authored-by: Cursor <cursoragent@cursor.com> Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
@@ -367,6 +367,70 @@ qwen3.5-fp8-mi355x-atom-mtp:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
 
+qwen3.5-fp8-mi355x-sglang-disagg:
+  image: lmsysorg/sglang-rocm:v0.5.11-rocm700-mi35x-20260511
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # Matches qwen3.5-fp8-mi355x-sglang TP8/EP1 low-concurrency sweep
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P+1D TP8/EP1 low-concurrency sweep.
+      # dp-attn intentionally false (matches the 1k1k row): with
+      # --enable-dp-attention + --moe-a2a-backend mori, sglang auto-promotes
+      # moe_ep_size=tp_size=8, but is_deepep_class_backend() excludes MoRI,
+      # so num_shared_slots stays at the global value (1) and the
+      # (num_experts - num_shared_slots) % moe_ep_size assertion in
+      # fused_moe_triton/layer.py fires for Qwen3.5 (512 routed + 1 shared).
+      # Track upstream sglang for a fix; flip back to dp-attn=true once
+      # MoRI is added to is_deepep_class_backend() or shared-slot
+      # accounting is reconciled.
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
 qwen3.5-fp4-mi355x-sglang:
   image: lmsysorg/sglang:v0.5.12-rocm720-mi35x
   model: amd/Qwen3.5-397B-A17B-MXFP4
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
@@ -161,6 +161,37 @@ DeepSeek-R1-0528:
       chunked_prefill_size: 262144
       cuda_graph_bs_range: "1-128"
 
+Qwen3.5-397B-A17B-FP8:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori --moe-dense-tp-size 1"
+  mtp_flags: ""
+  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+
 DeepSeek-R1-0528-MXFP4-Preview:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
diff --git a/benchmarks/multi_node/qwen3.5_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/qwen3.5_fp8_mi355x_sglang-disagg.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+# Use upstreamed multi_node scripts (no external clone needed)
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+# Set up SGL launch script-specific environment variables
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
+export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+export PREFILL_ENABLE_DP=true
+else
+export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
+export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+export DECODE_ENABLE_DP=true
+else
+export DECODE_ENABLE_DP=false
+fi
+
+# Launch jobs based on ISL/OSL
+# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
+# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
+# expects the concurrencies.
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO} \
+    ${NODE_LIST:-})
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3171,3 +3171,13 @@
   description:
     - "Validates measured-power aggregation pipeline (PR #1558) on both NVIDIA (H200) and AMD (MI355X) hardware — different SMI tools (nvidia-smi vs amd-smi), different CSV schemas (power.draw [W] vs socket_power), same aggregator. No config change. Entry intentionally kept past merge so run-sweep produces canonical agg JSONs with avg_power_w + joules_per_output_token on main for both vendors, seeding the dashboard's day-zero data."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1558
+
+- config-keys:
+    - qwen3.5-fp8-mi355x-sglang-disagg
+  description:
+    - "Add Qwen3.5-397B-A17B-FP8 MI355X SGLang disaggregated prefill-decode benchmark"
+    - "Image: lmsysorg/sglang-rocm:v0.5.11-rocm700-mi35x-20260511"
+    - "1P+1D TP8/EP1 smoke sweep for 1k1k and 8k1k (conc 8-512); MoRI transfer backend"
+    - "Add models.yaml server flags and multinode launch script qwen3.5_fp8_mi355x_sglang-disagg.sh"
+    - "8k1k row uses dp-attn=false (matches 1k1k): with --enable-dp-attention + --moe-a2a-backend mori, sglang auto-promotes moe_ep_size=tp_size=8, but is_deepep_class_backend() excludes MoRI, so num_shared_slots stays at the global value (1) and the (num_experts - num_shared_slots) % moe_ep_size assertion in fused_moe_triton/layer.py fires for Qwen3.5 (512 routed + 1 shared). Track upstream sglang; flip back to dp-attn=true once MoRI is added to is_deepep_class_backend() or shared-slot accounting is reconciled."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1570
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
@@ -54,6 +54,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     # Ensure root-owned files are cleaned up even on early exit to prevent
     # EACCES errors when the next GH Actions job checks out on this runner.
     # Always preserve slurm logs as CI artifacts for debugging.
+    # KEEP_LOGS=1 disables the trap entirely (local-debug knob).
     cleanup_and_save_logs() {
         if [[ -n "${GITHUB_ACTIONS:-}" && -n "${JOB_ID:-}" ]]; then
             local art_dir="$GITHUB_WORKSPACE/benchmark_artifacts"
@@ -69,7 +70,11 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         fi
         sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true
     }
-    trap cleanup_and_save_logs EXIT
+    if [[ "${KEEP_LOGS:-0}" == "1" ]]; then
+        trap '' EXIT
+    else
+        trap cleanup_and_save_logs EXIT
+    fi
 
     SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_mi355x_${FRAMEWORK}.sh"
     if [[ "$FRAMEWORK" == "sglang-disagg" ]] || [[ "$FRAMEWORK" == "vllm-disagg" ]]; then