SemiAnalysisAI
diff --git a/‎.github/configs/amd-master.yaml‎
Lines changed: 55 additions & 0 deletions b/‎.github/configs/amd-master.yaml‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎benchmarks/multi_node/amd_utils/env.sh‎
Lines changed: 7 additions & 0 deletions b/‎benchmarks/multi_node/amd_utils/env.sh‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎benchmarks/multi_node/amd_utils/job.slurm‎
Lines changed: 25 additions & 0 deletions b/‎benchmarks/multi_node/amd_utils/job.slurm‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎benchmarks/multi_node/amd_utils/models.yaml‎
Lines changed: 31 additions & 0 deletions b/‎benchmarks/multi_node/amd_utils/models.yaml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎benchmarks/multi_node/amd_utils/patches/README.md‎
Lines changed: 87 additions & 0 deletions b/‎benchmarks/multi_node/amd_utils/patches/README.md‎
Lines changed: 87 additions & 0 deletions
@@ -553,6 +553,61 @@ glm5-fp8-mi355x-sglang-mtp:
       - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
       - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
 
+glm5-fp8-mi355x-sglang-disagg:
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: mi355x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1P+1D TP8/EP1 CI smoke sweep (aligned with glm5-fp8-mi355x-sglang conc range)
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1P+1D TP8/EP1 CI smoke sweep; dp-attn false (NSA / MoRI path)
+      - spec-decoding: "none"
+        conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
+
 glm5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: zai-org/GLM-5-FP8
 
@@ -140,6 +140,13 @@ else
     export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
     export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600
 
+    # GLM-5: uses NSA (not MLA), needs fused-decode-MLA disabled + fast loading
+    if [[ "$MODEL_NAME" == "GLM-5-FP8" ]]; then
+        export SGLANG_ROCM_FUSED_DECODE_MLA=0
+        export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+        export SAFETENSORS_FAST_GPU=1
+    fi
+
     # Disable allocating memory in one pass
     export MORI_SHMEM_MODE=ISOLATION
 
 
@@ -55,6 +55,30 @@ echo "Runfile set: $RUN_FILE"
 # $(pwd) is amd_utils/ (the sbatch submit dir); go up 3 levels to reach the repo root.
 export DI_REPO_DIR=$(cd "$(pwd)/../../.." && pwd)
 
+# ── In-tree sglang patches: auto-apply for known-affected images ──────
+# sglang v0.5.12.post1 ships a known-broken MoRI PD-disaggregation
+# backend that crashes hybrid-attention models (GLM-5, Qwen3.5-MoE,
+# anything with state_types: List[StateType]) at startup. We carry an
+# in-tree overlay of mori/conn.py that fixes the wire format + the
+# legacy state_type fallback (see patches/README.md for the bug
+# analysis and patch detail).
+#
+# Auto-applied when the image tag contains "v0.5.12.post1", unless the
+# caller sets MORI_CONN_PATCH=skip. The overlay is appended to
+# ${EXTRA_DOCKER_MOUNTS:-} so callers can still inject other mounts.
+# Dedup guard avoids double-mounting if EXTRA_DOCKER_MOUNTS already
+# contains the target path (docker rejects duplicate destinations).
+_MORI_PATCH_FILE="$DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/mori_conn.py"
+_MORI_PATCH_TARGET="/sgl-workspace/sglang/python/sglang/srt/disaggregation/mori/conn.py"
+if [[ "${MORI_CONN_PATCH:-auto}" != "skip" ]] \
+   && [[ -f "$_MORI_PATCH_FILE" ]] \
+   && [[ "${DOCKER_IMAGE_NAME:-}" == *"v0.5.12.post1"* ]] \
+   && [[ "${EXTRA_DOCKER_MOUNTS:-}" != *"$_MORI_PATCH_TARGET"* ]]; then
+    EXTRA_DOCKER_MOUNTS="${EXTRA_DOCKER_MOUNTS:-} -v ${_MORI_PATCH_FILE}:${_MORI_PATCH_TARGET}:ro"
+    export EXTRA_DOCKER_MOUNTS
+    echo "[job.slurm] auto-applied MoRI conn.py overlay: ${_MORI_PATCH_FILE}"
+fi
+
 xP="${xP:-1}"
 yD="${yD:-1}"
 
@@ -465,6 +489,7 @@ fi
     -v /tmp:/run_logs \
     -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
     -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \
+    ${EXTRA_DOCKER_MOUNTS:-} \
     ${DOCKER_ENV_COMMON[*]} \
     ${DOCKER_ENV_ENGINE[*]} \
     --name \"$DOCKER_CONT_NAME\" \
 
@@ -192,6 +192,37 @@ Qwen3.5-397B-A17B-FP8:
       chunked_prefill_size: 262144
       cuda_graph_bs_range: "1-128"
 
+GLM-5-FP8:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --disaggregation-transfer-backend mori --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\\\"enable_multithread_load\\\": true, \\\"num_threads\\\": 8}'"
+  mtp_flags: ""
+  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+
 DeepSeek-R1-0528-MXFP4-Preview:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
 
@@ -0,0 +1,87 @@
+# In-tree sglang patches for the MoRI PD-disagg path
+
+This directory carries small Python overlays that get bind-mounted over
+the upstream sglang source inside the docker container at runtime.
+They are needed because some sglang releases ship known bugs in the
+MoRI disaggregation backend that block our benchmark + accuracy
+configs.
+
+The mount is wired through the `EXTRA_DOCKER_MOUNTS` env var that
+`job.slurm` consumes (an opt-in `${EXTRA_DOCKER_MOUNTS:-}` after the
+existing `-v` block). The local-test driver scripts under
+`scripts/sglang_disagg/` pre-set this env var to the path of the
+relevant overlay; CI runners that need the patch can do the same.
+
+## `mori_conn.py`
+
+Overlays
+`/sgl-workspace/sglang/python/sglang/srt/disaggregation/mori/conn.py`.
+
+Source: forked from the file shipped in
+`lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523`
+(sglang [v0.5.12.post1](https://github.com/sgl-project/sglang/tree/v0.5.12.post1)).
+Four logical edits, all confined to `MoriKVReceiver.send_state`,
+`MoriKVReceiver._register_kv_args`, and
+`MoriKVReceiver._send_swa_dsa_state`:
+
+1. **Sender flatten** — handle the framework's nested
+   `state_item_lens: List[List[int]]` instead of crashing in the
+   naked `struct.pack("I", item_len)` (the legacy `List[int]`
+   assumption). Idempotent for legacy flat callers.
+2. **`state_type` legacy fallback** — when the legacy singular
+   `kv_args.state_type` is `'none'` but `state_mem_descs` is non-empty,
+   read `kv_args.state_types[0]` (the modern plural API that Mooncake
+   and NIXL already use). Routes `MAMBA → _send_mamba_state` and
+   `DSA/SWA → _send_swa_dsa_state` correctly.
+3. **Consumer normalization** — flatten `state_item_lens` and
+   `state_dim_per_tensor` to flat `List[int]` once at the entry of
+   `send_state`, so the existing per-tensor index arithmetic
+   (`state_item_lens[i]`) and length checks
+   (`len(state_item_lens) == len(state_mem_descs)`) keep working.
+4. **DSA index rank+length normalization** — inside
+   `_send_swa_dsa_state`, before the `group_concurrent_contiguous`
+   call, ravel both `src_state_indices` and `dst_state_indices` to 1-D
+   and re-truncate to common length. Upstream's existing truncation
+   only slices the outer axis, leaving 2-D `(1, N)` arrays unchanged
+   and triggering an `np.diff` broadcasting error
+   (`shapes (1,12) (0,)`) for GLM-5 (single-DSA-component) prefill
+   traffic. See
+   `scripts/sglang_disagg/docs_glm5/01-bug-analysis.md` for the full
+   write-up.
+
+Verified passing GSM8K = 0.978 ± 0.004 on Qwen3.5-397B-A17B-FP8 1P+1D
+TP=8 dp-attn=false (matches and slightly exceeds upstream
+[PR #22665](https://github.com/sgl-project/sglang/pull/22665)'s
+reported 0.970 GSM8K on the bf16 baseline). GLM-5 (DSA) verification
+in progress under
+`scripts/sglang_disagg/docs_glm5/02-fix-and-verification.md`.
+
+This is a stop-gap. The proper upstream fix is to migrate MoRI to the
+plural `state_types: List[StateType]` API (full design + diff in
+`scripts/sglang_disagg/docs/03-upstream-pr-proposal.md`).
+
+## How to enable
+
+```bash
+export EXTRA_DOCKER_MOUNTS="-v $DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/mori_conn.py:/sgl-workspace/sglang/python/sglang/srt/disaggregation/mori/conn.py:ro"
+```
+
+`$DI_REPO_DIR` is the InferenceX checkout root that `job.slurm`
+already mounts into the container at `/workspace`.
+
+When this env var is unset (CI default for runs that don't need the
+patch), `${EXTRA_DOCKER_MOUNTS:-}` expands to the empty string and
+container behavior is byte-identical to the unpatched path.
+
+## When to use which patch
+
+| Image / version | Need `mori_conn.py` overlay? |
+|---|---|
+| `lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523` | yes (Qwen3.5-MoE-FP8, GLM-5, any hybrid model on this image) |
+| `lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-*` (used by `dsr1-fp4-*-disagg`) | not validated; same code path likely affected — try with the overlay if you hit the same `struct.error` |
+| `rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-*` (used by `dsr1-fp8-*-disagg`, `glm5-*-disagg`) | predates [PR #22665](https://github.com/sgl-project/sglang/pull/22665); different code paths; **do not** apply this overlay |
+
+When upstream merges the proper fix (see
+`scripts/sglang_disagg/docs/03-upstream-pr-proposal.md`) and that
+fix lands in a published image, retire this overlay and the
+`EXTRA_DOCKER_MOUNTS` knob can stay (still useful for future patches).