SemiAnalysisAI
diff --git a/‎.github/configs/amd-master.yaml‎
Lines changed: 63 additions & 70 deletions b/‎.github/configs/amd-master.yaml‎
Lines changed: 63 additions & 70 deletions
@@ -239,10 +239,6 @@ qwen3.5-fp8-mi355x-sglang:
       search-space:
       - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 }
       - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 }
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
 
 qwen3.5-fp8-mi355x-sglang-mtp:
   image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
@@ -331,6 +327,27 @@ qwen3.5-fp4-mi355x-sglang:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 16 }
 
+qwen3.5-fp4-mi355x-atom:
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  model: amd/Qwen3.5-397B-A17B-MXFP4
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 2, conc-start: 4, conc-end: 256 }
+      - { tp: 4, conc-start: 4, conc-end: 16 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 2, conc-start: 4, conc-end: 256 }
+      - { tp: 4, conc-start: 4, conc-end: 16 }
+
 qwen3.5-fp8-mi300x-sglang:
   image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -382,11 +399,13 @@ glm5-fp8-mi355x-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+      - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
+      - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
+      - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
+      - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
 
 glm5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
@@ -401,10 +420,12 @@ glm5-fp8-mi355x-atom:
     - isl: 1024
       osl: 1024
       search-space:
+      - { tp: 4, conc-start: 4, conc-end: 256 }
       - { tp: 8, conc-start: 4, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
+      - { tp: 4, conc-start: 4, conc-end: 256 }
       - { tp: 8, conc-start: 4, conc-end: 256 }
 
 glm5.1-fp4-mi355x-sglang:
@@ -427,11 +448,6 @@ glm5.1-fp4-mi355x-sglang:
       search-space:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 16 }
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
 
 glm5.1-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
@@ -510,11 +526,7 @@ kimik2.5-int4-mi300x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-fp4-mi355x-vllm:
-  # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
-  # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
-  # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
-  # includes all subsequent ROCm offload work.
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.18.0
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
   runner: mi355x
@@ -533,18 +545,6 @@ kimik2.5-fp4-mi355x-vllm:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
-    # MI355X has 288 GB HBM per GPU (vs MI300X/MI325X smaller, comparable to
-    # B300). Extend the conc sweep upward to probe where the KV cliff sits
-    # with the larger HBM envelope. Restrict to tp=8 for this sweep to halve
-    # job count while still covering the main parallelism config.
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
-      # CPU offload only above the KV cliff. Lower concurrencies fit
-      # entirely on-GPU, so paying the offload-path overhead there would
-      # just slow them down without measuring anything new.
-      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
 
 kimik2.5-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
@@ -568,12 +568,7 @@ kimik2.5-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 128 }
 
 minimaxm2.5-fp8-mi355x-vllm:
-  # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector]
-  # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"),
-  # which enables SimpleCPUOffloadConnector on ROCm. Required for the
-  # cpu-offload sweep points to use the same offload path as the NVIDIA
-  # agentic-coding configs.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
+  image: vllm/vllm-openai-rocm:v0.19.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi355x
@@ -594,14 +589,6 @@ minimaxm2.5-fp8-mi355x-vllm:
       - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 }
       - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
       - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }
-    agentic-coding:
-    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
-    # Compute saturates first; cpu offload likely won't help, but worth confirming.
-    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
-    - duration: 1800
-      search-space:
-      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
-      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
 
 minimaxm2.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
@@ -624,6 +611,31 @@ minimaxm2.5-fp8-mi355x-atom:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 256 }
 
+minimaxm2.5-fp4-mi355x-atom:
+  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
+  model: amd/MiniMax-M2.5-MXFP4
+  model-prefix: minimaxm2.5
+  runner: mi355x
+  precision: fp4
+  framework: atom
+  multinode: false
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - { tp: 1, conc-start: 4, conc-end: 1024 }
+      - { tp: 2, conc-start: 4, conc-end: 1024 }
+      - { tp: 4, conc-start: 4, conc-end: 128 }
+      - { tp: 8, conc-start: 4, conc-end: 16 }
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - { tp: 1, conc-start: 4, conc-end: 1024 }
+      - { tp: 2, conc-start: 4, conc-end: 1024 }
+      - { tp: 4, conc-start: 4, conc-end: 128 }
+      - { tp: 8, conc-start: 4, conc-end: 16 }
+
 minimaxm2.5-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.19.1
   model: amd/MiniMax-M2.5-MXFP4
@@ -648,8 +660,7 @@ minimaxm2.5-fp4-mi355x-vllm:
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi300x-vllm:
-  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
+  image: vllm/vllm-openai-rocm:v0.16.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi300x
@@ -668,18 +679,9 @@ minimaxm2.5-fp8-mi300x-vllm:
       search-space:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
-    agentic-coding:
-    # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200);
-    # KV cliff ~52. Compute saturates first.
-    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
 
 minimaxm2.5-fp8-mi325x-vllm:
-  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
+  image: vllm/vllm-openai-rocm:v0.18.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi325x
@@ -698,15 +700,6 @@ minimaxm2.5-fp8-mi325x-vllm:
       search-space:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
-    agentic-coding:
-    # MI325X tp=4: cloned from MI300X recipe (slightly faster compute,
-    # similar HBM profile). Compute saturates first; cpu-offload window
-    # exercises the SimpleCPUOffloadConnector path enabled by the rocm
-    # nightly. Mirror MI300X conc grid for cross-vendor comparability.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
 
 gptoss-fp4-mi300x-vllm:
   image: vllm/vllm-openai-rocm:v0.17.0
@@ -1643,13 +1636,13 @@ dsv4-fp8-mi355x-vllm:
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 1 }
 
-  # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
-  # PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...]
-  # hardcode), --enforce-eager required, ATOM_USE_TRITON_MOE=1 required on
-  # gfx950. Image is the standard atom0.1.2.post MI355X base (matching
-  # qwen3.5-fp8-mi355x-atom); the DSv4 PR is overlaid at runtime by
-  # benchmarks/single_node/dsv4_fp4_mi355x_atom.sh at a pinned SHA. Sweep
-  # will expand once ATOM PR3 (multi-request) and PR4 (CUDAGraph) land.
+# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
+# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
+# that OOM once warmup/prefill batches multiple requests; keep CONC=1 until
+# the AITER sparse-attention kernel / multi-request path lands upstream.
+# --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is
+# the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom);
+# the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA.
 dsv4-fp4-mi355x-atom:
   image: rocm/atom-dev:nightly_202605130853
   model: deepseek-ai/DeepSeek-V4-Pro