SemiAnalysisAI
diff --git a/‎.github/configs/amd-master.yaml‎
Lines changed: 70 additions & 63 deletions b/‎.github/configs/amd-master.yaml‎
Lines changed: 70 additions & 63 deletions
@@ -239,6 +239,10 @@ qwen3.5-fp8-mi355x-sglang:
       search-space:
       - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 }
       - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 }
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
 
 qwen3.5-fp8-mi355x-sglang-mtp:
   image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
@@ -327,27 +331,6 @@ qwen3.5-fp4-mi355x-sglang:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 16 }
 
-qwen3.5-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
-  model: amd/Qwen3.5-397B-A17B-MXFP4
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 2, conc-start: 4, conc-end: 256 }
-      - { tp: 4, conc-start: 4, conc-end: 16 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 2, conc-start: 4, conc-end: 256 }
-      - { tp: 4, conc-start: 4, conc-end: 16 }
-
 qwen3.5-fp8-mi300x-sglang:
   image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -399,13 +382,11 @@ glm5-fp8-mi355x-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
-      - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
-      - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
+      - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
 glm5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
@@ -420,12 +401,10 @@ glm5-fp8-mi355x-atom:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, conc-start: 4, conc-end: 256 }
       - { tp: 8, conc-start: 4, conc-end: 256 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 4, conc-start: 4, conc-end: 256 }
       - { tp: 8, conc-start: 4, conc-end: 256 }
 
 glm5.1-fp4-mi355x-sglang:
@@ -448,6 +427,11 @@ glm5.1-fp4-mi355x-sglang:
       search-space:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 16 }
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
 
 glm5.1-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
@@ -526,7 +510,11 @@ kimik2.5-int4-mi300x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.18.0
+  # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
+  # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
+  # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
+  # includes all subsequent ROCm offload work.
+  image: vllm/vllm-openai-rocm:v0.21.0
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
   runner: mi355x
@@ -545,6 +533,18 @@ kimik2.5-fp4-mi355x-vllm:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
+    # MI355X has 288 GB HBM per GPU (vs MI300X/MI325X smaller, comparable to
+    # B300). Extend the conc sweep upward to probe where the KV cliff sits
+    # with the larger HBM envelope. Restrict to tp=8 for this sweep to halve
+    # job count while still covering the main parallelism config.
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
+      # CPU offload only above the KV cliff. Lower concurrencies fit
+      # entirely on-GPU, so paying the offload-path overhead there would
+      # just slow them down without measuring anything new.
+      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
 
 kimik2.5-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
@@ -568,7 +568,12 @@ kimik2.5-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 128 }
 
 minimaxm2.5-fp8-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.19.0
+  # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector]
+  # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"),
+  # which enables SimpleCPUOffloadConnector on ROCm. Required for the
+  # cpu-offload sweep points to use the same offload path as the NVIDIA
+  # agentic-coding configs.
+  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi355x
@@ -589,6 +594,14 @@ minimaxm2.5-fp8-mi355x-vllm:
       - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 }
       - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
       - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }
+    agentic-coding:
+    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
+    # Compute saturates first; cpu offload likely won't help, but worth confirming.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
+      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
 
 minimaxm2.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
@@ -611,31 +624,6 @@ minimaxm2.5-fp8-mi355x-atom:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 256 }
 
-minimaxm2.5-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
-  model: amd/MiniMax-M2.5-MXFP4
-  model-prefix: minimaxm2.5
-  runner: mi355x
-  precision: fp4
-  framework: atom
-  multinode: false
-  scenarios:
-    fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-      - { tp: 1, conc-start: 4, conc-end: 1024 }
-      - { tp: 2, conc-start: 4, conc-end: 1024 }
-      - { tp: 4, conc-start: 4, conc-end: 128 }
-      - { tp: 8, conc-start: 4, conc-end: 16 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 1, conc-start: 4, conc-end: 1024 }
-      - { tp: 2, conc-start: 4, conc-end: 1024 }
-      - { tp: 4, conc-start: 4, conc-end: 128 }
-      - { tp: 8, conc-start: 4, conc-end: 16 }
-
 minimaxm2.5-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.19.1
   model: amd/MiniMax-M2.5-MXFP4
@@ -660,7 +648,8 @@ minimaxm2.5-fp4-mi355x-vllm:
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.16.0
+  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
+  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi300x
@@ -679,9 +668,18 @@ minimaxm2.5-fp8-mi300x-vllm:
       search-space:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
+    agentic-coding:
+    # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200);
+    # KV cliff ~52. Compute saturates first.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
 
 minimaxm2.5-fp8-mi325x-vllm:
-  image: vllm/vllm-openai-rocm:v0.18.0
+  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
+  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi325x
@@ -700,6 +698,15 @@ minimaxm2.5-fp8-mi325x-vllm:
       search-space:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
+    agentic-coding:
+    # MI325X tp=4: cloned from MI300X recipe (slightly faster compute,
+    # similar HBM profile). Compute saturates first; cpu-offload window
+    # exercises the SimpleCPUOffloadConnector path enabled by the rocm
+    # nightly. Mirror MI300X conc grid for cross-vendor comparability.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
 
 gptoss-fp4-mi300x-vllm:
   image: vllm/vllm-openai-rocm:v0.17.0
@@ -1636,13 +1643,13 @@ dsv4-fp8-mi355x-vllm:
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 1 }
 
-# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
-# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
-# that OOM once warmup/prefill batches multiple requests; keep CONC=1 until
-# the AITER sparse-attention kernel / multi-request path lands upstream.
-# --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is
-# the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom);
-# the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA.
+  # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
+  # PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...]
+  # hardcode), --enforce-eager required, ATOM_USE_TRITON_MOE=1 required on
+  # gfx950. Image is the standard atom0.1.2.post MI355X base (matching
+  # qwen3.5-fp8-mi355x-atom); the DSv4 PR is overlaid at runtime by
+  # benchmarks/single_node/dsv4_fp4_mi355x_atom.sh at a pinned SHA. Sweep
+  # will expand once ATOM PR3 (multi-request) and PR4 (CUDAGraph) land.
 dsv4-fp4-mi355x-atom:
   image: rocm/atom-dev:nightly_202605130853
   model: deepseek-ai/DeepSeek-V4-Pro