From 31b4fbe4ff6f60642106f3fab63df1e050487712 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 21:38:15 +0900
Subject: [PATCH 01/29] [AMD] dsv4-fp4-mi355x-atom: enable DPA TBO at high
 concurrency, update image to atom0.1.4

- Enable --enable-tbo for ISL=1024/OSL=1024 at CONC>=1024 and ISL=8192/OSL=1024 at CONC>=256
- Update image to atom0.1.4_20260612
- Update ISL=8192 search-space to start at conc=4 and use DPA from conc=128

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 19 +++++++++----------
 .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh     | 11 +++++++++--
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 70a79a273..77e4f0040 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2257,15 +2257,8 @@ dsv4-fp4-mi355x-vllm-mtp:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
 
-# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
-# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
-# that OOM once warmup/prefill batches multiple requests; keep CONC=1 until
-# the AITER sparse-attention kernel / multi-request path lands upstream.
-# --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is
-# the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom);
-# the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA.
 dsv4-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
+  image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
@@ -2277,13 +2270,19 @@ dsv4-fp4-mi355x-atom:
     - isl: 1024
       osl: 1024
       search-space:
+        # conc4-64, TP8
+        # conc128-512, DPA
+        # conc1024, DPA TBO
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
       - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 1024 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 512 }
+        # conc4-64, TP8
+        # conc128, DPA
+        # conc256-1024, DPA TBO
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 1024 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index e485dc9a6..4f4545824 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -25,8 +25,15 @@ PARALLEL_ARGS=(-tp "$TP") #TP
 if [ "$DP_ATTENTION" = "true" ]; then
     if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
         PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
-    else #DP+TP
-        PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
+    else #DPA+TP
+        #DPA+TP+TBO
+        if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 1024 ]; then
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+        elif [ "$ISL" -eq 8192 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 256 ]; then
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+        else
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
+        fi
     fi
 fi 
 

From c566e28e05a85be0c06bf531c6d6d92548f3aebf Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 21:40:22 +0900
Subject: [PATCH 02/29] [AMD] perf-changelog: dsv4-fp4-mi355x-atom DPA TBO +
 image atom0.1.4

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c0642188b..e68d5d3e0 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3600,3 +3600,11 @@
     - "MI355x DSR1-FP4: Include TP4 configurations for 8k1k"
     - "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692
+
+- config-keys:
+    - dsv4-fp4-mi355x-atom
+  description:
+    - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
+    - "Enable --enable-tbo (Token-Bucket Overlap) on top of DPA+TP8 at high concurrency: ISL=1024/OSL=1024 at CONC>=1024, ISL=8192/OSL=1024 at CONC>=256"
+    - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
+  pr-link: 

From 7e1aa060dbbc69f072ac51a5c43e475b4014da01 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 21:41:35 +0900
Subject: [PATCH 03/29] [AMD] perf-changelog: add PR link #1717

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index e68d5d3e0..f236a6d60 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3605,6 +3605,6 @@
     - dsv4-fp4-mi355x-atom
   description:
     - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
-    - "Enable --enable-tbo (Token-Bucket Overlap) on top of DPA+TP8 at high concurrency: ISL=1024/OSL=1024 at CONC>=1024, ISL=8192/OSL=1024 at CONC>=256"
+    - "Enable --enable-tbo (Two Batch Overlap) on top of DPA+TP8 at high concurrency: ISL=1024/OSL=1024 at CONC>=1024, ISL=8192/OSL=1024 at CONC>=256"
     - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
-  pr-link: 
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717

From 65e0fa328cd93161c5e736cc9f2fe7f8f22aed16 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 21:59:22 +0900
Subject: [PATCH 04/29] [AMD] dsv4_fp4_mi355x_atom.sh: disable prefix caching

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 4f4545824..369b72281 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -53,6 +53,7 @@ python3 -m atom.entrypoints.openai_server \
     --kv_cache_dtype fp8 \
     --trust-remote-code \
     --gpu-memory-utilization 0.85 \
+    --no-enable_prefix_caching \
     > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

From 3f3560b7ef5b39461065bad07da780e063438313 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Fri, 12 Jun 2026 22:17:00 +0900
Subject: [PATCH 05/29] [AMD] dsv4-fp4-mi355x-atom: add max-model-len, eval
 context, extend conc range

- Pass --max-model-len to server using SERVE_MAX_MODEL_LEN
- Add EVAL_ONLY path: compute eval context length via compute_eval_context_length
- Extend conc-end to 8192 (isl=1024) and 4096 (isl=8192) in amd-master.yaml

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml                      |  8 ++++----
 .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh            | 12 ++++++++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 77e4f0040..977f0ef2a 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2272,17 +2272,17 @@ dsv4-fp4-mi355x-atom:
       search-space:
         # conc4-64, TP8
         # conc128-512, DPA
-        # conc1024, DPA TBO
+        # conc1024-8192, DPA TBO
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 1024 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 8192 }
     - isl: 8192
       osl: 1024
       search-space:
         # conc4-64, TP8
         # conc128, DPA
-        # conc256-1024, DPA TBO
+        # conc256-4096, DPA TBO
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 1024 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 4096 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 369b72281..cfd4354b8 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -37,6 +37,15 @@ if [ "$DP_ATTENTION" = "true" ]; then
     fi
 fi 
 
+BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN")
+    export EVAL_MAX_MODEL_LEN
+    SERVE_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+else
+    SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN"
+fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
 
@@ -44,8 +53,6 @@ set -x
 export ATOM_DISABLE_MMAP=true
 export AITER_BF16_FP8_MOE_BOUND=0
 export ATOM_MOE_GU_ITLV=1
-# TODO: add --no-enable_chunked_prefill, when dsv4 prefix caching is supported 
-#https://github.com/ROCm/ATOM/commit/7df93a181da4d3c3250c2441c7d5e2745a03d0cd#diff-61b1ba0b8b74523530d2d5cdc739d4f3a23a43bedf69015a5235844d46e9373bL1127
 python3 -m atom.entrypoints.openai_server \
     --model $MODEL \
     --server-port $PORT \
@@ -54,6 +61,7 @@ python3 -m atom.entrypoints.openai_server \
     --trust-remote-code \
     --gpu-memory-utilization 0.85 \
     --no-enable_prefix_caching \
+    --max-model-len "$SERVE_MAX_MODEL_LEN" \
     > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

From c3b32890500af732e024b08051195bd8d47398e3 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Sat, 13 Jun 2026 22:07:18 +0900
Subject: [PATCH 06/29] [AMD] dsv4-fp4-mi355x-atom: narrow eval to single
 conc=1024 point, disable max-model-len

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 27 ++++++++++++-------
 .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh     |  2 +-
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 977f0ef2a..6835d9abc 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2267,22 +2267,29 @@ dsv4-fp4-mi355x-atom:
   multinode: false
   scenarios:
     fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-        # conc4-64, TP8
-        # conc128-512, DPA
-        # conc1024-8192, DPA TBO
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 8192 }
+    #- isl: 1024
+    #  osl: 1024
+    #  search-space:
+    #    # conc4-64, TP8
+    #    # conc128-512, DPA
+    #    # conc1024-8192, DPA TBO
+    #  - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
+    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 8192 }
+    #- isl: 8192
+    #  osl: 1024
+    #  search-space:
+    #    # conc4-64, TP8
+    #    # conc128, DPA
+    #    # conc256-4096, DPA TBO
+    #  - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
+    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 4096 }
     - isl: 8192
       osl: 1024
       search-space:
         # conc4-64, TP8
         # conc128, DPA
         # conc256-4096, DPA TBO
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 4096 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 1024, conc-end: 1024 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index cfd4354b8..03b9ff0a0 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -61,8 +61,8 @@ python3 -m atom.entrypoints.openai_server \
     --trust-remote-code \
     --gpu-memory-utilization 0.85 \
     --no-enable_prefix_caching \
-    --max-model-len "$SERVE_MAX_MODEL_LEN" \
     > $SERVER_LOG 2>&1 &
+    #--max-model-len "$SERVE_MAX_MODEL_LEN" \
 
 SERVER_PID=$!
 

From 7ffa976e1868cf5086bbd8f8b70f47d5512d32c3 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Sat, 13 Jun 2026 22:10:47 +0900
Subject: [PATCH 07/29] [AMD] dsv4_fp4_mi355x_atom.sh: add
 cudagraph-capture-sizes and max-num-seqs

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 03b9ff0a0..898dac45e 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -22,6 +22,7 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO
 SERVER_LOG=/workspace/server.log
 
 PARALLEL_ARGS=(-tp "$TP") #TP
+CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]'
 if [ "$DP_ATTENTION" = "true" ]; then
     if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
         PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
@@ -29,6 +30,7 @@ if [ "$DP_ATTENTION" = "true" ]; then
         #DPA+TP+TBO
         if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 1024 ]; then
             PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+            CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024]'
         elif [ "$ISL" -eq 8192 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 256 ]; then
             PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
         else
@@ -61,6 +63,8 @@ python3 -m atom.entrypoints.openai_server \
     --trust-remote-code \
     --gpu-memory-utilization 0.85 \
     --no-enable_prefix_caching \
+    --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \
+    --max-num-seqs ${CONC} \
     > $SERVER_LOG 2>&1 &
     #--max-model-len "$SERVE_MAX_MODEL_LEN" \
 

From f2677b2e5ef2fc9f9bcd1661c74568f71b2cab5d Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Mon, 15 Jun 2026 23:36:12 +0900
Subject: [PATCH 08/29] [AMD] dsv4-fp4-mi355x-atom: bump to nightly image,
 expand search space, enable max-model-len

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 29 +++++++------------
 .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh     |  4 +--
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 6835d9abc..03972371a 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2258,7 +2258,7 @@ dsv4-fp4-mi355x-vllm-mtp:
       - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
 
 dsv4-fp4-mi355x-atom:
-  image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612
+  image: rocm/atom-dev:nightly_202606141623
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
@@ -2267,29 +2267,22 @@ dsv4-fp4-mi355x-atom:
   multinode: false
   scenarios:
     fixed-seq-len:
-    #- isl: 1024
-    #  osl: 1024
-    #  search-space:
-    #    # conc4-64, TP8
-    #    # conc128-512, DPA
-    #    # conc1024-8192, DPA TBO
-    #  - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 8192 }
-    #- isl: 8192
-    #  osl: 1024
-    #  search-space:
-    #    # conc4-64, TP8
-    #    # conc128, DPA
-    #    # conc256-4096, DPA TBO
-    #  - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 4096 }
+    - isl: 1024
+      osl: 1024
+      search-space:
+        # conc4-64, TP8
+        # conc128-512, DPA
+        # conc1024-8192, DPA TBO
+      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 8192 }
     - isl: 8192
       osl: 1024
       search-space:
         # conc4-64, TP8
         # conc128, DPA
         # conc256-4096, DPA TBO
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 1024, conc-end: 1024 }
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 4096 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 898dac45e..54ea18836 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -30,7 +30,6 @@ if [ "$DP_ATTENTION" = "true" ]; then
         #DPA+TP+TBO
         if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 1024 ]; then
             PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
-            CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256,512,1024]'
         elif [ "$ISL" -eq 8192 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 256 ]; then
             PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
         else
@@ -63,10 +62,9 @@ python3 -m atom.entrypoints.openai_server \
     --trust-remote-code \
     --gpu-memory-utilization 0.85 \
     --no-enable_prefix_caching \
+    --max-model-len "$SERVE_MAX_MODEL_LEN" \
     --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \
-    --max-num-seqs ${CONC} \
     > $SERVER_LOG 2>&1 &
-    #--max-model-len "$SERVE_MAX_MODEL_LEN" \
 
 SERVER_PID=$!
 

From f5f0d666a66044d54ecc243ed64ee4c265ad992d Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Mon, 15 Jun 2026 23:48:34 +0900
Subject: [PATCH 09/29] [AMD] set GPU_MAX_HW_QUEUES=5 in
 dsv4_fp4_mi355x_atom.sh

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 54ea18836..f643f8fb6 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -54,6 +54,7 @@ set -x
 export ATOM_DISABLE_MMAP=true
 export AITER_BF16_FP8_MOE_BOUND=0
 export ATOM_MOE_GU_ITLV=1
+export GPU_MAX_HW_QUEUES=5
 python3 -m atom.entrypoints.openai_server \
     --model $MODEL \
     --server-port $PORT \

From dc5b239d027138cc9cab494ea757c0fcd8dc39b7 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 16 Jun 2026 01:05:13 +0900
Subject: [PATCH 10/29] [AMD] dsv4-fp4-mi355x-atom: disable TBO, add TP4 rows
 for isl=8192, cap conc ranges

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml                   |  6 ++++--
 .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh         | 15 ++++++++-------
 perf-changelog.yaml                               |  1 -
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 03972371a..39255c81e 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2274,7 +2274,7 @@ dsv4-fp4-mi355x-atom:
         # conc128-512, DPA
         # conc1024-8192, DPA TBO
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 8192 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
     - isl: 8192
       osl: 1024
       search-space:
@@ -2282,7 +2282,9 @@ dsv4-fp4-mi355x-atom:
         # conc128, DPA
         # conc256-4096, DPA TBO
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 4096 }
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
+      - { tp: 4, ep: 1, dp-attn: true, conc-start: 128, conc-end: 128 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index f643f8fb6..9702e6afc 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -28,13 +28,14 @@ if [ "$DP_ATTENTION" = "true" ]; then
         PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
     else #DPA+TP
         #DPA+TP+TBO
-        if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 1024 ]; then
-            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
-        elif [ "$ISL" -eq 8192 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 256 ]; then
-            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
-        else
-            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
-        fi
+        #if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 1024 ]; then
+        #    PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+        #elif [ "$ISL" -eq 8192 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 256 ]; then
+        #    PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+        #else
+        #    PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
+        #fi
+        PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
     fi
 fi 
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index f236a6d60..791b419e8 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3605,6 +3605,5 @@
     - dsv4-fp4-mi355x-atom
   description:
     - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
-    - "Enable --enable-tbo (Two Batch Overlap) on top of DPA+TP8 at high concurrency: ISL=1024/OSL=1024 at CONC>=1024, ISL=8192/OSL=1024 at CONC>=256"
     - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717

From 9e1805287ef7b998d6b6dbfc1f87713fde406d5b Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 16 Jun 2026 13:01:23 +0900
Subject: [PATCH 11/29] [AMD] dsv4_fp4_mi355x_atom.sh: quote SERVER_LOG
 variable

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 31 ++++++++++++-------
 .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh     |  5 ++-
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 3e1cc7f58..5a3b36666 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2267,24 +2267,31 @@ dsv4-fp4-mi355x-atom:
   multinode: false
   scenarios:
     fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-        # conc4-64, TP8
-        # conc128-512, DPA
-        # conc1024-8192, DPA TBO
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
+    #- isl: 1024
+    #  osl: 1024
+    #  search-space:
+    #    # conc4-64, TP8
+    #    # conc128-512, DPA
+    #    # conc1024-8192, DPA TBO
+    #  - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
+    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
+    #- isl: 8192
+    #  osl: 1024
+    #  search-space:
+    #    # conc4-64, TP8
+    #    # conc128, DPA
+    #    # conc256-4096, DPA TBO
+    #  - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
+    #  - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
+    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
+    #  - { tp: 4, ep: 1, dp-attn: true, conc-start: 128, conc-end: 128 }
     - isl: 8192
       osl: 1024
       search-space:
         # conc4-64, TP8
         # conc128, DPA
         # conc256-4096, DPA TBO
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
-      - { tp: 4, ep: 1, dp-attn: true, conc-start: 128, conc-end: 128 }
+      - { tp: 8, ep: 1, conc-start: 16, conc-end: 16 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 9702e6afc..a0d39218f 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -56,6 +56,8 @@ export ATOM_DISABLE_MMAP=true
 export AITER_BF16_FP8_MOE_BOUND=0
 export ATOM_MOE_GU_ITLV=1
 export GPU_MAX_HW_QUEUES=5
+OPT_ARGS=(--hf-overrides '{\"use_index_cache\": true, \"index_topk_freq\": 4}')
+
 python3 -m atom.entrypoints.openai_server \
     --model $MODEL \
     --server-port $PORT \
@@ -66,7 +68,8 @@ python3 -m atom.entrypoints.openai_server \
     --no-enable_prefix_caching \
     --max-model-len "$SERVE_MAX_MODEL_LEN" \
     --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \
-    > $SERVER_LOG 2>&1 &
+    "${OPT_ARGS[@]}" \
+    > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!
 

From c1812edb0050d143aba802824296c689d1cc0cfa Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 16 Jun 2026 13:02:47 +0900
Subject: [PATCH 12/29] [AMD] dsv4_fp4_mi355x_atom.sh: comment out dense
 cudagraph sizes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index a0d39218f..939fb6ab2 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -22,7 +22,7 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO
 SERVER_LOG=/workspace/server.log
 
 PARALLEL_ARGS=(-tp "$TP") #TP
-CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]'
+#CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]'
 if [ "$DP_ATTENTION" = "true" ]; then
     if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
         PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )

From 28bdc6a99410da0ea7eb178a99272d9cdf89f9e3 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 16 Jun 2026 14:27:20 +0900
Subject: [PATCH 13/29] [AMD] dsv4_fp4_mi355x_atom.sh: fix --hf-overrides JSON
 escaping

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 939fb6ab2..93eec3e76 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -22,7 +22,7 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO
 SERVER_LOG=/workspace/server.log
 
 PARALLEL_ARGS=(-tp "$TP") #TP
-#CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]'
+CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]'
 if [ "$DP_ATTENTION" = "true" ]; then
     if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
         PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
@@ -56,7 +56,7 @@ export ATOM_DISABLE_MMAP=true
 export AITER_BF16_FP8_MOE_BOUND=0
 export ATOM_MOE_GU_ITLV=1
 export GPU_MAX_HW_QUEUES=5
-OPT_ARGS=(--hf-overrides '{\"use_index_cache\": true, \"index_topk_freq\": 4}')
+OPT_ARGS=(--hf-overrides '{"use_index_cache": true, "index_topk_freq": 4}')
 
 python3 -m atom.entrypoints.openai_server \
     --model $MODEL \

From b36218e0264946c2e8ce25ad9e9299ae0520549f Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 16 Jun 2026 14:28:25 +0900
Subject: [PATCH 14/29] [AMD] dsv4_fp4_mi355x_atom.sh: comment out dense
 cudagraph sizes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 93eec3e76..0fedfa82d 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -22,7 +22,7 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO
 SERVER_LOG=/workspace/server.log
 
 PARALLEL_ARGS=(-tp "$TP") #TP
-CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]'
+#CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]'
 if [ "$DP_ATTENTION" = "true" ]; then
     if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
         PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )

From fa47caf52aa7e6534d809c824b32288e3e7a4d7c Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 16 Jun 2026 15:56:36 +0900
Subject: [PATCH 15/29] [AMD] dsv4-fp4-mi355x-atom: expand search space,
 restore isl=1024 rows

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 32 +++++++------------
 .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh     |  1 -
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 5a3b36666..c5ffcff7c 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2267,31 +2267,23 @@ dsv4-fp4-mi355x-atom:
   multinode: false
   scenarios:
     fixed-seq-len:
-    #- isl: 1024
-    #  osl: 1024
-    #  search-space:
-    #    # conc4-64, TP8
-    #    # conc128-512, DPA
-    #    # conc1024-8192, DPA TBO
-    #  - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
-    #- isl: 8192
-    #  osl: 1024
-    #  search-space:
-    #    # conc4-64, TP8
-    #    # conc128, DPA
-    #    # conc256-4096, DPA TBO
-    #  - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-    #  - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
-    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
-    #  - { tp: 4, ep: 1, dp-attn: true, conc-start: 128, conc-end: 128 }
+    - isl: 1024
+      osl: 1024
+      search-space:
+        # conc4-64, TP8
+        # conc128-512, DPA
+        # conc1024-8192, DPA TBO, (skip)
+      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
     - isl: 8192
       osl: 1024
       search-space:
         # conc4-64, TP8
         # conc128, DPA
-        # conc256-4096, DPA TBO
-      - { tp: 8, ep: 1, conc-start: 16, conc-end: 16 }
+        # conc256-4096, DPA TBO, (skip)
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 0fedfa82d..d44f65d69 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -22,7 +22,6 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO
 SERVER_LOG=/workspace/server.log
 
 PARALLEL_ARGS=(-tp "$TP") #TP
-#CUDAGRAPH_SIZES='[1,2,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,64,68,72,76,80,84,88,92,96,100,104,108,112,116,120,124,128,132,136,140,144,148,152,156,160,164,168,172,176,180,184,188,192,196,200,204,208,212,216,220,224,228,232,236,240,244,248,252,256]'
 if [ "$DP_ATTENTION" = "true" ]; then
     if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
         PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )

From af82c273f870181d125b93bc8172beb0db8061de Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 16 Jun 2026 15:58:27 +0900
Subject: [PATCH 16/29] [AMD] perf-changelog: update dsv4-fp4-mi355x-atom image
 and search-space description

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2523daf59..ecad9d617 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3615,8 +3615,8 @@
 - config-keys:
     - dsv4-fp4-mi355x-atom
   description:
-    - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
-    - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
+    - "Update image to rocm/atom-dev:nightly_202606141623"
+    - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-2048 (previously conc=1-64 and DPA conc=64-512)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
   
 - config-keys:

From 1300012bf1ecd01a1075e41f4c134b18dff0dd08 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 16 Jun 2026 19:51:02 +0900
Subject: [PATCH 17/29] [AMD] dsv4_fp4_mi355x_atom.sh: restore sparse cudagraph
 capture sizes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index d44f65d69..9a4282e67 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -22,6 +22,7 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO
 SERVER_LOG=/workspace/server.log
 
 PARALLEL_ARGS=(-tp "$TP") #TP
+CUDAGRAPH_SIZES='[1, 2, 4, 8, 16, 32, 48, 64, 128, 256, 512]'
 if [ "$DP_ATTENTION" = "true" ]; then
     if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
         PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )

From f56f8777f82883807c17a4fde0d5bdf7d1127c96 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 16 Jun 2026 19:53:31 +0900
Subject: [PATCH 18/29] [AMD] perf-changelog: revert dsv4-fp4-mi355x-atom
 image/search-space, remove stale entries

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 25 ++-----------------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index ecad9d617..ec24c0007 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3615,8 +3615,8 @@
 - config-keys:
     - dsv4-fp4-mi355x-atom
   description:
-    - "Update image to rocm/atom-dev:nightly_202606141623"
-    - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-2048 (previously conc=1-64 and DPA conc=64-512)"
+    - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
+    - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
   
 - config-keys:
@@ -3838,13 +3838,6 @@
     - "Extend MiniMax-M3 MXFP8 H100/H200 non-MTP sweeps to concurrency 1 on the latency rows (H100: TP8; H200: TP4 and TP8) and add full TEP coverage from conc 1 to 256 (H100: TP8+EP8; H200: TP4+EP4 and TP8+EP8, incl. a new TP4+EP4 row for 8k1k). H200 TP8+EP8 upper bound moves 512->256 (high concurrency stays covered by the TP8+EP8 dp-attn DEP rows). DEP rows unchanged"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1761
 
-- config-keys:
-    - dsv4-fp4-mi355x-sglang
-  description:
-    - "Switch fixed-seq-len search space from TP8 to TP4 for both isl=1024 and isl=8192 scenarios"
-    - "Expand isl=8192 coverage: add TP4 dp-attn sweep (conc 32–2048) and TP4 TP-only sweep (conc 1–32)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762
-  
 - config-keys:
     - dsv4-fp4-gb300-dynamo-trt
     - dsv4-fp4-gb300-dynamo-trt-mtp
@@ -3856,17 +3849,3 @@
     - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026"
     - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689
-
-- config-keys:
-    - minimaxm3-fp8-b200-vllm
-  description:
-    - "Align MiniMax-M3 B200 vLLM fixed-sequence serving with MiniMax-M2.5 FP8 B200 settings by setting VLLM_FLOAT32_MATMUL_PRECISION=high and restoring max cudagraph capture size 2048."
-    - "Add TP4+EP4 coverage for MiniMax-M3 B200: DP-attention rows for 1k1k/8k1k and the missing non-DP-attention row for 8k1k."
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1779
-
-- config-keys:
-    - minimaxm3-fp8-b200-vllm
-  description:
-    - "Align MiniMax-M3 B200 vLLM fixed-sequence serving with MiniMax-M2.5 FP8 B200 settings by setting VLLM_FLOAT32_MATMUL_PRECISION=high and restoring max cudagraph capture size 2048."
-    - "Add TP4+EP4 coverage for MiniMax-M3 B200: DP-attention rows for 1k1k/8k1k and the missing non-DP-attention row for 8k1k."
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1779

From a4828cbd7fd19913a40fddeb988564427ca22bd1 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Tue, 16 Jun 2026 19:58:01 +0900
Subject: [PATCH 19/29] [AMD] perf-changelog: add dsv4-fp4-mi355x-sglang entry
 for PR #1762

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 811e6e6af..0d30ff635 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3838,6 +3838,13 @@
     - "Extend MiniMax-M3 MXFP8 H100/H200 non-MTP sweeps to concurrency 1 on the latency rows (H100: TP8; H200: TP4 and TP8) and add full TEP coverage from conc 1 to 256 (H100: TP8+EP8; H200: TP4+EP4 and TP8+EP8, incl. a new TP4+EP4 row for 8k1k). H200 TP8+EP8 upper bound moves 512->256 (high concurrency stays covered by the TP8+EP8 dp-attn DEP rows). DEP rows unchanged"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1761
 
+- config-keys:
+    - dsv4-fp4-mi355x-sglang
+  description:
+    - "Switch fixed-seq-len search space from TP8 to TP4 for both isl=1024 and isl=8192 scenarios"
+    - "Expand isl=8192 coverage: add TP4 dp-attn sweep (conc 32–2048) and TP4 TP-only sweep (conc 1–32)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762
+
 - config-keys:
     - dsv4-fp4-gb300-dynamo-trt
     - dsv4-fp4-gb300-dynamo-trt-mtp

From 19b8757e2560fdfe843e72f99fd08141502f46c1 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Wed, 17 Jun 2026 13:26:51 +0900
Subject: [PATCH 20/29] update dsv4-fp4-mi355x-atom: bump image, enable TBO
 conditionally, fix mem frac

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 29 ++++++++++++-------
 .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh     | 24 +++++++--------
 2 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index fc317d224..c92346c14 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2262,7 +2262,7 @@ dsv4-fp4-mi355x-vllm-mtp:
       - { tp: 8, conc-start: 4, conc-end: 512, spec-decoding: mtp }
 
 dsv4-fp4-mi355x-atom:
-  image: rocm/atom-dev:nightly_202606141623
+  image: rocm/atom-dev:nightly_202606161823
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
@@ -2271,22 +2271,29 @@ dsv4-fp4-mi355x-atom:
   multinode: false
   scenarios:
     fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-        # conc4-64, TP8
-        # conc128-512, DPA
-        # conc1024-8192, DPA TBO, (skip)
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
+    #- isl: 1024
+    #  osl: 1024
+    #  search-space:
+    #    # conc4-64, TP8
+    #    # conc128-512, DPA
+    #    # conc1024-8192, DPA TBO, (skip)
+    #  - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
+    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
+    #- isl: 8192
+    #  osl: 1024
+    #  search-space:
+    #    # conc4-64, TP8
+    #    # conc128, DPA
+    #    # conc256-4096, DPA TBO, (skip)
+    #  - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
+    #  - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }
+    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
     - isl: 8192
       osl: 1024
       search-space:
         # conc4-64, TP8
         # conc128, DPA
         # conc256-4096, DPA TBO, (skip)
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }
       - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
 
 dsv4-fp4-mi355x-atom-mtp:
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 9a4282e67..7a2514c6b 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -28,13 +28,15 @@ if [ "$DP_ATTENTION" = "true" ]; then
         PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
     else #DPA+TP
         #DPA+TP+TBO
-        #if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 1024 ]; then
-        #    PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
-        #elif [ "$ISL" -eq 8192 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 256 ]; then
-        #    PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
-        #else
-        #    PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
-        #fi
+        if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 1024 ]; then
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+            export GPU_MAX_HW_QUEUES=5
+        elif [ "$ISL" -eq 8192 ] && [ "$OSL" -eq 1024 ] && [ "$CONC" -ge 256 ]; then
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention --enable-tbo)
+            export GPU_MAX_HW_QUEUES=5
+        else
+            PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
+        fi
         PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
     fi
 fi 
@@ -44,9 +46,6 @@ BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
 if [ "${EVAL_ONLY}" = "true" ]; then
     EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN")
     export EVAL_MAX_MODEL_LEN
-    SERVE_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
-else
-    SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN"
 fi
 # Start GPU monitoring (power, temperature, clocks every second)
 start_gpu_monitor
@@ -55,7 +54,7 @@ set -x
 export ATOM_DISABLE_MMAP=true
 export AITER_BF16_FP8_MOE_BOUND=0
 export ATOM_MOE_GU_ITLV=1
-export GPU_MAX_HW_QUEUES=5
+MEM_FRAC_STATIC=0.9
 OPT_ARGS=(--hf-overrides '{"use_index_cache": true, "index_topk_freq": 4}')
 
 python3 -m atom.entrypoints.openai_server \
@@ -64,9 +63,8 @@ python3 -m atom.entrypoints.openai_server \
     "${PARALLEL_ARGS[@]}" \
     --kv_cache_dtype fp8 \
     --trust-remote-code \
-    --gpu-memory-utilization 0.85 \
+    --gpu-memory-utilization $MEM_FRAC_STATIC \
     --no-enable_prefix_caching \
-    --max-model-len "$SERVE_MAX_MODEL_LEN" \
     --cudagraph-capture-sizes "${CUDAGRAPH_SIZES}" \
     "${OPT_ARGS[@]}" \
     > "$SERVER_LOG" 2>&1 &

From 03aaa6b6078a5b07742165195e490448b838b12c Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Wed, 17 Jun 2026 14:53:24 +0900
Subject: [PATCH 21/29] expand dsv4-fp4-mi355x-atom search space: restore
 ISL1024 scenarios, add TP4/TP8 conc lists for ISL8192

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index c92346c14..9120ebdba 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2271,29 +2271,22 @@ dsv4-fp4-mi355x-atom:
   multinode: false
   scenarios:
     fixed-seq-len:
-    #- isl: 1024
-    #  osl: 1024
-    #  search-space:
-    #    # conc4-64, TP8
-    #    # conc128-512, DPA
-    #    # conc1024-8192, DPA TBO, (skip)
-    #  - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
-    #- isl: 8192
-    #  osl: 1024
-    #  search-space:
-    #    # conc4-64, TP8
-    #    # conc128, DPA
-    #    # conc256-4096, DPA TBO, (skip)
-    #  - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-    #  - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }
-    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
+    - isl: 1024
+      osl: 1024
+      search-space:
+        # conc4-64, TP8
+        # conc128-512, DPA
+        # conc1024-2048, DPA TBO
+      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
     - isl: 8192
       osl: 1024
       search-space:
         # conc4-64, TP8
         # conc128, DPA
-        # conc256-4096, DPA TBO, (skip)
+        # conc256-2048, DPA TBO
+      - { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] }
+      - { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] }
       - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
 
 dsv4-fp4-mi355x-atom-mtp:

From 421313c2d6590126f8b0973930f37c496d6b9f40 Mon Sep 17 00:00:00 2001
From: seungrokj <144636725+seungrokj@users.noreply.github.com>
Date: Wed, 17 Jun 2026 14:54:49 +0900
Subject: [PATCH 22/29] Update perf-changelog.yaml

---
 perf-changelog.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 38756c91c..f55774a60 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3844,6 +3844,7 @@
     - "Switch fixed-seq-len search space from TP8 to TP4 for both isl=1024 and isl=8192 scenarios"
     - "Expand isl=8192 coverage: add TP4 dp-attn sweep (conc 32–2048) and TP4 TP-only sweep (conc 1–32)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762
+
   
 - config-keys:
     - dsv4-fp4-gb300-dynamo-trt

From ae77233689f3d0668c8d7b107896214b60298c22 Mon Sep 17 00:00:00 2001
From: seungrokj <144636725+seungrokj@users.noreply.github.com>
Date: Wed, 17 Jun 2026 14:55:43 +0900
Subject: [PATCH 23/29] Update perf-changelog.yaml

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index f55774a60..3851bedfe 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3838,6 +3838,7 @@
     - "Extend MiniMax-M3 MXFP8 H100/H200 non-MTP sweeps to concurrency 1 on the latency rows (H100: TP8; H200: TP4 and TP8) and add full TEP coverage from conc 1 to 256 (H100: TP8+EP8; H200: TP4+EP4 and TP8+EP8, incl. a new TP4+EP4 row for 8k1k). H200 TP8+EP8 upper bound moves 512->256 (high concurrency stays covered by the TP8+EP8 dp-attn DEP rows). DEP rows unchanged"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1761
 
+
 - config-keys:
     - dsv4-fp4-mi355x-sglang
   description:
@@ -3845,7 +3846,6 @@
     - "Expand isl=8192 coverage: add TP4 dp-attn sweep (conc 32–2048) and TP4 TP-only sweep (conc 1–32)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762
 
-  
 - config-keys:
     - dsv4-fp4-gb300-dynamo-trt
     - dsv4-fp4-gb300-dynamo-trt-mtp

From a8f6bd0662c70c86c9fb92cd8bad2082b5c5c389 Mon Sep 17 00:00:00 2001
From: seungrokj <144636725+seungrokj@users.noreply.github.com>
Date: Wed, 17 Jun 2026 14:56:25 +0900
Subject: [PATCH 24/29] Update perf-changelog.yaml

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 3851bedfe..dcfe6174e 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3837,7 +3837,7 @@
   description:
     - "Extend MiniMax-M3 MXFP8 H100/H200 non-MTP sweeps to concurrency 1 on the latency rows (H100: TP8; H200: TP4 and TP8) and add full TEP coverage from conc 1 to 256 (H100: TP8+EP8; H200: TP4+EP4 and TP8+EP8, incl. a new TP4+EP4 row for 8k1k). H200 TP8+EP8 upper bound moves 512->256 (high concurrency stays covered by the TP8+EP8 dp-attn DEP rows). DEP rows unchanged"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1761
-
+ 
 
 - config-keys:
     - dsv4-fp4-mi355x-sglang

From 5fbd0689ea69fa1f1f2d7e6f85d16b6ab187eb2a Mon Sep 17 00:00:00 2001
From: seungrokj <144636725+seungrokj@users.noreply.github.com>
Date: Wed, 17 Jun 2026 14:57:52 +0900
Subject: [PATCH 25/29] Update perf-changelog.yaml

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index dcfe6174e..3cc5510b3 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3837,7 +3837,6 @@
   description:
     - "Extend MiniMax-M3 MXFP8 H100/H200 non-MTP sweeps to concurrency 1 on the latency rows (H100: TP8; H200: TP4 and TP8) and add full TEP coverage from conc 1 to 256 (H100: TP8+EP8; H200: TP4+EP4 and TP8+EP8, incl. a new TP4+EP4 row for 8k1k). H200 TP8+EP8 upper bound moves 512->256 (high concurrency stays covered by the TP8+EP8 dp-attn DEP rows). DEP rows unchanged"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1761
- 
 
 - config-keys:
     - dsv4-fp4-mi355x-sglang
@@ -3878,6 +3877,7 @@
     - "Switch fixed-seq-len search space from TP8 to TP4 for both isl=1024 and isl=8192 scenarios"
     - "Expand isl=8192 coverage: add TP4 dp-attn sweep (conc 32–2048) and TP4 TP-only sweep (conc 1–32)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762
+ 
 
 - config-keys:
     - kimik2.5-int4-mi355x-vllm

From d080faa169d32075bb9db7f91033bf0cde81e894 Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Wed, 17 Jun 2026 15:00:20 +0900
Subject: [PATCH 26/29] update perf-changelog: move dsv4-fp4-mi355x-atom entry
 to end

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 perf-changelog.yaml

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
old mode 100644
new mode 100755
index 3cc5510b3..b4b5be948
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3612,13 +3612,6 @@
     - "Expand the TP sweep (included TP=4) for 8k/1k configuration for conc=4 to 64"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1692
 
-- config-keys:
-    - dsv4-fp4-mi355x-atom
-  description:
-    - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
-    - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
-  
 - config-keys:
     - qwen3.5-fp4-mi355x-sglang
     - qwen3.5-fp4-mi355x-sglang-mtp
@@ -3844,6 +3837,7 @@
     - "Switch fixed-seq-len search space from TP8 to TP4 for both isl=1024 and isl=8192 scenarios"
     - "Expand isl=8192 coverage: add TP4 dp-attn sweep (conc 32–2048) and TP4 TP-only sweep (conc 1–32)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762
+  
 
 - config-keys:
     - dsv4-fp4-gb300-dynamo-trt
@@ -3877,7 +3871,6 @@
     - "Switch fixed-seq-len search space from TP8 to TP4 for both isl=1024 and isl=8192 scenarios"
     - "Expand isl=8192 coverage: add TP4 dp-attn sweep (conc 32–2048) and TP4 TP-only sweep (conc 1–32)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762
- 
 
 - config-keys:
     - kimik2.5-int4-mi355x-vllm
@@ -3916,3 +3909,10 @@
   description:
     - "Use the Marlin MoE backend for MiniMax-M3 B200/B300 TP-only vLLM configurations by adding --moe-backend marlin when expert parallelism is disabled."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1809
+
+- config-keys:
+    - dsv4-fp4-mi355x-atom
+  description:
+    - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
+    - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717

From 91f6277f1218c5a360f8303b78654cc35585ae0c Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Wed, 17 Jun 2026 17:21:05 +0900
Subject: [PATCH 27/29] narrow dsv4-fp4-mi355x-atom to DPA conc=256-2048
 ISL8192, fix TBO branch override

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 31 ++++++++++++-------
 .../fixed_seq_len/dsv4_fp4_mi355x_atom.sh     |  1 -
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 9120ebdba..cd9aa6874 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2271,23 +2271,32 @@ dsv4-fp4-mi355x-atom:
   multinode: false
   scenarios:
     fixed-seq-len:
-    - isl: 1024
-      osl: 1024
-      search-space:
-        # conc4-64, TP8
-        # conc128-512, DPA
-        # conc1024-2048, DPA TBO
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
+    #- isl: 1024
+    #  osl: 1024
+    #  search-space:
+    #    # conc4-64, TP8
+    #    # conc128-512, DPA
+    #    # conc1024-2048, DPA TBO
+    #  - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
+    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
+    #- isl: 8192
+    #  osl: 1024
+    #  search-space:
+    #    # conc4-64, TP8
+    #    # conc128, DPA
+    #    # conc256-2048, DPA TBO
+    #  - { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] }
+    #  - { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
     - isl: 8192
       osl: 1024
       search-space:
         # conc4-64, TP8
         # conc128, DPA
         # conc256-2048, DPA TBO
-      - { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] }
-      - { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
+      #- { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] }
+      #- { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 256, conc-end: 2048 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3
diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
index 7a2514c6b..43f84d996 100644
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_atom.sh
@@ -37,7 +37,6 @@ if [ "$DP_ATTENTION" = "true" ]; then
         else
             PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
         fi
-        PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
     fi
 fi 
 

From 4364ef96418804c8b89dc6f97730067bbd21827a Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Wed, 17 Jun 2026 17:40:47 +0900
Subject: [PATCH 28/29] restore full dsv4-fp4-mi355x-atom search space: ISL1024
 + ISL8192 TP4/TP8/DPA

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index cd9aa6874..9120ebdba 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2271,32 +2271,23 @@ dsv4-fp4-mi355x-atom:
   multinode: false
   scenarios:
     fixed-seq-len:
-    #- isl: 1024
-    #  osl: 1024
-    #  search-space:
-    #    # conc4-64, TP8
-    #    # conc128-512, DPA
-    #    # conc1024-2048, DPA TBO
-    #  - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
-    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
-    #- isl: 8192
-    #  osl: 1024
-    #  search-space:
-    #    # conc4-64, TP8
-    #    # conc128, DPA
-    #    # conc256-2048, DPA TBO
-    #  - { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] }
-    #  - { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] }
-    #  - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
+    - isl: 1024
+      osl: 1024
+      search-space:
+        # conc4-64, TP8
+        # conc128-512, DPA
+        # conc1024-2048, DPA TBO
+      - { tp: 8, ep: 1, conc-start: 1, conc-end: 64 }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 64, conc-end: 2048 }
     - isl: 8192
       osl: 1024
       search-space:
         # conc4-64, TP8
         # conc128, DPA
         # conc256-2048, DPA TBO
-      #- { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] }
-      #- { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] }
-      - { tp: 8, ep: 1, dp-attn: true, conc-start: 256, conc-end: 2048 }
+      - { tp: 4, ep: 1, conc-list: [8, 16, 32, 64] }
+      - { tp: 8, ep: 1, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 8, ep: 1, dp-attn: true, conc-start: 128, conc-end: 2048 }
 
 dsv4-fp4-mi355x-atom-mtp:
   image: rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3

From 52f977942454246d2eeaad8782a63ba8e126046d Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Thu, 18 Jun 2026 09:04:28 +0800
Subject: [PATCH 29/29] chore: retrigger dsv4 atom benchmark sweep

---
 perf-changelog.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 7a150d908..ed995b364 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3935,3 +3935,11 @@
     - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
     - "Update Applied TBO on high concurrencies"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
+  
+- config-keys:
+    - dsv4-fp4-mi355x-atom
+  description:
+    - "Update image to rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.4_20260612"
+    - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
+    - "Update Applied TBO on high concurrencies"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717