@@ -239,10 +239,6 @@ qwen3.5-fp8-mi355x-sglang:
239239 search-space :
240240 - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 }
241241 - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 }
242- agentic-coding :
243- - duration : 1800
244- search-space :
245- - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
246242
247243qwen3.5-fp8-mi355x-sglang-mtp :
248244 image : lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
@@ -331,6 +327,27 @@ qwen3.5-fp4-mi355x-sglang:
331327 - { tp: 2, conc-start: 4, conc-end: 256 }
332328 - { tp: 4, conc-start: 4, conc-end: 16 }
333329
330+ qwen3.5-fp4-mi355x-atom :
331+ image : rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
332+ model : amd/Qwen3.5-397B-A17B-MXFP4
333+ model-prefix : qwen3.5
334+ runner : mi355x
335+ precision : fp4
336+ framework : atom
337+ multinode : false
338+ scenarios :
339+ fixed-seq-len :
340+ - isl : 1024
341+ osl : 1024
342+ search-space :
343+ - { tp: 2, conc-start: 4, conc-end: 256 }
344+ - { tp: 4, conc-start: 4, conc-end: 16 }
345+ - isl : 8192
346+ osl : 1024
347+ search-space :
348+ - { tp: 2, conc-start: 4, conc-end: 256 }
349+ - { tp: 4, conc-start: 4, conc-end: 16 }
350+
334351qwen3.5-fp8-mi300x-sglang :
335352 image : lmsysorg/sglang:v0.5.10-rocm720-mi30x
336353 model : Qwen/Qwen3.5-397B-A17B-FP8
@@ -382,11 +399,13 @@ glm5-fp8-mi355x-sglang-mtp:
382399 - isl : 1024
383400 osl : 1024
384401 search-space :
385- - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
402+ - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
403+ - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
386404 - isl : 8192
387405 osl : 1024
388406 search-space :
389- - { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
407+ - { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
408+ - { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
390409
391410glm5-fp8-mi355x-atom :
392411 image : rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
@@ -401,10 +420,12 @@ glm5-fp8-mi355x-atom:
401420 - isl : 1024
402421 osl : 1024
403422 search-space :
423+ - { tp: 4, conc-start: 4, conc-end: 256 }
404424 - { tp: 8, conc-start: 4, conc-end: 256 }
405425 - isl : 8192
406426 osl : 1024
407427 search-space :
428+ - { tp: 4, conc-start: 4, conc-end: 256 }
408429 - { tp: 8, conc-start: 4, conc-end: 256 }
409430
410431glm5.1-fp4-mi355x-sglang :
@@ -427,11 +448,6 @@ glm5.1-fp4-mi355x-sglang:
427448 search-space :
428449 - { tp: 2, conc-start: 4, conc-end: 256 }
429450 - { tp: 4, conc-start: 4, conc-end: 16 }
430- agentic-coding :
431- - duration : 1800
432- search-space :
433- # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
434- - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
435451
436452glm5.1-fp4-mi355x-atom :
437453 image : rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
@@ -510,11 +526,7 @@ kimik2.5-int4-mi300x-vllm:
510526 - { tp: 8, conc-start: 4, conc-end: 64 }
511527
512528kimik2.5-fp4-mi355x-vllm :
513- # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
514- # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
515- # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
516- # includes all subsequent ROCm offload work.
517- image : vllm/vllm-openai-rocm:v0.21.0
529+ image : vllm/vllm-openai-rocm:v0.18.0
518530 model : amd/Kimi-K2.5-MXFP4
519531 model-prefix : kimik2.5
520532 runner : mi355x
@@ -533,18 +545,6 @@ kimik2.5-fp4-mi355x-vllm:
533545 search-space :
534546 - { tp: 8, conc-start: 4, conc-end: 64 }
535547 - { tp: 4, conc-start: 4, conc-end: 64 }
536- # MI355X has 288 GB HBM per GPU (vs MI300X/MI325X smaller, comparable to
537- # B300). Extend the conc sweep upward to probe where the KV cliff sits
538- # with the larger HBM envelope. Restrict to tp=8 for this sweep to halve
539- # job count while still covering the main parallelism config.
540- agentic-coding :
541- - duration : 1800
542- search-space :
543- - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
544- # CPU offload only above the KV cliff. Lower concurrencies fit
545- # entirely on-GPU, so paying the offload-path overhead there would
546- # just slow them down without measuring anything new.
547- - { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] }
548548
549549kimik2.5-fp4-mi355x-atom :
550550 image : rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
@@ -568,12 +568,7 @@ kimik2.5-fp4-mi355x-atom:
568568 - { tp: 4, conc-start: 4, conc-end: 128 }
569569
570570minimaxm2.5-fp8-mi355x-vllm :
571- # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector]
572- # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"),
573- # which enables SimpleCPUOffloadConnector on ROCm. Required for the
574- # cpu-offload sweep points to use the same offload path as the NVIDIA
575- # agentic-coding configs.
576- image : vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
571+ image : vllm/vllm-openai-rocm:v0.19.0
577572 model : MiniMaxAI/MiniMax-M2.5
578573 model-prefix : minimaxm2.5
579574 runner : mi355x
@@ -594,14 +589,6 @@ minimaxm2.5-fp8-mi355x-vllm:
594589 - { tp: 2, ep: 2, conc-start: 2, conc-end: 256 }
595590 - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
596591 - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }
597- agentic-coding :
598- # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
599- # Compute saturates first; cpu offload likely won't help, but worth confirming.
600- # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
601- - duration : 1800
602- search-space :
603- - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
604- - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] }
605592
606593minimaxm2.5-fp8-mi355x-atom :
607594 image : rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
@@ -624,6 +611,31 @@ minimaxm2.5-fp8-mi355x-atom:
624611 - { tp: 2, conc-start: 4, conc-end: 256 }
625612 - { tp: 4, conc-start: 4, conc-end: 256 }
626613
614+ minimaxm2.5-fp4-mi355x-atom :
615+ image : rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
616+ model : amd/MiniMax-M2.5-MXFP4
617+ model-prefix : minimaxm2.5
618+ runner : mi355x
619+ precision : fp4
620+ framework : atom
621+ multinode : false
622+ scenarios :
623+ fixed-seq-len :
624+ - isl : 1024
625+ osl : 1024
626+ search-space :
627+ - { tp: 1, conc-start: 4, conc-end: 1024 }
628+ - { tp: 2, conc-start: 4, conc-end: 1024 }
629+ - { tp: 4, conc-start: 4, conc-end: 128 }
630+ - { tp: 8, conc-start: 4, conc-end: 16 }
631+ - isl : 8192
632+ osl : 1024
633+ search-space :
634+ - { tp: 1, conc-start: 4, conc-end: 1024 }
635+ - { tp: 2, conc-start: 4, conc-end: 1024 }
636+ - { tp: 4, conc-start: 4, conc-end: 128 }
637+ - { tp: 8, conc-start: 4, conc-end: 16 }
638+
627639minimaxm2.5-fp4-mi355x-vllm :
628640 image : vllm/vllm-openai-rocm:v0.19.1
629641 model : amd/MiniMax-M2.5-MXFP4
@@ -648,8 +660,7 @@ minimaxm2.5-fp4-mi355x-vllm:
648660 - { tp: 4, conc-start: 4, conc-end: 64 }
649661
650662minimaxm2.5-fp8-mi300x-vllm :
651- # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
652- image : vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
663+ image : vllm/vllm-openai-rocm:v0.16.0
653664 model : MiniMaxAI/MiniMax-M2.5
654665 model-prefix : minimaxm2.5
655666 runner : mi300x
@@ -668,18 +679,9 @@ minimaxm2.5-fp8-mi300x-vllm:
668679 search-space :
669680 - { tp: 2, conc-start: 4, conc-end: 64 }
670681 - { tp: 4, conc-start: 4, conc-end: 64 }
671- agentic-coding :
672- # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200);
673- # KV cliff ~52. Compute saturates first.
674- # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
675- - duration : 1800
676- search-space :
677- - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
678- - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] }
679682
680683minimaxm2.5-fp8-mi325x-vllm :
681- # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
682- image : vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
684+ image : vllm/vllm-openai-rocm:v0.18.0
683685 model : MiniMaxAI/MiniMax-M2.5
684686 model-prefix : minimaxm2.5
685687 runner : mi325x
@@ -698,15 +700,6 @@ minimaxm2.5-fp8-mi325x-vllm:
698700 search-space :
699701 - { tp: 2, conc-start: 4, conc-end: 64 }
700702 - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
701- agentic-coding :
702- # MI325X tp=4: cloned from MI300X recipe (slightly faster compute,
703- # similar HBM profile). Compute saturates first; cpu-offload window
704- # exercises the SimpleCPUOffloadConnector path enabled by the rocm
705- # nightly. Mirror MI300X conc grid for cross-vendor comparability.
706- - duration : 1800
707- search-space :
708- - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
709- - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] }
710703
711704gptoss-fp4-mi300x-vllm :
712705 image : vllm/vllm-openai-rocm:v0.17.0
@@ -1643,13 +1636,13 @@ dsv4-fp8-mi355x-vllm:
16431636 search-space :
16441637 - { tp: 8, conc-start: 1, conc-end: 1 }
16451638
1646- # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
1647- # PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...]
1648- # hardcode), --enforce-eager required, ATOM_USE_TRITON_MOE =1 required on
1649- # gfx950. Image is the standard atom0.1.2.post MI355X base (matching
1650- # qwen3.5-fp8-mi355x-atom); the DSv4 PR is overlaid at runtime by
1651- # benchmarks/single_node/dsv4_fp4_mi355x_atom.sh at a pinned SHA. Sweep
1652- # will expand once ATOM PR3 (multi-request) and PR4 (CUDAGraph) land .
1639+ # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
1640+ # PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
1641+ # that OOM once warmup/prefill batches multiple requests; keep CONC =1 until
1642+ # the AITER sparse-attention kernel / multi-request path lands upstream.
1643+ # --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is
1644+ # the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom);
1645+ # the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA .
16531646dsv4-fp4-mi355x-atom :
16541647 image : rocm/atom-dev:nightly_202605130853
16551648 model : deepseek-ai/DeepSeek-V4-Pro
0 commit comments