Skip to content

Commit 801621f

Browse files
Revert "Migrate agentic-coding benchmarks to aiperf v0.2 (#1391)" (#1392)
This reverts commit 370a162.
1 parent 370a162 commit 801621f

47 files changed

Lines changed: 513 additions & 3752 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/configs/amd-master.yaml

Lines changed: 63 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -239,10 +239,6 @@ qwen3.5-fp8-mi355x-sglang:
239239
search-space:
240240
- { tp: 2, ep: 2, conc-start: 4, conc-end: 32 }
241241
- { tp: 4, ep: 1, conc-start: 32, conc-end: 256 }
242-
agentic-coding:
243-
- duration: 1800
244-
search-space:
245-
- { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
246242

247243
qwen3.5-fp8-mi355x-sglang-mtp:
248244
image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
@@ -331,6 +327,27 @@ qwen3.5-fp4-mi355x-sglang:
331327
- { tp: 2, conc-start: 4, conc-end: 256 }
332328
- { tp: 4, conc-start: 4, conc-end: 16 }
333329

330+
qwen3.5-fp4-mi355x-atom:
331+
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
332+
model: amd/Qwen3.5-397B-A17B-MXFP4
333+
model-prefix: qwen3.5
334+
runner: mi355x
335+
precision: fp4
336+
framework: atom
337+
multinode: false
338+
scenarios:
339+
fixed-seq-len:
340+
- isl: 1024
341+
osl: 1024
342+
search-space:
343+
- { tp: 2, conc-start: 4, conc-end: 256 }
344+
- { tp: 4, conc-start: 4, conc-end: 16 }
345+
- isl: 8192
346+
osl: 1024
347+
search-space:
348+
- { tp: 2, conc-start: 4, conc-end: 256 }
349+
- { tp: 4, conc-start: 4, conc-end: 16 }
350+
334351
qwen3.5-fp8-mi300x-sglang:
335352
image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
336353
model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -382,11 +399,13 @@ glm5-fp8-mi355x-sglang-mtp:
382399
- isl: 1024
383400
osl: 1024
384401
search-space:
385-
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
402+
- { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
403+
- { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
386404
- isl: 8192
387405
osl: 1024
388406
search-space:
389-
- { tp: 8, conc-start: 4, conc-end: 64, spec-decoding: mtp }
407+
- { tp: 4, conc-start: 4, conc-end: 128, spec-decoding: mtp }
408+
- { tp: 8, conc-start: 4, conc-end: 8, spec-decoding: mtp }
390409

391410
glm5-fp8-mi355x-atom:
392411
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
@@ -401,10 +420,12 @@ glm5-fp8-mi355x-atom:
401420
- isl: 1024
402421
osl: 1024
403422
search-space:
423+
- { tp: 4, conc-start: 4, conc-end: 256 }
404424
- { tp: 8, conc-start: 4, conc-end: 256 }
405425
- isl: 8192
406426
osl: 1024
407427
search-space:
428+
- { tp: 4, conc-start: 4, conc-end: 256 }
408429
- { tp: 8, conc-start: 4, conc-end: 256 }
409430

410431
glm5.1-fp4-mi355x-sglang:
@@ -427,11 +448,6 @@ glm5.1-fp4-mi355x-sglang:
427448
search-space:
428449
- { tp: 2, conc-start: 4, conc-end: 256 }
429450
- { tp: 4, conc-start: 4, conc-end: 16 }
430-
agentic-coding:
431-
- duration: 1800
432-
search-space:
433-
# sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
434-
- { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
435451

436452
glm5.1-fp4-mi355x-atom:
437453
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
@@ -510,11 +526,7 @@ kimik2.5-int4-mi300x-vllm:
510526
- { tp: 8, conc-start: 4, conc-end: 64 }
511527

512528
kimik2.5-fp4-mi355x-vllm:
513-
# v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
514-
# (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
515-
# cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
516-
# includes all subsequent ROCm offload work.
517-
image: vllm/vllm-openai-rocm:v0.21.0
529+
image: vllm/vllm-openai-rocm:v0.18.0
518530
model: amd/Kimi-K2.5-MXFP4
519531
model-prefix: kimik2.5
520532
runner: mi355x
@@ -533,18 +545,6 @@ kimik2.5-fp4-mi355x-vllm:
533545
search-space:
534546
- { tp: 8, conc-start: 4, conc-end: 64 }
535547
- { tp: 4, conc-start: 4, conc-end: 64 }
536-
# MI355X has 288 GB HBM per GPU (vs MI300X/MI325X smaller, comparable to
537-
# B300). Extend the conc sweep upward to probe where the KV cliff sits
538-
# with the larger HBM envelope. Restrict to tp=8 for this sweep to halve
539-
# job count while still covering the main parallelism config.
540-
agentic-coding:
541-
- duration: 1800
542-
search-space:
543-
- { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
544-
# CPU offload only above the KV cliff. Lower concurrencies fit
545-
# entirely on-GPU, so paying the offload-path overhead there would
546-
# just slow them down without measuring anything new.
547-
- { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] }
548548

549549
kimik2.5-fp4-mi355x-atom:
550550
image: rocm/atom:rocm7.2.1-ubuntu24.04-pytorch2.9.1-atom0.1.2
@@ -568,12 +568,7 @@ kimik2.5-fp4-mi355x-atom:
568568
- { tp: 4, conc-start: 4, conc-end: 128 }
569569

570570
minimaxm2.5-fp8-mi355x-vllm:
571-
# Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector]
572-
# add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"),
573-
# which enables SimpleCPUOffloadConnector on ROCm. Required for the
574-
# cpu-offload sweep points to use the same offload path as the NVIDIA
575-
# agentic-coding configs.
576-
image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
571+
image: vllm/vllm-openai-rocm:v0.19.0
577572
model: MiniMaxAI/MiniMax-M2.5
578573
model-prefix: minimaxm2.5
579574
runner: mi355x
@@ -594,14 +589,6 @@ minimaxm2.5-fp8-mi355x-vllm:
594589
- { tp: 2, ep: 2, conc-start: 2, conc-end: 256 }
595590
- { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
596591
- { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }
597-
agentic-coding:
598-
# MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
599-
# Compute saturates first; cpu offload likely won't help, but worth confirming.
600-
# AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
601-
- duration: 1800
602-
search-space:
603-
- { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
604-
- { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] }
605592

606593
minimaxm2.5-fp8-mi355x-atom:
607594
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
@@ -624,6 +611,31 @@ minimaxm2.5-fp8-mi355x-atom:
624611
- { tp: 2, conc-start: 4, conc-end: 256 }
625612
- { tp: 4, conc-start: 4, conc-end: 256 }
626613

614+
minimaxm2.5-fp4-mi355x-atom:
615+
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
616+
model: amd/MiniMax-M2.5-MXFP4
617+
model-prefix: minimaxm2.5
618+
runner: mi355x
619+
precision: fp4
620+
framework: atom
621+
multinode: false
622+
scenarios:
623+
fixed-seq-len:
624+
- isl: 1024
625+
osl: 1024
626+
search-space:
627+
- { tp: 1, conc-start: 4, conc-end: 1024 }
628+
- { tp: 2, conc-start: 4, conc-end: 1024 }
629+
- { tp: 4, conc-start: 4, conc-end: 128 }
630+
- { tp: 8, conc-start: 4, conc-end: 16 }
631+
- isl: 8192
632+
osl: 1024
633+
search-space:
634+
- { tp: 1, conc-start: 4, conc-end: 1024 }
635+
- { tp: 2, conc-start: 4, conc-end: 1024 }
636+
- { tp: 4, conc-start: 4, conc-end: 128 }
637+
- { tp: 8, conc-start: 4, conc-end: 16 }
638+
627639
minimaxm2.5-fp4-mi355x-vllm:
628640
image: vllm/vllm-openai-rocm:v0.19.1
629641
model: amd/MiniMax-M2.5-MXFP4
@@ -648,8 +660,7 @@ minimaxm2.5-fp4-mi355x-vllm:
648660
- { tp: 4, conc-start: 4, conc-end: 64 }
649661

650662
minimaxm2.5-fp8-mi300x-vllm:
651-
# Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
652-
image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
663+
image: vllm/vllm-openai-rocm:v0.16.0
653664
model: MiniMaxAI/MiniMax-M2.5
654665
model-prefix: minimaxm2.5
655666
runner: mi300x
@@ -668,18 +679,9 @@ minimaxm2.5-fp8-mi300x-vllm:
668679
search-space:
669680
- { tp: 2, conc-start: 4, conc-end: 64 }
670681
- { tp: 4, conc-start: 4, conc-end: 64 }
671-
agentic-coding:
672-
# MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200);
673-
# KV cliff ~52. Compute saturates first.
674-
# AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
675-
- duration: 1800
676-
search-space:
677-
- { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
678-
- { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] }
679682

680683
minimaxm2.5-fp8-mi325x-vllm:
681-
# Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
682-
image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
684+
image: vllm/vllm-openai-rocm:v0.18.0
683685
model: MiniMaxAI/MiniMax-M2.5
684686
model-prefix: minimaxm2.5
685687
runner: mi325x
@@ -698,15 +700,6 @@ minimaxm2.5-fp8-mi325x-vllm:
698700
search-space:
699701
- { tp: 2, conc-start: 4, conc-end: 64 }
700702
- { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
701-
agentic-coding:
702-
# MI325X tp=4: cloned from MI300X recipe (slightly faster compute,
703-
# similar HBM profile). Compute saturates first; cpu-offload window
704-
# exercises the SimpleCPUOffloadConnector path enabled by the rocm
705-
# nightly. Mirror MI300X conc grid for cross-vendor comparability.
706-
- duration: 1800
707-
search-space:
708-
- { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
709-
- { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] }
710703

711704
gptoss-fp4-mi300x-vllm:
712705
image: vllm/vllm-openai-rocm:v0.17.0
@@ -1643,13 +1636,13 @@ dsv4-fp8-mi355x-vllm:
16431636
search-space:
16441637
- { tp: 8, conc-start: 1, conc-end: 1 }
16451638

1646-
# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
1647-
# PR1 of the ATOM DSv4 series — single-sequence only (kv_cache[:1,...]
1648-
# hardcode), --enforce-eager required, ATOM_USE_TRITON_MOE=1 required on
1649-
# gfx950. Image is the standard atom0.1.2.post MI355X base (matching
1650-
# qwen3.5-fp8-mi355x-atom); the DSv4 PR is overlaid at runtime by
1651-
# benchmarks/single_node/dsv4_fp4_mi355x_atom.sh at a pinned SHA. Sweep
1652-
# will expand once ATOM PR3 (multi-request) and PR4 (CUDAGraph) land.
1639+
# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
1640+
# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
1641+
# that OOM once warmup/prefill batches multiple requests; keep CONC=1 until
1642+
# the AITER sparse-attention kernel / multi-request path lands upstream.
1643+
# --enforce-eager and ATOM_USE_TRITON_MOE=1 are required on gfx950. Image is
1644+
# the standard atom0.1.2.post MI355X base (matching qwen3.5-fp8-mi355x-atom);
1645+
# the DSv4 PR is overlaid at runtime by dsv4_fp4_mi355x_atom.sh at a pinned SHA.
16531646
dsv4-fp4-mi355x-atom:
16541647
image: rocm/atom-dev:nightly_202605130853
16551648
model: deepseek-ai/DeepSeek-V4-Pro

0 commit comments

Comments
 (0)