diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index fb3966ce6..a50d37eab 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -304,25 +304,6 @@ qwen3.5-fp8-mi355x-sglang-mtp: - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } -# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main. -qwen3.5-fp8-mi355x-sglang-agentic: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - qwen3.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -704,26 +685,6 @@ glm5.1-fp4-mi355x-sglang: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 16 } -# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main. -glm5.1-fp4-mi355x-sglang-agentic: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 - model: amd/GLM-5.1-MXFP4 - model-prefix: glm5.1 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - glm5.1-fp4-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: amd/GLM-5.1-MXFP4 @@ -744,7 +705,7 @@ glm5.1-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 256 } kimik2.5-int4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi355x @@ -763,7 +724,7 @@ kimik2.5-int4-mi355x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi325x @@ -782,7 +743,7 @@ kimik2.5-int4-mi325x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi300x @@ -821,38 +782,6 @@ kimik2.5-fp4-mi355x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } -# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0' -kimik2.5-fp4-mi355x-vllm-agentic: - # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin - # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm - # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and - # includes all subsequent ROCm offload work. - image: vllm/vllm-openai-rocm:v0.21.0 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } - # CPU offload only above the KV cliff. Lower concurrencies fit - # entirely on-GPU, so paying the offload-path overhead there would - # just slow them down without measuring anything new. - - { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] } - # TP=4 probe: half-node layout doubles per-GPU weight footprint - # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to - # cliff-region concurrencies on both offload modes so we can directly - # compare TP=4 vs TP=8 at the same conc points. - - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } - kimik2.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 model: amd/Kimi-K2.5-MXFP4 @@ -897,33 +826,6 @@ minimaxm2.5-fp8-mi355x-vllm: - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } -# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' -minimaxm2.5-fp8-mi355x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector] - # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"), - # which enables SimpleCPUOffloadConnector on ROCm. Required for the - # cpu-offload sweep points to use the same offload path as the NVIDIA - # agentic-coding configs. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi355x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical). - # Compute saturates first; cpu offload likely won't help, but worth confirming. - # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). - - duration: 1800 - search-space: - - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] } - - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } - minimaxm2.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: MiniMaxAI/MiniMax-M2.5 @@ -994,7 +896,7 @@ minimaxm2.5-fp4-mi355x-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi300x @@ -1014,29 +916,6 @@ minimaxm2.5-fp8-mi300x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } -# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' -minimaxm2.5-fp8-mi300x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi300x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200); - # KV cliff ~52. Compute saturates first. - # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } - - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } - minimaxm2.5-fp8-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -1058,32 +937,8 @@ minimaxm2.5-fp8-mi325x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } -# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' -minimaxm2.5-fp8-mi325x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi325x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI325X tp=4: cloned from MI300X recipe (slightly faster compute, - # similar HBM profile). Compute saturates first; cpu-offload window - # exercises the SimpleCPUOffloadConnector path enabled by the rocm - # nightly. Mirror MI300X conc grid for cross-vendor comparability. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } - - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } - gptoss-fp4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.17.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi300x @@ -1524,7 +1379,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 + image: vllm/vllm-openai-rocm:v0.22.0 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg @@ -1578,7 +1433,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031 + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg @@ -1971,7 +1826,6 @@ dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 model: amd/DeepSeek-R1-0528-MXFP4-v2 @@ -2082,7 +1936,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" - # 1*DEP8 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 128 ] @@ -2140,11 +1993,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - -# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the -# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the -# image tag, so bumping sglang is just an image tag bump here. Sweeps -# DP-attention on/off and EP=8. dsv4-fp4-mi355x-sglang: image: rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4 model: deepseek-ai/DeepSeek-V4-Pro @@ -2201,25 +2049,6 @@ dsv4-fp4-mi355x-sglang-mtp: - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp } - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp } -# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm -# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged -# on 2026-05-05, so any nightly built after that includes the -# DeepseekV4ForCausalLM model class. -# -# IMPORTANT: pin to a digest-suffixed nightly tag rather than the -# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs -# files keyed on the image string and short-circuits re-import if the -# file already exists, so the floating tag silently keeps a stale build -# even after Docker Hub updates `:nightly`. -# -# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the -# rest); InferenceX classifies this as fp4 — same as the sister sglang -# and atom DSv4 mi355x entries below. Image and serving flags follow the -# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp -# executor, triton_unfused MoE (required for the FP4 expert format), -# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, -# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 -# probe to validate the ROCm DP+EP path. dsv4-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -2408,44 +2237,6 @@ glm5-fp8-mi325x-sglang-mtp: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } -# ============================================================================ -# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries). -# Recipes that ALREADY existed on main were intentionally left at main's version -# to preserve main behavior; PR-branch modifications to those recipes are NOT -# brought in here. -# ============================================================================ - -qwen3.5-fp8-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } - -dsv4-fp4-mi355x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.21.0 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4] } - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } - - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } - dsr1-fp4-mi355x-sglang-disagg-mtp: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 model: amd/DeepSeek-R1-0528-MXFP4-v2 @@ -2674,20 +2465,145 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" +qwen3.5-fp8-mi355x-sglang-agentic: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + +glm5.1-fp4-mi355x-sglang-agentic: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 + model: amd/GLM-5.1-MXFP4 + model-prefix: glm5.1 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + +kimik2.5-fp4-mi355x-vllm-agentic: + image: vllm/vllm-openai-rocm:v0.22.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } + # CPU offload only above the KV cliff. Lower concurrencies fit + # entirely on-GPU, so paying the offload-path overhead there would + # just slow them down without measuring anything new. + - { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] } + # TP=4 probe: half-node layout doubles per-GPU weight footprint + # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to + # cliff-region concurrencies on both offload modes so we can directly + # compare TP=4 vs TP=8 at the same conc points. + - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } + - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } + +minimaxm2.5-fp8-mi355x-vllm-agentic: + image: vllm/vllm-openai-rocm:v0.22.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi355x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical). + # Compute saturates first; cpu offload likely won't help, but worth confirming. + # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). + - duration: 1800 + search-space: + - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] } + - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } + +minimaxm2.5-fp8-mi300x-vllm-agentic: + image: vllm/vllm-openai-rocm:v0.22.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi300x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200); + # KV cliff ~52. Compute saturates first. + # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } + - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } + +minimaxm2.5-fp8-mi325x-vllm-agentic: + image: vllm/vllm-openai-rocm:v0.22.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi325x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI325X tp=4: cloned from MI300X recipe (slightly faster compute, + # similar HBM profile). Compute saturates first; cpu-offload window + # exercises the SimpleCPUOffloadConnector path enabled by the rocm + # nightly. Mirror MI300X conc grid for cross-vendor comparability. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } + - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } + +qwen3.5-fp8-mi355x-sglang-agentic-hicache: + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + +dsv4-fp4-mi355x-vllm-agentic: + image: vllm/vllm-openai-rocm:v0.22.0 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4] } + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } -# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the -# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the -# image tag, so bumping sglang is just an image tag bump here. Sweeps -# DP-attention on/off and EP=8. - -# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below; -# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - scenarios: replaced fixed-seq-len with agentic-coding. -# Image is identical to the base entry (rocm/sgl-dev DSv4 build). -# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware -# comparability. Offload sweep is none-only (SGLang has no equivalent of -# vLLM's SimpleCPUOffloadConnector path that we exercise on b200). dsv4-fp4-mi355x-sglang-agentic: image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4 model: deepseek-ai/DeepSeek-V4-Pro @@ -2702,23 +2618,3 @@ dsv4-fp4-mi355x-sglang-agentic: search-space: - { tp: 8, offloading: none, conc-list: [16, 32, 64] } - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } - -# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm -# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged -# on 2026-05-05, so any nightly built after that includes the -# DeepseekV4ForCausalLM model class. -# -# IMPORTANT: pin to a digest-suffixed nightly tag rather than the -# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs -# files keyed on the image string and short-circuits re-import if the -# file already exists, so the floating tag silently keeps a stale build -# even after Docker Hub updates `:nightly`. -# -# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the -# rest); InferenceX classifies this as fp4 — same as the sister sglang -# and atom DSv4 mi355x entries below. Image and serving flags follow the -# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp -# executor, triton_unfused MoE (required for the FP4 expert format), -# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, -# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 -# probe to validate the ROCm DP+EP path. diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d3b1b6729..7a15b43a3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -384,25 +384,6 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - agentic-coding: - - duration: 300 - search-space: - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - dsr1-fp8-b200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 model: deepseek-ai/DeepSeek-R1-0528 @@ -1778,28 +1759,6 @@ dsv4-fp4-b200-vllm: - { tp: 8, conc-start: 1, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 } -# Diverged from dsv4-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original dsv4-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'b200-dsv4' -> 'b200-dgxc' -dsv4-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.20.0-cu130 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: b200-dgxc - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # cpu offload only this iteration — none entries already validated in - # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%). - # Re-add when investigating regressions in offload=none. - - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - dsv4-fp4-b200-trt: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 model: deepseek-ai/DeepSeek-V4-Pro @@ -1845,7 +1804,7 @@ dsv4-fp4-b200-trt-mtp: # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. dsv4-fp4-b200-vllm-mtp: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -2105,23 +2064,6 @@ qwen3.5-bf16-b200-sglang-mtp: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } -# agentic-coding sibling — temporarily disabled, blocked by e2e-tests.yml -# artifact-name mismatch (downloads `agentic_*` but benchmark-tmpl.yml uploads -# as `bmk_agentic_*`). Re-enable once that workflow is aligned. -# qwen3.5-bf16-b200-sglang-agentic: -# image: lmsysorg/sglang:v0.5.12-cu130 -# model: Qwen/Qwen3.5-397B-A17B -# model-prefix: qwen3.5 -# runner: b200 -# precision: bf16 -# framework: sglang -# multinode: false -# scenarios: -# agentic-coding: -# - duration: 1800 -# search-space: -# - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - qwen3.5-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -2143,25 +2085,6 @@ qwen3.5-fp8-b200-sglang: - { tp: 8, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } -# Diverged from qwen3.5-fp8-b200-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's qwen3.5-fp8-b200-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original qwen3.5-fp8-b200-sglang entry stays byte-identical to origin/main. -qwen3.5-fp8-b200-sglang-agentic: - image: lmsysorg/sglang:nightly-dev-20260422-de962f32 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: b200 - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - qwen3.5-fp4-b200-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: nvidia/Qwen3.5-397B-A17B-NVFP4 @@ -2245,26 +2168,6 @@ glm5-fp8-b200-sglang-mtp: # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 # does not have a B300-specific recipe, so this config reuses the existing GLM5 FP8 # B200 SGLang recipe as-is until B300-specific tuning is available. -# Diverged from glm5-fp8-b200-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's glm5-fp8-b200-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original glm5-fp8-b200-sglang entry stays byte-identical to origin/main. -glm5-fp8-b200-sglang-agentic: - image: lmsysorg/sglang:v0.5.12-cu130 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: b200 - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] } - glm5-fp8-b300-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: zai-org/GLM-5-FP8 @@ -2411,7 +2314,6 @@ qwen3.5-fp8-b200-sglang-mtp: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - qwen3.5-fp8-b300-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -2553,39 +2455,8 @@ kimik2.5-int4-b200-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-int4-b200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-int4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'b200' -> 'b200-dgxc' -kimik2.5-int4-b200-vllm-agentic: - # Bumped from v0.19.1 — that release tripped a bug in - # `flashinfer_trtllm_mxint4_moe` ('list' object has no attribute 'to') - # during warmup `profile_run` on the agentic-coding path - # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the - # flashinfer fix. - image: vllm/vllm-openai:v0.20.2 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - runner: b200-dgxc - precision: int4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 8, offloading: cpu, conc-list: [32, 64, 96, 128] } - -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. - kimik2.5-int4-b300-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: b300 @@ -2624,29 +2495,6 @@ kimik2.5-int4-h200-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } -# Diverged from kimik2.5-int4-h200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-int4-h200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'h200' -> 'h200-dgxc' -kimik2.5-int4-h200-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with - # the /home/sa-shared/gharunners/ai-perf-cache mount where aiperf's - # content-addressed dataset mmap cache lives. Other h200 pools (cw, nb) - # don't have that mount and would re-materialize 65 GB to /tmp every job. - runner: h200-dgxc - precision: int4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] } - - { tp: 8, offloading: cpu, conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] } - kimik2.5-fp4-b200-vllm: image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 @@ -2668,40 +2516,8 @@ kimik2.5-fp4-b200-vllm: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2' -# - runner: 'b200' -> 'b200-dgxc' -kimik2.5-fp4-b200-vllm-agentic: - # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that - # cleared the agentic-coding warmup crash on max_model_len=131072 + - # prefix caching. - image: vllm/vllm-openai:v0.20.2 - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - runner: b200-dgxc - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] } - - { tp: 8, ep: 1, offloading: cpu, conc-list: [16, 24, 32, 36] } - - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] } - - { tp: 4, ep: 1, offloading: cpu, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } - -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. - kimik2.5-fp4-b300-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: b300 @@ -2763,34 +2579,6 @@ dsr1-fp8-b300-sglang-mtp: search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 512, spec-decoding: mtp } -# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130' -# - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4' -# - model-prefix: 'dsr1' -> 'kimik2.5' -# - precision: 'fp8' -> 'fp4' -# - framework: 'sglang' -> 'vllm' -kimik2.5-fp4-b300-vllm-agentic: - # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM - # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the - # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted - # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the - # INT4 B300 sister already uses successfully. - image: vllm/vllm-openai:v0.20.0-cu130 - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - runner: b300 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - - { tp: 8, ep: 1, offloading: cpu, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - dsr1-fp8-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc14 model: deepseek-ai/DeepSeek-R1-0528 @@ -2880,7 +2668,7 @@ dsr1-fp8-h200-sglang-mtp: # Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache # flag is omitted. Max-model-len is pinned at 800k per the recipe. dsv4-fp8-h200-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 @@ -2904,7 +2692,7 @@ dsv4-fp8-h200-vllm: # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. dsv4-fp8-h200-vllm-mtp: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 @@ -2924,31 +2712,6 @@ dsv4-fp8-h200-vllm-mtp: - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp } -# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). -# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper -# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up. -# Diverged from dsv4-fp8-h200-vllm (agentic-coding sibling). Reasons below; -# the original dsv4-fp8-h200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129' -dsv4-fp8-h200-vllm-agentic: - image: vllm/vllm-openai:deepseekv4-cu129 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: h200 - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [1, 2, 4, 8, 16] } - -# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image -# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds -# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. - dsv4-fp8-h200-sglang: image: lmsysorg/sglang:deepseek-v4-hopper@sha256:7f19c6dc092e47a10fac2e41f47eab78970280d06648b8e50d312a82f0ae722f model: deepseek-ai/DeepSeek-V4-Pro @@ -3024,30 +2787,6 @@ dsv4-fp4-b300-vllm: - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } -# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main. -dsv4-fp4-b300-vllm-agentic: - image: vllm/vllm-openai:v0.20.0-cu130 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: b300 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # cpu offload only this iteration — none entries already validated in - # earlier runs. Re-add when investigating regressions in offload=none. - - { tp: 4, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } - dsv4-fp4-b300-trt: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 model: deepseek-ai/DeepSeek-V4-Pro @@ -3095,7 +2834,7 @@ dsv4-fp4-b300-trt-mtp: - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024, spec-decoding: mtp } dsv4-fp4-b300-vllm-mtp: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -4284,27 +4023,6 @@ gptoss-fp4-b200-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 4 } -# Diverged from gptoss-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original gptoss-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1' -gptoss-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 - model: openai/gpt-oss-120b - model-prefix: gptoss - runner: b200 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] } - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] } - - { tp: 4, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } - - { tp: 8, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } - minimaxm2.5-fp8-b200-vllm: image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -4330,35 +4048,8 @@ minimaxm2.5-fp8-b200-vllm: # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from minimaxm2.5-fp8-b200-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' -# - runner: 'b200' -> 'b200-dgxc' -minimaxm2.5-fp8-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: b200-dgxc - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # B200 tp=4: compute ceiling ~50 (empirical), KV cliff ~48 (analytical). - # Push none past the KV cliff (96, 128) to make the no-offload throughput - # collapse visible; cpu range overlaps fully for same-conc comparison. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 96, 128] } - - { tp: 4, offloading: cpu, conc-list: [48, 56, 64, 96, 128] } - - # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html - # does not have a B300-specific recipe, so this config reuses the existing - # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. - minimaxm2.5-fp8-b300-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: b300 @@ -4381,31 +4072,6 @@ minimaxm2.5-fp8-b300-vllm: - { tp: 2, conc-start: 64, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 8 } -# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' -minimaxm2.5-fp8-b300-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: b300 - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # B300 tp=4: compute ceiling ~60 (empirical), KV cliff ~85 (analytical). - # Push none past the KV cliff (96, 128, 192) so the no-offload throughput - # collapse is visible; cpu range overlaps fully so each high-conc point - # has a same-conc no-offload counterpart for direct comparison. - # Dense sampling between 96 and 128 (step=4) to resolve the sharp dropoff - # observed in v6 cpu data right past conc=96. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } - - { tp: 4, offloading: cpu, conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } - minimaxm2.5-fp4-b200-vllm: image: vllm/vllm-openai:v0.22.0 model: nvidia/MiniMax-M2.5-NVFP4 @@ -4438,31 +4104,8 @@ minimaxm2.5-fp4-b200-vllm: # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from minimaxm2.5-fp4-b200-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp4-b200-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main. -minimaxm2.5-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.0-cu130 - model: nvidia/MiniMax-M2.5-NVFP4 - model-prefix: minimaxm2.5 - runner: b200 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html - # does not have a B300-specific recipe, so this config reuses the existing - # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. - minimaxm2.5-fp4-b300-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: nvidia/MiniMax-M2.5-NVFP4 model-prefix: minimaxm2.5 runner: b300 @@ -4489,7 +4132,7 @@ minimaxm2.5-fp4-b300-vllm: - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h100 @@ -4530,29 +4173,6 @@ minimaxm2.5-fp8-h100-vllm: search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } -# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main. -minimaxm2.5-fp8-h100-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: h100 - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # H100 tp=4 ep=4: compute ceiling ~10 (empirical), KV cliff ~6 (analytical). - # Best cpu-offload demo SKU — 4-conc-point window between cliffs. - # Dense sampling 4-12 covers both cliffs; conc 16 confirms compute plateau. - - duration: 1800 - search-space: - - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] } - - { tp: 4, ep: 4, offloading: cpu, conc-list: [5, 6, 7, 8, 10, 12] } - dsr1-fp8-h100-dynamo-sglang: image: lmsysorg/sglang:v0.5.8-cu130 model: deepseek-ai/DeepSeek-R1-0528 @@ -4757,28 +4377,6 @@ minimaxm2.5-fp8-h200-vllm: search-space: - { tp: 4, conc-start: 1, conc-end: 256 } -# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main. -minimaxm2.5-fp8-h200-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: h200 - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # H200 tp=4: compute ceiling ~35 (empirical), KV cliff ~29 (analytical). - # cpu offload window conc 29-35 — dense sampling 24-40 captures both cliffs. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] } - - { tp: 4, offloading: cpu, conc-list: [24, 28, 32, 36, 40, 48] } - dsr1-fp4-gb200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 model: nvidia/DeepSeek-R1-0528-NVFP4-v2 @@ -8267,7 +7865,7 @@ kimik2.5-fp4-gb200-dynamo-trt: dp-attn: true kimik2.5-fp4-gb200-dynamo-vllm: - image: vllm/vllm-openai:v0.18.0-cu130 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: gb200 @@ -8369,7 +7967,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: dp-attn: true dsv4-fp4-b200-dynamo-vllm: - image: vllm/vllm-openai:v0.20.1 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-multinode @@ -8425,7 +8023,7 @@ dsv4-fp4-b200-dynamo-vllm: dp-attn: true dsv4-fp4-gb200-dynamo-vllm: - image: vllm/vllm-openai:v0.20.0-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb200 @@ -8525,7 +8123,7 @@ dsv4-fp4-gb200-dynamo-vllm: # MTP2 variant of dsv4-fp4-gb200-dynamo-vllm. Uses the vLLM 0.20.1 image # and hand-picked 8k/1k Pareto points mirrored from NVIDIA/srt-slurm. dsv4-fp4-gb200-dynamo-vllm-mtp2: - image: vllm/vllm-openai:v0.20.1-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb200 @@ -8605,7 +8203,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: dp-attn: true dsv4-fp4-b300-dynamo-vllm: - image: vllm/vllm-openai:v0.20.1 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -8661,7 +8259,7 @@ dsv4-fp4-b300-dynamo-vllm: dp-attn: true dsv4-fp4-gb300-dynamo-vllm: - image: vllm/vllm-openai:v0.20.0-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-nv @@ -8856,7 +8454,7 @@ glm5-fp8-b200-dynamo-sglang: image: lmsysorg/sglang:v0.5.11-cu130 model: zai-org/GLM-5-FP8 model-prefix: glm5 - runner: b200-dgxc + runner: b200 precision: fp8 framework: dynamo-sglang multinode: true @@ -9202,27 +8800,6 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 8 dp-attn: true - -kimik2.5-int4-h100-vllm: - image: vllm/vllm-openai:v0.20.2 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - runner: h100 - precision: int4 - framework: vllm - multinode: false - scenarios: - # New entry, agentic-coding only: this PR intentionally does NOT add - # fixed-seq-len coverage for kimik2.5-int4 on H100 to keep the - # fixed-seq-len test surface identical to origin/main. - # H100 has 80 GB HBM per GPU (smallest in this set); the KV cliff arrives - # early. Sweep saturates conc=20 to keep total HBM headroom. - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 20] } - - { tp: 8, offloading: cpu, conc-list: [1, 2, 4, 8, 12, 16, 20] } - qwen3.5-fp8-h100-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -9681,12 +9258,340 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: false -# ============================================================================ -# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries). -# Recipes that ALREADY existed on main were intentionally left at main's version -# to preserve main behavior; PR-branch modifications to those recipes are NOT -# brought in here. -# ============================================================================ +dsv4-fp4-b200-vllm-agentic: + # Includes vllm-project/vllm#44774 so Mooncake honors sparse-attention + # prefix-cache retention when deciding which hybrid-KV blocks to store. + image: cquil/vllm-openai:v0.22.1-dcc957098904749bf375ffbf85aba6c74dfc9fe9 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b200-dgxc + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40] } + - { tp: 8, offloading: cpu, conc-list: [40, 48, 52, 64, 72] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 196, 256, 512] } + +dsv4-fp4-b200-sglang-agentic-hicache: + image: lmsysorg/sglang:v0.5.12.post1-cu130 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b200 + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40] } + - { tp: 8, offloading: hicache, conc-list: [40, 48, 52, 64, 72, 84, 100, 128, 196, 256, 512] } + +qwen3.5-fp8-b200-sglang-agentic: + image: lmsysorg/sglang:nightly-dev-20260422-de962f32 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: b200 + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + +glm5-fp8-b200-sglang-agentic: + image: lmsysorg/sglang:v0.5.12-cu130 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: b200 + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] } + +kimik2.5-int4-b200-vllm-agentic: + image: vllm/vllm-openai:v0.22.0 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + runner: b200-dgxc + precision: int4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 8, offloading: cpu, conc-list: [32, 64, 96, 128] } + +# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html +# does not have a B300-specific recipe, so this config reuses the existing +# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. + +kimik2.5-int4-h200-vllm-agentic: + image: vllm/vllm-openai:v0.22.0 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with + # the /home/sa-shared/gharunners/ai-perf-cache mount where aiperf's + # content-addressed dataset mmap cache lives. Other h200 pools (cw, nb) + # don't have that mount and would re-materialize 65 GB to /tmp every job. + runner: h200-dgxc + precision: int4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] } + - { tp: 8, offloading: cpu, conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] } + +kimik2.5-fp4-b200-vllm-agentic: + image: vllm/vllm-openai:v0.22.0 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: b200-dgxc + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] } + - { tp: 8, ep: 1, offloading: cpu, conc-list: [16, 24, 32, 36] } + - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] } + - { tp: 4, ep: 1, offloading: cpu, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } + +# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html +# does not have a B300-specific recipe, so this config reuses the existing +# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. + +kimik2.5-fp4-b300-vllm-agentic: + # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM + # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the + # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted + # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the + # INT4 B300 sister already uses successfully. + image: vllm/vllm-openai:v0.22.0 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + - { tp: 8, ep: 1, offloading: cpu, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + +dsv4-fp8-h200-vllm-agentic: + image: vllm/vllm-openai:v0.22.0 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: h200 + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [1, 2, 4, 8, 16] } + +# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image +# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds +# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. + +dsv4-fp4-b300-vllm-agentic: + # Includes vllm-project/vllm#44774 so Mooncake honors sparse-attention + # prefix-cache retention when deciding which hybrid-KV blocks to store. + image: cquil/vllm-openai:v0.22.1-dcc957098904749bf375ffbf85aba6c74dfc9fe9 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # TEMPORARY: run only MooncakeStore CPU-offload scenarios while + # diagnosing the native/SimpleCPU offload failures. + - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] } + - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } + - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [32, 48, 64, 96, 128, 192, 256] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + +dsv4-fp4-b300-sglang-agentic-hicache: + image: lmsysorg/sglang:nightly-dev-cu13-20260609-317fc6a9 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] } + - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } + - { tp: 4, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] } + - { tp: 8, offloading: hicache, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } + - { tp: 4, ep: 4, dp-attn: true, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + +gptoss-fp4-b200-vllm-agentic: + image: vllm/vllm-openai:v0.22.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: b200 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] } + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] } + - { tp: 4, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } + - { tp: 8, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } + +minimaxm2.5-fp8-b200-vllm-agentic: + image: vllm/vllm-openai:v0.22.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: b200-dgxc + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # B200 tp=4: compute ceiling ~50 (empirical), KV cliff ~48 (analytical). + # Push none past the KV cliff (96, 128) to make the no-offload throughput + # collapse visible; cpu range overlaps fully for same-conc comparison. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 96, 128] } + - { tp: 4, offloading: cpu, conc-list: [48, 56, 64, 96, 128] } + + # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html + # does not have a B300-specific recipe, so this config reuses the existing + # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. + +minimaxm2.5-fp8-b300-vllm-agentic: + image: vllm/vllm-openai:v0.22.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: b300 + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # B300 tp=4: compute ceiling ~60 (empirical), KV cliff ~85 (analytical). + # Push none past the KV cliff (96, 128, 192) so the no-offload throughput + # collapse is visible; cpu range overlaps fully so each high-conc point + # has a same-conc no-offload counterpart for direct comparison. + # Dense sampling between 96 and 128 (step=4) to resolve the sharp dropoff + # observed in v6 cpu data right past conc=96. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } + - { tp: 4, offloading: cpu, conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } + +minimaxm2.5-fp4-b200-vllm-agentic: + image: vllm/vllm-openai:v0.22.1 + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + runner: b200 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + + # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html + # does not have a B300-specific recipe, so this config reuses the existing + # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. + +minimaxm2.5-fp8-h100-vllm-agentic: + image: vllm/vllm-openai:v0.22.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: h100 + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # H100 tp=4 ep=4: compute ceiling ~10 (empirical), KV cliff ~6 (analytical). + # Best cpu-offload demo SKU — 4-conc-point window between cliffs. + # Dense sampling 4-12 covers both cliffs; conc 16 confirms compute plateau. + - duration: 1800 + search-space: + - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] } + - { tp: 4, ep: 4, offloading: cpu, conc-list: [5, 6, 7, 8, 10, 12] } + +minimaxm2.5-fp8-h200-vllm-agentic: + image: vllm/vllm-openai:v0.22.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: h200 + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # H200 tp=4: compute ceiling ~35 (empirical), KV cliff ~29 (analytical). + # cpu offload window conc 29-35 — dense sampling 24-40 captures both cliffs. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] } + - { tp: 4, offloading: cpu, conc-list: [24, 28, 32, 36, 40, 48] } + +kimik2.5-int4-h100-vllm: + image: vllm/vllm-openai:v0.22.0 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + runner: h100 + precision: int4 + framework: vllm + multinode: false + scenarios: + # New entry, agentic-coding only: this PR intentionally does NOT add + # fixed-seq-len coverage for kimik2.5-int4 on H100 to keep the + # fixed-seq-len test surface identical to origin/main. + # H100 has 80 GB HBM per GPU (smallest in this set); the KV cliff arrives + # early. Sweep saturates conc=20 to keep total HBM headroom. + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 20] } + - { tp: 8, offloading: cpu, conc-list: [1, 2, 4, 8, 12, 16, 20] } qwen3.5-fp8-b300-sglang-agentic-hicache: image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd @@ -9704,7 +9609,7 @@ qwen3.5-fp8-b300-sglang-agentic-hicache: - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } kimik2.5-fp4-b200-vllm-agentic-lmcache: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: b200-dgxc @@ -9724,16 +9629,17 @@ kimik2.5-fp4-b200-vllm-agentic-lmcache: # does not have a B300-specific recipe, so this config reuses the existing # Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons -# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to -# origin/main so its fixed-seq-len sweep is unaffected. -# - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape -# mirroring the conc=192 point in the base entry's fixed-seq-len sweep. -# - additional-settings.CONFIG_FILE: points at the new agentic recipe under -# recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh -# overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC -# branch). Local-overlay pattern mirrors the existing 8k1k overlay. dsv4-fp4-gb300-dynamo-vllm-agentic: + # Pinned to the R30-validated stack (vllm v0.21.0-ubuntu2404 + ai-dynamo + # wheel 1.2.0.dev20260426). The repo-wide bump to v0.22.0 (76aedd65) broke + # this config silently: the agentic recipes' `model.container` field must + # match this image string for srtctl's containers-map lookup to resolve to + # the squash file the launcher imports — on mismatch srtctl passes the + # recipe string verbatim to pyxis, which re-pulls from Docker Hub on every + # node and ignores the imported squash. Bump this together with + # `model.container` in benchmarks/multi_node/srt-slurm-recipes/vllm/ + # deepseek-v4/agentic/*.yaml once v0.22.x + the dynamo wheel is validated + # on GB300 disagg. image: vllm/vllm-openai:v0.21.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 @@ -9823,6 +9729,8 @@ dsv4-fp4-gb300-dynamo-vllm-agentic: # overlay (recipes/vllm/deepseek-v4/agentic/), so a change to the recipe # applies to both clusters with no duplication. dsv4-fp4-gb300-cw-dynamo-vllm-agentic: + # Image pinned to match the agentic recipes' `model.container` — see the + # comment on dsv4-fp4-gb300-dynamo-vllm-agentic. image: vllm/vllm-openai:v0.21.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 @@ -9881,16 +9789,62 @@ dsv4-fp4-gb300-cw-dynamo-vllm-agentic: ep: 8 dp-attn: true -# Diverged from qwen3.5-fp8-h100-sglang (agentic-coding sibling). Reasons below; -# the original qwen3.5-fp8-h100-sglang entry stays byte-identical to origin/main -# so its fixed-seq-len sweep is unaffected. -# - scenarios: replaced fixed-seq-len with agentic-coding. -# - runner: 'h100' -> 'h100-dgxc' (agentic runs need the dgxc-slurm cluster). -# Image is identical to the base entry (lmsysorg/sglang:v0.5.12-cu130). -# CONC range conservative for H100's 80 GB HBM3 under the long-ISL with- -# subagents corpus. hicache arm capped at conc 16 since high-conc + hicache -# tends to flake on first runs and conc 16 covers the cliff. The bench script -# sets WEKA_LOADER_OVERRIDE to the 256k-capped corpus variant. +# GB200 sibling of the gb300 agentic configs. Unlike gb300, the topology is +# TEP8 prefill + TP8 decode (NOT the fixed-seq-len DEP8/DEP8 megamoe family): +# DSv4's hybrid KV needs ~20 GiB per data-parallel rank to admit one +# 256k-token request, but GB200's 186 GB HBM leaves only ~8.8 GiB free after +# FP4 weights — TP shards the KV 8-ways so it fits. See the recipe header. +# No high-throughput conc-4096 tier yet: a single TP8 decode worker caps at +# max-num-seqs 512, and DEP decode (which scales seqs) hits the KV wall +# above; revisit with fp4 indexer cache or multi-worker TP8 decode. +# Image matches the recipes' `model.container` (v0.21.0-ubuntu2404 — the +# gb300-validated agentic stack; v0.20.0's NIXL connector breaks TP8<->TP8 +# transfers, see the recipe header); the two must stay in lockstep — see +# dsv4-fp4-gb300-dynamo-vllm-agentic. +dsv4-fp4-gb200-dynamo-vllm-agentic: + image: vllm/vllm-openai:v0.21.0-ubuntu2404 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # Low-latency: 1p1d (TEP=8 / TP=8) at conc 32. 5 nodes incl. infra. + - spec-decoding: none + conc-list: [32] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + # Mid: same 1p1d shape at conc 192. + - spec-decoding: none + conc-list: [192] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + qwen3.5-fp8-h100-sglang-agentic: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -9905,3 +9859,33 @@ qwen3.5-fp8-h100-sglang-agentic: search-space: - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] } - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] } + +# Split from dsr1-fp4-b200-dynamo-trt: agentic-coding scenario only. +dsr1-fp4-b200-dynamo-trt-agentic: + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + model: deepseek-r1-fp4 + model-prefix: dsr1 + runner: b200-multinode + precision: fp4 + framework: dynamo-trt + multinode: true + disagg: true + scenarios: + agentic-coding: + - duration: 300 + search-space: + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 81727ef39..d46c75a5c 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -171,10 +171,17 @@ jobs: - name: Slurm cleanup (pre-run) run: &slurm-cleanup | if command -v squeue >/dev/null 2>&1; then - echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." + # Clean both the bare runner name and the "ifx-" prefixed form. + # launch_gb200-nv.sh names jobs ifx- to dodge a foreign + # runner fleet on watchtower that scancels by the bare name + # across users (see the comment there). squeue is filtered to + # our user so the wait loop can't hang on a same-named foreign + # job we have no permission to cancel. + echo "[Slurm] Cleaning up jobs named: ${{ runner.name }}, ifx-${{ runner.name }} ..." scancel --name="${{ runner.name }}" || true - while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do - squeue --name="${{ runner.name }}" + scancel --name="ifx-${{ runner.name }}" || true + while [ -n "$(squeue --user="$USER" --name='${{ runner.name }},ifx-${{ runner.name }}' --noheader --format='%i')" ]; do + squeue --user="$USER" --name="${{ runner.name }},ifx-${{ runner.name }}" sleep 5 done fi @@ -218,6 +225,16 @@ jobs: elif [ "${{ inputs.scenario-type }}" = "agentic-coding" ]; then if [ -f "${RESULT_FILENAME}.json" ]; then echo "Found agentic result file: ${RESULT_FILENAME}.json" + # Existence is not enough: process_agentic_result.py writes the + # aggregate even when aiperf recorded zero valid requests (e.g. + # the server 500'd every request — gb200 R8 went green on an + # all-null result this way). Require at least one successful + # request. + ok=$(python3 -c "import json,sys; d=json.load(open('${RESULT_FILENAME}.json')); print(int(bool(d.get('num_requests_successful'))))" 2>/dev/null || echo 0) + if [ "$ok" != "1" ]; then + echo "Run failed: ${RESULT_FILENAME}.json has zero successful requests." >&2 + exit 1 + fi else echo "Run failed: Agentic benchmark result ${RESULT_FILENAME}.json not found." >&2 exit 1 diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 2148def36..46f305fe8 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -239,7 +239,10 @@ jobs: name: agentic_${{ env.RESULT_FILENAME }} path: | results/server.log + results/router.log results/lmcache_server.log + results/mooncake_master.log + results/mooncake_config.json results/benchmark.log results/config.yaml results/lmcache_command.txt @@ -279,7 +282,10 @@ jobs: name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }} path: | ${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }} + ${{ inputs.scenario-type == 'agentic-coding' && 'results/router.log' || '' }} ${{ inputs.scenario-type == 'agentic-coding' && 'results/lmcache_server.log' || '' }} + ${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_master.log' || '' }} + ${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_config.json' || '' }} if-no-files-found: ignore - name: Upload GPU metrics diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e3080b4bf..3fd56e7e4 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -899,6 +899,7 @@ run_eval() { INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}" AGENTIC_DIR="${AGENTIC_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/agentic-benchmark}" AIPERF_DIR="${AIPERF_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/aiperf}" +AIPERF_FAILED_REQUEST_THRESHOLD=0.10 agentic_pip_install() { local pip_install=(python3 -m pip install) @@ -924,8 +925,21 @@ resolve_trace_source() { # public-dataset loader names allowed by the inferencex-agentx-mvp # scenario. Used by recipes whose servers have non-default context # caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the - # unfiltered 052726 corpus and switches to the 256k-capped variant). - local loader="${WEKA_LOADER_OVERRIDE:-semianalysis_cc_traces_weka_with_subagents}" + # unfiltered corpus and switches to the 256k-capped variant), or + # by recipes that want to pin an older corpus generation. + # + # Default (no override): the 060826 v6 corpus, selected by model family. + # DSv4 (full context) rides the unfiltered base corpus; every non-DSv4 + # recipe defaults to the 256k-capped variant because those servers run at + # max_model_len ~256k and would reject >256k requests. Any recipe can still + # pin a specific corpus via WEKA_LOADER_OVERRIDE. + local default_loader + if [[ "${MODEL_PREFIX:-}" == dsv4* ]]; then + default_loader="semianalysis_cc_traces_weka_with_subagents_060826" + else + default_loader="semianalysis_cc_traces_weka_with_subagents_060826_256k" + fi + local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}" local dataset case "$loader" in semianalysis_cc_traces_weka_with_subagents) @@ -934,13 +948,31 @@ resolve_trace_source() { semianalysis_cc_traces_weka_with_subagents_256k) dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k" ;; + semianalysis_cc_traces_weka_with_subagents_060226) + dataset="semianalysisai/cc-traces-weka-with-subagents-060226" + ;; + semianalysis_cc_traces_weka_with_subagents_060226_256k) + dataset="semianalysisai/cc-traces-weka-with-subagents-060226-256k" + ;; + semianalysis_cc_traces_weka_with_subagents_060526) + dataset="semianalysisai/cc-traces-weka-with-subagents-060526" + ;; + semianalysis_cc_traces_weka_with_subagents_060526_256k) + dataset="semianalysisai/cc-traces-weka-with-subagents-060526-256k" + ;; + semianalysis_cc_traces_weka_with_subagents_060826) + dataset="semianalysisai/cc-traces-weka-with-subagents-060826" + ;; + semianalysis_cc_traces_weka_with_subagents_060826_256k) + dataset="semianalysisai/cc-traces-weka-with-subagents-060826-256k" + ;; *) - echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2 + echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k, semianalysis_cc_traces_weka_with_subagents_060526, semianalysis_cc_traces_weka_with_subagents_060526_256k, semianalysis_cc_traces_weka_with_subagents_060826, semianalysis_cc_traces_weka_with_subagents_060826_256k" >&2 exit 1 ;; esac TRACE_SOURCE_FLAG="--public-dataset $loader" - echo "Loading traces via aiperf public-dataset: $loader ($dataset)" + echo "Loading traces via aiperf public-dataset: $loader ($dataset) [MODEL_PREFIX=${MODEL_PREFIX:-unset}]" # Pre-download the dataset into the shared HF_HUB_CACHE (same mount used # for model weights) so subsequent runs read from cache instead of # re-downloading every job. @@ -1017,7 +1049,7 @@ build_replay_cmd() { # transient low-rate failures from killing long sweeps while still # catching malformed payloads or server crashes before they get aggregated # as benchmarkable data. - REPLAY_CMD+=" --failed-request-threshold 0.10" + REPLAY_CMD+=" --failed-request-threshold $AIPERF_FAILED_REQUEST_THRESHOLD" # Sample each trajectory's warmup start position uniformly from # [25%, 75%] of the trace's turn count (was hardcoded 0%-70% upstream). # Avoids starting trajectories right at turn 0 where the KV cache is @@ -1031,6 +1063,14 @@ build_replay_cmd() { # CPU on minimax-m2.5 at high concurrency. Lossless for vLLM (server # usage is authoritative). REPLAY_CMD+=" --use-server-token-count" + # Disable DCGM GPU telemetry collection. aiperf's GpuMetricTimeSeries + # freezes its metric schema on the first DCGM scrape, then KeyErrors when + # an optional field (xid_errors, power_violation, encoder_utilization) + # first appears mid-run. We don't consume the gpu_telemetry artifact in + # downstream processing, and the server-metrics path (Prometheus /metrics + # from vLLM) is unaffected by this flag and still gives us KV usage, + # prefix cache hit rate, etc. + REPLAY_CMD+=" --no-gpu-telemetry" # aiperf's dataset manager (separate from the inference parser) loads # the model's tokenizer for trace-prompt tokenization regardless of # --use-server-token-count. Models like kimi (amd/Kimi-K2.5-MXFP4, @@ -1070,8 +1110,9 @@ build_replay_cmd() { write_agentic_result_json() { # Aggregate aiperf's profile_export.{json,jsonl} + server_metrics_export.json - # into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow's existing - # retry-based existence check is the single success gate. + # into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow checks that + # this file exists; run_agentic_replay_and_write_outputs separately rejects + # aggregates whose request error rate exceeds the configured limit. local result_dir="$1" RESULT_DIR="$result_dir" AGENTIC_OUTPUT_DIR="${AGENTIC_OUTPUT_DIR:-$INFMAX_CONTAINER_WORKSPACE}" \ python3 "$INFMAX_CONTAINER_WORKSPACE/utils/process_agentic_result.py" @@ -1085,6 +1126,7 @@ write_agentic_result_json() { run_agentic_replay_and_write_outputs() { local result_dir="$1" local replay_rc + local validation_rc echo "$REPLAY_CMD" > "$result_dir/benchmark_command.txt" @@ -1100,8 +1142,20 @@ run_agentic_replay_and_write_outputs() { python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ "$result_dir/aiperf_artifacts" -o "$result_dir" 2>&1 || true + set +e + python3 "$INFMAX_CONTAINER_WORKSPACE/utils/validate_agentic_result.py" \ + "$result_dir/aiperf_artifacts" \ + --failed-request-threshold "$AIPERF_FAILED_REQUEST_THRESHOLD" + validation_rc=$? + set -e + if [ "$replay_rc" -ne 0 ]; then echo "ERROR: agentic trace replay exited with code $replay_rc after writing available results" >&2 return "$replay_rc" fi + + if [ "$validation_rc" -ne 0 ]; then + echo "ERROR: agentic trace replay produced invalid results after writing available artifacts" >&2 + return "$validation_rc" + fi } diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml new file mode 100644 index 000000000..8587b5aae --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml @@ -0,0 +1,205 @@ +name: "svf-vllm-disagg-gb200-1p1d-tep8-tp8-agentic" + +# Agentic-coding recipe for GB200: 1 prefill (TEP=8) + 1 decode (TP=8), +# 16 GPUs across 4 GB200 nodes + 1 dedicated NATS/etcd infra node. +# +# Why TEP/TP instead of the fixed-seq-len DEP8/DEP8 family +# (disagg-gb200-mid-curve-megamoe.yaml): with data-parallel ranks each rank +# holds the FULL KV of its sequences, and DSv4's hybrid KV needs 19.82 GiB +# per rank just to admit one 256k-token request — but only ~8.8 GiB is free +# on a 186 GB GB200 GPU after FP4 weights + MegaMOE buffers (engine init +# died in _check_enough_kv_cache_memory; R4 jobs 18598/18600). Tensor +# parallelism shards the KV 8-ways (~2.5 GiB/GPU at 256k), which fits with +# room for concurrent sequences. Worker flag sets mirror the validated +# gb300 TEP/TP recipes (disagg-gb300-1p17d-tep4-tp4.yaml and the 1p6d +# agentic decode): no data-parallel, no deep_gemm_mega_moe. +# +# Container is v0.21.0-ubuntu2404 (the gb300-validated agentic stack), NOT +# the v0.20.0 the gb200 fixed-seq family pins: v0.20.0's NIXL connector +# breaks on TP8<->TP8 transfers — the decode worker's first get_finished() +# poll dies with KeyError on the remote (prefill) engine_id in +# transfer_topo.get_engine_info() because the prefill engine never +# registers in the decode's engine map (R6, both shards, identical +# tracebacks). The fixed-seq DEP8/DEP8 family never hits this path +# (per-rank TP=1 transfer topology). v0.21.0 + the same ai-dynamo wheel +# ran green NIXL transfers on gb300 agentic (R30 + manual 8137). +# +# Standard agentic deltas (see the gb300 agentic recipes): +# - benchmark.type custom -> agentic_srt.sh +# - prefix caching ON (no no-enable-prefix-caching) +# - max-model-len 262144 + 060826 256k-capped corpus (GB200 cannot serve +# the full 1M DSv4 context, mirroring the minimaxm2.5 agentic configs) +# - infra.nats_max_payload_mb 32 (long agentic prompts exceed NATS' 1 MiB) +# - srun_options.container-remap-root (apt-get git in agentic_srt.sh) + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.21.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + # See the gb300 1p6d agentic recipe for rationale — NATS' 1 MiB default + # rejects long agentic prompts; 32 MiB gives ~10x headroom over the + # largest observed payload. + nats_max_payload_mb: 32 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + vllm_config: + prefill: + # Static engine_id (one per worker, distinct between prefill/decode): + # the TP8 workers span 2 GB200 nodes, which srtctl launches as two + # processes (--node-rank 0 + --node-rank 1 --headless). Without a + # pinned engine_id each process generates its own random NIXL UUID, so + # ranks 0-3 and ranks 4-7 of the SAME worker register under different + # engine ids and the consumer's handshake dies with "Remote NIXL agent + # engine ID mismatch" on the first transfer (R7, both shards). + # Single-node-per-worker topologies (all gb300 recipes) never hit this. + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "11111111-1111-4111-8111-111111111111"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enable-ep-weight-filter: true + enforce-eager: true + max-model-len: 262144 + max-num-seqs: 16 + # 16384 batched tokens + util 0.90 (the fixed-seq megamoe recipes use + # 32768 + 0.95, tuned for 9k contexts): at 256k contexts the first + # long prefill's activation spike (sparse indexer logits, mhc fused + # kernels) needs ~2 GiB of runtime headroom that 0.95 doesn't leave — + # R5 job 18603 died with "CUDA out of memory. Tried to allocate + # 1.98 GiB ... 1.53 GiB free" on the first scheduled request. Matches + # the green gb300 agentic prefill (0.9 / 16384). + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + # See prefill: static engine_id shared by both node processes of this + # 2-node TP8 worker (distinct from the prefill worker's id). + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "22222222-2222-4222-8222-222222222222"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-ep-weight-filter: true + max-model-len: 262144 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + all2all-backend: "flashinfer_nvlink_one_sided" + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +# cpus-per-task=72: one full GB200 NUMA socket (144 cores split 2 x 72) per +# task. Critical for the *infra step* (etcd + nats), which srtctl spawns +# without --gres — on watchtower the per-GPU CPU default (CpusPerTres=gpu:35) +# doesn't apply to GPU-less steps, so etcd lands with 1 CPU, falls behind on +# lease keep-alives, and worker registrations silently expire mid-run: R8's +# decode worker logged "Keep-alive lease expired" 11 min after going healthy +# and the frontend 500'd every benchmark request with "Instance not found". +# Same failure mode and fix as the gb300 agentic recipes (their R12). +sbatch_directives: + cpus-per-task: "72" + +srun_options: + # See gb300 agentic recipes: pyxis may map the calling user to a non-root + # uid inside the container; remap to uid 0 so agentic_srt.sh's apt-get + # install git works. No-op when the container user is already root. + container-remap-root: "" + +benchmark: + type: custom + command: bash /infmax-workspace/benchmarks/multi_node/agentic_srt.sh + env: + INFMAX_CONTAINER_WORKSPACE: /infmax-workspace + RESULT_DIR: /logs/agentic + PORT: "8000" + IS_MULTINODE: "true" + # Container-side path of the aiperf mmap dataset cache; the host-side + # mount is wired via launch_gb200-nv.sh's srtslurm.yaml default_mounts. + # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files + # per dataset on every run. + AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache" + # Persistent HF hub cache (also wired via default_mounts) so the trace + # dataset isn't re-downloaded on every run. Overrides the workflow-level + # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes. + HF_HUB_CACHE: "/hf_hub_cache" + # The server runs at max-model-len 262144 (see header comment) — replay + # the 256k-capped corpus and tell aiperf to filter inputs to the served + # window, mirroring the minimaxm2.5 agentic configs. + WEKA_LOADER_OVERRIDE: "semianalysis_cc_traces_weka_with_subagents_060826_256k" + MAX_MODEL_LEN: "262144" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml index fb7b9fd97..2caf202a6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml @@ -175,3 +175,7 @@ benchmark: # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files # per dataset on every run. AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache" + # Persistent HF hub cache (also wired via default_mounts) so the trace + # dataset isn't re-downloaded on every run. Overrides the workflow-level + # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes. + HF_HUB_CACHE: "/hf_hub_cache" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml index bb8fc6df8..98e25c450 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml @@ -174,3 +174,7 @@ benchmark: # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files # per dataset on every run. AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache" + # Persistent HF hub cache (also wired via default_mounts) so the trace + # dataset isn't re-downloaded on every run. Overrides the workflow-level + # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes. + HF_HUB_CACHE: "/hf_hub_cache" diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh index f9955adc7..16dc3bfd5 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh @@ -17,7 +17,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- @@ -33,7 +43,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ ---model-path $MODEL \ +--model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh index ff76b768d..3b2561fe2 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi amd-smi || true @@ -34,7 +44,7 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ ---model-path=$MODEL \ +--model-path=$MODEL_PATH --served-model-name=$MODEL \ --host=0.0.0.0 \ --port=$PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh new file mode 100755 index 000000000..b159f9022 --- /dev/null +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh @@ -0,0 +1,202 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on B200 using SGLang. +# +# OFFLOADING values: +# none - SGLang GPU KV cache with RadixAttention prefix caching. +# hicache - SGLang HiCache local CPU tier with DSv4 UnifiedRadixCache. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INFERENCEX_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +export INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}" + +# The B200 DeepSeek-V4 Blackwell image installs SGLang editable under +# /workspace, so its launcher mounts InferenceX at /ix instead. Resolve the +# agentic tooling and results against the actual repository mount so the image +# can keep its /workspace install and GitHub Actions can collect the outputs. +if [[ ! -d "$INFMAX_CONTAINER_WORKSPACE/utils/aiperf" ]]; then + export INFMAX_CONTAINER_WORKSPACE="$INFERENCEX_ROOT" +fi +if [[ "${RESULT_DIR:-}" == /workspace/* && "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then + export RESULT_DIR="$INFMAX_CONTAINER_WORKSPACE/${RESULT_DIR#/workspace/}" +fi + +source "$INFERENCEX_ROOT/benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION + +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=1000000 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi +nvidia-smi + +resolve_trace_source + +# Keep AIPerf's Transformers-main dependency from replacing the older +# Transformers build pinned by the B200-specialized SGLang image. The server +# always launches with the image's original interpreter; AIPerf and result +# processing use the isolated environment when InferenceX is mounted at /ix. +SGLANG_PYTHON="$(command -v python3)" +if [[ "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then + AGENTIC_VENV="${AGENTIC_VENV:-/tmp/inferencex-agentic-venv}" + "$SGLANG_PYTHON" -m venv "$AGENTIC_VENV" + export PATH="$AGENTIC_VENV/bin:$PATH" +fi +install_agentic_deps + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +if [ "$DP_ATTENTION" = "true" ]; then + echo "Error: current SGLang nightly self-collides on internal IPC ports during single-node DP-attention startup; use pure TP until upstream fixes PortArgs initialization." >&2 + exit 1 +fi + +CACHE_ARGS=() +case "$OFFLOADING" in + none) + ;; + hicache) + # DeepSeek V4 HiCache currently rejects --hicache-size and supports + # capacity control only through a host/device token-capacity ratio. + # DSv4 allocates several physical host sub-pools for each logical host + # token. On B300 TP8, ratio=4 consumes about 237 GB/rank (1.9 TB total) + # while model loading/page cache is still resident and the OS kills a + # rank, so leave transient startup headroom with ratio=2. B200 has a + # smaller device KV pool and 3.8 TiB of host RAM, so ratio=8 provides a + # substantially larger useful CPU tier while staying within its node + # budget. + # TP4 ratio=4 works at C32 but fills its roughly 500 GB host tier at + # C48/C64. Ratio=8 still cannot retain the C64 session working set long + # enough to produce host hits. Ratio=16 provides roughly 21M logical + # host tokens while remaining below the B300 node's host budget. + if [ "$TP" -ge 8 ]; then + DEFAULT_HICACHE_RATIO=8 + else + DEFAULT_HICACHE_RATIO=16 + fi + HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}" + export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1 + CACHE_ARGS=( + --enable-hierarchical-cache + --hicache-ratio "$HICACHE_RATIO" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + ) + echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +PARALLEL_ARGS=(--tp "$TP") +METRICS_ARGS=(--enable-metrics) +CHUNKED_PREFILL_SIZE=8192 +PARALLEL_ARGS+=( + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune +) + +MODEL_ARGS=() +# The B200-specialized image deadlocks immediately after weight loading when +# forced through the B300 compressed-attention/page-size overrides. +MEM_FRACTION_STATIC=0.90 + +PER_ENGINE_MAX_RUNNING=$CONC +[ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 +CUDA_GRAPH_MAX_BS=$PER_ENGINE_MAX_RUNNING +[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 + +export PYTHONNOUSERSITE=1 +export TORCH_CUDA_ARCH_LIST=10.0 +# Agentic warmup dispatches hundreds of large prompts at once. SGLang's +# tokenizer process can leave request bytes unacknowledged for longer than +# AIPerf's 30-second TCP_USER_TIMEOUT while it admits that initial burst, +# causing Linux to abort otherwise-live localhost connections. Keep the +# six-hour request timeout unchanged, but allow up to 15 minutes for TCP +# progress before declaring the connection dead. +export AIPERF_HTTP_TCP_USER_TIMEOUT=900000 +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 + +SGLANG_CMD=( + "$SGLANG_PYTHON" -m sglang.launch_server + --model-path "$MODEL_PATH" + --served-model-name "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --trust-remote-code + "${PARALLEL_ARGS[@]}" + --mem-fraction-static "$MEM_FRACTION_STATIC" + --swa-full-tokens-ratio 0.1 + --max-running-requests "$PER_ENGINE_MAX_RUNNING" + --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" + --context-length "$MAX_MODEL_LEN" + --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" + --tool-call-parser deepseekv4 + --reasoning-parser deepseek-v4 + --chat-template "$SCRIPT_DIR/../chat_templates/deepseek_v4_thinking.jinja" + --watchdog-timeout 1800 + "${MODEL_ARGS[@]}" + "${METRICS_ARGS[@]}" + "${CACHE_ARGS[@]}" +) + +printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" +printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" + +{ + echo "=== SGLANG_* env vars at launch ===" + env | grep -E '^SGLANG_' | sort + echo "===================================" +} | tee "$SERVER_LOG" + +echo "Starting SGLang server for B200..." +"${SGLANG_CMD[@]}" >> "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +capture_cache_metrics() { + { + echo "=== SGLang cache metrics snapshot $(date --iso-8601=seconds) ===" + curl -fsS "http://localhost:$PORT/metrics" 2>/dev/null \ + | grep -E '^(sglang:(cache_hit_rate|cached_tokens_total|prompt_tokens_total|hicache_host_used_tokens|hicache_host_total_tokens|token_usage|num_requests_running|num_requests_waiting))' \ + || true + echo "============================================================" + } >> "$SERVER_LOG" +} + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" +if [ "${#METRICS_ARGS[@]}" -gt 0 ]; then + capture_cache_metrics + trap capture_cache_metrics EXIT +fi + +build_replay_cmd "$RESULT_DIR" +REPLAY_CMD+=" --server-metrics http://localhost:$PORT/metrics" +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 108347479..514c6df8c 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -13,18 +13,17 @@ set -x # experts EP-sharded across DP ranks (per the vLLM blog recipe). # Highest aggregate throughput at large CONC. # -# Image is vllm/vllm-openai:v0.20.0-cu130. block_size=256, kv-cache-dtype=fp8, -# FP4 indexer cache enabled, FULL_AND_PIECEWISE cudagraph capture with -# custom_ops=all (per the vLLM blog recipe at https://vllm.ai/blog/deepseek-v4). +# Image is configured in nvidia-master.yaml. block_size=256, +# kv-cache-dtype=fp8, FP4 indexer cache enabled, FULL_AND_PIECEWISE cudagraph +# capture with custom_ops=all (per the vLLM blog recipe at +# https://vllm.ai/blog/deepseek-v4). # # Required env vars: # MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR # # OFFLOADING values: -# none - vLLM GPU KV only, with DSv4 hybrid KV manager enabled. -# cpu - vLLM native OffloadingConnector, with hybrid KV manager enabled. -# lmcache-mp - Temporarily disabled for DSv4. LMCache PR #3261 must merge -# first so LMCacheMPConnector can support HMA block-id tuples. +# none - vLLM GPU KV only. +# cpu - MooncakeStoreConnector with a shared 2.5 TB host-memory KV tier. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -38,157 +37,130 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps +# vLLM v0.22.1 can ship CUTLASS DSL 4.5.2 with stale native MLIR bindings, +# which fails DSV4 indexer compilation with mlir_global_dtors(..., data). +# Reinstall the matching native wheel until NVIDIA/cutlass#3259 is resolved. +agentic_pip_install --quiet --force-reinstall --no-deps \ + 'nvidia-cutlass-dsl-libs-cu13==4.5.2' + +# vllm-project/router expands the one HTTP backend into one logical worker per +# DP rank and sends X-data-parallel-rank on forwarded requests. aiperf's +# X-Correlation-ID is stable for every turn of a conversation; alias it to the +# router's preferred X-Session-ID header. +USE_VLLM_ROUTER=false +VLLM_BACKEND_PORT="$PORT" +if [ "$DP_ATTENTION" = "true" ]; then + USE_VLLM_ROUTER=true + VLLM_BACKEND_PORT=$((PORT + 1)) + VLLM_ROUTER_VERSION=0.1.14 + VLLM_ROUTER_POLICY=consistent_hash + VLLM_ROUTER_METRICS_PORT=$((PORT + 10000)) + export AIPERF_HTTP_X_SESSION_ID_FROM_CORRELATION_ID=1 + agentic_pip_install --quiet "vllm-router==$VLLM_ROUTER_VERSION" +fi + # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s. export VLLM_ENGINE_READY_TIMEOUT_S=3600 +# vllm-project/vllm#43447 keeps local SWA prefix-cache tails sparsely, while +# vllm-project/vllm#44774 applies the same reachability policy to Mooncake's +# store mask. 32k matches the trace-replay tuning validated for this workload. +export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768 + # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" -LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" +ROUTER_LOG="$RESULT_DIR/router.log" +MOONCAKE_MASTER_LOG="$RESULT_DIR/mooncake_master.log" mkdir -p "$RESULT_DIR" OFFLOAD_ARGS=() -HYBRID_KV_ARGS=(--no-disable-hybrid-kv-cache-manager) -LMCACHE_PID="" - -cleanup_lmcache_server() { - if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then - kill "$LMCACHE_PID" 2>/dev/null || true - wait "$LMCACHE_PID" 2>/dev/null || true - fi -} - -trap cleanup_lmcache_server EXIT - -wait_for_lmcache_ready() { - { set +x; } 2>/dev/null - local attempts="${LMCACHE_READY_ATTEMPTS:-120}" - local tail_pid="" - - while [ ! -f "$LMCACHE_LOG" ]; do - if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then - echo "LMCache server died before creating log file. Exiting." >&2 - exit 1 - fi - sleep 1 - done - - tail -f -n +1 "$LMCACHE_LOG" & - tail_pid=$! - - for ((i = 1; i <= attempts; i++)); do - if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then - kill "$tail_pid" 2>/dev/null || true - wait "$tail_pid" 2>/dev/null || true - return 0 - fi - if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then - echo "LMCache server died before becoming healthy. Log follows:" >&2 - kill "$tail_pid" 2>/dev/null || true - wait "$tail_pid" 2>/dev/null || true - cat "$LMCACHE_LOG" >&2 || true - exit 1 - fi - sleep 1 - done - - echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 - kill "$tail_pid" 2>/dev/null || true - wait "$tail_pid" 2>/dev/null || true - cat "$LMCACHE_LOG" >&2 || true - exit 1 -} case "$OFFLOADING" in none) ;; cpu) - # b200-dgxc compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits - # individual jobs to a fraction of that. Aim for ~1.2 TB total native - # CPU offload pool across the engine(s); previously 2.8 TB but every - # DP-attn worker stalled for 4+ min during pinned-CPU-tensor allocation - # and the shm_broadcast watchdog killed them (run 26246044726). 150 GB - # per worker (1.2 TB / 8) completes the alloc within the 60 s window. + # B200 DGXC compute nodes have about 3.9 TB host RAM. Leave enough + # headroom for model workers and the runtime. # - # Native --kv-offloading-size becomes OffloadingConnector's - # cpu_bytes_to_use. For DP-attn there are $TP independent DP engines, - # so pre-divide to keep aggregate host commit near TOTAL_CPU_DRAM_GB. - # For pure TP, vLLM treats the size as the total across TP ranks. - TOTAL_CPU_DRAM_GB=1200 - if [ "$DP_ATTENTION" = "true" ]; then - PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP)) - else - PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB + # Embedded mode contributes one segment per GPU rank to a shared + # distributed store, so pre-divide the aggregate host-memory budget. + TOTAL_CPU_DRAM_GB=2500 + PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) + + MOONCAKE_VERSION=0.3.11.post1 + agentic_pip_install --quiet --no-cache-dir --no-deps \ + --force-reinstall "mooncake-transfer-engine-cuda13==$MOONCAKE_VERSION" + python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null + + MOONCAKE_MASTER_PORT=$((PORT + 12000)) + MOONCAKE_CONFIG_PATH="$RESULT_DIR/mooncake_config.json" + cat > "$MOONCAKE_CONFIG_PATH" < "$MOONCAKE_MASTER_LOG" 2>&1 & + MOONCAKE_MASTER_PID=$! + sleep 2 + if ! kill -0 "$MOONCAKE_MASTER_PID" 2>/dev/null; then + echo "Mooncake master died during startup." >&2 + cat "$MOONCAKE_MASTER_LOG" >&2 + exit 1 fi unset VLLM_USE_SIMPLE_KV_OFFLOAD - OFFLOAD_ARGS=( - --kv-offloading-backend native - --kv-offloading-size "$PER_ENGINE_GB" - ) - ;; - lmcache-mp) - { set +x; } 2>/dev/null - # LMCacheMPConnector needs HMA support before it can run DSv4 with the - # hybrid KV manager. Re-enable this path after - # https://github.com/LMCache/LMCache/pull/3261 is merged. - echo "Error: OFFLOADING=lmcache-mp is disabled for DSv4 until LMCache PR #3261 adds HMA support." >&2 - exit 1 - - # LMCache docs recommend MP mode for production: start an external - # `lmcache server`, then point vLLM's LMCacheMPConnector at it. For - # vLLM >= 0.20, prefer the LMCache-shipped connector module because it - # tracks the latest server protocol ahead of vLLM's vendored copy. - # - # Important DSv4 caveat: LMCacheMPConnector currently only accepts the - # non-hybrid KV block layout. The connector raises if vLLM returns the - # hybrid block-id tuple used by the CSA/HCA hybrid KV manager. This - # mode therefore disables the hybrid manager; `none` and `cpu` keep it - # enabled for the normal B200 DSv4 path. - agentic_pip_install --quiet --no-cache-dir lmcache - python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null - - TOTAL_CPU_DRAM_GB=2800 - LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" - LMCACHE_PORT="${LMCACHE_PORT:-5555}" - LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" - LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}" - LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-200}" - LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" - LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" - - echo "Starting LMCache MP server..." - LMCACHE_CMD=( - lmcache server - --host "$LMCACHE_HOST" - --port "$LMCACHE_PORT" - --http-host "$LMCACHE_HOST" - --http-port "$LMCACHE_HTTP_PORT" - --l1-size-gb "$LMCACHE_L1_SIZE_GB" - --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" - --chunk-size "$LMCACHE_CHUNK_SIZE" - --max-workers "$LMCACHE_MAX_WORKERS" - --eviction-policy LRU - ) - printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" - printf '\n' >> "$RESULT_DIR/lmcache_command.txt" - "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & - LMCACHE_PID=$! - echo "LMCache server PID: $LMCACHE_PID" - wait_for_lmcache_ready - - HYBRID_KV_ARGS=(--disable-hybrid-kv-cache-manager) OFFLOAD_ARGS=( --kv-transfer-config - "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" + '{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"load_async":true}}' ) ;; *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache-mp)" >&2 + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 exit 1 ;; esac @@ -221,9 +193,9 @@ export VLLM_FLOAT32_MATMUL_PRECISION=high { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 - --port "$PORT" + --port "$VLLM_BACKEND_PORT" --trust-remote-code --kv-cache-dtype fp8 --block-size 256 @@ -236,7 +208,7 @@ VLLM_CMD=( --enable-auto-tool-choice --reasoning-parser deepseek_v4 --enable-prefix-caching - "${HYBRID_KV_ARGS[@]}" + --no-disable-hybrid-kv-cache-manager --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$PER_ENGINE_MAX_NUM_SEQS" "${OFFLOAD_ARGS[@]}" @@ -247,7 +219,24 @@ printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" SERVER_PID=$! echo "Server PID: $SERVER_PID" -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" +wait_for_server_ready --port "$VLLM_BACKEND_PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +if [ "$USE_VLLM_ROUTER" = "true" ]; then + echo "Starting native vLLM router on port $PORT for $TP DP ranks..." + vllm-router \ + --worker-urls "http://localhost:$VLLM_BACKEND_PORT" \ + --policy "$VLLM_ROUTER_POLICY" \ + --intra-node-data-parallel-size "$TP" \ + --host 0.0.0.0 \ + --port "$PORT" \ + --prometheus-host 127.0.0.1 \ + --prometheus-port "$VLLM_ROUTER_METRICS_PORT" \ + --request-timeout-secs 3600 \ + --disable-retries > "$ROUTER_LOG" 2>&1 & + ROUTER_PID=$! + echo "Router PID: $ROUTER_PID" + wait_for_server_ready --port "$PORT" --server-log "$ROUTER_LOG" --server-pid "$ROUTER_PID" +fi # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh new file mode 100755 index 000000000..dcc41f688 --- /dev/null +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -0,0 +1,243 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on B300 using SGLang. +# +# OFFLOADING values: +# none - SGLang GPU KV cache with RadixAttention prefix caching. +# hicache - SGLang HiCache local CPU tier with DSv4 UnifiedRadixCache. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INFERENCEX_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +export INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}" + +# The B200 DeepSeek-V4 Blackwell image installs SGLang editable under +# /workspace, so its launcher mounts InferenceX at /ix instead. Resolve the +# agentic tooling and results against the actual repository mount so the image +# can keep its /workspace install and GitHub Actions can collect the outputs. +if [[ ! -d "$INFMAX_CONTAINER_WORKSPACE/utils/aiperf" ]]; then + export INFMAX_CONTAINER_WORKSPACE="$INFERENCEX_ROOT" +fi +if [[ "${RESULT_DIR:-}" == /workspace/* && "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then + export RESULT_DIR="$INFMAX_CONTAINER_WORKSPACE/${RESULT_DIR#/workspace/}" +fi + +source "$INFERENCEX_ROOT/benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION + +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=1000000 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi +nvidia-smi + +resolve_trace_source + +# Keep AIPerf's Transformers-main dependency from replacing the older +# Transformers build pinned by the B200-specialized SGLang image. The server +# always launches with the image's original interpreter; AIPerf and result +# processing use the isolated environment when InferenceX is mounted at /ix. +SGLANG_PYTHON="$(command -v python3)" +if [[ "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then + AGENTIC_VENV="${AGENTIC_VENV:-/tmp/inferencex-agentic-venv}" + "$SGLANG_PYTHON" -m venv "$AGENTIC_VENV" + export PATH="$AGENTIC_VENV/bin:$PATH" +fi +install_agentic_deps + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +CACHE_ARGS=() +case "$OFFLOADING" in + none) + ;; + hicache) + # DeepSeek V4 HiCache currently rejects --hicache-size and supports + # capacity control only through a host/device token-capacity ratio. + # DSv4 allocates several physical host sub-pools for each logical host + # token. On B300 TP8, ratio=4 consumes about 237 GB/rank (1.9 TB total) + # while model loading/page cache is still resident and the OS kills a + # rank, so leave transient startup headroom with ratio=2. B200 has a + # smaller device KV pool and 3.8 TiB of host RAM, so ratio=8 provides a + # substantially larger useful CPU tier while staying within its node + # budget. + # TP4 ratio=4 works at C32 but fills its roughly 500 GB host tier at + # C48/C64. Ratio=8 still cannot retain the C64 session working set long + # enough to produce host hits. Ratio=16 provides roughly 21M logical + # host tokens while remaining below the B300 node's host budget. + if [ "$TP" -ge 8 ]; then + DEFAULT_HICACHE_RATIO=2 + else + DEFAULT_HICACHE_RATIO=16 + fi + HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}" + export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1 + CACHE_ARGS=( + --enable-hierarchical-cache + --hicache-ratio "$HICACHE_RATIO" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + ) + echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +USE_SGLANG_ROUTER=false +SGLANG_BACKEND_PORT="$PORT" +ROUTER_LOG="$RESULT_DIR/router.log" +if [ "$DP_ATTENTION" = "true" ]; then + USE_SGLANG_ROUTER=true + SGLANG_BACKEND_PORT=$((PORT + 1)) + SGLANG_ROUTER_METRICS_PORT=$((PORT + 10000)) +fi + +PARALLEL_ARGS=(--tp "$TP") +METRICS_ARGS=(--enable-metrics) +MEM_FRACTION_STATIC=0.88 +CHUNKED_PREFILL_SIZE=8192 +if [ "$DP_ATTENTION" = "true" ]; then + PARALLEL_ARGS+=( + --dp "$TP" + --enable-dp-attention + --dist-init-addr "127.0.0.1:$((PORT + 2000))" + --ep-size "$EP_SIZE" + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + --enable-prefill-delayer + ) + MEM_FRACTION_STATIC=0.88 + CHUNKED_PREFILL_SIZE=16384 +else + PARALLEL_ARGS+=( + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + ) +fi + +MODEL_ARGS=( + --attention-backend compressed + --page-size 256 + --disable-shared-experts-fusion +) + +MAX_RUNNING_REQUESTS=$CONC +CUDA_GRAPH_MAX_BS=$CONC +[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 + +export PYTHONNOUSERSITE=1 +export TORCH_CUDA_ARCH_LIST=10.0 +# Agentic warmup dispatches hundreds of large prompts at once. SGLang's +# tokenizer process can leave request bytes unacknowledged for longer than +# AIPerf's 30-second TCP_USER_TIMEOUT while it admits that initial burst, +# causing Linux to abort otherwise-live localhost connections. Keep the +# six-hour request timeout unchanged, but allow up to 15 minutes for TCP +# progress before declaring the connection dead. +export AIPERF_HTTP_TCP_USER_TIMEOUT=900000 +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 +SGLANG_CMD=( + "$SGLANG_PYTHON" -m sglang.launch_server + --model-path "$MODEL_PATH" + --served-model-name "$MODEL" + --host 0.0.0.0 + --port "$SGLANG_BACKEND_PORT" + --trust-remote-code + "${PARALLEL_ARGS[@]}" + --mem-fraction-static "$MEM_FRACTION_STATIC" + --swa-full-tokens-ratio 0.1 + --max-running-requests "$MAX_RUNNING_REQUESTS" + --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" + --context-length "$MAX_MODEL_LEN" + --allow-auto-truncate + --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" + --tool-call-parser deepseekv4 + --reasoning-parser deepseek-v4 + --chat-template "$SCRIPT_DIR/../chat_templates/deepseek_v4_thinking.jinja" + --watchdog-timeout 1800 + "${MODEL_ARGS[@]}" + "${METRICS_ARGS[@]}" + "${CACHE_ARGS[@]}" +) + +printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" +printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" + +{ + echo "=== SGLANG_* env vars at launch ===" + env | grep -E '^SGLANG_' | sort + echo "===================================" +} | tee "$SERVER_LOG" + +echo "Starting SGLang server for B300..." +"${SGLANG_CMD[@]}" >> "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +capture_cache_metrics() { + { + echo "=== SGLang cache metrics snapshot $(date --iso-8601=seconds) ===" + curl -fsS "http://localhost:$SGLANG_BACKEND_PORT/metrics" 2>/dev/null \ + | grep -E '^(sglang:(cache_hit_rate|cached_tokens_total|prompt_tokens_total|hicache_host_used_tokens|hicache_host_total_tokens|token_usage|num_requests_running|num_requests_waiting))' \ + || true + echo "============================================================" + } >> "$SERVER_LOG" +} + +wait_for_server_ready --port "$SGLANG_BACKEND_PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +if [ "$USE_SGLANG_ROUTER" = "true" ]; then + echo "Starting SGLang router on port $PORT for $TP DP ranks..." + "$SGLANG_PYTHON" -m sglang_router.launch_router \ + --worker-urls "http://localhost:$SGLANG_BACKEND_PORT" \ + --policy manual \ + --assignment-mode min_load \ + --request-id-headers x-correlation-id \ + --dp-aware \ + --host 0.0.0.0 \ + --port "$PORT" \ + --prometheus-host 127.0.0.1 \ + --prometheus-port "$SGLANG_ROUTER_METRICS_PORT" \ + --request-timeout-secs 3600 \ + --disable-retries > "$ROUTER_LOG" 2>&1 & + ROUTER_PID=$! + echo "Router PID: $ROUTER_PID" + wait_for_server_ready --port "$PORT" --server-log "$ROUTER_LOG" --server-pid "$ROUTER_PID" +fi + +if [ "${#METRICS_ARGS[@]}" -gt 0 ]; then + capture_cache_metrics + trap capture_cache_metrics EXIT +fi + +build_replay_cmd "$RESULT_DIR" +if [ "$DP_ATTENTION" = "true" ]; then + REPLAY_CMD+=" --server-metrics http://localhost:$SGLANG_BACKEND_PORT/metrics" +fi +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index f6748a5f8..7fc30b60b 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -24,62 +24,140 @@ source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then +if ! declare -p MAX_MODEL_LEN >/dev/null 2>&1; then + MAX_MODEL_LEN=1000000 +elif [[ -z "$MAX_MODEL_LEN" || "$MAX_MODEL_LEN" = "0" ]]; then MAX_MODEL_LEN=1000000 fi -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +if declare -p SLURM_JOB_ID >/dev/null 2>&1 && [ -n "$SLURM_JOB_ID" ]; then + SLURM_NODE=unknown + if declare -p SLURMD_NODENAME >/dev/null 2>&1 && [ -n "$SLURMD_NODENAME" ]; then + SLURM_NODE="$SLURMD_NODENAME" + fi + echo "JOB $SLURM_JOB_ID running on $SLURM_NODE" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if declare -p MODEL_PATH >/dev/null 2>&1 && [ -n "$MODEL_PATH" ]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps +# vLLM v0.22.1 can ship CUTLASS DSL 4.5.2 with stale native MLIR bindings, +# which fails DSV4 indexer compilation with mlir_global_dtors(..., data). +# Reinstall the matching native wheel until NVIDIA/cutlass#3259 is resolved. +agentic_pip_install --quiet --force-reinstall --no-deps \ + 'nvidia-cutlass-dsl-libs-cu13==4.5.2' + +# vllm-project/router expands the one HTTP backend into one logical worker per +# DP rank and sends X-data-parallel-rank on forwarded requests. aiperf's +# X-Correlation-ID is stable for every turn of a conversation; alias it to the +# router's preferred X-Session-ID header. This also keeps affinity correct when +# testing older wheels that prioritize per-request X-Request-ID. +USE_VLLM_ROUTER=false +VLLM_BACKEND_PORT="$PORT" +if [ "$DP_ATTENTION" = "true" ]; then + USE_VLLM_ROUTER=true + VLLM_BACKEND_PORT=$((PORT + 1)) + VLLM_ROUTER_VERSION=0.1.14 + VLLM_ROUTER_POLICY=consistent_hash + VLLM_ROUTER_METRICS_PORT=$((PORT + 10000)) + export AIPERF_HTTP_X_SESSION_ID_FROM_CORRELATION_ID=1 + agentic_pip_install --quiet "vllm-router==$VLLM_ROUTER_VERSION" +fi + # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s. export VLLM_ENGINE_READY_TIMEOUT_S=3600 +# vllm-project/vllm#43447 keeps local SWA prefix-cache tails sparsely, while +# vllm-project/vllm#44774 applies the same reachability policy to Mooncake's +# store mask. 32k matches the trace-replay tuning validated for this workload. +export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768 + # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" +ROUTER_LOG="$RESULT_DIR/router.log" +MOONCAKE_MASTER_LOG="$RESULT_DIR/mooncake_master.log" mkdir -p "$RESULT_DIR" -OFFLOAD_ARGS="" +OFFLOAD_ARGS=() case "$OFFLOADING" in none) ;; cpu) - # B300 compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits - # individual jobs to a fraction of that. Aim for ~2.2 TB total host - # CPU pool across the engine(s). + # Leave enough host-memory headroom for model workers and the runtime. + # Use the 2.5 TB host-memory budget across all GPU ranks. # - # SimpleCPUOffloadConnector divides cpu_bytes_to_use by - # parallel_config.world_size (= TP*PP, NOT including DP — see - # vllm/config/parallel.py docstring). So: - # - DP-attn=true → each of $TP DP engines has world_size=1 in - # its parallel_config; the connector does no internal divide, - # and each engine torch.zeros + pin_tensor allocates the full - # --kv_offloading_size value. Pre-divide by $TP here so the - # aggregate host commit ≈ TOTAL_CPU_DRAM_GB. - # - DP-attn=false → single engine with world_size=TP. Pass the - # full TOTAL_CPU_DRAM_GB; the connector's internal divide - # yields TOTAL/TP per rank, and TP-shared mmap (PR #37206) - # keeps the aggregate at TOTAL. - TOTAL_CPU_DRAM_GB=2200 - if [ "$DP_ATTENTION" = "true" ]; then - PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP)) - else - PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB + # Mooncake embedded mode contributes one global segment per GPU rank to + # a shared distributed store. Pre-divide the aggregate host budget + # across those rank-contributed segments. + TOTAL_CPU_DRAM_GB=2500 + PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) + + MOONCAKE_VERSION=0.3.11.post1 + agentic_pip_install --quiet --no-cache-dir --no-deps \ + --force-reinstall "mooncake-transfer-engine-cuda13==$MOONCAKE_VERSION" + python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null + + MOONCAKE_MASTER_PORT=$((PORT + 12000)) + MOONCAKE_CONFIG_PATH="$RESULT_DIR/mooncake_config.json" + cat > "$MOONCAKE_CONFIG_PATH" < "$MOONCAKE_MASTER_LOG" 2>&1 & + MOONCAKE_MASTER_PID=$! + sleep 2 + if ! kill -0 "$MOONCAKE_MASTER_PID" 2>/dev/null; then + echo "Mooncake master died during startup." >&2 + cat "$MOONCAKE_MASTER_LOG" >&2 + exit 1 fi - PER_ENGINE_BYTES=$((PER_ENGINE_GB * 1024 * 1024 * 1024)) - # Use --kv-transfer-config JSON to also pass lazy_offload=true. Eager - # mode (default) hits an AssertionError in - # vllm/v1/core/kv_cache_utils.py:269 popleft_n at low/mid CONC; lazy - # mode defers the store path and clears low/mid CONC at 80-100%. - # See SimpleCPUOffloadConnector PR #37160 for the lazy_offload knob. - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"SimpleCPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"cpu_bytes_to_use\":$PER_ENGINE_BYTES,\"lazy_offload\":true}}" + + unset VLLM_USE_SIMPLE_KV_OFFLOAD + OFFLOAD_ARGS=( + --kv-transfer-config + '{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"load_async":true}}' + ) ;; *) echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 @@ -113,9 +191,9 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve "$MODEL" \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ ---port "$PORT" \ +--port "$VLLM_BACKEND_PORT" \ --trust-remote-code \ --kv-cache-dtype fp8 \ --block-size 256 \ @@ -131,11 +209,28 @@ vllm serve "$MODEL" \ --no-disable-hybrid-kv-cache-manager \ --max-model-len "$MAX_MODEL_LEN" \ --max-num-seqs "$PER_ENGINE_MAX_NUM_SEQS" \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +"${OFFLOAD_ARGS[@]}" > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" +wait_for_server_ready --port "$VLLM_BACKEND_PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +if [ "$USE_VLLM_ROUTER" = "true" ]; then + echo "Starting native vLLM router on port $PORT for $TP DP ranks..." + vllm-router \ + --worker-urls "http://localhost:$VLLM_BACKEND_PORT" \ + --policy "$VLLM_ROUTER_POLICY" \ + --intra-node-data-parallel-size "$TP" \ + --host 0.0.0.0 \ + --port "$PORT" \ + --prometheus-host 127.0.0.1 \ + --prometheus-port "$VLLM_ROUTER_METRICS_PORT" \ + --request-timeout-secs 3600 \ + --disable-retries > "$ROUTER_LOG" 2>&1 & + ROUTER_PID=$! + echo "Router PID: $ROUTER_PID" + wait_for_server_ready --port "$PORT" --server-log "$ROUTER_LOG" --server-pid "$ROUTER_PID" +fi # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh index 99aec25fe..029c8ea7f 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -31,7 +31,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true @@ -135,7 +145,7 @@ fi echo "Starting sglang server..." python3 -m sglang.launch_server \ - --model-path "$MODEL" \ + --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port "$PORT" \ "${PARALLEL_ARGS[@]}" \ diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh index 0a0177983..799c2bf26 100755 --- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- @@ -40,7 +50,7 @@ export PYTHONNOUSERSITE=1 # Per recipe: EP + DP=8 (no --tensor-parallel-size). TP from search space is # used for GPU allocation by the runner and as the DP size. -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh index 500b456f5..3b85a31cd 100755 --- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true @@ -42,7 +52,7 @@ echo "Starting SGLang server..." export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ - --model-path $MODEL \ + --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh index 259c19586..b3597cf52 100755 --- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- @@ -39,7 +49,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ ---model-path=$MODEL \ +--model-path=$MODEL_PATH --served-model-name=$MODEL \ --host=0.0.0.0 \ --port=$PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh deleted file mode 100755 index 6e921db58..000000000 --- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for GPT-OSS 120B FP4 on B200 using vLLM. -# -# Required env vars: -# MODEL, TP, CONC, RESULT_DIR - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION - -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi -nvidia-smi - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -cat > "$RESULT_DIR/config.yaml" << EOF -kv-cache-dtype: fp8 -compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $MAX_MODEL_LEN -EOF - -OFFLOAD_ARGS="" -case "$OFFLOADING" in - none) ;; - cpu) - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" - ;; - *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; -esac - -echo "Starting vllm server..." -export TORCH_CUDA_ARCH_LIST="10.0" -export PYTHONNOUSERSITE=1 -export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 - -vllm serve $MODEL \ ---host 0.0.0.0 \ ---port $PORT \ ---config "$RESULT_DIR/config.yaml" \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size $TP \ ---max-num-seqs $CONC \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh deleted file mode 100755 index 557986b0d..000000000 --- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for GPT-OSS 120B FP4 on H100 using vLLM. -# -# Required env vars: -# MODEL, TP, CONC, RESULT_DIR - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION - -# Agentic matrix entries don't set max-model-len, so the workflow passes 0. -# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi -nvidia-smi - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -cat > "$RESULT_DIR/config.yaml" << EOF -async-scheduling: true -max-cudagraph-capture-size: 2048 -max-model-len: $MAX_MODEL_LEN -EOF - -OFFLOAD_ARGS="" -case "$OFFLOADING" in - none) - ;; - cpu) - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 - exit 1 - ;; -esac - -echo "Starting vllm server..." -export TORCH_CUDA_ARCH_LIST="9.0" -export PYTHONNOUSERSITE=1 -export VLLM_MXFP4_USE_MARLIN=1 - -vllm serve $MODEL \ ---host 0.0.0.0 \ ---port $PORT \ ---config "$RESULT_DIR/config.yaml" \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size $TP \ ---max-num-seqs $CONC \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh deleted file mode 100755 index 1592a8d5c..000000000 --- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for GPT-OSS 120B FP4 on H200 using vLLM. -# -# Required env vars: -# MODEL, TP, CONC, RESULT_DIR - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION - -# Agentic matrix entries don't set max-model-len, so the workflow passes 0. -# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi -nvidia-smi - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -cat > "$RESULT_DIR/config.yaml" << EOF -async-scheduling: true -max-cudagraph-capture-size: 2048 -max-model-len: $MAX_MODEL_LEN -EOF - -OFFLOAD_ARGS="" -case "$OFFLOADING" in - none) - ;; - cpu) - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 - exit 1 - ;; -esac - -echo "Starting vllm server..." -export TORCH_CUDA_ARCH_LIST="9.0" -export PYTHONNOUSERSITE=1 -export VLLM_MXFP4_USE_MARLIN=1 - -vllm serve $MODEL \ ---host 0.0.0.0 \ ---port $PORT \ ---config "$RESULT_DIR/config.yaml" \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size $TP \ ---max-num-seqs $CONC \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh deleted file mode 100755 index eb1883ff1..000000000 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for GPT-OSS 120B FP4 on MI300X using vLLM. -# -# Required env vars: -# MODEL, TP, CONC, RESULT_DIR - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION - -# Agentic matrix entries don't set max-model-len, so the workflow passes 0. -# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi -rocm-smi -amd-smi || true - -# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory. -# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates -version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` -if [[ "$version" == "" || $version -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES -if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then - export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" -fi - -export AMDGCN_USE_BUFFER_OPS=0 -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -export PYTHONNOUSERSITE=1 - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -OFFLOAD_ARGS="" -case "$OFFLOADING" in - none) - ;; - cpu) - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 - exit 1 - ;; -esac - -echo "Starting vllm server..." - -vllm serve $MODEL \ ---host 0.0.0.0 \ ---port $PORT \ ---attention-backend ROCM_AITER_UNIFIED_ATTN \ --cc.pass_config.fuse_rope_kvcache=True \ --cc.use_inductor_graph_partition=True \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.85 \ ---max-model-len $MAX_MODEL_LEN \ ---max-num-seqs $CONC \ ---block-size=64 \ ---kv-cache-dtype fp8 \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh deleted file mode 100755 index 99e29c819..000000000 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for GPT-OSS 120B FP4 on MI325X using vLLM. -# -# Required env vars: -# MODEL, TP, CONC, RESULT_DIR - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION - -# Agentic matrix entries don't set max-model-len, so the workflow passes 0. -# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi -rocm-smi - -# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory. -# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates -version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` -if [[ "$version" == "" || $version -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES -if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then - export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" -fi - -export AMDGCN_USE_BUFFER_OPS=0 -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -export PYTHONNOUSERSITE=1 - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -OFFLOAD_ARGS="" -case "$OFFLOADING" in - none) - ;; - cpu) - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 - exit 1 - ;; -esac - -echo "Starting vllm server..." - -vllm serve $MODEL \ ---host 0.0.0.0 \ ---port $PORT \ ---attention-backend ROCM_AITER_UNIFIED_ATTN \ --cc.pass_config.fuse_rope_kvcache=True \ --cc.use_inductor_graph_partition=True \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.85 \ ---max-model-len $MAX_MODEL_LEN \ ---max-num-seqs $CONC \ ---block-size=64 \ ---kv-cache-dtype fp8 \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index ad0b4495a..34b45c9ec 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- @@ -178,7 +188,7 @@ export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh index 8cebe4f20..9667003e1 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- @@ -85,7 +95,7 @@ export PYTHONNOUSERSITE=1 { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index fd0ce3677..139b12256 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -33,7 +33,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true @@ -786,7 +796,7 @@ export PYTHONNOUSERSITE=1 { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh index 697d3fa45..5685f098c 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- @@ -45,7 +55,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_INT4=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --gpu-memory-utilization 0.95 \ diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh index 2fd3b381c..cb6c67f4b 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- @@ -45,7 +55,7 @@ echo "Starting vllm server..." export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_INT4=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --gpu-memory-utilization 0.95 \ diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh index 97929e43e..1bfa0c33b 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- @@ -56,7 +66,7 @@ echo "Starting vllm server..." export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_INT4=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --gpu-memory-utilization 0.95 \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh index 38ef72b56..f9b769636 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh @@ -19,14 +19,24 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps @@ -58,7 +68,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ $PARALLEL_ARGS \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh index 4ce131cba..d07c3af69 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh @@ -19,14 +19,24 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps @@ -62,7 +72,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh index 9f2d83a0b..906ae7408 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh @@ -19,14 +19,24 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps @@ -62,7 +72,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh index d21690da6..c35afe33a 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh @@ -19,14 +19,24 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps @@ -58,7 +68,7 @@ echo "Starting vllm server..." export TORCH_CUDA_ARCH_LIST="9.0" export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh index ed59991cb..5b4782646 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh @@ -19,14 +19,24 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps @@ -58,7 +68,7 @@ echo "Starting vllm server..." export TORCH_CUDA_ARCH_LIST="9.0" export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh index 260bbdc68..512eb0e6c 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh @@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true @@ -32,7 +42,7 @@ amd-smi || true # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps @@ -64,7 +74,7 @@ echo "Starting vllm server..." export VLLM_ROCM_USE_AITER=1 export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh index edac27a45..5e5a9f9a3 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh @@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true @@ -32,7 +42,7 @@ amd-smi || true # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps @@ -61,7 +71,7 @@ echo "Starting vllm server..." export VLLM_ROCM_USE_AITER=1 export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh index 39dd63293..8e15e7850 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true @@ -32,7 +42,7 @@ amd-smi || true # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps @@ -65,7 +75,7 @@ export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py b/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py new file mode 100755 index 000000000..5c061606f --- /dev/null +++ b/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +"""Temporarily bound MooncakeStoreConnector transfer batches. + +Mooncake's TCP connection pool grows without a concurrency ceiling. Large +DeepSeek-V4 requests therefore create enough simultaneous per-layer transfers +to exhaust the node's TCP ports. This patch preserves the same keys and buffer +lists but submits them in smaller sequential batches. +""" + +import argparse +from pathlib import Path + + +HELPER_ANCHOR = '''def _rotate_list(values: list[_T], offset: int) -> list[_T]: + return values[offset:] + values[:offset] +''' + +HELPER = ''' + +_INFERENCEX_MOONCAKE_BATCH_PATCH = True + + +def _run_mooncake_transfer_batches(fn, keys, addrs, sizes, *args): + max_keys = int(os.getenv("INFERENCEX_MOONCAKE_MAX_TRANSFER_BATCH_KEYS", "0")) + if max_keys <= 0 or len(keys) <= max_keys: + return fn(keys, addrs, sizes, *args) + + results = [] + for start in range(0, len(keys), max_keys): + end = start + max_keys + results.extend(fn(keys[start:end], addrs[start:end], sizes[start:end], *args)) + return results +''' + +PUT_CALL = '''res = self.store.batch_put_from_multi_buffers( + keys, + addrs, + sizes, + self.replicate_config, + )''' + +PATCHED_PUT_CALL = '''res = _run_mooncake_transfer_batches( + self.store.batch_put_from_multi_buffers, + keys, + addrs, + sizes, + self.replicate_config, + )''' + +GET_CALL = '''res = self.store.batch_get_into_multi_buffers( + batch_keys, batch_addrs, batch_sizes + )''' + +PATCHED_GET_CALL = '''res = _run_mooncake_transfer_batches( + self.store.batch_get_into_multi_buffers, + batch_keys, + batch_addrs, + batch_sizes, + )''' + + +def patch_worker(worker_path: Path) -> None: + source = worker_path.read_text() + if "_INFERENCEX_MOONCAKE_BATCH_PATCH = True" in source: + print(f"Mooncake transfer batching already patched: {worker_path}") + return + + replacements = ( + (HELPER_ANCHOR, HELPER_ANCHOR + HELPER), + (PUT_CALL, PATCHED_PUT_CALL), + (GET_CALL, PATCHED_GET_CALL), + ) + for old, new in replacements: + count = source.count(old) + if count != 1: + raise RuntimeError( + f"Expected exactly one patch target in {worker_path}, found {count}: " + f"{old.splitlines()[0]}" + ) + source = source.replace(old, new, 1) + + worker_path.write_text(source) + print(f"Patched Mooncake transfer batching: {worker_path}") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--worker-path", type=Path) + args = parser.parse_args() + + worker_path = args.worker_path + if worker_path is None: + import vllm + + worker_path = Path(vllm.__file__).parent / ( + "distributed/kv_transfer/kv_connector/v1/mooncake/store/worker.py" + ) + patch_worker(worker_path) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh index 4ba87976b..d06d82ec8 100755 --- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh @@ -20,7 +20,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- @@ -39,7 +49,7 @@ export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true python3 -m sglang.launch_server \ ---model-path=$MODEL \ +--model-path=$MODEL_PATH --served-model-name=$MODEL \ --host=0.0.0.0 \ --port=$PORT \ --served-model-name "Qwen/Qwen3.5-397B-A17B" \ diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh index 3432af5c9..ad49b2b67 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh @@ -20,7 +20,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- @@ -39,7 +49,7 @@ export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true python3 -m sglang.launch_server \ ---model-path=$MODEL \ +--model-path=$MODEL_PATH --served-model-name=$MODEL \ --host=0.0.0.0 \ --port=$PORT \ --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" \ diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh index 9d9c1d7d5..4f9b12659 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh @@ -24,7 +24,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- @@ -85,7 +95,7 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true { set +x; } 2>/dev/null SGLANG_CMD=( python3 -m sglang.launch_server - --model-path="$MODEL" + --model-path="$MODEL_PATH" --served-model-name="$MODEL" --host=0.0.0.0 --port="$PORT" --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh index 95f0397a0..b280fff8b 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh @@ -27,7 +27,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- @@ -98,7 +108,7 @@ fi { set +x; } 2>/dev/null SGLANG_CMD=( python3 -m sglang.launch_server - --model-path="$MODEL" + --model-path="$MODEL_PATH" --served-model-name="$MODEL" --host=0.0.0.0 --port="$PORT" --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index aef9650ca..ff901b674 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true @@ -36,7 +46,7 @@ export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ - --model-path $MODEL \ + --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh index 5427d0d31..cdded8860 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh @@ -24,7 +24,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true @@ -110,7 +120,7 @@ export PYTHONNOUSERSITE=1 SGLANG_CMD=( python3 -m sglang.launch_server --attention-backend triton - --model-path "$MODEL" + --model-path "$MODEL_PATH" --served-model-name "$MODEL" --host=0.0.0.0 --port "$PORT" --tensor-parallel-size "$TP" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d201e9f3b..c4111ef0d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3395,3 +3395,10 @@ description: - "Add DeepSeek-V4-Pro FP4 MI355X ATOM MTP3 benchmark; image rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1627 + +- config-keys: + - dsv4-fp4-b200-sglang-agentic-hicache + - dsv4-fp4-b300-sglang-agentic-hicache + description: + - "Add DeepSeek-V4-Pro FP4 B200 and B300 SGLang agentic benchmarks with HiCache CPU KV offloading; use the B200-specific DeepSeek-V4 Blackwell image and the June 9 nightly on B300" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1640 diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index bb3bf9ed1..2187617ae 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -364,9 +364,35 @@ else # and gpu-15 names no longer exist. gpu-2 currently has 10 fully-idle GPU # nodes (all of gpu-2-[0-9]); gpu-1 has 2 drained (gpu-1-4, gpu-1-8). We # land on gpu-2 to avoid drained nodes and skip the per-node excludes. - salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" + SALLOC_MEMORY_ARGS=() + if [[ "${OFFLOADING:-none}" != "none" ]]; then + # Host KV tiers (vLLM Mooncake cpu offload, SGLang HiCache) allocate + # multi-TB pinned host pools. Without an explicit request, Slurm caps + # this exclusive job at 2 TB and OOM-kills it even though the B200 + # node has about 4 TB of physical RAM. + SALLOC_MEMORY_ARGS=(--mem=0) + fi + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive "${SALLOC_MEMORY_ARGS[@]}" --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) + # DSv4 is also staged on the compute nodes' local RAID. Loading the 806 GB + # checkpoint independently from Lustre on every TP rank leaves the loader + # threads blocked in Lustre I/O for hours. Select the local copy only after + # Slurm assigns a node, and retain the shared-Lustre path as a fallback for + # nodes whose local staging is incomplete. + if [[ "$MODEL_PREFIX" == "dsv4" && "$PRECISION" == "fp4" && "$FRAMEWORK" == "sglang" ]]; then + LOCAL_MODEL_PATH=/raid/models/DeepSeek-V4-Pro-NVFP4 + if srun --jobid="$JOB_ID" bash -c \ + 'test -f "$1/config.json" && test -f "$1/model.safetensors.index.json" && test "$(find "$1" -maxdepth 1 -name "model-*.safetensors" | wc -l)" -eq 64' \ + _ "$LOCAL_MODEL_PATH"; then + export MODEL_PATH="$LOCAL_MODEL_PATH" + export MODEL="$MODEL_PATH" + echo "Using node-local DSv4 checkpoint: $MODEL_PATH" + else + echo "Node-local DSv4 checkpoint unavailable; using shared checkpoint: $MODEL_PATH" + fi + fi + # Use flock to serialize concurrent imports to the same squash file # Override ENROOT_CACHE_PATH to avoid permission issues with system-wide cache on worker nodes srun --jobid=$JOB_ID bash -c " diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 67e8b48cc..1616ed490 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -379,7 +379,14 @@ else fi ) - salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" + SALLOC_MEMORY_ARGS=() + if [[ "${OFFLOADING:-none}" != "none" ]]; then + # Host KV tiers (vLLM Mooncake cpu offload, SGLang HiCache) allocate + # multi-TB pinned host pools. Give them the full memory allocation of + # the exclusive node instead of Slurm's implicit 2 TB default. + SALLOC_MEMORY_ARGS=(--mem=0) + fi + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive "${SALLOC_MEMORY_ARGS[@]}" --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) srun --jobid=$JOB_ID \ @@ -387,6 +394,7 @@ else --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT,$WRITABLE_MODELS_DIR:$WRITABLE_MODELS_DIR \ --no-container-mount-home \ + --container-remap-root \ --container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash "$BENCH_SCRIPT" diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index dada98bd6..18f286965 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -148,8 +148,21 @@ fi # TODO(CJQ): make first class upon srt-slurm upstream refactor if [[ "$IS_AGENTIC" == "1" ]]; then - git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" + # Agentic multi-node uses the same pinned cquil11/srt-slurm-nv commit as + # launch_gb300-nv.sh — everything the agentic recipes need is there: + # - BenchmarkType.CUSTOM + benchmark.command + benchmark.env + # (the hook that hands off to benchmarks/multi_node/agentic_srt.sh) + # - DynamoConfig.wheel (recipes pin the ai-dynamo wheel) + # - srtctl apply --no-preflight (model path /mnt/numa1 is compute-node + # local NVMe, invisible to the login-node runner) + # - benchmark_stage srun_options propagation (container-remap-root + # must reach the agentic_srt.sh srun) + git clone https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" + git checkout 6e34b8b83229634d732e41a4e2d6595f46ef60b5 + mkdir -p recipes/vllm/deepseek-v4/agentic + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic" \ + recipes/vllm/deepseek-v4/agentic elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" @@ -200,6 +213,24 @@ echo "Configs available at: $SRT_REPO_DIR/" # Create srtslurm.yaml for srtctl (used by both frameworks) SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" + +# Agentic runs bind-mount two persistent caches into every worker container +# (Lustre, shared across nodes): aiperf's content-addressed dataset mmap +# cache (~65 GB per corpus, re-tokenized from scratch without it) and the +# HF hub cache holding the trace dataset download. The container-side paths +# are referenced by the agentic recipes' benchmark.env +# (AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache, HF_HUB_CACHE=/hf_hub_cache). +DEFAULT_MOUNTS_BLOCK="" +if [[ "$IS_AGENTIC" == "1" ]]; then + AIPERF_MMAP_CACHE_HOST_PATH="/mnt/lustre01/users-public/sa-shared/ai-perf-cache" + HF_HUB_CACHE_HOST_PATH="/mnt/lustre01/users-public/sa-shared/hf-hub-cache" + mkdir -p "$AIPERF_MMAP_CACHE_HOST_PATH" "$HF_HUB_CACHE_HOST_PATH" + chmod 777 "$AIPERF_MMAP_CACHE_HOST_PATH" "$HF_HUB_CACHE_HOST_PATH" 2>/dev/null || true + DEFAULT_MOUNTS_BLOCK="default_mounts: + ${AIPERF_MMAP_CACHE_HOST_PATH}: /aiperf_mmap_cache + ${HF_HUB_CACHE_HOST_PATH}: /hf_hub_cache" +fi + echo "Creating srtslurm.yaml configuration..." cat > srtslurm.yaml <. +# On watchtower the whole batch partition (blue-cn01-18) is a single NVL72 +# rack, so segment contiguity buys nothing for MNNVL — but it DOES make +# jobs unschedulable when the partition is fragmented: Slurm backfills a +# non-contiguous node set, fails segment placement at start, and the job +# dies with "CANCELLED Reason=Resources" at RunTime=0 (hit by the first +# gb200 agentic run, job 18582). Mirror launch_gb300-nv.sh and disable. +use_segment_sbatch_directive: false +${DEFAULT_MOUNTS_BLOCK} EOF echo "Generated srtslurm.yaml:" @@ -237,13 +277,42 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" echo "Submitting job with srtctl..." -# Override the job name in the config file with the runner name -sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "${CONFIG_FILE%%:*}" +# Override the job name with the runner name, prefixed "ifx-": another +# runner fleet on watchtower (user slurm-shared, uid 1010, with Slurm +# operator rights) names ITS jobs after the same runner names (gb200-nv_N) +# and its pre-job cleanup scancels by job name across users — it killed our +# jobs 18593 and 18599 mid-startup (CANCELLED by 1010). The distinct prefix +# keeps their --name match away from our jobs; the workflow's own pre-run +# cleanup scancels both the bare and ifx- prefixed names. +# +# NOTE the sed alone is not enough: srtctl's get_job_name() (cli/submit.py) +# prefers the RUNNER_NAME env var over the recipe name, so the prefixed +# RUNNER_NAME must be passed to `srtctl apply` itself (R4 job 18599 proved +# the recipe-name route gets ignored on CI runners). +sed -i "s/^name:.*/name: \"ifx-${RUNNER_NAME}\"/" "${CONFIG_FILE%%:*}" +SRTCTL_RUNNER_NAME="ifx-${RUNNER_NAME}" + +# Don't leak the login-node venv to the compute-node orchestrator. sbatch's +# default --export=ALL propagates VIRTUAL_ENV (set by `source +# .venv/bin/activate` above) into job_script_minimal.j2, whose +# `uv run` step then tries to inspect the *active* venv — and dies with +# "Broken symlink at .venv/bin/python3" because the login-node interpreter +# path doesn't exist on compute nodes (gb200 agentic R2, job 18587). +# srtctl itself still resolves through PATH (.venv/bin is on it). +unset VIRTUAL_ENV + +# --no-preflight is only safe on the agentic path, where the recipe resolves +# model.path to /mnt/numa1 (compute-node-only NVMe) that the login-node +# runner can't see. Fixed-seq-len recipes keep enforcement on. +PREFLIGHT_FLAG="" +if [[ "$IS_AGENTIC" == "1" ]]; then + PREFLIGHT_FLAG="--no-preflight" +fi if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then - SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) + SRTCTL_OUTPUT=$(RUNNER_NAME="$SRTCTL_RUNNER_NAME" srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) else - SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) + SRTCTL_OUTPUT=$(RUNNER_NAME="$SRTCTL_RUNNER_NAME" srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) fi echo "$SRTCTL_OUTPUT" diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 6a5c50e38..7a7a66afa 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -88,6 +88,12 @@ export NVIDIA_DRIVER_CAPABILITIES=compute,utility # write to it. export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/vast/ai-perf-cache" +# Persistent HF hub cache for the agentic trace datasets — see the +# launch_gb300-nv.sh comment. Mounted at /hf_hub_cache; agentic recipes set +# HF_HUB_CACHE=/hf_hub_cache in benchmark.env. +export HF_HUB_CACHE_HOST_PATH="/mnt/vast/hf-hub-cache" +mkdir -p "$HF_HUB_CACHE_HOST_PATH" + NGINX_IMAGE="nginx:1.27.4" # Squash files live alongside models on /mnt/vast (shared across nodes). @@ -221,6 +227,7 @@ srtctl_root: "${SRTCTL_ROOT}" default_mounts: ${DYNAMO_WHEELS_CACHE_HOST}: /configs/dynamo-wheels ${AIPERF_MMAP_CACHE_HOST_PATH}: /aiperf_mmap_cache + ${HF_HUB_CACHE_HOST_PATH}: /hf_hub_cache model_paths: dspro: "${MODEL_PATH}" diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index b47e103fd..e4597302f 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -17,6 +17,14 @@ export ENROOT_ROOTFS_WRITABLE=1 # write to it. export AIPERF_MMAP_CACHE_HOST_PATH="/data/home/sa-shared/gharunners/ai-perf-cache" +# Persistent HF hub cache for the agentic trace datasets — mounted into +# worker containers at /hf_hub_cache; the agentic recipes set +# HF_HUB_CACHE=/hf_hub_cache in benchmark.env. Without it the workflow-level +# HF_HUB_CACHE (/mnt/hf_hub_cache) doesn't exist on these nodes and every +# run re-downloads the corpus into the ephemeral container overlay. +export HF_HUB_CACHE_HOST_PATH="/data/home/sa-shared/gharunners/hf-hub-cache" +mkdir -p "$HF_HUB_CACHE_HOST_PATH" + export MODEL_PATH=$MODEL if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then @@ -189,6 +197,7 @@ srtctl_root: "${SRTCTL_ROOT}" # re-tokenized + re-written every job. default_mounts: "${AIPERF_MMAP_CACHE_HOST_PATH}": "/aiperf_mmap_cache" + "${HF_HUB_CACHE_HOST_PATH}": "/hf_hub_cache" # Model path aliases model_paths: diff --git a/utils/aiperf b/utils/aiperf index 062a5de92..ff2b646c0 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit 062a5de92c8ac8a0a6dd5d2a7fb9a539a147f3d9 +Subproject commit ff2b646c0425aff9307a0e73161b23d77003a357 diff --git a/utils/process_agentic_result.py b/utils/process_agentic_result.py index 3c4015ce6..90f1aaca9 100644 --- a/utils/process_agentic_result.py +++ b/utils/process_agentic_result.py @@ -37,7 +37,6 @@ # Trace metadata lookup: conversation_id (= trace id) -> per-turn dict with # ``hash_ids`` and ``output_length``. Built lazily from the HF dataset cache. _TRACE_METADATA_CACHE: dict[str, list[dict]] | None = None -_HF_DATASET = "semianalysisai/cc-traces-weka-with-subagents-051926" # ---- helpers --------------------------------------------------------------- @@ -118,10 +117,17 @@ def load_server_metrics(path: Path) -> dict: def _hf_traces_dir() -> Path | None: """Locate the HuggingFace cache directory for the weka traces dataset. - Returns the directory containing per-trace JSON files, or None if the - dataset isn't present locally. Mirrors the layout + Returns the directory containing per-trace JSON files, or None if no + weka dataset is present locally. Mirrors the layout huggingface_hub.snapshot_download() produces: ``$HF_HUB_CACHE/datasets----/snapshots//``. + + The bench script supports several corpus revisions + (cc-traces-weka-with-subagents-052726, ...-060226, ...-060226-256k, etc.) + and may switch between them per-recipe via WEKA_LOADER_OVERRIDE. Rather + than hardcode a single dataset name, scan all ``datasets--semianalysisai + --cc-traces-weka*`` directories in the cache and pick the most-recently- + modified snapshot that contains usable trace files. """ hub_cache = os.environ.get("HF_HUB_CACHE") or os.environ.get("HUGGINGFACE_HUB_CACHE") if hub_cache: @@ -130,17 +136,23 @@ def _hf_traces_dir() -> Path | None: home = os.environ.get("HF_HOME") cache_root = Path(home) / "hub" if home else Path.home() / ".cache" / "huggingface" / "hub" - org, name = _HF_DATASET.split("/", 1) - snapshots = cache_root / f"datasets--{org}--{name}" / "snapshots" - if not snapshots.is_dir(): + if not cache_root.is_dir(): return None - candidates = sorted(snapshots.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True) + + # Collect every weka-corpus snapshot dir across all matching dataset + # entries, sorted newest first. + snapshots: list[Path] = [] + for dataset_dir in cache_root.glob("datasets--semianalysisai--cc-traces-weka*"): + snap_root = dataset_dir / "snapshots" + if not snap_root.is_dir(): + continue + snapshots.extend(p for p in snap_root.iterdir() if p.is_dir()) + snapshots.sort(key=lambda p: p.stat().st_mtime, reverse=True) + # Prefer the snapshot that contains usable trace files. The published HF # dataset ships a single ``traces.jsonl`` (one trace per line); older / # local mirrors may use per-trace ``*.json`` files instead. Accept either. - for c in candidates: - if not c.is_dir(): - continue + for c in snapshots: if any(c.glob("*.jsonl")) or any(c.glob("*.json")): return c return None @@ -382,16 +394,36 @@ def compute_throughput_stats(records: list[dict], aggregate: dict) -> dict: def compute_cache_stats(records: list[dict], server_metrics: dict) -> dict: - """Cache-hit metrics: theoretical (from trace metadata) + actual (server).""" + """Cache-hit metrics: theoretical (from trace metadata) + actual (server). + + Server-metric coverage depends on the engine + KV connector combination, + so several fields are structurally null for some configs. The matrix: + + | engine + connector | populated server fields | + |----------------------------------------------|------------------------------------| + | vLLM, no connector | server_gpu_cache_hit_rate, | + | | gpu_kv_cache_usage_pct | + | vLLM + SimpleCPUOffloadConnector | same as above (the CPU tier | + | | extends the local LRU; reloads are | + | | counted as prefix_cache_hits — no | + | | separate vllm:cpu_prefix_cache_* | + | | counter exists) | + | vLLM + LMCacheMPConnector (kv_role=kv_both) | server_external_cache_hit_rate. | + | | server_gpu_cache_hit_rate goes to | + | | ~0 because delay_cache_blocks=True | + | | suppresses local hash registration | + | SGLang | not yet wired | + """ result: dict = { "theoretical_cache_hit_rate": None, "server_gpu_cache_hit_rate": None, - "server_cpu_cache_hit_rate": None, + "server_external_cache_hit_rate": None, + "gpu_kv_cache_usage_pct": None, + "cpu_kv_cache_usage_pct": None, "kv_offload_bytes_gpu_to_cpu": None, "kv_offload_bytes_cpu_to_gpu": None, "kv_offload_time_gpu_to_cpu": None, "kv_offload_time_cpu_to_gpu": None, - "cpu_kv_cache_usage_pct": None, "total_prompt_tokens": None, "total_generation_tokens": None, "total_requests_completed": None, @@ -476,15 +508,30 @@ def _final_value(metric_name: str) -> float | None: return agg return None + # Local GPU prefix cache (every vLLM config emits these). Note: with + # LMCacheMPConnector + kv_role=kv_both, the scheduler sets + # delay_cache_blocks=True on every load and these hits stay at ~0 even + # when overall cache efficiency is high — read server_external_*. hits = _final_value("vllm:prefix_cache_hits") queries = _final_value("vllm:prefix_cache_queries") if hits is not None and queries and queries > 0: result["server_gpu_cache_hit_rate"] = hits / queries - cpu_hits = _final_value("vllm:cpu_prefix_cache_hits") - cpu_queries = _final_value("vllm:cpu_prefix_cache_queries") - if cpu_hits is not None and cpu_queries and cpu_queries > 0: - result["server_cpu_cache_hit_rate"] = cpu_hits / cpu_queries + # External KV connector (LMCacheMPConnector and similar). Only populated + # when the connector implements get_num_new_matched_tokens; absent for + # SimpleCPUOffloadConnector and for pure-vLLM (no connector) runs. + ext_hits = _final_value("vllm:external_prefix_cache_hits") + ext_queries = _final_value("vllm:external_prefix_cache_queries") + if ext_hits is not None and ext_queries and ext_queries > 0: + result["server_external_cache_hit_rate"] = ext_hits / ext_queries + + # GPU KV pool fill ratio gauge. vLLM emits vllm:kv_cache_usage_perc on V1 + # and vllm:gpu_cache_usage_perc on V0 (kept for older deployments). + kv_usage = _final_value("vllm:kv_cache_usage_perc") + if kv_usage is None: + kv_usage = _final_value("vllm:gpu_cache_usage_perc") + if kv_usage is not None: + result["gpu_kv_cache_usage_pct"] = kv_usage for src_key, dst_key in ( ("vllm:kv_offload_bytes_gpu_to_cpu", "kv_offload_bytes_gpu_to_cpu"), @@ -679,6 +726,13 @@ def main() -> int: ) if agg.get("server_gpu_cache_hit_rate") is not None: print(f" GPU cache hit rate: {agg['server_gpu_cache_hit_rate']:.1%}") + if agg.get("server_external_cache_hit_rate") is not None: + print( + f" External cache hit rate: " + f"{agg['server_external_cache_hit_rate']:.1%}" + ) + if agg.get("gpu_kv_cache_usage_pct") is not None: + print(f" GPU KV cache usage: {agg['gpu_kv_cache_usage_pct']:.1%}") if agg.get("response_cache_hit_rate") is not None: print(f" Response cache hit rate: {agg['response_cache_hit_rate']:.1%}") if agg.get("theoretical_cache_hit_rate") is not None: diff --git a/utils/test_validate_agentic_result.py b/utils/test_validate_agentic_result.py new file mode 100644 index 000000000..f21bfa069 --- /dev/null +++ b/utils/test_validate_agentic_result.py @@ -0,0 +1,73 @@ +"""Tests for the agentic aiperf result validity gate.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from validate_agentic_result import validate_result + + +def _write_aggregate(tmp_path: Path, aggregate: dict, *, per_run: bool = False) -> Path: + artifact_dir = tmp_path / "aiperf_artifacts" + output_dir = artifact_dir / "run_0" if per_run else artifact_dir + output_dir.mkdir(parents=True) + with open(output_dir / "profile_export_aiperf.json", "w") as f: + json.dump(aggregate, f) + return artifact_dir + + +def test_passes_when_request_error_rate_is_within_limit(tmp_path: Path): + artifact_dir = _write_aggregate( + tmp_path, + { + "request_count": {"avg": 90}, + "error_request_count": {"avg": 10}, + "completed_request_count": {"avg": 100}, + }, + ) + + assert validate_result(artifact_dir, 0.10) == [] + + +def test_fails_when_request_error_rate_exceeds_limit(tmp_path: Path): + artifact_dir = _write_aggregate( + tmp_path, + { + "request_count": {"avg": 2}, + "error_request_count": {"avg": 65}, + "completed_request_count": {"avg": 67}, + }, + ) + + errors = validate_result(artifact_dir, 0.10) + assert errors == [ + "aiperf request error rate exceeded the benchmark limit: " + "65/67 = 97.015% > 10.000%" + ] + + +def test_treats_missing_error_count_as_zero(tmp_path: Path): + artifact_dir = _write_aggregate( + tmp_path, + {"request_count": {"avg": 12}}, + ) + + assert validate_result(artifact_dir, 0.10) == [] + + +def test_supports_per_run_artifact_layout(tmp_path: Path): + artifact_dir = _write_aggregate( + tmp_path, + {"request_count": {"avg": 12}}, + per_run=True, + ) + + assert validate_result(artifact_dir, 0.10) == [] + + +def test_fails_when_aggregate_is_missing(tmp_path: Path): + errors = validate_result(tmp_path / "aiperf_artifacts", 0.10) + + assert len(errors) == 1 + assert errors[0].endswith("profile_export_aiperf.json not found") diff --git a/utils/validate_agentic_result.py b/utils/validate_agentic_result.py new file mode 100644 index 000000000..e54691059 --- /dev/null +++ b/utils/validate_agentic_result.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +"""Validate whether an aiperf agentic replay produced benchmarkable results.""" + +from __future__ import annotations + +import argparse +import json +import math +import sys +from pathlib import Path +from typing import Any + + +def _resolve_aggregate_path(artifact_dir: Path) -> Path: + """Find aiperf's aggregate JSON in the direct or per-run artifact layout.""" + direct = artifact_dir / "profile_export_aiperf.json" + if direct.is_file(): + return direct + + if artifact_dir.is_dir(): + for child in sorted(artifact_dir.iterdir()): + candidate = child / "profile_export_aiperf.json" + if child.is_dir() and candidate.is_file(): + return candidate + + return direct + + +def _metric_avg(aggregate: dict[str, Any], name: str) -> float | None: + """Read an aggregate metric's numeric average, if present.""" + metric = aggregate.get(name) + if metric is None: + return None + if not isinstance(metric, dict): + raise ValueError(f"{name} must be an object") + + value = metric.get("avg") + if value is None: + return None + if not isinstance(value, int | float) or isinstance(value, bool): + raise ValueError(f"{name}.avg must be numeric") + + value = float(value) + if not math.isfinite(value) or value < 0: + raise ValueError(f"{name}.avg must be a finite non-negative number") + return value + + +def validate_result(artifact_dir: Path, failed_request_threshold: float) -> list[str]: + """Return validation errors for an aiperf artifact directory.""" + aggregate_path = _resolve_aggregate_path(artifact_dir) + if not aggregate_path.is_file(): + return [f"{aggregate_path} not found"] + + try: + with open(aggregate_path) as f: + aggregate = json.load(f) + if not isinstance(aggregate, dict): + return [f"{aggregate_path} must contain a JSON object"] + + successes = _metric_avg(aggregate, "request_count") + errors = _metric_avg(aggregate, "error_request_count") or 0.0 + completed = _metric_avg(aggregate, "completed_request_count") + except (OSError, json.JSONDecodeError, ValueError) as exc: + return [f"failed to read {aggregate_path}: {exc}"] + + if successes is None: + return ["request_count.avg is missing"] + if completed is None: + completed = successes + errors + if completed <= 0: + return ["aiperf completed zero requests"] + + error_rate = errors / completed + if error_rate > failed_request_threshold: + return [ + "aiperf request error rate exceeded the benchmark limit: " + f"{errors:g}/{completed:g} = {error_rate:.3%} > " + f"{failed_request_threshold:.3%}" + ] + + print( + "Validated aiperf request error rate: " + f"{errors:g}/{completed:g} = {error_rate:.3%} <= " + f"{failed_request_threshold:.3%}" + ) + return [] + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("artifact_dir", type=Path) + parser.add_argument( + "--failed-request-threshold", + type=float, + required=True, + help="Maximum accepted error fraction, inclusive", + ) + args = parser.parse_args() + + if not 0 <= args.failed_request_threshold <= 1: + parser.error("--failed-request-threshold must be between 0 and 1") + + errors = validate_result(args.artifact_dir, args.failed_request_threshold) + for error in errors: + print(f"ERROR: {error}", file=sys.stderr) + return 1 if errors else 0 + + +if __name__ == "__main__": + sys.exit(main())