From f632aa42c2872eecaa0089d119e6f1fea1a5c2ec Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 12:21:55 -0500 Subject: [PATCH 001/132] agentic(trace-source): default non-DSv4 to v6 (060226) corpus resolve_trace_source() now picks a model-prefix-aware default: MODEL_PREFIX == dsv4 -> semianalysis_cc_traces_weka_with_subagents (052726, the v5 baseline, unchanged for continuity with prior DSv4 published runs) everything else -> semianalysis_cc_traces_weka_with_subagents_060226 (060226, newer v6 corpus with fresher CC recording windows) WEKA_LOADER_OVERRIDE still wins. Allowed values widened from the two 052726 loaders to all four: semianalysis_cc_traces_weka_with_subagents (052726) semianalysis_cc_traces_weka_with_subagents_256k (052726-256k) semianalysis_cc_traces_weka_with_subagents_060226 (060226) semianalysis_cc_traces_weka_with_subagents_060226_256k (060226-256k) Bumps utils/aiperf submodule to de3ad1c1, which registers the two 060226 plugin entries those new loader names resolve through. The pre-cache log line now also includes MODEL_PREFIX so it's obvious in CI which default fired. Signed-off-by: Cam Quilici --- benchmarks/benchmark_lib.sh | 25 +++++++++++++++++++++---- utils/aiperf | 2 +- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e3080b4bf..e062b42f1 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -924,8 +924,19 @@ resolve_trace_source() { # public-dataset loader names allowed by the inferencex-agentx-mvp # scenario. Used by recipes whose servers have non-default context # caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the - # unfiltered 052726 corpus and switches to the 256k-capped variant). - local loader="${WEKA_LOADER_OVERRIDE:-semianalysis_cc_traces_weka_with_subagents}" + # unfiltered corpus and switches to the 256k-capped variant), or + # by recipes that want to pin a specific corpus generation rather + # than ride the model-prefix-aware default below. + # + # Default (no override) is model-prefix-aware: + # DSv4 recipes -> 052726 (v5 corpus, the original baseline) + # everything else -> 060226 (v6 corpus, newer CC versions) + # DSv4 stays on 052726 for continuity with prior published baselines. + local default_loader="semianalysis_cc_traces_weka_with_subagents_060226" + if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then + default_loader="semianalysis_cc_traces_weka_with_subagents" + fi + local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}" local dataset case "$loader" in semianalysis_cc_traces_weka_with_subagents) @@ -934,13 +945,19 @@ resolve_trace_source() { semianalysis_cc_traces_weka_with_subagents_256k) dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k" ;; + semianalysis_cc_traces_weka_with_subagents_060226) + dataset="semianalysisai/cc-traces-weka-with-subagents-060226" + ;; + semianalysis_cc_traces_weka_with_subagents_060226_256k) + dataset="semianalysisai/cc-traces-weka-with-subagents-060226-256k" + ;; *) - echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2 + echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k" >&2 exit 1 ;; esac TRACE_SOURCE_FLAG="--public-dataset $loader" - echo "Loading traces via aiperf public-dataset: $loader ($dataset)" + echo "Loading traces via aiperf public-dataset: $loader ($dataset) [MODEL_PREFIX=${MODEL_PREFIX:-unset}]" # Pre-download the dataset into the shared HF_HUB_CACHE (same mount used # for model weights) so subsequent runs read from cache instead of # re-downloading every job. diff --git a/utils/aiperf b/utils/aiperf index 062a5de92..de3ad1c18 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit 062a5de92c8ac8a0a6dd5d2a7fb9a539a147f3d9 +Subproject commit de3ad1c18b704a60c43bcc5f76dfb2ac7e346fd1 From 5544a448d594a5ff3b8b83a25d714a8635adc3b7 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 13:16:10 -0500 Subject: [PATCH 002/132] configs(master): consolidate agentic recipes at end + split combined dsr1-trt entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reorganizes both master YAMLs so all pure-agentic (agentic-coding-only) recipes sit at the bottom of the file behind an "# Agentic configs" divider, separated from fixed-seq-len / synthetic / prefix-share entries above. No functional change to any non-agentic recipe. nvidia-master.yaml: splits dsr1-fp4-b200-dynamo-trt — which previously mixed fixed-seq-len + agentic-coding in one entry — into the original entry (fixed-seq-len only) plus a new sibling dsr1-fp4-b200-dynamo-trt-agentic carrying the agentic-coding scenario. 22 pure-agentic entries moved. amd-master.yaml: no split needed (no combined entries); 9 pure-agentic entries moved to the end. Verified via deep YAML parse: nvidia adds 1 key (the split sibling) and modifies the source key's scenarios from [agentic-coding, fixed-seq-len] to [fixed-seq-len]; amd has 0 keys added/removed/modified. All other entries are byte-equal after round-trip. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/amd-master.yaml | 358 ++++++------ .github/configs/nvidia-master.yaml | 893 +++++++++++++++-------------- 2 files changed, 637 insertions(+), 614 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index fb3966ce6..0495ebf16 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -304,25 +304,6 @@ qwen3.5-fp8-mi355x-sglang-mtp: - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } -# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main. -qwen3.5-fp8-mi355x-sglang-agentic: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - qwen3.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -704,26 +685,6 @@ glm5.1-fp4-mi355x-sglang: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 16 } -# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main. -glm5.1-fp4-mi355x-sglang-agentic: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 - model: amd/GLM-5.1-MXFP4 - model-prefix: glm5.1 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - glm5.1-fp4-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: amd/GLM-5.1-MXFP4 @@ -821,38 +782,6 @@ kimik2.5-fp4-mi355x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } -# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0' -kimik2.5-fp4-mi355x-vllm-agentic: - # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin - # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm - # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and - # includes all subsequent ROCm offload work. - image: vllm/vllm-openai-rocm:v0.21.0 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } - # CPU offload only above the KV cliff. Lower concurrencies fit - # entirely on-GPU, so paying the offload-path overhead there would - # just slow them down without measuring anything new. - - { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] } - # TP=4 probe: half-node layout doubles per-GPU weight footprint - # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to - # cliff-region concurrencies on both offload modes so we can directly - # compare TP=4 vs TP=8 at the same conc points. - - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } - kimik2.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 model: amd/Kimi-K2.5-MXFP4 @@ -897,33 +826,6 @@ minimaxm2.5-fp8-mi355x-vllm: - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } -# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' -minimaxm2.5-fp8-mi355x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector] - # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"), - # which enables SimpleCPUOffloadConnector on ROCm. Required for the - # cpu-offload sweep points to use the same offload path as the NVIDIA - # agentic-coding configs. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi355x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical). - # Compute saturates first; cpu offload likely won't help, but worth confirming. - # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). - - duration: 1800 - search-space: - - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] } - - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } - minimaxm2.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: MiniMaxAI/MiniMax-M2.5 @@ -1014,29 +916,6 @@ minimaxm2.5-fp8-mi300x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } -# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' -minimaxm2.5-fp8-mi300x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi300x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200); - # KV cliff ~52. Compute saturates first. - # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } - - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } - minimaxm2.5-fp8-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -1058,30 +937,6 @@ minimaxm2.5-fp8-mi325x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } -# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' -minimaxm2.5-fp8-mi325x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi325x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI325X tp=4: cloned from MI300X recipe (slightly faster compute, - # similar HBM profile). Compute saturates first; cpu-offload window - # exercises the SimpleCPUOffloadConnector path enabled by the rocm - # nightly. Mirror MI300X conc grid for cross-vendor comparability. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } - - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } - gptoss-fp4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 model: openai/gpt-oss-120b @@ -2415,37 +2270,6 @@ glm5-fp8-mi325x-sglang-mtp: # brought in here. # ============================================================================ -qwen3.5-fp8-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } - -dsv4-fp4-mi355x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.21.0 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4] } - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } - - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } - dsr1-fp4-mi355x-sglang-disagg-mtp: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 model: amd/DeepSeek-R1-0528-MXFP4-v2 @@ -2679,6 +2503,188 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: # amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the # image tag, so bumping sglang is just an image tag bump here. Sweeps # DP-attention on/off and EP=8. +# ============================================================================= +# Agentic configs +# ----------------------------------------------------------------------------- +# All entries below run the agentic-coding scenario (Weka trace replay). +# Above this divider: fixed-seq-len / synthetic / prefix-share scenarios only. +# ============================================================================= + +# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is +# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main. +qwen3.5-fp8-mi355x-sglang-agentic: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + +# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is +# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main. +glm5.1-fp4-mi355x-sglang-agentic: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 + model: amd/GLM-5.1-MXFP4 + model-prefix: glm5.1 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + +# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below; +# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0' +kimik2.5-fp4-mi355x-vllm-agentic: + # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin + # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm + # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and + # includes all subsequent ROCm offload work. + image: vllm/vllm-openai-rocm:v0.21.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } + # CPU offload only above the KV cliff. Lower concurrencies fit + # entirely on-GPU, so paying the offload-path overhead there would + # just slow them down without measuring anything new. + - { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] } + # TP=4 probe: half-node layout doubles per-GPU weight footprint + # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to + # cliff-region concurrencies on both offload modes so we can directly + # compare TP=4 vs TP=8 at the same conc points. + - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } + - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } + +# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' +minimaxm2.5-fp8-mi355x-vllm-agentic: + # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector] + # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"), + # which enables SimpleCPUOffloadConnector on ROCm. Required for the + # cpu-offload sweep points to use the same offload path as the NVIDIA + # agentic-coding configs. + image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi355x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical). + # Compute saturates first; cpu offload likely won't help, but worth confirming. + # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). + - duration: 1800 + search-space: + - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] } + - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } + +# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' +minimaxm2.5-fp8-mi300x-vllm-agentic: + # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. + image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi300x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200); + # KV cliff ~52. Compute saturates first. + # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } + - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } + +# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' +minimaxm2.5-fp8-mi325x-vllm-agentic: + # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. + image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi325x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI325X tp=4: cloned from MI300X recipe (slightly faster compute, + # similar HBM profile). Compute saturates first; cpu-offload window + # exercises the SimpleCPUOffloadConnector path enabled by the rocm + # nightly. Mirror MI300X conc grid for cross-vendor comparability. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } + - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } + +qwen3.5-fp8-mi355x-sglang-agentic-hicache: + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + +dsv4-fp4-mi355x-vllm-agentic: + image: vllm/vllm-openai-rocm:v0.21.0 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4] } + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } # Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below; # the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d3b1b6729..04764831c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -384,25 +384,6 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - agentic-coding: - - duration: 300 - search-space: - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - dsr1-fp8-b200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 model: deepseek-ai/DeepSeek-R1-0528 @@ -1778,28 +1759,6 @@ dsv4-fp4-b200-vllm: - { tp: 8, conc-start: 1, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 } -# Diverged from dsv4-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original dsv4-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'b200-dsv4' -> 'b200-dgxc' -dsv4-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.20.0-cu130 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: b200-dgxc - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # cpu offload only this iteration — none entries already validated in - # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%). - # Re-add when investigating regressions in offload=none. - - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - dsv4-fp4-b200-trt: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 model: deepseek-ai/DeepSeek-V4-Pro @@ -2143,25 +2102,6 @@ qwen3.5-fp8-b200-sglang: - { tp: 8, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } -# Diverged from qwen3.5-fp8-b200-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's qwen3.5-fp8-b200-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original qwen3.5-fp8-b200-sglang entry stays byte-identical to origin/main. -qwen3.5-fp8-b200-sglang-agentic: - image: lmsysorg/sglang:nightly-dev-20260422-de962f32 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: b200 - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - qwen3.5-fp4-b200-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: nvidia/Qwen3.5-397B-A17B-NVFP4 @@ -2245,26 +2185,6 @@ glm5-fp8-b200-sglang-mtp: # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 # does not have a B300-specific recipe, so this config reuses the existing GLM5 FP8 # B200 SGLang recipe as-is until B300-specific tuning is available. -# Diverged from glm5-fp8-b200-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's glm5-fp8-b200-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original glm5-fp8-b200-sglang entry stays byte-identical to origin/main. -glm5-fp8-b200-sglang-agentic: - image: lmsysorg/sglang:v0.5.12-cu130 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: b200 - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] } - glm5-fp8-b300-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: zai-org/GLM-5-FP8 @@ -2553,37 +2473,6 @@ kimik2.5-int4-b200-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-int4-b200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-int4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'b200' -> 'b200-dgxc' -kimik2.5-int4-b200-vllm-agentic: - # Bumped from v0.19.1 — that release tripped a bug in - # `flashinfer_trtllm_mxint4_moe` ('list' object has no attribute 'to') - # during warmup `profile_run` on the agentic-coding path - # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the - # flashinfer fix. - image: vllm/vllm-openai:v0.20.2 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - runner: b200-dgxc - precision: int4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 8, offloading: cpu, conc-list: [32, 64, 96, 128] } - -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. - kimik2.5-int4-b300-vllm: image: vllm/vllm-openai:v0.21.0 model: moonshotai/Kimi-K2.5 @@ -2624,29 +2513,6 @@ kimik2.5-int4-h200-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } -# Diverged from kimik2.5-int4-h200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-int4-h200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'h200' -> 'h200-dgxc' -kimik2.5-int4-h200-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with - # the /home/sa-shared/gharunners/ai-perf-cache mount where aiperf's - # content-addressed dataset mmap cache lives. Other h200 pools (cw, nb) - # don't have that mount and would re-materialize 65 GB to /tmp every job. - runner: h200-dgxc - precision: int4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] } - - { tp: 8, offloading: cpu, conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] } - kimik2.5-fp4-b200-vllm: image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 @@ -2668,38 +2534,6 @@ kimik2.5-fp4-b200-vllm: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2' -# - runner: 'b200' -> 'b200-dgxc' -kimik2.5-fp4-b200-vllm-agentic: - # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that - # cleared the agentic-coding warmup crash on max_model_len=131072 + - # prefix caching. - image: vllm/vllm-openai:v0.20.2 - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - runner: b200-dgxc - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] } - - { tp: 8, ep: 1, offloading: cpu, conc-list: [16, 24, 32, 36] } - - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] } - - { tp: 4, ep: 1, offloading: cpu, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } - -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. - kimik2.5-fp4-b300-vllm: image: vllm/vllm-openai:v0.21.0 model: nvidia/Kimi-K2.5-NVFP4 @@ -2763,34 +2597,6 @@ dsr1-fp8-b300-sglang-mtp: search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 512, spec-decoding: mtp } -# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130' -# - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4' -# - model-prefix: 'dsr1' -> 'kimik2.5' -# - precision: 'fp8' -> 'fp4' -# - framework: 'sglang' -> 'vllm' -kimik2.5-fp4-b300-vllm-agentic: - # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM - # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the - # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted - # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the - # INT4 B300 sister already uses successfully. - image: vllm/vllm-openai:v0.20.0-cu130 - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - runner: b300 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - - { tp: 8, ep: 1, offloading: cpu, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - dsr1-fp8-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc14 model: deepseek-ai/DeepSeek-R1-0528 @@ -2924,31 +2730,6 @@ dsv4-fp8-h200-vllm-mtp: - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp } -# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). -# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper -# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up. -# Diverged from dsv4-fp8-h200-vllm (agentic-coding sibling). Reasons below; -# the original dsv4-fp8-h200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129' -dsv4-fp8-h200-vllm-agentic: - image: vllm/vllm-openai:deepseekv4-cu129 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: h200 - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [1, 2, 4, 8, 16] } - -# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image -# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds -# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. - dsv4-fp8-h200-sglang: image: lmsysorg/sglang:deepseek-v4-hopper@sha256:7f19c6dc092e47a10fac2e41f47eab78970280d06648b8e50d312a82f0ae722f model: deepseek-ai/DeepSeek-V4-Pro @@ -3024,30 +2805,6 @@ dsv4-fp4-b300-vllm: - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } -# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main. -dsv4-fp4-b300-vllm-agentic: - image: vllm/vllm-openai:v0.20.0-cu130 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: b300 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # cpu offload only this iteration — none entries already validated in - # earlier runs. Re-add when investigating regressions in offload=none. - - { tp: 4, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } - dsv4-fp4-b300-trt: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 model: deepseek-ai/DeepSeek-V4-Pro @@ -4284,31 +4041,10 @@ gptoss-fp4-b200-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 4 } -# Diverged from gptoss-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original gptoss-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1' -gptoss-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 - model: openai/gpt-oss-120b - model-prefix: gptoss - runner: b200 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] } - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] } - - { tp: 4, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } - - { tp: 8, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } - -minimaxm2.5-fp8-b200-vllm: - image: vllm/vllm-openai:v0.22.0 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 +minimaxm2.5-fp8-b200-vllm: + image: vllm/vllm-openai:v0.22.0 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 runner: b200 precision: fp8 framework: vllm @@ -4330,33 +4066,6 @@ minimaxm2.5-fp8-b200-vllm: # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from minimaxm2.5-fp8-b200-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' -# - runner: 'b200' -> 'b200-dgxc' -minimaxm2.5-fp8-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: b200-dgxc - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # B200 tp=4: compute ceiling ~50 (empirical), KV cliff ~48 (analytical). - # Push none past the KV cliff (96, 128) to make the no-offload throughput - # collapse visible; cpu range overlaps fully for same-conc comparison. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 96, 128] } - - { tp: 4, offloading: cpu, conc-list: [48, 56, 64, 96, 128] } - - # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html - # does not have a B300-specific recipe, so this config reuses the existing - # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. - minimaxm2.5-fp8-b300-vllm: image: vllm/vllm-openai:v0.21.0 model: MiniMaxAI/MiniMax-M2.5 @@ -4381,31 +4090,6 @@ minimaxm2.5-fp8-b300-vllm: - { tp: 2, conc-start: 64, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 8 } -# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' -minimaxm2.5-fp8-b300-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: b300 - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # B300 tp=4: compute ceiling ~60 (empirical), KV cliff ~85 (analytical). - # Push none past the KV cliff (96, 128, 192) so the no-offload throughput - # collapse is visible; cpu range overlaps fully so each high-conc point - # has a same-conc no-offload counterpart for direct comparison. - # Dense sampling between 96 and 128 (step=4) to resolve the sharp dropoff - # observed in v6 cpu data right past conc=96. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } - - { tp: 4, offloading: cpu, conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } - minimaxm2.5-fp4-b200-vllm: image: vllm/vllm-openai:v0.22.0 model: nvidia/MiniMax-M2.5-NVFP4 @@ -4438,29 +4122,6 @@ minimaxm2.5-fp4-b200-vllm: # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from minimaxm2.5-fp4-b200-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp4-b200-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main. -minimaxm2.5-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.0-cu130 - model: nvidia/MiniMax-M2.5-NVFP4 - model-prefix: minimaxm2.5 - runner: b200 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html - # does not have a B300-specific recipe, so this config reuses the existing - # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. - minimaxm2.5-fp4-b300-vllm: image: vllm/vllm-openai:v0.21.0 model: nvidia/MiniMax-M2.5-NVFP4 @@ -4530,29 +4191,6 @@ minimaxm2.5-fp8-h100-vllm: search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } -# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main. -minimaxm2.5-fp8-h100-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: h100 - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # H100 tp=4 ep=4: compute ceiling ~10 (empirical), KV cliff ~6 (analytical). - # Best cpu-offload demo SKU — 4-conc-point window between cliffs. - # Dense sampling 4-12 covers both cliffs; conc 16 confirms compute plateau. - - duration: 1800 - search-space: - - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] } - - { tp: 4, ep: 4, offloading: cpu, conc-list: [5, 6, 7, 8, 10, 12] } - dsr1-fp8-h100-dynamo-sglang: image: lmsysorg/sglang:v0.5.8-cu130 model: deepseek-ai/DeepSeek-R1-0528 @@ -4757,28 +4395,6 @@ minimaxm2.5-fp8-h200-vllm: search-space: - { tp: 4, conc-start: 1, conc-end: 256 } -# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main. -minimaxm2.5-fp8-h200-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: h200 - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # H200 tp=4: compute ceiling ~35 (empirical), KV cliff ~29 (analytical). - # cpu offload window conc 29-35 — dense sampling 24-40 captures both cliffs. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] } - - { tp: 4, offloading: cpu, conc-list: [24, 28, 32, 36, 40, 48] } - dsr1-fp4-gb200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 model: nvidia/DeepSeek-R1-0528-NVFP4-v2 @@ -9203,26 +8819,6 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: dp-attn: true -kimik2.5-int4-h100-vllm: - image: vllm/vllm-openai:v0.20.2 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - runner: h100 - precision: int4 - framework: vllm - multinode: false - scenarios: - # New entry, agentic-coding only: this PR intentionally does NOT add - # fixed-seq-len coverage for kimik2.5-int4 on H100 to keep the - # fixed-seq-len test surface identical to origin/main. - # H100 has 80 GB HBM per GPU (smallest in this set); the KV cliff arrives - # early. Sweep saturates conc=20 to keep total HBM headroom. - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 20] } - - { tp: 8, offloading: cpu, conc-list: [1, 2, 4, 8, 12, 16, 20] } - qwen3.5-fp8-h100-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -9687,12 +9283,45 @@ glm5-fp8-gb300-dynamo-sglang: # to preserve main behavior; PR-branch modifications to those recipes are NOT # brought in here. # ============================================================================ +# ============================================================================= +# Agentic configs +# ----------------------------------------------------------------------------- +# All entries below run the agentic-coding scenario (Weka trace replay). +# Above this divider: fixed-seq-len / synthetic / prefix-share scenarios only. +# ============================================================================= -qwen3.5-fp8-b300-sglang-agentic-hicache: - image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd +# Diverged from dsv4-fp4-b200-vllm (agentic-coding sibling). Reasons below; +# the original dsv4-fp4-b200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - runner: 'b200-dsv4' -> 'b200-dgxc' +dsv4-fp4-b200-vllm-agentic: + image: vllm/vllm-openai:v0.20.0-cu130 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b200-dgxc + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # cpu offload only this iteration — none entries already validated in + # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%). + # Re-add when investigating regressions in offload=none. + - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } + - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } + +# Diverged from qwen3.5-fp8-b200-sglang (agentic-coding sibling). Metadata is +# identical to origin/main's qwen3.5-fp8-b200-sglang; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original qwen3.5-fp8-b200-sglang entry stays byte-identical to origin/main. +qwen3.5-fp8-b200-sglang-agentic: + image: lmsysorg/sglang:nightly-dev-20260422-de962f32 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 - runner: b300 + runner: b200 precision: fp8 framework: sglang multinode: false @@ -9700,46 +9329,404 @@ qwen3.5-fp8-b300-sglang-agentic-hicache: agentic-coding: - duration: 1800 search-space: - - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } -kimik2.5-fp4-b200-vllm-agentic-lmcache: - image: vllm/vllm-openai:v0.21.0 - model: nvidia/Kimi-K2.5-NVFP4 +# Diverged from glm5-fp8-b200-sglang (agentic-coding sibling). Metadata is +# identical to origin/main's glm5-fp8-b200-sglang; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original glm5-fp8-b200-sglang entry stays byte-identical to origin/main. +glm5-fp8-b200-sglang-agentic: + image: lmsysorg/sglang:v0.5.12-cu130 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: b200 + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] } + +# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html +# does not have a B300-specific recipe, so this config reuses the existing +# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. +# Diverged from kimik2.5-int4-b200-vllm (agentic-coding sibling). Reasons below; +# the original kimik2.5-int4-b200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - runner: 'b200' -> 'b200-dgxc' +kimik2.5-int4-b200-vllm-agentic: + # Bumped from v0.19.1 — that release tripped a bug in + # `flashinfer_trtllm_mxint4_moe` ('list' object has no attribute 'to') + # during warmup `profile_run` on the agentic-coding path + # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the + # flashinfer fix. + image: vllm/vllm-openai:v0.20.2 + model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: b200-dgxc - precision: fp4 + precision: int4 framework: vllm multinode: false scenarios: agentic-coding: - duration: 1800 search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] } - - { tp: 8, ep: 1, offloading: lmcache, conc-list: [16, 24, 32, 36] } - - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] } - - { tp: 4, ep: 1, offloading: lmcache, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 8, offloading: cpu, conc-list: [32, 64, 96, 128] } # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html # does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. +# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons -# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to -# origin/main so its fixed-seq-len sweep is unaffected. -# - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape -# mirroring the conc=192 point in the base entry's fixed-seq-len sweep. -# - additional-settings.CONFIG_FILE: points at the new agentic recipe under -# recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh -# overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC -# branch). Local-overlay pattern mirrors the existing 8k1k overlay. -dsv4-fp4-gb300-dynamo-vllm-agentic: - image: vllm/vllm-openai:v0.21.0-ubuntu2404 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - # gb300-nv (not generic gb300) — the generic label is shared by both NV - # and CW runner pools, so runs-on: gb300 lets CW runners pick up shards. - # The gb300-nv label is on NV runners only (per .github/configs/runners.yaml +# Diverged from kimik2.5-int4-h200-vllm (agentic-coding sibling). Reasons below; +# the original kimik2.5-int4-h200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - runner: 'h200' -> 'h200-dgxc' +kimik2.5-int4-h200-vllm-agentic: + image: vllm/vllm-openai:v0.20.2 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with + # the /home/sa-shared/gharunners/ai-perf-cache mount where aiperf's + # content-addressed dataset mmap cache lives. Other h200 pools (cw, nb) + # don't have that mount and would re-materialize 65 GB to /tmp every job. + runner: h200-dgxc + precision: int4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] } + - { tp: 8, offloading: cpu, conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] } + +# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html +# does not have a B300-specific recipe, so this config reuses the existing +# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. +# Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below; +# the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2' +# - runner: 'b200' -> 'b200-dgxc' +kimik2.5-fp4-b200-vllm-agentic: + # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that + # cleared the agentic-coding warmup crash on max_model_len=131072 + + # prefix caching. + image: vllm/vllm-openai:v0.20.2 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: b200-dgxc + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] } + - { tp: 8, ep: 1, offloading: cpu, conc-list: [16, 24, 32, 36] } + - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] } + - { tp: 4, ep: 1, offloading: cpu, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } + +# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html +# does not have a B300-specific recipe, so this config reuses the existing +# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. + +# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below; +# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130' +# - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4' +# - model-prefix: 'dsr1' -> 'kimik2.5' +# - precision: 'fp8' -> 'fp4' +# - framework: 'sglang' -> 'vllm' +kimik2.5-fp4-b300-vllm-agentic: + # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM + # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the + # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted + # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the + # INT4 B300 sister already uses successfully. + image: vllm/vllm-openai:v0.20.0-cu130 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + - { tp: 8, ep: 1, offloading: cpu, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + +# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). +# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper +# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up. +# Diverged from dsv4-fp8-h200-vllm (agentic-coding sibling). Reasons below; +# the original dsv4-fp8-h200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129' +dsv4-fp8-h200-vllm-agentic: + image: vllm/vllm-openai:deepseekv4-cu129 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: h200 + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [1, 2, 4, 8, 16] } + +# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image +# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds +# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. + +# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is +# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main. +dsv4-fp4-b300-vllm-agentic: + image: vllm/vllm-openai:v0.20.0-cu130 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # cpu offload only this iteration — none entries already validated in + # earlier runs. Re-add when investigating regressions in offload=none. + - { tp: 4, offloading: cpu, conc-list: [16, 32, 64] } + - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } + - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } + - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } + +# Diverged from gptoss-fp4-b200-vllm (agentic-coding sibling). Reasons below; +# the original gptoss-fp4-b200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1' +gptoss-fp4-b200-vllm-agentic: + image: vllm/vllm-openai:v0.19.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: b200 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] } + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] } + - { tp: 4, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } + - { tp: 8, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } + +# Diverged from minimaxm2.5-fp8-b200-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-b200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' +# - runner: 'b200' -> 'b200-dgxc' +minimaxm2.5-fp8-b200-vllm-agentic: + image: vllm/vllm-openai:v0.19.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: b200-dgxc + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # B200 tp=4: compute ceiling ~50 (empirical), KV cliff ~48 (analytical). + # Push none past the KV cliff (96, 128) to make the no-offload throughput + # collapse visible; cpu range overlaps fully for same-conc comparison. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 96, 128] } + - { tp: 4, offloading: cpu, conc-list: [48, 56, 64, 96, 128] } + + # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html + # does not have a B300-specific recipe, so this config reuses the existing + # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. + +# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' +minimaxm2.5-fp8-b300-vllm-agentic: + image: vllm/vllm-openai:v0.19.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: b300 + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # B300 tp=4: compute ceiling ~60 (empirical), KV cliff ~85 (analytical). + # Push none past the KV cliff (96, 128, 192) so the no-offload throughput + # collapse is visible; cpu range overlaps fully so each high-conc point + # has a same-conc no-offload counterpart for direct comparison. + # Dense sampling between 96 and 128 (step=4) to resolve the sharp dropoff + # observed in v6 cpu data right past conc=96. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } + - { tp: 4, offloading: cpu, conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } + +# Diverged from minimaxm2.5-fp4-b200-vllm (agentic-coding sibling). Metadata is +# identical to origin/main's minimaxm2.5-fp4-b200-vllm; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main. +minimaxm2.5-fp4-b200-vllm-agentic: + image: vllm/vllm-openai:v0.19.0-cu130 + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + runner: b200 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + + # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html + # does not have a B300-specific recipe, so this config reuses the existing + # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. + +# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is +# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main. +minimaxm2.5-fp8-h100-vllm-agentic: + image: vllm/vllm-openai:v0.20.2 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: h100 + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # H100 tp=4 ep=4: compute ceiling ~10 (empirical), KV cliff ~6 (analytical). + # Best cpu-offload demo SKU — 4-conc-point window between cliffs. + # Dense sampling 4-12 covers both cliffs; conc 16 confirms compute plateau. + - duration: 1800 + search-space: + - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] } + - { tp: 4, ep: 4, offloading: cpu, conc-list: [5, 6, 7, 8, 10, 12] } + +# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is +# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main. +minimaxm2.5-fp8-h200-vllm-agentic: + image: vllm/vllm-openai:v0.20.2 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: h200 + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # H200 tp=4: compute ceiling ~35 (empirical), KV cliff ~29 (analytical). + # cpu offload window conc 29-35 — dense sampling 24-40 captures both cliffs. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] } + - { tp: 4, offloading: cpu, conc-list: [24, 28, 32, 36, 40, 48] } + +kimik2.5-int4-h100-vllm: + image: vllm/vllm-openai:v0.20.2 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + runner: h100 + precision: int4 + framework: vllm + multinode: false + scenarios: + # New entry, agentic-coding only: this PR intentionally does NOT add + # fixed-seq-len coverage for kimik2.5-int4 on H100 to keep the + # fixed-seq-len test surface identical to origin/main. + # H100 has 80 GB HBM per GPU (smallest in this set); the KV cliff arrives + # early. Sweep saturates conc=20 to keep total HBM headroom. + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 20] } + - { tp: 8, offloading: cpu, conc-list: [1, 2, 4, 8, 12, 16, 20] } + +qwen3.5-fp8-b300-sglang-agentic-hicache: + image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: b300 + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + +kimik2.5-fp4-b200-vllm-agentic-lmcache: + image: vllm/vllm-openai:v0.21.0 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: b200-dgxc + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] } + - { tp: 8, ep: 1, offloading: lmcache, conc-list: [16, 24, 32, 36] } + - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] } + - { tp: 4, ep: 1, offloading: lmcache, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } + +# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html +# does not have a B300-specific recipe, so this config reuses the existing +# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. + +# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons +# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to +# origin/main so its fixed-seq-len sweep is unaffected. +# - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape +# mirroring the conc=192 point in the base entry's fixed-seq-len sweep. +# - additional-settings.CONFIG_FILE: points at the new agentic recipe under +# recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh +# overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC +# branch). Local-overlay pattern mirrors the existing 8k1k overlay. +dsv4-fp4-gb300-dynamo-vllm-agentic: + image: vllm/vllm-openai:v0.21.0-ubuntu2404 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + # gb300-nv (not generic gb300) — the generic label is shared by both NV + # and CW runner pools, so runs-on: gb300 lets CW runners pick up shards. + # The gb300-nv label is on NV runners only (per .github/configs/runners.yaml # + actual runner label listings). Pins agentic to the NVIDIA cluster # for initial validation. Drop -nv suffix to widen later. runner: gb300-nv @@ -9905,3 +9892,33 @@ qwen3.5-fp8-h100-sglang-agentic: search-space: - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] } - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] } + +# Split from dsr1-fp4-b200-dynamo-trt: agentic-coding scenario only. +dsr1-fp4-b200-dynamo-trt-agentic: + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + model: deepseek-r1-fp4 + model-prefix: dsr1 + runner: b200-multinode + precision: fp4 + framework: dynamo-trt + multinode: true + disagg: true + scenarios: + agentic-coding: + - duration: 300 + search-space: + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false From 76aedd65780ddaabfb2cb0d630081a42e6cb72ac Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 13:20:16 -0500 Subject: [PATCH 003/132] configs(master): bump all vllm images to v0.22.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps every non-comment `image:` line in both master configs to the unsuffixed v0.22.0 tag: - vllm/vllm-openai:* -> vllm/vllm-openai:v0.22.0 - vllm/vllm-openai-rocm:* -> vllm/vllm-openai-rocm:v0.22.0 Covers all prior variants: v0.17–v0.21 numbered releases, the -cu130 / -ubuntu2404 / deepseekv4-cu129 build-variant tags, and the nightly- ROCm pins (which were holding DSv4 ROCm support that has since landed in the tagged release). Comment-line tag references in the agentic divergence change-log blocks are intentionally untouched so their "X -> Y" history reads correctly. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/amd-master.yaml | 24 +++++------ .github/configs/nvidia-master.yaml | 64 +++++++++++++++--------------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 0495ebf16..ee4276a26 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -705,7 +705,7 @@ glm5.1-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 256 } kimik2.5-int4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi355x @@ -724,7 +724,7 @@ kimik2.5-int4-mi355x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi325x @@ -743,7 +743,7 @@ kimik2.5-int4-mi325x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi300x @@ -896,7 +896,7 @@ minimaxm2.5-fp4-mi355x-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi300x @@ -938,7 +938,7 @@ minimaxm2.5-fp8-mi325x-vllm: - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } gptoss-fp4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.17.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi300x @@ -1379,7 +1379,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 + image: vllm/vllm-openai-rocm:v0.22.0 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg @@ -1433,7 +1433,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031 + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg @@ -2558,7 +2558,7 @@ kimik2.5-fp4-mi355x-vllm-agentic: # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and # includes all subsequent ROCm offload work. - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x @@ -2591,7 +2591,7 @@ minimaxm2.5-fp8-mi355x-vllm-agentic: # which enables SimpleCPUOffloadConnector on ROCm. Required for the # cpu-offload sweep points to use the same offload path as the NVIDIA # agentic-coding configs. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x @@ -2614,7 +2614,7 @@ minimaxm2.5-fp8-mi355x-vllm-agentic: # - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' minimaxm2.5-fp8-mi300x-vllm-agentic: # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi300x @@ -2637,7 +2637,7 @@ minimaxm2.5-fp8-mi300x-vllm-agentic: # - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' minimaxm2.5-fp8-mi325x-vllm-agentic: # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi325x @@ -2671,7 +2671,7 @@ qwen3.5-fp8-mi355x-sglang-agentic-hicache: - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } dsv4-fp4-mi355x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 04764831c..d7791fa11 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1804,7 +1804,7 @@ dsv4-fp4-b200-trt-mtp: # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. dsv4-fp4-b200-vllm-mtp: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -2474,7 +2474,7 @@ kimik2.5-int4-b200-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-b300-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: b300 @@ -2535,7 +2535,7 @@ kimik2.5-fp4-b200-vllm: - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } kimik2.5-fp4-b300-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: b300 @@ -2686,7 +2686,7 @@ dsr1-fp8-h200-sglang-mtp: # Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache # flag is omitted. Max-model-len is pinned at 800k per the recipe. dsv4-fp8-h200-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 @@ -2710,7 +2710,7 @@ dsv4-fp8-h200-vllm: # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. dsv4-fp8-h200-vllm-mtp: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 @@ -2852,7 +2852,7 @@ dsv4-fp4-b300-trt-mtp: - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024, spec-decoding: mtp } dsv4-fp4-b300-vllm-mtp: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -4067,7 +4067,7 @@ minimaxm2.5-fp8-b200-vllm: # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. minimaxm2.5-fp8-b300-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: b300 @@ -4123,7 +4123,7 @@ minimaxm2.5-fp4-b200-vllm: # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. minimaxm2.5-fp4-b300-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: nvidia/MiniMax-M2.5-NVFP4 model-prefix: minimaxm2.5 runner: b300 @@ -4150,7 +4150,7 @@ minimaxm2.5-fp4-b300-vllm: - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h100 @@ -7883,7 +7883,7 @@ kimik2.5-fp4-gb200-dynamo-trt: dp-attn: true kimik2.5-fp4-gb200-dynamo-vllm: - image: vllm/vllm-openai:v0.18.0-cu130 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: gb200 @@ -7985,7 +7985,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: dp-attn: true dsv4-fp4-b200-dynamo-vllm: - image: vllm/vllm-openai:v0.20.1 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-multinode @@ -8041,7 +8041,7 @@ dsv4-fp4-b200-dynamo-vllm: dp-attn: true dsv4-fp4-gb200-dynamo-vllm: - image: vllm/vllm-openai:v0.20.0-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb200 @@ -8141,7 +8141,7 @@ dsv4-fp4-gb200-dynamo-vllm: # MTP2 variant of dsv4-fp4-gb200-dynamo-vllm. Uses the vLLM 0.20.1 image # and hand-picked 8k/1k Pareto points mirrored from NVIDIA/srt-slurm. dsv4-fp4-gb200-dynamo-vllm-mtp2: - image: vllm/vllm-openai:v0.20.1-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb200 @@ -8221,7 +8221,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: dp-attn: true dsv4-fp4-b300-dynamo-vllm: - image: vllm/vllm-openai:v0.20.1 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -8277,7 +8277,7 @@ dsv4-fp4-b300-dynamo-vllm: dp-attn: true dsv4-fp4-gb300-dynamo-vllm: - image: vllm/vllm-openai:v0.20.0-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-nv @@ -9295,7 +9295,7 @@ glm5-fp8-gb300-dynamo-sglang: # its fixed-seq-len sweep is unaffected. # - runner: 'b200-dsv4' -> 'b200-dgxc' dsv4-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.20.0-cu130 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dgxc @@ -9364,7 +9364,7 @@ kimik2.5-int4-b200-vllm-agentic: # during warmup `profile_run` on the agentic-coding path # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the # flashinfer fix. - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: b200-dgxc @@ -9387,7 +9387,7 @@ kimik2.5-int4-b200-vllm-agentic: # its fixed-seq-len sweep is unaffected. # - runner: 'h200' -> 'h200-dgxc' kimik2.5-int4-h200-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with @@ -9417,7 +9417,7 @@ kimik2.5-fp4-b200-vllm-agentic: # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that # cleared the agentic-coding warmup crash on max_model_len=131072 + # prefix caching. - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: b200-dgxc @@ -9451,7 +9451,7 @@ kimik2.5-fp4-b300-vllm-agentic: # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the # INT4 B300 sister already uses successfully. - image: vllm/vllm-openai:v0.20.0-cu130 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: b300 @@ -9473,7 +9473,7 @@ kimik2.5-fp4-b300-vllm-agentic: # its fixed-seq-len sweep is unaffected. # - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129' dsv4-fp8-h200-vllm-agentic: - image: vllm/vllm-openai:deepseekv4-cu129 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 @@ -9496,7 +9496,7 @@ dsv4-fp8-h200-vllm-agentic: # (either main had none or had a different conc/offload sweep). # The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main. dsv4-fp4-b300-vllm-agentic: - image: vllm/vllm-openai:v0.20.0-cu130 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -9519,7 +9519,7 @@ dsv4-fp4-b300-vllm-agentic: # its fixed-seq-len sweep is unaffected. # - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1' gptoss-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 + image: vllm/vllm-openai:v0.22.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: b200 @@ -9541,7 +9541,7 @@ gptoss-fp4-b200-vllm-agentic: # - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' # - runner: 'b200' -> 'b200-dgxc' minimaxm2.5-fp8-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 + image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: b200-dgxc @@ -9567,7 +9567,7 @@ minimaxm2.5-fp8-b200-vllm-agentic: # its fixed-seq-len sweep is unaffected. # - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' minimaxm2.5-fp8-b300-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 + image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: b300 @@ -9593,7 +9593,7 @@ minimaxm2.5-fp8-b300-vllm-agentic: # (either main had none or had a different conc/offload sweep). # The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.0-cu130 + image: vllm/vllm-openai:v0.22.0 model: nvidia/MiniMax-M2.5-NVFP4 model-prefix: minimaxm2.5 runner: b200 @@ -9616,7 +9616,7 @@ minimaxm2.5-fp4-b200-vllm-agentic: # (either main had none or had a different conc/offload sweep). # The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp8-h100-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h100 @@ -9639,7 +9639,7 @@ minimaxm2.5-fp8-h100-vllm-agentic: # (either main had none or had a different conc/offload sweep). # The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp8-h200-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h200 @@ -9656,7 +9656,7 @@ minimaxm2.5-fp8-h200-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [24, 28, 32, 36, 40, 48] } kimik2.5-int4-h100-vllm: - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: h100 @@ -9691,7 +9691,7 @@ qwen3.5-fp8-b300-sglang-agentic-hicache: - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } kimik2.5-fp4-b200-vllm-agentic-lmcache: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: b200-dgxc @@ -9721,7 +9721,7 @@ kimik2.5-fp4-b200-vllm-agentic-lmcache: # overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC # branch). Local-overlay pattern mirrors the existing 8k1k overlay. dsv4-fp4-gb300-dynamo-vllm-agentic: - image: vllm/vllm-openai:v0.21.0-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 # gb300-nv (not generic gb300) — the generic label is shared by both NV @@ -9810,7 +9810,7 @@ dsv4-fp4-gb300-dynamo-vllm-agentic: # overlay (recipes/vllm/deepseek-v4/agentic/), so a change to the recipe # applies to both clusters with no duplication. dsv4-fp4-gb300-cw-dynamo-vllm-agentic: - image: vllm/vllm-openai:v0.21.0-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw From 6dede7b24c94f68a74acd537c552950ef74531af Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 13:46:25 -0500 Subject: [PATCH 004/132] configs(master): strip stale narrative comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes ~240 lines of slop comments that no longer earn their keep: - "Diverged from X (agentic-coding sibling)..." rationale blocks (24 occurrences) — the sibling split is now durable and the "preserved on main" framing isn't meaningful on a branch - "Net-new agentic recipes from chore/agentx-v0.3" PR-context headers - "agentic-coding sibling — temporarily disabled" + the entire commented-out qwen3.5-bf16-b200-sglang-agentic placeholder block - Orphan boundary comments ("# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720..." / "# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm nightly...") that were stranded by prior entry moves - Inline image-bump rationale that's now stale ("# Bumped from v0.19.1...", "# Same image as the INT4 sibling: v0.20.x...", "# Nightly carrying vllm-project/vllm@20cac26b...", "# v0.21.0 (released 2026-05-14)...") since everything is on v0.22.0 Verified via YAML deep-equal: 0 keys added/removed/modified in either file — purely comment removal. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/amd-master.yaml | 110 ----------------------- .github/configs/nvidia-master.yaml | 140 ----------------------------- 2 files changed, 250 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ee4276a26..7f1c8192d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1826,7 +1826,6 @@ dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 model: amd/DeepSeek-R1-0528-MXFP4-v2 @@ -1937,7 +1936,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" - # 1*DEP8 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 128 ] @@ -1995,11 +1993,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - -# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the -# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the -# image tag, so bumping sglang is just an image tag bump here. Sweeps -# DP-attention on/off and EP=8. dsv4-fp4-mi355x-sglang: image: rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4 model: deepseek-ai/DeepSeek-V4-Pro @@ -2056,25 +2049,6 @@ dsv4-fp4-mi355x-sglang-mtp: - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp } - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp } -# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm -# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged -# on 2026-05-05, so any nightly built after that includes the -# DeepseekV4ForCausalLM model class. -# -# IMPORTANT: pin to a digest-suffixed nightly tag rather than the -# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs -# files keyed on the image string and short-circuits re-import if the -# file already exists, so the floating tag silently keeps a stale build -# even after Docker Hub updates `:nightly`. -# -# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the -# rest); InferenceX classifies this as fp4 — same as the sister sglang -# and atom DSv4 mi355x entries below. Image and serving flags follow the -# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp -# executor, triton_unfused MoE (required for the FP4 expert format), -# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, -# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 -# probe to validate the ROCm DP+EP path. dsv4-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -2263,13 +2237,6 @@ glm5-fp8-mi325x-sglang-mtp: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } -# ============================================================================ -# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries). -# Recipes that ALREADY existed on main were intentionally left at main's version -# to preserve main behavior; PR-branch modifications to those recipes are NOT -# brought in here. -# ============================================================================ - dsr1-fp4-mi355x-sglang-disagg-mtp: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 model: amd/DeepSeek-R1-0528-MXFP4-v2 @@ -2498,23 +2465,6 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - -# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the -# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the -# image tag, so bumping sglang is just an image tag bump here. Sweeps -# DP-attention on/off and EP=8. -# ============================================================================= -# Agentic configs -# ----------------------------------------------------------------------------- -# All entries below run the agentic-coding scenario (Weka trace replay). -# Above this divider: fixed-seq-len / synthetic / prefix-share scenarios only. -# ============================================================================= - -# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main. qwen3.5-fp8-mi355x-sglang-agentic: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -2529,11 +2479,6 @@ qwen3.5-fp8-mi355x-sglang-agentic: search-space: - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } -# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main. glm5.1-fp4-mi355x-sglang-agentic: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 model: amd/GLM-5.1-MXFP4 @@ -2549,15 +2494,7 @@ glm5.1-fp4-mi355x-sglang-agentic: # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } -# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0' kimik2.5-fp4-mi355x-vllm-agentic: - # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin - # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm - # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and - # includes all subsequent ROCm offload work. image: vllm/vllm-openai-rocm:v0.22.0 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 @@ -2581,16 +2518,7 @@ kimik2.5-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } -# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' minimaxm2.5-fp8-mi355x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector] - # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"), - # which enables SimpleCPUOffloadConnector on ROCm. Required for the - # cpu-offload sweep points to use the same offload path as the NVIDIA - # agentic-coding configs. image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 @@ -2608,12 +2536,7 @@ minimaxm2.5-fp8-mi355x-vllm-agentic: - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] } - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } -# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' minimaxm2.5-fp8-mi300x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 @@ -2631,12 +2554,7 @@ minimaxm2.5-fp8-mi300x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } -# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' minimaxm2.5-fp8-mi325x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 @@ -2686,14 +2604,6 @@ dsv4-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } -# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below; -# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - scenarios: replaced fixed-seq-len with agentic-coding. -# Image is identical to the base entry (rocm/sgl-dev DSv4 build). -# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware -# comparability. Offload sweep is none-only (SGLang has no equivalent of -# vLLM's SimpleCPUOffloadConnector path that we exercise on b200). dsv4-fp4-mi355x-sglang-agentic: image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4 model: deepseek-ai/DeepSeek-V4-Pro @@ -2708,23 +2618,3 @@ dsv4-fp4-mi355x-sglang-agentic: search-space: - { tp: 8, offloading: none, conc-list: [16, 32, 64] } - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } - -# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm -# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged -# on 2026-05-05, so any nightly built after that includes the -# DeepseekV4ForCausalLM model class. -# -# IMPORTANT: pin to a digest-suffixed nightly tag rather than the -# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs -# files keyed on the image string and short-circuits re-import if the -# file already exists, so the floating tag silently keeps a stale build -# even after Docker Hub updates `:nightly`. -# -# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the -# rest); InferenceX classifies this as fp4 — same as the sister sglang -# and atom DSv4 mi355x entries below. Image and serving flags follow the -# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp -# executor, triton_unfused MoE (required for the FP4 expert format), -# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, -# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 -# probe to validate the ROCm DP+EP path. diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d7791fa11..77c5d17ce 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2064,23 +2064,6 @@ qwen3.5-bf16-b200-sglang-mtp: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } -# agentic-coding sibling — temporarily disabled, blocked by e2e-tests.yml -# artifact-name mismatch (downloads `agentic_*` but benchmark-tmpl.yml uploads -# as `bmk_agentic_*`). Re-enable once that workflow is aligned. -# qwen3.5-bf16-b200-sglang-agentic: -# image: lmsysorg/sglang:v0.5.12-cu130 -# model: Qwen/Qwen3.5-397B-A17B -# model-prefix: qwen3.5 -# runner: b200 -# precision: bf16 -# framework: sglang -# multinode: false -# scenarios: -# agentic-coding: -# - duration: 1800 -# search-space: -# - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - qwen3.5-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -2331,7 +2314,6 @@ qwen3.5-fp8-b200-sglang-mtp: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - qwen3.5-fp8-b300-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -8818,7 +8800,6 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 8 dp-attn: true - qwen3.5-fp8-h100-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -9277,23 +9258,6 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: false -# ============================================================================ -# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries). -# Recipes that ALREADY existed on main were intentionally left at main's version -# to preserve main behavior; PR-branch modifications to those recipes are NOT -# brought in here. -# ============================================================================ -# ============================================================================= -# Agentic configs -# ----------------------------------------------------------------------------- -# All entries below run the agentic-coding scenario (Weka trace replay). -# Above this divider: fixed-seq-len / synthetic / prefix-share scenarios only. -# ============================================================================= - -# Diverged from dsv4-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original dsv4-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'b200-dsv4' -> 'b200-dgxc' dsv4-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -9312,11 +9276,6 @@ dsv4-fp4-b200-vllm-agentic: - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } -# Diverged from qwen3.5-fp8-b200-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's qwen3.5-fp8-b200-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original qwen3.5-fp8-b200-sglang entry stays byte-identical to origin/main. qwen3.5-fp8-b200-sglang-agentic: image: lmsysorg/sglang:nightly-dev-20260422-de962f32 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -9331,11 +9290,6 @@ qwen3.5-fp8-b200-sglang-agentic: search-space: - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } -# Diverged from glm5-fp8-b200-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's glm5-fp8-b200-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original glm5-fp8-b200-sglang entry stays byte-identical to origin/main. glm5-fp8-b200-sglang-agentic: image: lmsysorg/sglang:v0.5.12-cu130 model: zai-org/GLM-5-FP8 @@ -9351,19 +9305,7 @@ glm5-fp8-b200-sglang-agentic: # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-int4-b200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-int4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'b200' -> 'b200-dgxc' kimik2.5-int4-b200-vllm-agentic: - # Bumped from v0.19.1 — that release tripped a bug in - # `flashinfer_trtllm_mxint4_moe` ('list' object has no attribute 'to') - # during warmup `profile_run` on the agentic-coding path - # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the - # flashinfer fix. image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 @@ -9382,10 +9324,6 @@ kimik2.5-int4-b200-vllm-agentic: # does not have a B300-specific recipe, so this config reuses the existing # Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-int4-h200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-int4-h200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'h200' -> 'h200-dgxc' kimik2.5-int4-h200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 @@ -9405,18 +9343,7 @@ kimik2.5-int4-h200-vllm-agentic: - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] } - { tp: 8, offloading: cpu, conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2' -# - runner: 'b200' -> 'b200-dgxc' kimik2.5-fp4-b200-vllm-agentic: - # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that - # cleared the agentic-coding warmup crash on max_model_len=131072 + - # prefix caching. image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 @@ -9437,14 +9364,6 @@ kimik2.5-fp4-b200-vllm-agentic: # does not have a B300-specific recipe, so this config reuses the existing # Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130' -# - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4' -# - model-prefix: 'dsr1' -> 'kimik2.5' -# - precision: 'fp8' -> 'fp4' -# - framework: 'sglang' -> 'vllm' kimik2.5-fp4-b300-vllm-agentic: # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the @@ -9465,13 +9384,6 @@ kimik2.5-fp4-b300-vllm-agentic: - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - { tp: 8, ep: 1, offloading: cpu, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } -# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). -# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper -# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up. -# Diverged from dsv4-fp8-h200-vllm (agentic-coding sibling). Reasons below; -# the original dsv4-fp8-h200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129' dsv4-fp8-h200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -9490,11 +9402,6 @@ dsv4-fp8-h200-vllm-agentic: # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. -# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main. dsv4-fp4-b300-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -9514,10 +9421,6 @@ dsv4-fp4-b300-vllm-agentic: - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } -# Diverged from gptoss-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original gptoss-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1' gptoss-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: openai/gpt-oss-120b @@ -9535,11 +9438,6 @@ gptoss-fp4-b200-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } - { tp: 8, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } -# Diverged from minimaxm2.5-fp8-b200-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' -# - runner: 'b200' -> 'b200-dgxc' minimaxm2.5-fp8-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -9562,10 +9460,6 @@ minimaxm2.5-fp8-b200-vllm-agentic: # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' minimaxm2.5-fp8-b300-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -9587,11 +9481,6 @@ minimaxm2.5-fp8-b300-vllm-agentic: - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } - { tp: 4, offloading: cpu, conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } -# Diverged from minimaxm2.5-fp4-b200-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp4-b200-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: nvidia/MiniMax-M2.5-NVFP4 @@ -9610,11 +9499,6 @@ minimaxm2.5-fp4-b200-vllm-agentic: # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp8-h100-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -9633,11 +9517,6 @@ minimaxm2.5-fp8-h100-vllm-agentic: - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] } - { tp: 4, ep: 4, offloading: cpu, conc-list: [5, 6, 7, 8, 10, 12] } -# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp8-h200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -9711,15 +9590,6 @@ kimik2.5-fp4-b200-vllm-agentic-lmcache: # does not have a B300-specific recipe, so this config reuses the existing # Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons -# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to -# origin/main so its fixed-seq-len sweep is unaffected. -# - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape -# mirroring the conc=192 point in the base entry's fixed-seq-len sweep. -# - additional-settings.CONFIG_FILE: points at the new agentic recipe under -# recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh -# overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC -# branch). Local-overlay pattern mirrors the existing 8k1k overlay. dsv4-fp4-gb300-dynamo-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -9868,16 +9738,6 @@ dsv4-fp4-gb300-cw-dynamo-vllm-agentic: ep: 8 dp-attn: true -# Diverged from qwen3.5-fp8-h100-sglang (agentic-coding sibling). Reasons below; -# the original qwen3.5-fp8-h100-sglang entry stays byte-identical to origin/main -# so its fixed-seq-len sweep is unaffected. -# - scenarios: replaced fixed-seq-len with agentic-coding. -# - runner: 'h100' -> 'h100-dgxc' (agentic runs need the dgxc-slurm cluster). -# Image is identical to the base entry (lmsysorg/sglang:v0.5.12-cu130). -# CONC range conservative for H100's 80 GB HBM3 under the long-ISL with- -# subagents corpus. hicache arm capped at conc 16 since high-conc + hicache -# tends to flake on first runs and conc 16 covers the cliff. The bench script -# sets WEKA_LOADER_OVERRIDE to the 256k-capped corpus variant. qwen3.5-fp8-h100-sglang-agentic: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 From 32572755524d98283c5339350a049fd7c6aad43d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 14:42:39 -0500 Subject: [PATCH 005/132] chore(aiperf): bump submodule for 060226 loader allowlist fix Picks up SemiAnalysisAI/aiperf@47e6e206, which adds the 060226 and 060226_256k loader names to the inferencex-agentx-mvp scenario's require_loader allowlist. Without this bump, dispatching any non-DSv4 agentic run on this branch fails preflight because benchmark_lib.sh now defaults the loader to the 060226 corpus. Co-Authored-By: Claude Opus 4.7 (1M context) --- utils/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/aiperf b/utils/aiperf index de3ad1c18..47e6e2060 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit de3ad1c18b704a60c43bcc5f76dfb2ac7e346fd1 +Subproject commit 47e6e206001a85a3cc4c6212a1e0425f045bbcb3 From 321fd445c301c5c52901b8f37e295ee38a10f39f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 14:58:48 -0500 Subject: [PATCH 006/132] (testing) b300 dsv4 simple offloading --- .github/configs/nvidia-master.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 77c5d17ce..5b0792d08 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9403,7 +9403,9 @@ dsv4-fp8-h200-vllm-agentic: # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. dsv4-fp4-b300-vllm-agentic: - image: vllm/vllm-openai:v0.22.0 + # image: vllm/vllm-openai:v0.22.0 + # includes https://github.com/vllm-project/vllm/pull/43447 up to 7ead0a0f27fc2b34efdcc8a557d542c5a372306f + image: cquil/vllm-openai:v0.22.0-7ead0a0f27fc2b34efdcc8a557d542c5a372306f model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From 32839349559d13a51537879b32dd05e8f60e0661 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 15:22:03 -0500 Subject: [PATCH 007/132] runners(b300-nv): remap container UID to root for apt-get install Same root cause as 967c50ca (h200-dgxc-slurm fix): vllm/vllm-openai images ship as non-root, and on b300-nv the pyxis/enroot config does NOT implicitly remap the calling user to UID 0 inside the container. benchmark_lib.sh::install_agentic_deps runs apt-get install -y git, which fails with "dpkg: error: requested operation requires superuser privilege" (see run 26844610474 / dsv4 b300 simple offloading). Adding --container-remap-root to the srun line matches b200-dgxc and h200-dgxc-slurm behavior; benchmark_lib.sh stays untouched. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b300-nv.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 67e8b48cc..cb4a634c3 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -387,6 +387,7 @@ else --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT,$WRITABLE_MODELS_DIR:$WRITABLE_MODELS_DIR \ --no-container-mount-home \ + --container-remap-root \ --container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash "$BENCH_SCRIPT" From 360bcf089130808b0f2a3a249dfdd38e19772c1b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 15:45:01 -0500 Subject: [PATCH 008/132] benchmarks(agentic): skip hf download when MODEL_PATH is pre-staged Replaces the simple unguarded download in every agentic recipe: - if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi with the same MODEL_PATH-aware logic that the fixed-seq-len B300 recipes already use: if [[ -n "${MODEL_PATH:-}" ]]; then if [[ ! -d "$MODEL_PATH" || empty ]]; then hf download "$MODEL" --local-dir "$MODEL_PATH" fi else hf download "$MODEL" export MODEL_PATH="$MODEL" fi Effect: on clusters where launch_*.sh exports MODEL_PATH pointing at a pre-staged on-node copy (e.g. b300-nv sets it to /scratch/models/), the agentic recipe now correctly short- circuits the hf-download instead of re-pulling 700 GB of DSv4-Pro into $HOME/.cache/huggingface every run. Touches 33 scripts; same edit in each. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/agentic/dsr1_fp4_b200.sh | 12 +++++++++++- benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh | 12 +++++++++++- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 12 +++++++++++- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 12 +++++++++++- .../single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 12 +++++++++++- benchmarks/single_node/agentic/dsv4_fp8_h200.sh | 12 +++++++++++- benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh | 12 +++++++++++- benchmarks/single_node/agentic/glm5_fp8_b200.sh | 12 +++++++++++- benchmarks/single_node/agentic/gptoss_fp4_b200.sh | 12 +++++++++++- benchmarks/single_node/agentic/gptoss_fp4_h100.sh | 12 +++++++++++- benchmarks/single_node/agentic/gptoss_fp4_h200.sh | 12 +++++++++++- benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh | 12 +++++++++++- benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh | 12 +++++++++++- benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh | 12 +++++++++++- benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh | 12 +++++++++++- .../single_node/agentic/kimik2.5_fp4_mi355x.sh | 12 +++++++++++- benchmarks/single_node/agentic/kimik2.5_int4_b200.sh | 12 +++++++++++- benchmarks/single_node/agentic/kimik2.5_int4_h100.sh | 12 +++++++++++- benchmarks/single_node/agentic/kimik2.5_int4_h200.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp4_b200.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_b200.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_b300.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_h100.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_h200.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_mi300x.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_mi325x.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_mi355x.sh | 12 +++++++++++- benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh | 12 +++++++++++- benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh | 12 +++++++++++- .../single_node/agentic/qwen3.5_fp8_b300_sglang.sh | 12 +++++++++++- benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh | 12 +++++++++++- benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh | 12 +++++++++++- .../single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh | 12 +++++++++++- 33 files changed, 363 insertions(+), 33 deletions(-) diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh index f9955adc7..23cf71e7d 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh @@ -17,7 +17,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh index ff76b768d..c67fc7ebf 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi amd-smi || true diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 108347479..7bc18ce22 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -38,7 +38,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index f6748a5f8..7a130673d 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -32,7 +32,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh index 99aec25fe..ab2897d88 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -31,7 +31,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh index 0a0177983..c1e2f50b3 100755 --- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh index 500b456f5..5987a789e 100755 --- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh index 259c19586..3d601193f 100755 --- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh index 6e921db58..ec8c4c9f8 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh index 557986b0d..443bc8bcc 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh index 1592a8d5c..7a93c71c5 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh index eb1883ff1..8ca6d805c 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi amd-smi || true diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh index 99e29c819..6e41756a0 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi # If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory. diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index ad0b4495a..e5c87b14a 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh index 8cebe4f20..8ab9672af 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index fd0ce3677..734f63766 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -33,7 +33,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh index 697d3fa45..ab91c99c5 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh index 2fd3b381c..fa867d976 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh index 97929e43e..08549e93a 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh index 38ef72b56..195b285c6 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh index 4ce131cba..af7c7a216 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh index 9f2d83a0b..d3ea641ef 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh index d21690da6..48f2ab388 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh index ed59991cb..15e5798c6 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh index 260bbdc68..add2a8fa0 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh @@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh index edac27a45..57746eef6 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh @@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh index 39dd63293..eac820aa0 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh index 4ba87976b..ee40e1855 100755 --- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh @@ -20,7 +20,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh index 3432af5c9..4d39f2c81 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh @@ -20,7 +20,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh index 9d9c1d7d5..d926288ae 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh @@ -24,7 +24,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh index 95f0397a0..9db72e569 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh @@ -27,7 +27,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index aef9650ca..a78ee87b9 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh index 5427d0d31..f5e2d2e6f 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh @@ -24,7 +24,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true From 57d4adb4fb5fbebc478f628c522a0a49cec9e072 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 16:23:34 -0500 Subject: [PATCH 009/132] benchmarks(agentic): launch server from MODEL_PATH, not the HF id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Companion to 360bcf08. That commit made the agentic recipes skip hf-download when MODEL_PATH was already pre-staged — but the recipes still invoked the server with the HF id ("vllm serve \$MODEL" / "--model-path \$MODEL"), so the engine looked up the HF cache (now empty, because we just skipped the download) and tried to download from scratch itself. With the model not in cache, vllm/sglang would deadlock in the auto-download path rather than fall through to a clean error. This commit aligns every agentic recipe with the fixed-seq-len B300 pattern verbatim: vllm serve "$MODEL_PATH" --served-model-name "$MODEL" python3 -m sglang.launch_server --model-path "$MODEL_PATH" --served-model-name "$MODEL" Net effect: server loads weights directly from /scratch/models// (or wherever the launch script staged the model) and reports the HF id as the served-model-name for downstream tooling. Touches all 33 agentic scripts. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/agentic/dsr1_fp4_b200.sh | 2 +- benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh | 2 +- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 2 +- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 2 +- benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 2 +- benchmarks/single_node/agentic/dsv4_fp8_h200.sh | 2 +- benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh | 2 +- benchmarks/single_node/agentic/glm5_fp8_b200.sh | 2 +- benchmarks/single_node/agentic/gptoss_fp4_b200.sh | 2 +- benchmarks/single_node/agentic/gptoss_fp4_h100.sh | 2 +- benchmarks/single_node/agentic/gptoss_fp4_h200.sh | 2 +- benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh | 2 +- benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh | 2 +- benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh | 2 +- benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh | 2 +- benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh | 2 +- benchmarks/single_node/agentic/kimik2.5_int4_b200.sh | 2 +- benchmarks/single_node/agentic/kimik2.5_int4_h100.sh | 2 +- benchmarks/single_node/agentic/kimik2.5_int4_h200.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh | 2 +- benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh | 2 +- benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh | 2 +- benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh | 2 +- benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh | 2 +- benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh | 2 +- benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh | 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh index 23cf71e7d..16dc3bfd5 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh @@ -43,7 +43,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ ---model-path $MODEL \ +--model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh index c67fc7ebf..3b2561fe2 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh @@ -44,7 +44,7 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ ---model-path=$MODEL \ +--model-path=$MODEL_PATH --served-model-name=$MODEL \ --host=0.0.0.0 \ --port=$PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 7bc18ce22..e80008f71 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -231,7 +231,7 @@ export VLLM_FLOAT32_MATMUL_PRECISION=high { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --trust-remote-code diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 7a130673d..88f4b38f5 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -123,7 +123,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve "$MODEL" \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port "$PORT" \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh index ab2897d88..029c8ea7f 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -145,7 +145,7 @@ fi echo "Starting sglang server..." python3 -m sglang.launch_server \ - --model-path "$MODEL" \ + --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port "$PORT" \ "${PARALLEL_ARGS[@]}" \ diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh index c1e2f50b3..799c2bf26 100755 --- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh @@ -50,7 +50,7 @@ export PYTHONNOUSERSITE=1 # Per recipe: EP + DP=8 (no --tensor-parallel-size). TP from search space is # used for GPU allocation by the runner and as the DP size. -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh index 5987a789e..3b85a31cd 100755 --- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh @@ -52,7 +52,7 @@ echo "Starting SGLang server..." export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ - --model-path $MODEL \ + --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh index 3d601193f..b3597cf52 100755 --- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh @@ -49,7 +49,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ ---model-path=$MODEL \ +--model-path=$MODEL_PATH --served-model-name=$MODEL \ --host=0.0.0.0 \ --port=$PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh index ec8c4c9f8..80d70e724 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh @@ -63,7 +63,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --config "$RESULT_DIR/config.yaml" \ diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh index 443bc8bcc..13e32d315 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh @@ -67,7 +67,7 @@ export TORCH_CUDA_ARCH_LIST="9.0" export PYTHONNOUSERSITE=1 export VLLM_MXFP4_USE_MARLIN=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --config "$RESULT_DIR/config.yaml" \ diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh index 7a93c71c5..e0d967246 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh @@ -67,7 +67,7 @@ export TORCH_CUDA_ARCH_LIST="9.0" export PYTHONNOUSERSITE=1 export VLLM_MXFP4_USE_MARLIN=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --config "$RESULT_DIR/config.yaml" \ diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh index 8ca6d805c..ff597c9a4 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh @@ -75,7 +75,7 @@ esac echo "Starting vllm server..." -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --attention-backend ROCM_AITER_UNIFIED_ATTN \ diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh index 6e41756a0..1f8c29351 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh @@ -74,7 +74,7 @@ esac echo "Starting vllm server..." -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --attention-backend ROCM_AITER_UNIFIED_ATTN \ diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index e5c87b14a..34b45c9ec 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -188,7 +188,7 @@ export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh index 8ab9672af..9667003e1 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh @@ -95,7 +95,7 @@ export PYTHONNOUSERSITE=1 { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index 734f63766..139b12256 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -796,7 +796,7 @@ export PYTHONNOUSERSITE=1 { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh index ab91c99c5..5685f098c 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh @@ -55,7 +55,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_INT4=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --gpu-memory-utilization 0.95 \ diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh index fa867d976..cb6c67f4b 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh @@ -55,7 +55,7 @@ echo "Starting vllm server..." export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_INT4=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --gpu-memory-utilization 0.95 \ diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh index 08549e93a..1bfa0c33b 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh @@ -66,7 +66,7 @@ echo "Starting vllm server..." export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_INT4=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --gpu-memory-utilization 0.95 \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh index 195b285c6..b4a63eff3 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh @@ -68,7 +68,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ $PARALLEL_ARGS \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh index af7c7a216..0724aba5b 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh @@ -72,7 +72,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh index d3ea641ef..c291a2ceb 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh @@ -72,7 +72,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh index 48f2ab388..516bc4696 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh @@ -68,7 +68,7 @@ echo "Starting vllm server..." export TORCH_CUDA_ARCH_LIST="9.0" export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh index 15e5798c6..e6343b8ba 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh @@ -68,7 +68,7 @@ echo "Starting vllm server..." export TORCH_CUDA_ARCH_LIST="9.0" export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh index add2a8fa0..8988316d3 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh @@ -74,7 +74,7 @@ echo "Starting vllm server..." export VLLM_ROCM_USE_AITER=1 export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh index 57746eef6..caa70de63 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh @@ -71,7 +71,7 @@ echo "Starting vllm server..." export VLLM_ROCM_USE_AITER=1 export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh index eac820aa0..cd114fe96 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -75,7 +75,7 @@ export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh index ee40e1855..d06d82ec8 100755 --- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh @@ -49,7 +49,7 @@ export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true python3 -m sglang.launch_server \ ---model-path=$MODEL \ +--model-path=$MODEL_PATH --served-model-name=$MODEL \ --host=0.0.0.0 \ --port=$PORT \ --served-model-name "Qwen/Qwen3.5-397B-A17B" \ diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh index 4d39f2c81..ad49b2b67 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh @@ -49,7 +49,7 @@ export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true python3 -m sglang.launch_server \ ---model-path=$MODEL \ +--model-path=$MODEL_PATH --served-model-name=$MODEL \ --host=0.0.0.0 \ --port=$PORT \ --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" \ diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh index d926288ae..4f9b12659 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh @@ -95,7 +95,7 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true { set +x; } 2>/dev/null SGLANG_CMD=( python3 -m sglang.launch_server - --model-path="$MODEL" + --model-path="$MODEL_PATH" --served-model-name="$MODEL" --host=0.0.0.0 --port="$PORT" --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh index 9db72e569..b280fff8b 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh @@ -108,7 +108,7 @@ fi { set +x; } 2>/dev/null SGLANG_CMD=( python3 -m sglang.launch_server - --model-path="$MODEL" + --model-path="$MODEL_PATH" --served-model-name="$MODEL" --host=0.0.0.0 --port="$PORT" --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index a78ee87b9..ff901b674 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -46,7 +46,7 @@ export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ - --model-path $MODEL \ + --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh index f5e2d2e6f..cdded8860 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh @@ -120,7 +120,7 @@ export PYTHONNOUSERSITE=1 SGLANG_CMD=( python3 -m sglang.launch_server --attention-backend triton - --model-path "$MODEL" + --model-path "$MODEL_PATH" --served-model-name "$MODEL" --host=0.0.0.0 --port "$PORT" --tensor-parallel-size "$TP" From 1bccc5cacf281a1221dac8f0558248f220786311 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 17:40:48 -0500 Subject: [PATCH 010/132] benchmarks(dsv4-b300): enable VLLM_PREFIX_CACHE_RETENTION_INTERVAL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The custom cquil/vllm-openai image integrates vllm-project/vllm#43447, which fixes the DSv4 sliding-window prefix-cache eviction issue. But the fix is opt-in via VLLM_PREFIX_CACHE_RETENTION_INTERVAL — without setting it, vllm falls back to the legacy cache-every-segment path that this PR was written to repair, so the trace-replay cache hit rate stays near 0% even though the patched code is loaded. Sets the env var to 32768 (32k tokens), matching the value the PR author validated to take cache hit rate from 0% -> 74% on a comparable agentic trace-replay benchmark. On stock vllm images that don't carry the patch, the env var is simply ignored — safe to land. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 4 ++++ benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5b0792d08..4d7785c2a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9422,6 +9422,10 @@ dsv4-fp4-b300-vllm-agentic: - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } + - { tp: 4, offloading: none, conc-list: [16, 32, 64] } + - { tp: 8, offloading: none, conc-list: [16, 32, 64] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [128, 256, 512] } gptoss-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 88f4b38f5..837345423 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -52,6 +52,13 @@ install_agentic_deps # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s. export VLLM_ENGINE_READY_TIMEOUT_S=3600 +# vllm-project/vllm#43447: keep SWA prefix-cache tails sparsely so transient +# sliding-window allocations don't evict useful prefix entries. 32k matches +# the trace-replay tuning the PR author validated (0% -> 74% hit rate). +# Requires the custom image (cquil/vllm-openai:*-7ead0a0f...) that carries +# the patch; on stock images the env var is ignored. +export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768 + # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" mkdir -p "$RESULT_DIR" From 094610734aa2e88a4ccbfad503002d7810bc8f8f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 23:04:36 -0500 Subject: [PATCH 011/132] configs(dsv4-b300-vllm-agentic): bump cquil image to 6c529f30 for retention-interval env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 7ead0a0 only carried the "Prepend uncached blocks in SWA free()" hunk of PR vllm-project/vllm#43447 — it did NOT modify vllm/envs.py to register the VLLM_PREFIX_CACHE_RETENTION_INTERVAL env var. That registration didn't land until commit 7c909f8 in the PR, and 6c529f30 is the latest merge of main into the PR branch. Effect: the export in dsv4_fp4_b300_vllm.sh (1bccc5ca) finally takes effect — vllm stops logging "Unknown vLLM environment variable detected" and actually activates the SWA prefix-cache retention path. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4d7785c2a..380c799e1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9404,8 +9404,8 @@ dsv4-fp8-h200-vllm-agentic: dsv4-fp4-b300-vllm-agentic: # image: vllm/vllm-openai:v0.22.0 - # includes https://github.com/vllm-project/vllm/pull/43447 up to 7ead0a0f27fc2b34efdcc8a557d542c5a372306f - image: cquil/vllm-openai:v0.22.0-7ead0a0f27fc2b34efdcc8a557d542c5a372306f + # includes https://github.com/vllm-project/vllm/pull/43447 up to 6c529f3001ab8bf44b1657e779dc54b622397045 + image: cquil/vllm-openai:v0.22.0-6c529f3001ab8bf44b1657e779dc54b622397045 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From 38c365c77bf0cd1214ea6d1b81f7f7ed2c56b750 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 23:09:58 -0500 Subject: [PATCH 012/132] benchmarks(dsv4-b300-vllm): override trace loader to 060226 (v6) DSv4 recipes inherit the benchmark_lib carveout that defaults to the 052726 corpus for backward-compat with prior published baselines. This recipe is opting out to ride the v6 060226 corpus that all non-DSv4 recipes already use, exercising the newer CC versions / longer-tail trace mix. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 837345423..fdb7a49b6 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -46,6 +46,8 @@ fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- +# Opt this recipe out of the DSv4 052726 default; use the v6 corpus. +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226 resolve_trace_source install_agentic_deps From ee8d74391ba7674ba77f67c1e764fc200be1956d Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 14:33:00 +0900 Subject: [PATCH 013/132] [AMD] agentx-v0.4: add MiniMax/Kimi lmcache agentic entries, update Qwen hicache config Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 35 ++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7f1c8192d..134af929a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -872,6 +872,21 @@ minimaxm2.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 16 } +minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache: + image: vllm/vllm-openai-rocm:v0.22.0 + model: amd/MiniMax-M2.5-MXFP4 + model-prefix: minimaxm2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 1, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] } + - { tp: 1, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48] } + minimaxm2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: amd/MiniMax-M2.5-MXFP4 @@ -2518,6 +2533,16 @@ kimik2.5-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } +kimik2.5-fp4-mi355x-vllm-agentic-lmcache: + image: vllm/vllm-openai-rocm:v0.22.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] } + - { tp: 4, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] } + minimaxm2.5-fp8-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -2574,19 +2599,15 @@ minimaxm2.5-fp8-mi325x-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } qwen3.5-fp8-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: agentic-coding: - duration: 1800 search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] } + - { tp: 4, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] } dsv4-fp4-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 From 616f4db634e13e57b3244853dd317bd3f8a5bd1c Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 14:40:55 +0900 Subject: [PATCH 014/132] [AMD] agentx-v0.4: add MiniMax agentic script, refactor Kimi/Qwen scripts Co-Authored-By: Claude Sonnet 4.6 --- .../agentic/kimik2.5_fp4_mi355x.sh | 674 ++---------------- .../agentic/minimaxm2.5_fp4_mi355x.sh | 256 +++++++ .../single_node/agentic/qwen3.5_fp8_mi355x.sh | 112 ++- .../agentic/qwen3.5_fp8_mi355x_sglang.sh | 152 ---- 4 files changed, 397 insertions(+), 797 deletions(-) create mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh delete mode 100755 benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index 139b12256..d05b27253 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -14,15 +14,11 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE - -# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0. -# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this -# script we need the concrete value so AgentX filters prompt+max_tokens against -# the same limit vLLM enforces. -if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then - MAX_MODEL_LEN=262144 -fi +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -33,557 +29,22 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi rocm-smi || true amd-smi || true +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps -# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) -pip install amd-quark - -# Disable AITER RMSNorm for TP < 8 due to accuracy issues -if [ "${TP}" -lt 8 ]; then - export VLLM_ROCM_USE_AITER_RMSNORM=0 -fi - -write_lmcache_rocm_mp_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/sitecustomize.py" <<'PY' -"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches.""" - -import os -import threading - -if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1": - import builtins - import sys - - _orig_import = builtins.__import__ - - def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None: - _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator - - if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False): - return - - _orig_init = _LazyMemoryAllocator.__init__ - _orig_allocate = _LazyMemoryAllocator.allocate - _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate - - def _expand_to(self, target_size: int) -> None: - target_size = min( - self._final_size, - _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE), - ) - lock = self._agentic_rocm_demand_expand_lock - with lock: - if target_size <= self._curr_size: - return - - start_size = self._curr_size - while self._curr_size < target_size: - commit_start = self._curr_size - commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE) - while self._curr_size < commit_target: - self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE) - self._curr_size += self.PIN_CHUNK_SIZE - self._commit_expansion(self._curr_size - commit_start) - - self._log_expansion_progress(self._curr_size - start_size) - - def _retry_with_demand_expansion(self, allocate_once): - obj = allocate_once() - step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64")) - step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3))) - - while obj is None and self._curr_size < self._final_size: - _expand_to(self, self._curr_size + step_bytes) - obj = allocate_once() - - return obj - - def _patched_init(self, *args, **kwargs): - _orig_init(self, *args, **kwargs) - self._agentic_rocm_demand_expand_lock = threading.Lock() - - # LMCache MP's upstream LazyMemoryAllocator currently expands to - # the final pinned size in a background thread. On ROCm Kimi TP4, - # vLLM reaches KV-cache registration only after that 2.5 TB pool - # is fully pinned, and the server-side IPC open path can stall - # before acknowledging register_kv_caches. Keep the same final - # capacity, but pin/commit extra host memory only when L1 - # allocations actually need it. - self._stop_expand.set() - self._expand_thread.join() - _lazy_memory_allocator.logger.info( - "Agentic ROCm patch: using demand-driven LMCache pinned " - "memory expansion; final capacity remains %s MB", - self._final_size >> 20, - ) - - def _patched_allocate( - self, - shapes, - dtypes, - fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, - allocator_type=None, - ): - return _retry_with_demand_expansion( - self, - lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type), - ) - - def _patched_batched_allocate( - self, - shapes, - dtypes, - batch_size, - fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, - allocator_type=None, - ): - return _retry_with_demand_expansion( - self, - lambda: _orig_batched_allocate( - self, shapes, dtypes, batch_size, fmt, allocator_type - ), - ) - - _LazyMemoryAllocator.__init__ = _patched_init - _LazyMemoryAllocator.allocate = _patched_allocate - _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate - _LazyMemoryAllocator._agentic_rocm_demand_patch = True - - def _patch_l1_memory_manager(_memory_manager) -> None: - _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None) - _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None) - if _L1MemoryManager is None or _LazyMemoryAllocator is None: - return - if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False): - return - - _orig_get_memory_usage = _L1MemoryManager.get_memory_usage - - def _patched_get_memory_usage(self): - allocator = getattr(self, "_allocator", None) - if isinstance(allocator, _LazyMemoryAllocator): - address_manager = allocator.get_address_manager() - used_size = ( - address_manager.get_heap_size() - address_manager.get_free_size() - ) - return used_size, allocator._final_size - return _orig_get_memory_usage(self) - - _L1MemoryManager.get_memory_usage = _patched_get_memory_usage - _L1MemoryManager._agentic_rocm_final_capacity_patch = True - - def _maybe_patch_lazy_memory_allocator() -> None: - module = sys.modules.get("lmcache.v1.lazy_memory_allocator") - if module is not None and hasattr(module, "LazyMemoryAllocator"): - _patch_lazy_memory_allocator(module) - - def _maybe_patch_l1_memory_manager() -> None: - module = sys.modules.get("lmcache.v1.distributed.memory_manager") - if module is not None and hasattr(module, "L1MemoryManager"): - _patch_l1_memory_manager(module) - - def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0): - module = _orig_import(name, globals, locals, fromlist, level) - if name == "lmcache.v1.lazy_memory_allocator" or ( - name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules - ): - _maybe_patch_lazy_memory_allocator() - if name == "lmcache.v1.distributed.memory_manager" or ( - name.startswith("lmcache") - and "lmcache.v1.distributed.memory_manager" in sys.modules - ): - _maybe_patch_l1_memory_manager() - return module - - builtins.__import__ = _agentic_rocm_import - _maybe_patch_lazy_memory_allocator() - _maybe_patch_l1_memory_manager() - -if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1": - import torch - import lmcache.non_cuda_equivalents as lmc - - if not hasattr(lmc, "multi_layer_block_kv_transfer"): - _DTYPE_BY_NAME = { - "bfloat16": torch.bfloat16, - "float16": torch.float16, - "float32": torch.float32, - } - - def _dtype_from_env() -> torch.dtype: - name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16") - try: - return _DTYPE_BY_NAME[name] - except KeyError as exc: - raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc - - def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor: - block_stride = shape_desc.block_stride_elems or ( - shape_desc.bs * shape_desc.nh * shape_desc.hs - ) - base = lmc._tensor_from_ptr( - ptr, - (shape_desc.nb * block_stride,), - dtype, - device, - ) - return torch.as_strided( - base, - (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs), - (block_stride, shape_desc.nh * shape_desc.hs, 1), - ) - - def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: - return lmc._tensor_from_ptr( - ptr, - (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs), - dtype, - device, - ) - - def multi_layer_block_kv_transfer( - group_kv_pointers, - tmp_buffer_ptrs, - block_ids, - paged_memory_device, - direction, - shape_desc, - lmcache_chunk_size, - gpu_kv_format, - skip_blocks=0, - ) -> None: - # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with - # shape [num_blocks, block_size, hidden_size]. LMCache's Python - # fallback has no block-transfer entrypoint yet, so implement the - # same gather/scatter contract with torch indexing on ROCm. - if shape_desc.kv_size != 1: - raise NotImplementedError( - "ROCm LMCache MP block fallback currently supports MLA KV caches only" - ) - - dtype = _dtype_from_env() - device = ( - paged_memory_device - if isinstance(paged_memory_device, torch.device) - else torch.device(paged_memory_device) - ) - num_layers = int(group_kv_pointers.numel()) - blocks_per_chunk = lmcache_chunk_size // shape_desc.bs - direction_name = getattr(direction, "name", str(direction)) - - for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs): - start = chunk_idx * blocks_per_chunk - end = start + blocks_per_chunk - chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long) - - dest_slot_offset = 0 - if skip_blocks and chunk_idx == 0: - chunk_blocks = chunk_blocks[int(skip_blocks):] - dest_slot_offset = int(skip_blocks) * shape_desc.bs - if chunk_blocks.numel() == 0: - continue - - num_slots = int(chunk_blocks.numel()) * shape_desc.bs - tmp = _tmp_view( - int(tmp_ptr), - shape_desc, - num_layers, - lmcache_chunk_size, - dtype, - device, - ) - - for layer_idx in range(num_layers): - paged = _paged_view( - int(group_kv_pointers[layer_idx].item()), - shape_desc, - dtype, - device, - ) - tmp_slice = tmp[ - 0, - layer_idx, - dest_slot_offset : dest_slot_offset + num_slots, - :, - ] - if direction_name == "D2H": - gathered = paged.index_select(0, chunk_blocks).reshape( - num_slots, shape_desc.nh * shape_desc.hs - ) - tmp_slice.copy_(gathered) - elif direction_name == "H2D": - src = tmp_slice.reshape( - int(chunk_blocks.numel()), - shape_desc.bs, - shape_desc.nh * shape_desc.hs, - ) - paged.index_copy_(0, chunk_blocks, src) - else: - raise ValueError(f"Unsupported transfer direction: {direction}") - - lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer - -# ---- Chunked KV loading (prevents GPU block exhaustion at high concurrency) ---- -if os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "0") != "0": - import chunked_connector_patch # noqa: F401 - -# ---- vLLM scheduler assertion fix (stale KV transfer notifications) ---- -import scheduler_assertion_patch # noqa: F401 -PY -} - -write_chunked_connector_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/chunked_connector_patch.py" <<'PY' -""" -Monkey-patch for LMCacheMPConnector to add chunked KV loading. - -Fixes GPU block exhaustion deadlock at high concurrency by capping -the number of external tokens reported AND retrieved per scheduling step. - -Usage: set CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD= and import this -module from sitecustomize.py before LMCache is loaded. -""" - -import logging -import os -import sys -import builtins - -logger = logging.getLogger("chunked_lmcache_patch") - -_MAX_TOKENS = int(os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "32768")) - -# Per-request chunk tracking (module-level, survives across calls) -_chunk_state: dict[str, dict] = {} - - -def _apply_patch(): - """Patch LMCacheMPConnector in-place.""" - mod = sys.modules.get("lmcache.integration.vllm.lmcache_mp_connector") - if mod is None: - return - cls = getattr(mod, "LMCacheMPConnector", None) - if cls is None or getattr(cls, "_chunked_patch_applied", False): - return - - LMCacheMPRequestState = getattr(mod, "LMCacheMPRequestState", None) - _orig_get_matched = cls.get_num_new_matched_tokens - _orig_get_finished = cls.get_finished - - def _get_blocks_per_chunk(self): - block_size = getattr(self, "block_size", 1) - return max(1, _MAX_TOKENS // block_size) - - def _patched_get_num_new_matched_tokens(self, request, num_computed_tokens): - full_match = _orig_get_matched(self, request, num_computed_tokens) - if full_match <= 0 or _MAX_TOKENS <= 0: - return full_match - - req_id = request.request_id - block_size = getattr(self, "block_size", 1) - blocks_per_chunk = _get_blocks_per_chunk(self) - full_match_blocks = full_match // block_size - - state = _chunk_state.get(req_id) - if state is None or state.get("num_computed_at_start") != num_computed_tokens: - state = { - "full_match_blocks": full_match_blocks, - "chunk_end_blocks": 0, - "num_computed_at_start": num_computed_tokens, - "lookup_done": False, - } - _chunk_state[req_id] = state - - if state["lookup_done"]: - return 0 - - remaining = state["full_match_blocks"] - state["chunk_end_blocks"] - if remaining <= 0: - state["lookup_done"] = True - return 0 - - this_chunk = min(remaining, blocks_per_chunk) - state["chunk_end_blocks"] += this_chunk - if state["chunk_end_blocks"] >= state["full_match_blocks"]: - state["lookup_done"] = True - - capped = this_chunk * block_size - if capped < full_match: - logger.debug( - "Chunked LMCache: req %s capped %d -> %d tokens " - "(chunk %d/%d blocks)", - req_id, full_match, capped, this_chunk, full_match_blocks, - ) - - # Cap the tracker's hit blocks to match what we report - tracker = getattr(request, "kv_transfer_params", None) - if tracker is not None: - orig_hits = getattr(tracker, "num_lmcache_hit_blocks", 0) - if orig_hits > this_chunk: - tracker.num_lmcache_hit_blocks = this_chunk - - return capped - - def _patched_get_finished(self, scheduler_output): - result = _orig_get_finished(self, scheduler_output) - # Clean up chunk state for finished requests. - # vLLM passes scheduler_output as a set of request-ID strings - # (not a SchedulerOutput object), so iterate directly when it - # is a set/frozenset; fall back to the attribute path for - # forward compatibility. - if isinstance(scheduler_output, (set, frozenset)): - finished = scheduler_output - else: - finished = getattr(scheduler_output, "finished_req_ids", []) - for req in finished: - _chunk_state.pop(req, None) - return result - - cls.get_num_new_matched_tokens = _patched_get_num_new_matched_tokens - cls.get_finished = _patched_get_finished - cls._chunked_patch_applied = True - logger.info( - "Chunked LMCache connector patch applied " - "(max_tokens_per_load=%d)", _MAX_TOKENS, - ) - - -_orig_import = builtins.__import__ - - -def _patching_import(name, *args, **kwargs): - module = _orig_import(name, *args, **kwargs) - if ( - name == "lmcache.integration.vllm.lmcache_mp_connector" - or ( - name.startswith("lmcache") - and "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules - ) - ): - _apply_patch() - return module - - -builtins.__import__ = _patching_import -_apply_patch() -PY -} - -write_scheduler_assertion_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/scheduler_assertion_patch.py" <<'PY' -""" -Patch vLLM scheduler to handle stale finished_recving gracefully. - -The assertion at scheduler.py crashes when a KV transfer reports -"finished recving" but the request is already in RUNNING state. -This happens when transfers complete asynchronously and the scheduler -has already moved the request forward. - -Fix: Instead of asserting, log a warning and skip. -""" - -import logging -import sys -import builtins - -logger = logging.getLogger("scheduler_assertion_patch") - - -def _apply_patch(): - """Patch vLLM scheduler's _update_from_kv_xfer_finished.""" - sched_mod = sys.modules.get("vllm.v1.core.sched.scheduler") - if sched_mod is None: - return - req_mod = sys.modules.get("vllm.v1.request") - if req_mod is None: - return - Scheduler = getattr(sched_mod, "Scheduler", None) - RequestStatus = getattr(req_mod, "RequestStatus", None) - if Scheduler is None or RequestStatus is None: - return - if getattr(Scheduler, "_kv_xfer_patch_applied", False): - return - - _orig_update = Scheduler._update_from_kv_xfer_finished - - def _patched_update(self, kv_connector_output): - if self.connector is not None: - self.connector.update_connector_output(kv_connector_output) - for req_id in kv_connector_output.finished_recving or (): - if req_id not in self.requests: - continue - req = self.requests[req_id] - if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: - self.finished_recving_kv_req_ids.add(req_id) - elif RequestStatus.is_finished(req.status): - self._free_blocks(self.requests[req_id]) - else: - logger.warning( - "Stale finished_recving for req %s in status %s; skipping.", - req_id, req.status.name, - ) - for req_id in kv_connector_output.finished_sending or (): - if req_id not in self.requests: - continue - self._free_blocks(self.requests[req_id]) - - Scheduler._update_from_kv_xfer_finished = _patched_update - Scheduler._kv_xfer_patch_applied = True - logger.info("Scheduler KV transfer assertion patch applied") - - -_orig_import = builtins.__import__ - - -def _patching_import(name, *args, **kwargs): - module = _orig_import(name, *args, **kwargs) - if ( - name == "vllm.v1.core.sched.scheduler" - or ( - name.startswith("vllm") - and "vllm.v1.core.sched.scheduler" in sys.modules - ) - ): - _apply_patch() - return module - - -builtins.__import__ = _patching_import -_apply_patch() -PY -} - -# Workaround for MEC FW <177 RCCL memory reclaim issue -version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') -if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" @@ -591,6 +52,8 @@ mkdir -p "$RESULT_DIR" OFFLOAD_ARGS=() PREFIX_CACHE_ARGS=() + +# ---- Lmcache config ---------------------------------------------------------- LMCACHE_PID="" cleanup_lmcache_server() { @@ -648,7 +111,9 @@ case "$OFFLOADING" in # MI355X nodes have ~2.7 TiB of host DRAM available for offload; # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for # worker RSS / page cache / slurm cgroup). - TOTAL_CPU_DRAM_GB=2500 + #TODO: fix + TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" # Use vLLM's regular native KV-offload path (OffloadingConnector), # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 @@ -659,7 +124,7 @@ case "$OFFLOADING" in # (vllm/config/vllm.py:662). OFFLOAD_ARGS=( --kv_offloading_backend native - --kv_offloading_size "$TOTAL_CPU_DRAM_GB" + --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" --disable-hybrid-kv-cache-manager ) ;; @@ -667,74 +132,20 @@ case "$OFFLOADING" in { set +x; } 2>/dev/null unset VLLM_USE_SIMPLE_KV_OFFLOAD - agentic_pip_install --quiet --no-cache-dir lmcache - # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and - # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and - # during Kimi fused-MoE model inspection it imports nixl_ep whenever - # that module is importable, even when this run is not using EP/NIXL - # kernels. The CUDA extension then fails immediately on AMD nodes with - # "ImportError: libcuda.so.1". - # - # LMCache MP also uses CuPy stream APIs while registering vLLM's KV - # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime - # with cudaErrorInsufficientDriver when LMCache touches the stream. Use - # the ROCm 7 CuPy wheel so the same API dispatches through HIP. - python3 -m pip uninstall -y \ - nixl nixl-cu12 nixl-cu13 nixl_ep \ - >/dev/null 2>&1 || true - python3 -m pip uninstall -y \ - cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \ - >/dev/null 2>&1 || true - agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0 - python3 - <<'PY' -import importlib.util -import sys - -spec = importlib.util.find_spec("nixl_ep") -if spec is not None: - locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"]) - print( - "Error: nixl_ep is still importable after LMCache install; " - "this ROCm Kimi run would import a CUDA-only nixl_ep module. " - f"location={locations}", - file=sys.stderr, - ) - sys.exit(1) - -try: - from cupy_backends.cuda.api import runtime as cupy_runtime -except Exception as exc: - print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr) - sys.exit(1) - -if not getattr(cupy_runtime, "is_hip", False): - print( - "Error: CuPy is still using the CUDA backend after installing " - "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.", - file=sys.stderr, - ) - sys.exit(1) -PY - LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch" - write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR" - write_chunked_connector_patch "$LMCACHE_ROCM_PATCH_DIR" - write_scheduler_assertion_patch "$LMCACHE_ROCM_PATCH_DIR" - export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1 - export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16 - export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1 - # Cap external KV tokens loaded per scheduling step to prevent GPU - # block exhaustion deadlock at high concurrency (c>=32). Default - # 32768 keeps peak block demand within the GPU KV pool. Set to 0 to - # disable chunking (only safe at low concurrency). - export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD="${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:-32768}" - export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" + git clone https://github.com/LMCache/LMCache.git + cd LMCache + pip install -r requirements/build.txt + CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation + cd .. + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV # pool, but let the external MP server own that pool so vLLM does not # split --kv-offloading-size across TP ranks through the integrated # LMCache backend. - TOTAL_CPU_DRAM_GB=2500 + #TODO: fix + TOTAL_CPU_DRAM_GB=3000 LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" LMCACHE_PORT="${LMCACHE_PORT:-5555}" LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" @@ -742,7 +153,7 @@ PY # ZMQ endpoint. Bind the server to a raw host, but pass the connector a # ZMQ-style host string. LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" - LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" # LMCache read locks are leases on chunks that lookup has promised # vLLM can retrieve. The default 300s TTL is too short for this @@ -750,10 +161,11 @@ PY # lookup and retrieve while GPU KV is saturated, which leaves the # object present in L1 but no longer readable. Keep the 2.5 TB pool # size unchanged and only extend the lookup-to-retrieve lease. - LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}" + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + export LMCACHE_BLOCKING_TIMEOUT_SECS=120 echo "Starting LMCache MP server..." LMCACHE_CMD=( @@ -786,6 +198,7 @@ PY *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; esac +# ---- LLM server config ---------------------------------------------------------- EP_ARGS=() if [ "$EP_SIZE" -gt 1 ]; then EP_ARGS=(--enable-expert-parallel) @@ -794,17 +207,34 @@ fi echo "Starting vllm server..." export PYTHONNOUSERSITE=1 +# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) +pip install amd-quark + +# Disable AITER RMSNorm for TP < 8 due to accuracy issues +if [ "${TP}" -lt 8 ]; then + export VLLM_ROCM_USE_AITER_RMSNORM=0 +fi + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL_PATH" --served-model-name "$MODEL" + vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" "${EP_ARGS[@]}" --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 \ --block-size=1 --trust-remote-code - --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$CONC" --mm-encoder-tp-mode data "${PREFIX_CACHE_ARGS[@]}" @@ -821,4 +251,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh new file mode 100755 index 000000000..f36fc59e9 --- /dev/null +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh @@ -0,0 +1,256 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - vLLM GPU KV only. +# cpu - vLLM native CPU offload. +# lmcache - LMCache MP server + vLLM LMCacheMPConnector. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ROCR/HIP visibility for vLLM 0.14+ +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 +# corpus has requests up to ~1M proxy tokens that would be rejected. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" +mkdir -p "$RESULT_DIR" + +OFFLOAD_ARGS=() +PREFIX_CACHE_ARGS=() + +# ---- Lmcache config ---------------------------------------------------------- +LMCACHE_PID="" + +cleanup_lmcache_server() { + if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then + kill "$LMCACHE_PID" 2>/dev/null || true + wait "$LMCACHE_PID" 2>/dev/null || true + fi +} + +trap cleanup_lmcache_server EXIT + +wait_for_lmcache_ready() { + { set +x; } 2>/dev/null + local attempts="${LMCACHE_READY_ATTEMPTS:-120}" + local tail_pid="" + + while [ ! -f "$LMCACHE_LOG" ]; do + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before creating log file. Exiting." >&2 + exit 1 + fi + sleep 1 + done + + tail -f -n +1 "$LMCACHE_LOG" & + tail_pid=$! + + for ((i = 1; i <= attempts; i++)); do + if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + return 0 + fi + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before becoming healthy. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 + fi + sleep 1 + done + + echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 +} + +case "$OFFLOADING" in + none) ;; + cpu) + unset VLLM_USE_SIMPLE_KV_OFFLOAD + # MI355X nodes have ~2.7 TiB of host DRAM available for offload; + # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for + # worker RSS / page cache / slurm cgroup). + TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + # Use vLLM's regular native KV-offload path (OffloadingConnector), + # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to + # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 + # would switch it to SimpleCPUOffloadConnector. We intentionally leave + # that env var UNSET here so the regular OffloadingConnector path is + # used. The shortcut --kv_offloading_backend native + --kv_offloading_size + # form constructs the KVTransferConfig at engine startup + # (vllm/config/vllm.py:662). + + # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default) + # This gives extra cache hit than disabling hybrid kv cache manager + # srok, + # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma + # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60 + OFFLOAD_ARGS=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" + --disable-hybrid-kv-cache-manager + ) + ;; + lmcache) + { set +x; } 2>/dev/null + unset VLLM_USE_SIMPLE_KV_OFFLOAD + + git clone https://github.com/LMCache/LMCache.git + cd LMCache + pip install -r requirements/build.txt + CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation + cd .. + + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null + + # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV + # pool, but let the external MP server own that pool so vLLM does not + # split --kv-offloading-size across TP ranks through the integrated + # LMCache backend. + TOTAL_CPU_DRAM_GB=3000 + LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" + LMCACHE_PORT="${LMCACHE_PORT:-5555}" + LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" + # LMCacheMPConnector concatenates lmcache.mp.host and port into the + # ZMQ endpoint. Bind the server to a raw host, but pass the connector a + # ZMQ-style host string. + LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" + # LMCache read locks are leases on chunks that lookup has promised + # vLLM can retrieve. The default 300s TTL is too short for this + # long-context agentic queue: TP8/conc32 can spend >300s between + # lookup and retrieve while GPU KV is saturated, which leaves the + # object present in L1 but no longer readable. Keep the 2.5 TB pool + # size unchanged and only extend the lookup-to-retrieve lease. + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" + LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" + export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + export LMCACHE_BLOCKING_TIMEOUT_SECS=120 + + set -x + echo "Starting LMCache MP server..." + LMCACHE_CMD=( + lmcache server + --host "$LMCACHE_HOST" + --port "$LMCACHE_PORT" + --http-host "$LMCACHE_HOST" + --http-port "$LMCACHE_HTTP_PORT" + --l1-size-gb "$LMCACHE_L1_SIZE_GB" + --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" + --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS" + --chunk-size "$LMCACHE_CHUNK_SIZE" + --max-workers "$LMCACHE_MAX_WORKERS" + --eviction-policy LRU + ) + printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" + printf '\n' >> "$RESULT_DIR/lmcache_command.txt" + "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & + LMCACHE_PID=$! + echo "LMCache server PID: $LMCACHE_PID" + wait_for_lmcache_ready + + PREFIX_CACHE_ARGS=(--enable-prefix-caching) + # srok, + # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma + # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60 + OFFLOAD_ARGS=( + --kv-transfer-config + "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" + --disable-hybrid-kv-cache-manager + ) + ;; + *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; +esac + +# ---- LLM server config ---------------------------------------------------------- +EP_ARGS=() +if [ "$EP_SIZE" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + +echo "Starting vllm server..." +export PYTHONNOUSERSITE=1 + +# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) +pip install -q amd-quark + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +{ set +x; } 2>/dev/null +VLLM_CMD=( + vllm serve "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size="$TP" + "${EP_ARGS[@]}" + --gpu-memory-utilization 0.95 + --kv-cache-dtype fp8 \ + --block-size=32 + --trust-remote-code + --attention-backend "ROCM_AITER_FA" + --max-num-seqs "$CONC" + "${PREFIX_CACHE_ARGS[@]}" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index ff901b674..656e924dc 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -2,51 +2,117 @@ set -euo pipefail set -x -# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang. +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang. +# +# Base server recipe follows the upstream MI300X reference +# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe): +# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75. +# The agentic harness (resolve_trace_source / build_replay_cmd / +# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and +# --disable-radix-cache is dropped because agentic replay needs prefix reuse. # # Required env vars: -# MODEL, TP, CONC, RESULT_DIR +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE +# +# OFFLOADING values: +# none - SGLang GPU KV with the default RadixAttention prefix cache. +# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix. source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} + +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi rocm-smi || true amd-smi || true +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps -# ---- Start SGLang server ---------------------------------------------------- +# ---- Cache / offload config ------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" mkdir -p "$RESULT_DIR" +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per + # TP rank (one hierarchical KV, one hierarchical Mamba), so the + # node-total DRAM budget divides by TP and the host-pool count. + TOTAL_CPU_DRAM_GB=3000 + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which + # requires page_size=1. Keep the safer direct/layer_first copy path; + # kernel/page_first faults on first prefill in this mode on ROCm. + HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size "$HICACHE_PAGE_SIZE" + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # HiCache startup reaches API readiness but SGLang's internal warmup + # request can time out on this path; let aiperf own benchmark traffic. + WARMUP_ARGS=(--skip-server-warmup) + # Don't force ROCm graph capture at every high concurrency point; conc=16 + # is the highest known-good capture size for this model/server path. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + echo "Starting SGLang server..." export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ - --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ + --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ @@ -56,10 +122,10 @@ python3 -m sglang.launch_server \ --enable-aiter-allreduce-fusion \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \ - --max-prefill-tokens 32768 \ - --scheduler-recv-interval 30 \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --mem-fraction-static 0.8 \ - --context-length $MAX_MODEL_LEN \ + "${CACHE_ARGS[@]}" \ + "${WARMUP_ARGS[@]}" \ --enable-metrics > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -69,4 +135,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh deleted file mode 100755 index cdded8860..000000000 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang. -# -# Required env vars: -# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR -# -# OFFLOADING values: -# none - SGLang GPU KV only with radix cache disabled. -# hicache - SGLang HiCache with local CPU hierarchical cache. - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE - -SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi -rocm-smi || true -amd-smi || true - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -CACHE_ARGS=() -WARMUP_ARGS=() -CUDA_GRAPH_MAX_BS="$CONC" -case "$OFFLOADING" in - none) - # Leave SGLang's default RadixAttention prefix cache on — agentic - # replay needs it; --disable-radix-cache would zero the hit rate. - ;; - hicache) - # MI355X nodes have about 3 TB of host DRAM, but Qwen3.5's hybrid - # GDN/Mamba path allocates two HiCache host pools per TP rank: one for - # hierarchical KV cache and one for hierarchical Mamba cache. A 2 TB - # node-total target at TP=8 is therefore 2000 / (8 * 2) = 125 GB per - # host pool, not 250 GB. Keep overrides for one-off tuning. - TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-2000}" - HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" - HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-180}}" - HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" - # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler on - # MI355X, which requires page_size=1. The kernel/page_first HiCache - # transfer path faults on first prefill in this mode on ROCm, so keep - # the default on the safer direct/layer_first copy path. These remain - # env-overridable for future SGLang/ROCm fixes. - HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" - HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" - HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" - # SGLang --hicache-size is per rank per host pool, while the workflow - # input is a node-total DRAM budget. Divide by TP and the number of - # host pools unless HICACHE_SIZE_GB is set directly for one-off tuning. - HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" - if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then - HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" - fi - if [ "$HICACHE_SIZE_GB" -lt 1 ]; then - echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 - exit 1 - fi - echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" - CACHE_ARGS=( - --page-size "$HICACHE_PAGE_SIZE" - --enable-hierarchical-cache - --hicache-size "$HICACHE_SIZE_GB" - --hicache-io-backend "$HICACHE_IO_BACKEND" - --hicache-mem-layout "$HICACHE_MEM_LAYOUT" - --hicache-write-policy "$HICACHE_WRITE_POLICY" - ) - # HiCache startup reaches API readiness, but SGLang's internal warmup - # request has timed out after 600s on this Qwen MI355X path. Let aiperf - # own benchmark traffic instead of blocking server readiness on it. - WARMUP_ARGS=(--skip-server-warmup) - # Keep request concurrency as the swept variable, but do not force - # HiCache runs to capture ROCm graphs at every high concurrency point. - # The conc=32 HiCache job crashed after startup readiness, before any - # aiperf traffic, while conc=16 is the highest known-good capture size - # for this model/server path. Requests above the capture size can still - # run; they just do not require a larger captured graph at startup. - HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}" - if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then - CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" - fi - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 - exit 1 - ;; -esac - -echo "Starting SGLang server..." -export PYTHONNOUSERSITE=1 - -{ set +x; } 2>/dev/null -SGLANG_CMD=( - python3 -m sglang.launch_server - --attention-backend triton - --model-path "$MODEL_PATH" --served-model-name "$MODEL" - --host=0.0.0.0 - --port "$PORT" - --tensor-parallel-size "$TP" - --ep-size "$EP_SIZE" - --trust-remote-code - --tokenizer-worker-num 6 - --enable-aiter-allreduce-fusion - --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" - --max-running-requests "$CONC" - --max-prefill-tokens 32768 - --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" - --mem-fraction-static 0.8 - --context-length "$MAX_MODEL_LEN" - --enable-metrics - "${CACHE_ARGS[@]}" - "${WARMUP_ARGS[@]}" -) -printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" -printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" -"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" From 574d8914475ba7d5f8cc0ec9d17ea79aca03e95d Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 14:46:29 +0900 Subject: [PATCH 015/132] Revert "[AMD] agentx-v0.4: add MiniMax agentic script, refactor Kimi/Qwen scripts" and "[AMD] agentx-v0.4: add MiniMax/Kimi lmcache agentic entries, update Qwen hicache config" Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 35 +- .../agentic/kimik2.5_fp4_mi355x.sh | 674 ++++++++++++++++-- .../agentic/minimaxm2.5_fp4_mi355x.sh | 256 ------- .../single_node/agentic/qwen3.5_fp8_mi355x.sh | 112 +-- .../agentic/qwen3.5_fp8_mi355x_sglang.sh | 152 ++++ 5 files changed, 804 insertions(+), 425 deletions(-) delete mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh create mode 100755 benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 134af929a..7f1c8192d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -872,21 +872,6 @@ minimaxm2.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 16 } -minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache: - image: vllm/vllm-openai-rocm:v0.22.0 - model: amd/MiniMax-M2.5-MXFP4 - model-prefix: minimaxm2.5 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 1, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] } - - { tp: 1, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48] } - minimaxm2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: amd/MiniMax-M2.5-MXFP4 @@ -2533,16 +2518,6 @@ kimik2.5-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } -kimik2.5-fp4-mi355x-vllm-agentic-lmcache: - image: vllm/vllm-openai-rocm:v0.22.0 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - agentic-coding: - - duration: 1800 - search-space: - - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] } - - { tp: 4, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] } - minimaxm2.5-fp8-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -2599,15 +2574,19 @@ minimaxm2.5-fp8-mi325x-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } qwen3.5-fp8-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531 + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: agentic-coding: - duration: 1800 search-space: - - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] } - - { tp: 4, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] } + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } dsv4-fp4-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index d05b27253..139b12256 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -14,11 +14,15 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -EP_SIZE=${EP_SIZE:-1} +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + +# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0. +# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this +# script we need the concrete value so AgentX filters prompt+max_tokens against +# the same limit vLLM enforces. +if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then + MAX_MODEL_LEN=262144 +fi if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -29,22 +33,557 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true -# ---- Resolve traces and install deps ---------------------------------------- -# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the -# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf -# signal at high concurrency. -#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k -#060226 -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k - # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps +# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) +pip install amd-quark + +# Disable AITER RMSNorm for TP < 8 due to accuracy issues +if [ "${TP}" -lt 8 ]; then + export VLLM_ROCM_USE_AITER_RMSNORM=0 +fi + +write_lmcache_rocm_mp_patch() { + local patch_dir="$1" + mkdir -p "$patch_dir" + cat > "$patch_dir/sitecustomize.py" <<'PY' +"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches.""" + +import os +import threading + +if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1": + import builtins + import sys + + _orig_import = builtins.__import__ + + def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None: + _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator + + if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False): + return + + _orig_init = _LazyMemoryAllocator.__init__ + _orig_allocate = _LazyMemoryAllocator.allocate + _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate + + def _expand_to(self, target_size: int) -> None: + target_size = min( + self._final_size, + _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE), + ) + lock = self._agentic_rocm_demand_expand_lock + with lock: + if target_size <= self._curr_size: + return + + start_size = self._curr_size + while self._curr_size < target_size: + commit_start = self._curr_size + commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE) + while self._curr_size < commit_target: + self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE) + self._curr_size += self.PIN_CHUNK_SIZE + self._commit_expansion(self._curr_size - commit_start) + + self._log_expansion_progress(self._curr_size - start_size) + + def _retry_with_demand_expansion(self, allocate_once): + obj = allocate_once() + step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64")) + step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3))) + + while obj is None and self._curr_size < self._final_size: + _expand_to(self, self._curr_size + step_bytes) + obj = allocate_once() + + return obj + + def _patched_init(self, *args, **kwargs): + _orig_init(self, *args, **kwargs) + self._agentic_rocm_demand_expand_lock = threading.Lock() + + # LMCache MP's upstream LazyMemoryAllocator currently expands to + # the final pinned size in a background thread. On ROCm Kimi TP4, + # vLLM reaches KV-cache registration only after that 2.5 TB pool + # is fully pinned, and the server-side IPC open path can stall + # before acknowledging register_kv_caches. Keep the same final + # capacity, but pin/commit extra host memory only when L1 + # allocations actually need it. + self._stop_expand.set() + self._expand_thread.join() + _lazy_memory_allocator.logger.info( + "Agentic ROCm patch: using demand-driven LMCache pinned " + "memory expansion; final capacity remains %s MB", + self._final_size >> 20, + ) + + def _patched_allocate( + self, + shapes, + dtypes, + fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, + allocator_type=None, + ): + return _retry_with_demand_expansion( + self, + lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type), + ) + + def _patched_batched_allocate( + self, + shapes, + dtypes, + batch_size, + fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, + allocator_type=None, + ): + return _retry_with_demand_expansion( + self, + lambda: _orig_batched_allocate( + self, shapes, dtypes, batch_size, fmt, allocator_type + ), + ) + + _LazyMemoryAllocator.__init__ = _patched_init + _LazyMemoryAllocator.allocate = _patched_allocate + _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate + _LazyMemoryAllocator._agentic_rocm_demand_patch = True + + def _patch_l1_memory_manager(_memory_manager) -> None: + _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None) + _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None) + if _L1MemoryManager is None or _LazyMemoryAllocator is None: + return + if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False): + return + + _orig_get_memory_usage = _L1MemoryManager.get_memory_usage + + def _patched_get_memory_usage(self): + allocator = getattr(self, "_allocator", None) + if isinstance(allocator, _LazyMemoryAllocator): + address_manager = allocator.get_address_manager() + used_size = ( + address_manager.get_heap_size() - address_manager.get_free_size() + ) + return used_size, allocator._final_size + return _orig_get_memory_usage(self) + + _L1MemoryManager.get_memory_usage = _patched_get_memory_usage + _L1MemoryManager._agentic_rocm_final_capacity_patch = True + + def _maybe_patch_lazy_memory_allocator() -> None: + module = sys.modules.get("lmcache.v1.lazy_memory_allocator") + if module is not None and hasattr(module, "LazyMemoryAllocator"): + _patch_lazy_memory_allocator(module) + + def _maybe_patch_l1_memory_manager() -> None: + module = sys.modules.get("lmcache.v1.distributed.memory_manager") + if module is not None and hasattr(module, "L1MemoryManager"): + _patch_l1_memory_manager(module) + + def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0): + module = _orig_import(name, globals, locals, fromlist, level) + if name == "lmcache.v1.lazy_memory_allocator" or ( + name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules + ): + _maybe_patch_lazy_memory_allocator() + if name == "lmcache.v1.distributed.memory_manager" or ( + name.startswith("lmcache") + and "lmcache.v1.distributed.memory_manager" in sys.modules + ): + _maybe_patch_l1_memory_manager() + return module + + builtins.__import__ = _agentic_rocm_import + _maybe_patch_lazy_memory_allocator() + _maybe_patch_l1_memory_manager() + +if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1": + import torch + import lmcache.non_cuda_equivalents as lmc + + if not hasattr(lmc, "multi_layer_block_kv_transfer"): + _DTYPE_BY_NAME = { + "bfloat16": torch.bfloat16, + "float16": torch.float16, + "float32": torch.float32, + } + + def _dtype_from_env() -> torch.dtype: + name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16") + try: + return _DTYPE_BY_NAME[name] + except KeyError as exc: + raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc + + def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + block_stride = shape_desc.block_stride_elems or ( + shape_desc.bs * shape_desc.nh * shape_desc.hs + ) + base = lmc._tensor_from_ptr( + ptr, + (shape_desc.nb * block_stride,), + dtype, + device, + ) + return torch.as_strided( + base, + (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs), + (block_stride, shape_desc.nh * shape_desc.hs, 1), + ) + + def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + return lmc._tensor_from_ptr( + ptr, + (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs), + dtype, + device, + ) + + def multi_layer_block_kv_transfer( + group_kv_pointers, + tmp_buffer_ptrs, + block_ids, + paged_memory_device, + direction, + shape_desc, + lmcache_chunk_size, + gpu_kv_format, + skip_blocks=0, + ) -> None: + # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with + # shape [num_blocks, block_size, hidden_size]. LMCache's Python + # fallback has no block-transfer entrypoint yet, so implement the + # same gather/scatter contract with torch indexing on ROCm. + if shape_desc.kv_size != 1: + raise NotImplementedError( + "ROCm LMCache MP block fallback currently supports MLA KV caches only" + ) + + dtype = _dtype_from_env() + device = ( + paged_memory_device + if isinstance(paged_memory_device, torch.device) + else torch.device(paged_memory_device) + ) + num_layers = int(group_kv_pointers.numel()) + blocks_per_chunk = lmcache_chunk_size // shape_desc.bs + direction_name = getattr(direction, "name", str(direction)) + + for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs): + start = chunk_idx * blocks_per_chunk + end = start + blocks_per_chunk + chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long) + + dest_slot_offset = 0 + if skip_blocks and chunk_idx == 0: + chunk_blocks = chunk_blocks[int(skip_blocks):] + dest_slot_offset = int(skip_blocks) * shape_desc.bs + if chunk_blocks.numel() == 0: + continue + + num_slots = int(chunk_blocks.numel()) * shape_desc.bs + tmp = _tmp_view( + int(tmp_ptr), + shape_desc, + num_layers, + lmcache_chunk_size, + dtype, + device, + ) + + for layer_idx in range(num_layers): + paged = _paged_view( + int(group_kv_pointers[layer_idx].item()), + shape_desc, + dtype, + device, + ) + tmp_slice = tmp[ + 0, + layer_idx, + dest_slot_offset : dest_slot_offset + num_slots, + :, + ] + if direction_name == "D2H": + gathered = paged.index_select(0, chunk_blocks).reshape( + num_slots, shape_desc.nh * shape_desc.hs + ) + tmp_slice.copy_(gathered) + elif direction_name == "H2D": + src = tmp_slice.reshape( + int(chunk_blocks.numel()), + shape_desc.bs, + shape_desc.nh * shape_desc.hs, + ) + paged.index_copy_(0, chunk_blocks, src) + else: + raise ValueError(f"Unsupported transfer direction: {direction}") + + lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer + +# ---- Chunked KV loading (prevents GPU block exhaustion at high concurrency) ---- +if os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "0") != "0": + import chunked_connector_patch # noqa: F401 + +# ---- vLLM scheduler assertion fix (stale KV transfer notifications) ---- +import scheduler_assertion_patch # noqa: F401 +PY +} + +write_chunked_connector_patch() { + local patch_dir="$1" + mkdir -p "$patch_dir" + cat > "$patch_dir/chunked_connector_patch.py" <<'PY' +""" +Monkey-patch for LMCacheMPConnector to add chunked KV loading. + +Fixes GPU block exhaustion deadlock at high concurrency by capping +the number of external tokens reported AND retrieved per scheduling step. + +Usage: set CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD= and import this +module from sitecustomize.py before LMCache is loaded. +""" + +import logging +import os +import sys +import builtins + +logger = logging.getLogger("chunked_lmcache_patch") + +_MAX_TOKENS = int(os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "32768")) + +# Per-request chunk tracking (module-level, survives across calls) +_chunk_state: dict[str, dict] = {} + + +def _apply_patch(): + """Patch LMCacheMPConnector in-place.""" + mod = sys.modules.get("lmcache.integration.vllm.lmcache_mp_connector") + if mod is None: + return + cls = getattr(mod, "LMCacheMPConnector", None) + if cls is None or getattr(cls, "_chunked_patch_applied", False): + return + + LMCacheMPRequestState = getattr(mod, "LMCacheMPRequestState", None) + _orig_get_matched = cls.get_num_new_matched_tokens + _orig_get_finished = cls.get_finished + + def _get_blocks_per_chunk(self): + block_size = getattr(self, "block_size", 1) + return max(1, _MAX_TOKENS // block_size) + + def _patched_get_num_new_matched_tokens(self, request, num_computed_tokens): + full_match = _orig_get_matched(self, request, num_computed_tokens) + if full_match <= 0 or _MAX_TOKENS <= 0: + return full_match + + req_id = request.request_id + block_size = getattr(self, "block_size", 1) + blocks_per_chunk = _get_blocks_per_chunk(self) + full_match_blocks = full_match // block_size + + state = _chunk_state.get(req_id) + if state is None or state.get("num_computed_at_start") != num_computed_tokens: + state = { + "full_match_blocks": full_match_blocks, + "chunk_end_blocks": 0, + "num_computed_at_start": num_computed_tokens, + "lookup_done": False, + } + _chunk_state[req_id] = state + + if state["lookup_done"]: + return 0 + + remaining = state["full_match_blocks"] - state["chunk_end_blocks"] + if remaining <= 0: + state["lookup_done"] = True + return 0 + + this_chunk = min(remaining, blocks_per_chunk) + state["chunk_end_blocks"] += this_chunk + if state["chunk_end_blocks"] >= state["full_match_blocks"]: + state["lookup_done"] = True + + capped = this_chunk * block_size + if capped < full_match: + logger.debug( + "Chunked LMCache: req %s capped %d -> %d tokens " + "(chunk %d/%d blocks)", + req_id, full_match, capped, this_chunk, full_match_blocks, + ) + + # Cap the tracker's hit blocks to match what we report + tracker = getattr(request, "kv_transfer_params", None) + if tracker is not None: + orig_hits = getattr(tracker, "num_lmcache_hit_blocks", 0) + if orig_hits > this_chunk: + tracker.num_lmcache_hit_blocks = this_chunk + + return capped + + def _patched_get_finished(self, scheduler_output): + result = _orig_get_finished(self, scheduler_output) + # Clean up chunk state for finished requests. + # vLLM passes scheduler_output as a set of request-ID strings + # (not a SchedulerOutput object), so iterate directly when it + # is a set/frozenset; fall back to the attribute path for + # forward compatibility. + if isinstance(scheduler_output, (set, frozenset)): + finished = scheduler_output + else: + finished = getattr(scheduler_output, "finished_req_ids", []) + for req in finished: + _chunk_state.pop(req, None) + return result + + cls.get_num_new_matched_tokens = _patched_get_num_new_matched_tokens + cls.get_finished = _patched_get_finished + cls._chunked_patch_applied = True + logger.info( + "Chunked LMCache connector patch applied " + "(max_tokens_per_load=%d)", _MAX_TOKENS, + ) + + +_orig_import = builtins.__import__ + + +def _patching_import(name, *args, **kwargs): + module = _orig_import(name, *args, **kwargs) + if ( + name == "lmcache.integration.vllm.lmcache_mp_connector" + or ( + name.startswith("lmcache") + and "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules + ) + ): + _apply_patch() + return module + + +builtins.__import__ = _patching_import +_apply_patch() +PY +} + +write_scheduler_assertion_patch() { + local patch_dir="$1" + mkdir -p "$patch_dir" + cat > "$patch_dir/scheduler_assertion_patch.py" <<'PY' +""" +Patch vLLM scheduler to handle stale finished_recving gracefully. + +The assertion at scheduler.py crashes when a KV transfer reports +"finished recving" but the request is already in RUNNING state. +This happens when transfers complete asynchronously and the scheduler +has already moved the request forward. + +Fix: Instead of asserting, log a warning and skip. +""" + +import logging +import sys +import builtins + +logger = logging.getLogger("scheduler_assertion_patch") + + +def _apply_patch(): + """Patch vLLM scheduler's _update_from_kv_xfer_finished.""" + sched_mod = sys.modules.get("vllm.v1.core.sched.scheduler") + if sched_mod is None: + return + req_mod = sys.modules.get("vllm.v1.request") + if req_mod is None: + return + Scheduler = getattr(sched_mod, "Scheduler", None) + RequestStatus = getattr(req_mod, "RequestStatus", None) + if Scheduler is None or RequestStatus is None: + return + if getattr(Scheduler, "_kv_xfer_patch_applied", False): + return + + _orig_update = Scheduler._update_from_kv_xfer_finished + + def _patched_update(self, kv_connector_output): + if self.connector is not None: + self.connector.update_connector_output(kv_connector_output) + for req_id in kv_connector_output.finished_recving or (): + if req_id not in self.requests: + continue + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + elif RequestStatus.is_finished(req.status): + self._free_blocks(self.requests[req_id]) + else: + logger.warning( + "Stale finished_recving for req %s in status %s; skipping.", + req_id, req.status.name, + ) + for req_id in kv_connector_output.finished_sending or (): + if req_id not in self.requests: + continue + self._free_blocks(self.requests[req_id]) + + Scheduler._update_from_kv_xfer_finished = _patched_update + Scheduler._kv_xfer_patch_applied = True + logger.info("Scheduler KV transfer assertion patch applied") + + +_orig_import = builtins.__import__ + + +def _patching_import(name, *args, **kwargs): + module = _orig_import(name, *args, **kwargs) + if ( + name == "vllm.v1.core.sched.scheduler" + or ( + name.startswith("vllm") + and "vllm.v1.core.sched.scheduler" in sys.modules + ) + ): + _apply_patch() + return module + + +builtins.__import__ = _patching_import +_apply_patch() +PY +} + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" @@ -52,8 +591,6 @@ mkdir -p "$RESULT_DIR" OFFLOAD_ARGS=() PREFIX_CACHE_ARGS=() - -# ---- Lmcache config ---------------------------------------------------------- LMCACHE_PID="" cleanup_lmcache_server() { @@ -111,9 +648,7 @@ case "$OFFLOADING" in # MI355X nodes have ~2.7 TiB of host DRAM available for offload; # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for # worker RSS / page cache / slurm cgroup). - #TODO: fix - TOTAL_CPU_DRAM_GB=3000 - TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + TOTAL_CPU_DRAM_GB=2500 # Use vLLM's regular native KV-offload path (OffloadingConnector), # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 @@ -124,7 +659,7 @@ case "$OFFLOADING" in # (vllm/config/vllm.py:662). OFFLOAD_ARGS=( --kv_offloading_backend native - --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" + --kv_offloading_size "$TOTAL_CPU_DRAM_GB" --disable-hybrid-kv-cache-manager ) ;; @@ -132,20 +667,74 @@ case "$OFFLOADING" in { set +x; } 2>/dev/null unset VLLM_USE_SIMPLE_KV_OFFLOAD - git clone https://github.com/LMCache/LMCache.git - cd LMCache - pip install -r requirements/build.txt - CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation - cd .. - + agentic_pip_install --quiet --no-cache-dir lmcache + # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and + # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and + # during Kimi fused-MoE model inspection it imports nixl_ep whenever + # that module is importable, even when this run is not using EP/NIXL + # kernels. The CUDA extension then fails immediately on AMD nodes with + # "ImportError: libcuda.so.1". + # + # LMCache MP also uses CuPy stream APIs while registering vLLM's KV + # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime + # with cudaErrorInsufficientDriver when LMCache touches the stream. Use + # the ROCm 7 CuPy wheel so the same API dispatches through HIP. + python3 -m pip uninstall -y \ + nixl nixl-cu12 nixl-cu13 nixl_ep \ + >/dev/null 2>&1 || true + python3 -m pip uninstall -y \ + cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \ + >/dev/null 2>&1 || true + agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0 + python3 - <<'PY' +import importlib.util +import sys + +spec = importlib.util.find_spec("nixl_ep") +if spec is not None: + locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"]) + print( + "Error: nixl_ep is still importable after LMCache install; " + "this ROCm Kimi run would import a CUDA-only nixl_ep module. " + f"location={locations}", + file=sys.stderr, + ) + sys.exit(1) + +try: + from cupy_backends.cuda.api import runtime as cupy_runtime +except Exception as exc: + print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr) + sys.exit(1) + +if not getattr(cupy_runtime, "is_hip", False): + print( + "Error: CuPy is still using the CUDA backend after installing " + "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.", + file=sys.stderr, + ) + sys.exit(1) +PY + LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch" + write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR" + write_chunked_connector_patch "$LMCACHE_ROCM_PATCH_DIR" + write_scheduler_assertion_patch "$LMCACHE_ROCM_PATCH_DIR" + export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1 + export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16 + export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1 + # Cap external KV tokens loaded per scheduling step to prevent GPU + # block exhaustion deadlock at high concurrency (c>=32). Default + # 32768 keeps peak block demand within the GPU KV pool. Set to 0 to + # disable chunking (only safe at low concurrency). + export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD="${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:-32768}" + export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV # pool, but let the external MP server own that pool so vLLM does not # split --kv-offloading-size across TP ranks through the integrated # LMCache backend. - #TODO: fix - TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_GB=2500 LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" LMCACHE_PORT="${LMCACHE_PORT:-5555}" LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" @@ -153,7 +742,7 @@ case "$OFFLOADING" in # ZMQ endpoint. Bind the server to a raw host, but pass the connector a # ZMQ-style host string. LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" - LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}" LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" # LMCache read locks are leases on chunks that lookup has promised # vLLM can retrieve. The default 300s TTL is too short for this @@ -161,11 +750,10 @@ case "$OFFLOADING" in # lookup and retrieve while GPU KV is saturated, which leaves the # object present in L1 but no longer readable. Keep the 2.5 TB pool # size unchanged and only extend the lookup-to-retrieve lease. - LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}" LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" - export LMCACHE_BLOCKING_TIMEOUT_SECS=120 echo "Starting LMCache MP server..." LMCACHE_CMD=( @@ -198,7 +786,6 @@ case "$OFFLOADING" in *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; esac -# ---- LLM server config ---------------------------------------------------------- EP_ARGS=() if [ "$EP_SIZE" -gt 1 ]; then EP_ARGS=(--enable-expert-parallel) @@ -207,34 +794,17 @@ fi echo "Starting vllm server..." export PYTHONNOUSERSITE=1 -# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) -pip install amd-quark - -# Disable AITER RMSNorm for TP < 8 due to accuracy issues -if [ "${TP}" -lt 8 ]; then - export VLLM_ROCM_USE_AITER_RMSNORM=0 -fi - -# Workaround for MEC FW <177 RCCL memory reclaim issue -version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') -if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" "${EP_ARGS[@]}" --gpu-memory-utilization 0.90 - --kv-cache-dtype fp8 \ --block-size=1 --trust-remote-code + --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$CONC" --mm-encoder-tp-mode data "${PREFIX_CACHE_ARGS[@]}" @@ -251,4 +821,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh deleted file mode 100755 index f36fc59e9..000000000 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh +++ /dev/null @@ -1,256 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM. -# -# Required env vars: -# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR -# -# OFFLOADING values: -# none - vLLM GPU KV only. -# cpu - vLLM native CPU offload. -# lmcache - LMCache MP server + vLLM LMCacheMPConnector. - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -EP_SIZE=${EP_SIZE:-1} - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -# ROCR/HIP visibility for vLLM 0.14+ -if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then - export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" -fi - -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi -rocm-smi || true -amd-smi || true - -# ---- Resolve traces and install deps ---------------------------------------- -# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 -# corpus has requests up to ~1M proxy tokens that would be rejected. -# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k -#060226 -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k - -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" -mkdir -p "$RESULT_DIR" - -OFFLOAD_ARGS=() -PREFIX_CACHE_ARGS=() - -# ---- Lmcache config ---------------------------------------------------------- -LMCACHE_PID="" - -cleanup_lmcache_server() { - if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then - kill "$LMCACHE_PID" 2>/dev/null || true - wait "$LMCACHE_PID" 2>/dev/null || true - fi -} - -trap cleanup_lmcache_server EXIT - -wait_for_lmcache_ready() { - { set +x; } 2>/dev/null - local attempts="${LMCACHE_READY_ATTEMPTS:-120}" - local tail_pid="" - - while [ ! -f "$LMCACHE_LOG" ]; do - if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then - echo "LMCache server died before creating log file. Exiting." >&2 - exit 1 - fi - sleep 1 - done - - tail -f -n +1 "$LMCACHE_LOG" & - tail_pid=$! - - for ((i = 1; i <= attempts; i++)); do - if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then - kill "$tail_pid" 2>/dev/null || true - wait "$tail_pid" 2>/dev/null || true - return 0 - fi - if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then - echo "LMCache server died before becoming healthy. Log follows:" >&2 - kill "$tail_pid" 2>/dev/null || true - wait "$tail_pid" 2>/dev/null || true - cat "$LMCACHE_LOG" >&2 || true - exit 1 - fi - sleep 1 - done - - echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 - kill "$tail_pid" 2>/dev/null || true - wait "$tail_pid" 2>/dev/null || true - cat "$LMCACHE_LOG" >&2 || true - exit 1 -} - -case "$OFFLOADING" in - none) ;; - cpu) - unset VLLM_USE_SIMPLE_KV_OFFLOAD - # MI355X nodes have ~2.7 TiB of host DRAM available for offload; - # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for - # worker RSS / page cache / slurm cgroup). - TOTAL_CPU_DRAM_GB=3000 - TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" - # Use vLLM's regular native KV-offload path (OffloadingConnector), - # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to - # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 - # would switch it to SimpleCPUOffloadConnector. We intentionally leave - # that env var UNSET here so the regular OffloadingConnector path is - # used. The shortcut --kv_offloading_backend native + --kv_offloading_size - # form constructs the KVTransferConfig at engine startup - # (vllm/config/vllm.py:662). - - # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default) - # This gives extra cache hit than disabling hybrid kv cache manager - # srok, - # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma - # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60 - OFFLOAD_ARGS=( - --kv_offloading_backend native - --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" - --disable-hybrid-kv-cache-manager - ) - ;; - lmcache) - { set +x; } 2>/dev/null - unset VLLM_USE_SIMPLE_KV_OFFLOAD - - git clone https://github.com/LMCache/LMCache.git - cd LMCache - pip install -r requirements/build.txt - CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation - cd .. - - python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null - - # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV - # pool, but let the external MP server own that pool so vLLM does not - # split --kv-offloading-size across TP ranks through the integrated - # LMCache backend. - TOTAL_CPU_DRAM_GB=3000 - LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" - LMCACHE_PORT="${LMCACHE_PORT:-5555}" - LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" - # LMCacheMPConnector concatenates lmcache.mp.host and port into the - # ZMQ endpoint. Bind the server to a raw host, but pass the connector a - # ZMQ-style host string. - LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" - LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" - LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" - # LMCache read locks are leases on chunks that lookup has promised - # vLLM can retrieve. The default 300s TTL is too short for this - # long-context agentic queue: TP8/conc32 can spend >300s between - # lookup and retrieve while GPU KV is saturated, which leaves the - # object present in L1 but no longer readable. Keep the 2.5 TB pool - # size unchanged and only extend the lookup-to-retrieve lease. - LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" - LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" - LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" - export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" - export LMCACHE_BLOCKING_TIMEOUT_SECS=120 - - set -x - echo "Starting LMCache MP server..." - LMCACHE_CMD=( - lmcache server - --host "$LMCACHE_HOST" - --port "$LMCACHE_PORT" - --http-host "$LMCACHE_HOST" - --http-port "$LMCACHE_HTTP_PORT" - --l1-size-gb "$LMCACHE_L1_SIZE_GB" - --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" - --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS" - --chunk-size "$LMCACHE_CHUNK_SIZE" - --max-workers "$LMCACHE_MAX_WORKERS" - --eviction-policy LRU - ) - printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" - printf '\n' >> "$RESULT_DIR/lmcache_command.txt" - "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & - LMCACHE_PID=$! - echo "LMCache server PID: $LMCACHE_PID" - wait_for_lmcache_ready - - PREFIX_CACHE_ARGS=(--enable-prefix-caching) - # srok, - # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma - # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60 - OFFLOAD_ARGS=( - --kv-transfer-config - "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" - --disable-hybrid-kv-cache-manager - ) - ;; - *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; -esac - -# ---- LLM server config ---------------------------------------------------------- -EP_ARGS=() -if [ "$EP_SIZE" -gt 1 ]; then - EP_ARGS=(--enable-expert-parallel) -fi - -echo "Starting vllm server..." -export PYTHONNOUSERSITE=1 - -# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) -pip install -q amd-quark - -# Workaround for MEC FW <177 RCCL memory reclaim issue -version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') -if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -{ set +x; } 2>/dev/null -VLLM_CMD=( - vllm serve "$MODEL" - --host 0.0.0.0 - --port "$PORT" - --tensor-parallel-size="$TP" - "${EP_ARGS[@]}" - --gpu-memory-utilization 0.95 - --kv-cache-dtype fp8 \ - --block-size=32 - --trust-remote-code - --attention-backend "ROCM_AITER_FA" - --max-num-seqs "$CONC" - "${PREFIX_CACHE_ARGS[@]}" - "${OFFLOAD_ARGS[@]}" -) -printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" -printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" -"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index 656e924dc..ff901b674 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -2,117 +2,51 @@ set -euo pipefail set -x -# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang. -# -# Base server recipe follows the upstream MI300X reference -# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe): -# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75. -# The agentic harness (resolve_trace_source / build_replay_cmd / -# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and -# --disable-radix-cache is dropped because agentic replay needs prefix reuse. +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang. # # Required env vars: -# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE -# -# OFFLOADING values: -# none - SGLang GPU KV with the default RadixAttention prefix cache. -# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix. +# MODEL, TP, CONC, RESULT_DIR source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION +check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -EP_SIZE=${EP_SIZE:-1} - -SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=131072 +fi if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true -# ---- Resolve traces and install deps ---------------------------------------- -# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the -# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf -# signal at high concurrency. -#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k -#060226 -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k - # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps -# ---- Cache / offload config ------------------------------------------------- +# ---- Start SGLang server ---------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" mkdir -p "$RESULT_DIR" -CACHE_ARGS=() -WARMUP_ARGS=() -CUDA_GRAPH_MAX_BS="$CONC" -case "$OFFLOADING" in - none) - # Leave SGLang's default RadixAttention prefix cache on — agentic - # replay needs it; --disable-radix-cache would zero the hit rate. - ;; - hicache) - # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per - # TP rank (one hierarchical KV, one hierarchical Mamba), so the - # node-total DRAM budget divides by TP and the host-pool count. - TOTAL_CPU_DRAM_GB=3000 - HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" - HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}" - HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" - # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which - # requires page_size=1. Keep the safer direct/layer_first copy path; - # kernel/page_first faults on first prefill in this mode on ROCm. - HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" - HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" - HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" - HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" - if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then - HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" - fi - if [ "$HICACHE_SIZE_GB" -lt 1 ]; then - echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 - exit 1 - fi - echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" - CACHE_ARGS=( - --page-size "$HICACHE_PAGE_SIZE" - --enable-hierarchical-cache - --hicache-size "$HICACHE_SIZE_GB" - --hicache-io-backend "$HICACHE_IO_BACKEND" - --hicache-mem-layout "$HICACHE_MEM_LAYOUT" - --hicache-write-policy "$HICACHE_WRITE_POLICY" - ) - # HiCache startup reaches API readiness but SGLang's internal warmup - # request can time out on this path; let aiperf own benchmark traffic. - WARMUP_ARGS=(--skip-server-warmup) - # Don't force ROCm graph capture at every high concurrency point; conc=16 - # is the highest known-good capture size for this model/server path. - HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}" - if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then - CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" - fi - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 - exit 1 - ;; -esac - echo "Starting SGLang server..." export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ - --model-path $MODEL \ + --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ @@ -122,10 +56,10 @@ python3 -m sglang.launch_server \ --enable-aiter-allreduce-fusion \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \ - --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ + --max-prefill-tokens 32768 \ + --scheduler-recv-interval 30 \ --mem-fraction-static 0.8 \ - "${CACHE_ARGS[@]}" \ - "${WARMUP_ARGS[@]}" \ + --context-length $MAX_MODEL_LEN \ --enable-metrics > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -135,4 +69,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh new file mode 100755 index 000000000..cdded8860 --- /dev/null +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh @@ -0,0 +1,152 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - SGLang GPU KV only with radix cache disabled. +# hicache - SGLang HiCache with local CPU hierarchical cache. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=131072 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # MI355X nodes have about 3 TB of host DRAM, but Qwen3.5's hybrid + # GDN/Mamba path allocates two HiCache host pools per TP rank: one for + # hierarchical KV cache and one for hierarchical Mamba cache. A 2 TB + # node-total target at TP=8 is therefore 2000 / (8 * 2) = 125 GB per + # host pool, not 250 GB. Keep overrides for one-off tuning. + TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-2000}" + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-180}}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler on + # MI355X, which requires page_size=1. The kernel/page_first HiCache + # transfer path faults on first prefill in this mode on ROCm, so keep + # the default on the safer direct/layer_first copy path. These remain + # env-overridable for future SGLang/ROCm fixes. + HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + # SGLang --hicache-size is per rank per host pool, while the workflow + # input is a node-total DRAM budget. Divide by TP and the number of + # host pools unless HICACHE_SIZE_GB is set directly for one-off tuning. + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size "$HICACHE_PAGE_SIZE" + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # HiCache startup reaches API readiness, but SGLang's internal warmup + # request has timed out after 600s on this Qwen MI355X path. Let aiperf + # own benchmark traffic instead of blocking server readiness on it. + WARMUP_ARGS=(--skip-server-warmup) + # Keep request concurrency as the swept variable, but do not force + # HiCache runs to capture ROCm graphs at every high concurrency point. + # The conc=32 HiCache job crashed after startup readiness, before any + # aiperf traffic, while conc=16 is the highest known-good capture size + # for this model/server path. Requests above the capture size can still + # run; they just do not require a larger captured graph at startup. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +echo "Starting SGLang server..." +export PYTHONNOUSERSITE=1 + +{ set +x; } 2>/dev/null +SGLANG_CMD=( + python3 -m sglang.launch_server + --attention-backend triton + --model-path "$MODEL_PATH" --served-model-name "$MODEL" + --host=0.0.0.0 + --port "$PORT" + --tensor-parallel-size "$TP" + --ep-size "$EP_SIZE" + --trust-remote-code + --tokenizer-worker-num 6 + --enable-aiter-allreduce-fusion + --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" + --max-running-requests "$CONC" + --max-prefill-tokens 32768 + --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" + --mem-fraction-static 0.8 + --context-length "$MAX_MODEL_LEN" + --enable-metrics + "${CACHE_ARGS[@]}" + "${WARMUP_ARGS[@]}" +) +printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" +printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" +"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" From 5ec21d45fcd695a8491bae757f3d890894c39f39 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 11:12:19 -0500 Subject: [PATCH 016/132] utils(process_agentic_result): align cache metrics + theoretical-trace loader with reality MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related fixes in one commit, both surfaced by the dsv4-b300 sweep analysis (run 26863239242): 1. Cache-hit reporting was wrong for several engine/connector combos. Replaces the dead vllm:cpu_prefix_cache_* lookup (which never had a matching prometheus metric — SimpleCPUOffloadConnector folds CPU reloads back into vllm:prefix_cache_hits) with: - server_external_cache_hit_rate <- vllm:external_prefix_cache_* (populated only by LMCacheMPConnector and similar external connectors; this is the metric that actually matters for LMCache runs, where delay_cache_blocks=True drives local hit rate to 0%) - gpu_kv_cache_usage_pct <- vllm:kv_cache_usage_perc (V1) with fallback to vllm:gpu_cache_usage_perc (V0) The docstring now documents which fields are populated for which engine/connector combination so the matrix isn't a mystery. 2. _hf_traces_dir was hardcoded to a single corpus revision (051926), which broke for every other corpus the bench script supports (052726, 060226, 060226-256k) and made the test_processor_loads_traces_jsonl_for_theoretical_cache unit test permanently fail. Replaced with a scan over all datasets--semianalysisai--cc-traces-weka* dirs in the HF cache, picking the most-recently-modified usable snapshot. All 9 unit tests pass (was 8/9). Co-Authored-By: Claude Opus 4.7 (1M context) --- utils/process_agentic_result.py | 88 ++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 17 deletions(-) diff --git a/utils/process_agentic_result.py b/utils/process_agentic_result.py index 3c4015ce6..90f1aaca9 100644 --- a/utils/process_agentic_result.py +++ b/utils/process_agentic_result.py @@ -37,7 +37,6 @@ # Trace metadata lookup: conversation_id (= trace id) -> per-turn dict with # ``hash_ids`` and ``output_length``. Built lazily from the HF dataset cache. _TRACE_METADATA_CACHE: dict[str, list[dict]] | None = None -_HF_DATASET = "semianalysisai/cc-traces-weka-with-subagents-051926" # ---- helpers --------------------------------------------------------------- @@ -118,10 +117,17 @@ def load_server_metrics(path: Path) -> dict: def _hf_traces_dir() -> Path | None: """Locate the HuggingFace cache directory for the weka traces dataset. - Returns the directory containing per-trace JSON files, or None if the - dataset isn't present locally. Mirrors the layout + Returns the directory containing per-trace JSON files, or None if no + weka dataset is present locally. Mirrors the layout huggingface_hub.snapshot_download() produces: ``$HF_HUB_CACHE/datasets----/snapshots//``. + + The bench script supports several corpus revisions + (cc-traces-weka-with-subagents-052726, ...-060226, ...-060226-256k, etc.) + and may switch between them per-recipe via WEKA_LOADER_OVERRIDE. Rather + than hardcode a single dataset name, scan all ``datasets--semianalysisai + --cc-traces-weka*`` directories in the cache and pick the most-recently- + modified snapshot that contains usable trace files. """ hub_cache = os.environ.get("HF_HUB_CACHE") or os.environ.get("HUGGINGFACE_HUB_CACHE") if hub_cache: @@ -130,17 +136,23 @@ def _hf_traces_dir() -> Path | None: home = os.environ.get("HF_HOME") cache_root = Path(home) / "hub" if home else Path.home() / ".cache" / "huggingface" / "hub" - org, name = _HF_DATASET.split("/", 1) - snapshots = cache_root / f"datasets--{org}--{name}" / "snapshots" - if not snapshots.is_dir(): + if not cache_root.is_dir(): return None - candidates = sorted(snapshots.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True) + + # Collect every weka-corpus snapshot dir across all matching dataset + # entries, sorted newest first. + snapshots: list[Path] = [] + for dataset_dir in cache_root.glob("datasets--semianalysisai--cc-traces-weka*"): + snap_root = dataset_dir / "snapshots" + if not snap_root.is_dir(): + continue + snapshots.extend(p for p in snap_root.iterdir() if p.is_dir()) + snapshots.sort(key=lambda p: p.stat().st_mtime, reverse=True) + # Prefer the snapshot that contains usable trace files. The published HF # dataset ships a single ``traces.jsonl`` (one trace per line); older / # local mirrors may use per-trace ``*.json`` files instead. Accept either. - for c in candidates: - if not c.is_dir(): - continue + for c in snapshots: if any(c.glob("*.jsonl")) or any(c.glob("*.json")): return c return None @@ -382,16 +394,36 @@ def compute_throughput_stats(records: list[dict], aggregate: dict) -> dict: def compute_cache_stats(records: list[dict], server_metrics: dict) -> dict: - """Cache-hit metrics: theoretical (from trace metadata) + actual (server).""" + """Cache-hit metrics: theoretical (from trace metadata) + actual (server). + + Server-metric coverage depends on the engine + KV connector combination, + so several fields are structurally null for some configs. The matrix: + + | engine + connector | populated server fields | + |----------------------------------------------|------------------------------------| + | vLLM, no connector | server_gpu_cache_hit_rate, | + | | gpu_kv_cache_usage_pct | + | vLLM + SimpleCPUOffloadConnector | same as above (the CPU tier | + | | extends the local LRU; reloads are | + | | counted as prefix_cache_hits — no | + | | separate vllm:cpu_prefix_cache_* | + | | counter exists) | + | vLLM + LMCacheMPConnector (kv_role=kv_both) | server_external_cache_hit_rate. | + | | server_gpu_cache_hit_rate goes to | + | | ~0 because delay_cache_blocks=True | + | | suppresses local hash registration | + | SGLang | not yet wired | + """ result: dict = { "theoretical_cache_hit_rate": None, "server_gpu_cache_hit_rate": None, - "server_cpu_cache_hit_rate": None, + "server_external_cache_hit_rate": None, + "gpu_kv_cache_usage_pct": None, + "cpu_kv_cache_usage_pct": None, "kv_offload_bytes_gpu_to_cpu": None, "kv_offload_bytes_cpu_to_gpu": None, "kv_offload_time_gpu_to_cpu": None, "kv_offload_time_cpu_to_gpu": None, - "cpu_kv_cache_usage_pct": None, "total_prompt_tokens": None, "total_generation_tokens": None, "total_requests_completed": None, @@ -476,15 +508,30 @@ def _final_value(metric_name: str) -> float | None: return agg return None + # Local GPU prefix cache (every vLLM config emits these). Note: with + # LMCacheMPConnector + kv_role=kv_both, the scheduler sets + # delay_cache_blocks=True on every load and these hits stay at ~0 even + # when overall cache efficiency is high — read server_external_*. hits = _final_value("vllm:prefix_cache_hits") queries = _final_value("vllm:prefix_cache_queries") if hits is not None and queries and queries > 0: result["server_gpu_cache_hit_rate"] = hits / queries - cpu_hits = _final_value("vllm:cpu_prefix_cache_hits") - cpu_queries = _final_value("vllm:cpu_prefix_cache_queries") - if cpu_hits is not None and cpu_queries and cpu_queries > 0: - result["server_cpu_cache_hit_rate"] = cpu_hits / cpu_queries + # External KV connector (LMCacheMPConnector and similar). Only populated + # when the connector implements get_num_new_matched_tokens; absent for + # SimpleCPUOffloadConnector and for pure-vLLM (no connector) runs. + ext_hits = _final_value("vllm:external_prefix_cache_hits") + ext_queries = _final_value("vllm:external_prefix_cache_queries") + if ext_hits is not None and ext_queries and ext_queries > 0: + result["server_external_cache_hit_rate"] = ext_hits / ext_queries + + # GPU KV pool fill ratio gauge. vLLM emits vllm:kv_cache_usage_perc on V1 + # and vllm:gpu_cache_usage_perc on V0 (kept for older deployments). + kv_usage = _final_value("vllm:kv_cache_usage_perc") + if kv_usage is None: + kv_usage = _final_value("vllm:gpu_cache_usage_perc") + if kv_usage is not None: + result["gpu_kv_cache_usage_pct"] = kv_usage for src_key, dst_key in ( ("vllm:kv_offload_bytes_gpu_to_cpu", "kv_offload_bytes_gpu_to_cpu"), @@ -679,6 +726,13 @@ def main() -> int: ) if agg.get("server_gpu_cache_hit_rate") is not None: print(f" GPU cache hit rate: {agg['server_gpu_cache_hit_rate']:.1%}") + if agg.get("server_external_cache_hit_rate") is not None: + print( + f" External cache hit rate: " + f"{agg['server_external_cache_hit_rate']:.1%}" + ) + if agg.get("gpu_kv_cache_usage_pct") is not None: + print(f" GPU KV cache usage: {agg['gpu_kv_cache_usage_pct']:.1%}") if agg.get("response_cache_hit_rate") is not None: print(f" Response cache hit rate: {agg['response_cache_hit_rate']:.1%}") if agg.get("theoretical_cache_hit_rate") is not None: From d7841d8550f2a1c31b0c93badc65ff4dbaef8cf5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 11:42:33 -0500 Subject: [PATCH 017/132] feat(agentic): route DEP traffic through native vLLM router Signed-off-by: Cam Quilici --- .github/workflows/benchmark-tmpl.yml | 2 + .../single_node/agentic/dsv4_fp4_b300_vllm.sh | 51 ++++++++++++++++++- utils/aiperf | 2 +- 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 2148def36..4c2c2ba5f 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -239,6 +239,7 @@ jobs: name: agentic_${{ env.RESULT_FILENAME }} path: | results/server.log + results/router.log results/lmcache_server.log results/benchmark.log results/config.yaml @@ -279,6 +280,7 @@ jobs: name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }} path: | ${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }} + ${{ inputs.scenario-type == 'agentic-coding' && 'results/router.log' || '' }} ${{ inputs.scenario-type == 'agentic-coding' && 'results/lmcache_server.log' || '' }} if-no-files-found: ignore diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index fdb7a49b6..8cdb65138 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -19,6 +19,10 @@ set -x # # Required env vars: # MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# Optional DEP router env vars: +# VLLM_USE_ROUTER=false disables the native vLLM router baseline. +# VLLM_ROUTER_POLICY overrides the default consistent_hash policy. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -51,6 +55,32 @@ export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226 resolve_trace_source install_agentic_deps +# vllm-project/router expands the one HTTP backend into one logical worker per +# DP rank and sends X-data-parallel-rank on forwarded requests. aiperf's +# X-Correlation-ID is stable for every turn of a conversation; alias it to the +# router's preferred X-Session-ID header. This also keeps affinity correct when +# testing older wheels that prioritize per-request X-Request-ID. +USE_VLLM_ROUTER=false +VLLM_BACKEND_PORT="$PORT" +case "${VLLM_USE_ROUTER:-true}" in + true) + if [ "$DP_ATTENTION" = "true" ]; then + USE_VLLM_ROUTER=true + VLLM_BACKEND_PORT="${VLLM_BACKEND_PORT_OVERRIDE:-$((PORT + 1))}" + VLLM_ROUTER_VERSION="${VLLM_ROUTER_VERSION:-0.1.14}" + VLLM_ROUTER_POLICY="${VLLM_ROUTER_POLICY:-consistent_hash}" + VLLM_ROUTER_METRICS_PORT="${VLLM_ROUTER_METRICS_PORT:-$((PORT + 10000))}" + export AIPERF_HTTP_X_SESSION_ID_FROM_CORRELATION_ID=1 + agentic_pip_install --quiet "vllm-router==$VLLM_ROUTER_VERSION" + fi + ;; + false) ;; + *) + echo "Error: unsupported VLLM_USE_ROUTER value '${VLLM_USE_ROUTER}' (expected one of: true, false)" >&2 + exit 1 + ;; +esac + # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s. export VLLM_ENGINE_READY_TIMEOUT_S=3600 @@ -63,6 +93,7 @@ export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768 # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" +ROUTER_LOG="$RESULT_DIR/router.log" mkdir -p "$RESULT_DIR" OFFLOAD_ARGS="" @@ -134,7 +165,7 @@ export VLLM_FLOAT32_MATMUL_PRECISION=high vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ ---port "$PORT" \ +--port "$VLLM_BACKEND_PORT" \ --trust-remote-code \ --kv-cache-dtype fp8 \ --block-size 256 \ @@ -154,7 +185,23 @@ $OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" +wait_for_server_ready --port "$VLLM_BACKEND_PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +if [ "$USE_VLLM_ROUTER" = "true" ]; then + echo "Starting native vLLM router on port $PORT for $TP DP ranks..." + vllm-router \ + --worker-urls "http://localhost:$VLLM_BACKEND_PORT" \ + --policy "$VLLM_ROUTER_POLICY" \ + --intra-node-data-parallel-size "$TP" \ + --host 0.0.0.0 \ + --port "$PORT" \ + --prometheus-host 127.0.0.1 \ + --prometheus-port "$VLLM_ROUTER_METRICS_PORT" \ + --disable-retries > "$ROUTER_LOG" 2>&1 & + ROUTER_PID=$! + echo "Router PID: $ROUTER_PID" + wait_for_server_ready --port "$PORT" --server-log "$ROUTER_LOG" --server-pid "$ROUTER_PID" +fi # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" diff --git a/utils/aiperf b/utils/aiperf index 47e6e2060..ed5b4ce04 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit 47e6e206001a85a3cc4c6212a1e0425f045bbcb3 +Subproject commit ed5b4ce04eee054fd3cb5f9a8510677df7cfe37b From fc5a792a869d327d9b08c92d1971f765cf788180 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 12:34:14 -0500 Subject: [PATCH 018/132] benchmarks(agentic): disable DCGM gpu_telemetry in aiperf invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit aiperf's GpuMetricTimeSeries.append_snapshot freezes the metric schema on the first DCGM scrape; any optional field that's None on the first scrape (xid_errors most commonly, also power_violation, encoder_utilization) then raises KeyError when it first appears mid-run. The exception is caught at records_manager.py:609 so the run completes, but every late telemetry sample is dropped silently and the error count grows. We don't consume the gpu_telemetry_export.jsonl artifact in downstream processing (process_agentic_result.py only reads aiperf's server-metrics output and the per-request profile export). Server-side /metrics from vLLM/sglang flows through a separate path and is unaffected — KV cache usage, prefix cache hit rate, throughput etc. still populate. Until the aiperf upstream patch lands (dynamic schema extension in telemetry_models.py), --no-gpu-telemetry sidesteps the bug entirely. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/benchmark_lib.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e062b42f1..fd7d4fb44 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -1048,6 +1048,14 @@ build_replay_cmd() { # CPU on minimax-m2.5 at high concurrency. Lossless for vLLM (server # usage is authoritative). REPLAY_CMD+=" --use-server-token-count" + # Disable DCGM GPU telemetry collection. aiperf's GpuMetricTimeSeries + # freezes its metric schema on the first DCGM scrape, then KeyErrors when + # an optional field (xid_errors, power_violation, encoder_utilization) + # first appears mid-run. We don't consume the gpu_telemetry artifact in + # downstream processing, and the server-metrics path (Prometheus /metrics + # from vLLM) is unaffected by this flag and still gives us KV usage, + # prefix cache hit rate, etc. + REPLAY_CMD+=" --no-gpu-telemetry" # aiperf's dataset manager (separate from the inference parser) loads # the model's tokenizer for trace-prompt tokenization regardless of # --use-server-token-count. Models like kimi (amd/Kimi-K2.5-MXFP4, From ba65df8c2c787d3904186c155969cb0b3f0f3e09 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 12:40:35 -0500 Subject: [PATCH 019/132] refactor(agentic): hardcode DSv4 B300 router settings --- .../single_node/agentic/dsv4_fp4_b300_vllm.sh | 45 ++++++++----------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 8cdb65138..06bdbe432 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -19,27 +19,29 @@ set -x # # Required env vars: # MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR -# -# Optional DEP router env vars: -# VLLM_USE_ROUTER=false disables the native vLLM router baseline. -# VLLM_ROUTER_POLICY overrides the default consistent_hash policy. source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then +if ! declare -p MAX_MODEL_LEN >/dev/null 2>&1; then + MAX_MODEL_LEN=1000000 +elif [[ -z "$MAX_MODEL_LEN" || "$MAX_MODEL_LEN" = "0" ]]; then MAX_MODEL_LEN=1000000 fi -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +if declare -p SLURM_JOB_ID >/dev/null 2>&1 && [ -n "$SLURM_JOB_ID" ]; then + SLURM_NODE=unknown + if declare -p SLURMD_NODENAME >/dev/null 2>&1 && [ -n "$SLURMD_NODENAME" ]; then + SLURM_NODE="$SLURMD_NODENAME" + fi + echo "JOB $SLURM_JOB_ID running on $SLURM_NODE" fi # `hf download` creates the target dir if missing and is itself idempotent. # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE # Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then +if declare -p MODEL_PATH >/dev/null 2>&1 && [ -n "$MODEL_PATH" ]; then if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then hf download "$MODEL" --local-dir "$MODEL_PATH" fi @@ -62,24 +64,15 @@ install_agentic_deps # testing older wheels that prioritize per-request X-Request-ID. USE_VLLM_ROUTER=false VLLM_BACKEND_PORT="$PORT" -case "${VLLM_USE_ROUTER:-true}" in - true) - if [ "$DP_ATTENTION" = "true" ]; then - USE_VLLM_ROUTER=true - VLLM_BACKEND_PORT="${VLLM_BACKEND_PORT_OVERRIDE:-$((PORT + 1))}" - VLLM_ROUTER_VERSION="${VLLM_ROUTER_VERSION:-0.1.14}" - VLLM_ROUTER_POLICY="${VLLM_ROUTER_POLICY:-consistent_hash}" - VLLM_ROUTER_METRICS_PORT="${VLLM_ROUTER_METRICS_PORT:-$((PORT + 10000))}" - export AIPERF_HTTP_X_SESSION_ID_FROM_CORRELATION_ID=1 - agentic_pip_install --quiet "vllm-router==$VLLM_ROUTER_VERSION" - fi - ;; - false) ;; - *) - echo "Error: unsupported VLLM_USE_ROUTER value '${VLLM_USE_ROUTER}' (expected one of: true, false)" >&2 - exit 1 - ;; -esac +if [ "$DP_ATTENTION" = "true" ]; then + USE_VLLM_ROUTER=true + VLLM_BACKEND_PORT=$((PORT + 1)) + VLLM_ROUTER_VERSION=0.1.14 + VLLM_ROUTER_POLICY=consistent_hash + VLLM_ROUTER_METRICS_PORT=$((PORT + 10000)) + export AIPERF_HTTP_X_SESSION_ID_FROM_CORRELATION_ID=1 + agentic_pip_install --quiet "vllm-router==$VLLM_ROUTER_VERSION" +fi # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s. export VLLM_ENGINE_READY_TIMEOUT_S=3600 From 76a3f09b4d4f2db17f8f25e049077d9f30242328 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 12:40:35 -0500 Subject: [PATCH 020/132] fix(agentic): fail jobs with excessive aiperf errors --- benchmarks/benchmark_lib.sh | 21 ++++- utils/test_validate_agentic_result.py | 73 +++++++++++++++++ utils/validate_agentic_result.py | 111 ++++++++++++++++++++++++++ 3 files changed, 202 insertions(+), 3 deletions(-) create mode 100644 utils/test_validate_agentic_result.py create mode 100644 utils/validate_agentic_result.py diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index fd7d4fb44..29a6b91c3 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -899,6 +899,7 @@ run_eval() { INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}" AGENTIC_DIR="${AGENTIC_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/agentic-benchmark}" AIPERF_DIR="${AIPERF_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/aiperf}" +AIPERF_FAILED_REQUEST_THRESHOLD=0.10 agentic_pip_install() { local pip_install=(python3 -m pip install) @@ -1034,7 +1035,7 @@ build_replay_cmd() { # transient low-rate failures from killing long sweeps while still # catching malformed payloads or server crashes before they get aggregated # as benchmarkable data. - REPLAY_CMD+=" --failed-request-threshold 0.10" + REPLAY_CMD+=" --failed-request-threshold $AIPERF_FAILED_REQUEST_THRESHOLD" # Sample each trajectory's warmup start position uniformly from # [25%, 75%] of the trace's turn count (was hardcoded 0%-70% upstream). # Avoids starting trajectories right at turn 0 where the KV cache is @@ -1095,8 +1096,9 @@ build_replay_cmd() { write_agentic_result_json() { # Aggregate aiperf's profile_export.{json,jsonl} + server_metrics_export.json - # into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow's existing - # retry-based existence check is the single success gate. + # into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow checks that + # this file exists; run_agentic_replay_and_write_outputs separately rejects + # aggregates whose request error rate exceeds the configured limit. local result_dir="$1" RESULT_DIR="$result_dir" AGENTIC_OUTPUT_DIR="${AGENTIC_OUTPUT_DIR:-$INFMAX_CONTAINER_WORKSPACE}" \ python3 "$INFMAX_CONTAINER_WORKSPACE/utils/process_agentic_result.py" @@ -1110,6 +1112,7 @@ write_agentic_result_json() { run_agentic_replay_and_write_outputs() { local result_dir="$1" local replay_rc + local validation_rc echo "$REPLAY_CMD" > "$result_dir/benchmark_command.txt" @@ -1125,8 +1128,20 @@ run_agentic_replay_and_write_outputs() { python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \ "$result_dir/aiperf_artifacts" -o "$result_dir" 2>&1 || true + set +e + python3 "$INFMAX_CONTAINER_WORKSPACE/utils/validate_agentic_result.py" \ + "$result_dir/aiperf_artifacts" \ + --failed-request-threshold "$AIPERF_FAILED_REQUEST_THRESHOLD" + validation_rc=$? + set -e + if [ "$replay_rc" -ne 0 ]; then echo "ERROR: agentic trace replay exited with code $replay_rc after writing available results" >&2 return "$replay_rc" fi + + if [ "$validation_rc" -ne 0 ]; then + echo "ERROR: agentic trace replay produced invalid results after writing available artifacts" >&2 + return "$validation_rc" + fi } diff --git a/utils/test_validate_agentic_result.py b/utils/test_validate_agentic_result.py new file mode 100644 index 000000000..f21bfa069 --- /dev/null +++ b/utils/test_validate_agentic_result.py @@ -0,0 +1,73 @@ +"""Tests for the agentic aiperf result validity gate.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from validate_agentic_result import validate_result + + +def _write_aggregate(tmp_path: Path, aggregate: dict, *, per_run: bool = False) -> Path: + artifact_dir = tmp_path / "aiperf_artifacts" + output_dir = artifact_dir / "run_0" if per_run else artifact_dir + output_dir.mkdir(parents=True) + with open(output_dir / "profile_export_aiperf.json", "w") as f: + json.dump(aggregate, f) + return artifact_dir + + +def test_passes_when_request_error_rate_is_within_limit(tmp_path: Path): + artifact_dir = _write_aggregate( + tmp_path, + { + "request_count": {"avg": 90}, + "error_request_count": {"avg": 10}, + "completed_request_count": {"avg": 100}, + }, + ) + + assert validate_result(artifact_dir, 0.10) == [] + + +def test_fails_when_request_error_rate_exceeds_limit(tmp_path: Path): + artifact_dir = _write_aggregate( + tmp_path, + { + "request_count": {"avg": 2}, + "error_request_count": {"avg": 65}, + "completed_request_count": {"avg": 67}, + }, + ) + + errors = validate_result(artifact_dir, 0.10) + assert errors == [ + "aiperf request error rate exceeded the benchmark limit: " + "65/67 = 97.015% > 10.000%" + ] + + +def test_treats_missing_error_count_as_zero(tmp_path: Path): + artifact_dir = _write_aggregate( + tmp_path, + {"request_count": {"avg": 12}}, + ) + + assert validate_result(artifact_dir, 0.10) == [] + + +def test_supports_per_run_artifact_layout(tmp_path: Path): + artifact_dir = _write_aggregate( + tmp_path, + {"request_count": {"avg": 12}}, + per_run=True, + ) + + assert validate_result(artifact_dir, 0.10) == [] + + +def test_fails_when_aggregate_is_missing(tmp_path: Path): + errors = validate_result(tmp_path / "aiperf_artifacts", 0.10) + + assert len(errors) == 1 + assert errors[0].endswith("profile_export_aiperf.json not found") diff --git a/utils/validate_agentic_result.py b/utils/validate_agentic_result.py new file mode 100644 index 000000000..e54691059 --- /dev/null +++ b/utils/validate_agentic_result.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +"""Validate whether an aiperf agentic replay produced benchmarkable results.""" + +from __future__ import annotations + +import argparse +import json +import math +import sys +from pathlib import Path +from typing import Any + + +def _resolve_aggregate_path(artifact_dir: Path) -> Path: + """Find aiperf's aggregate JSON in the direct or per-run artifact layout.""" + direct = artifact_dir / "profile_export_aiperf.json" + if direct.is_file(): + return direct + + if artifact_dir.is_dir(): + for child in sorted(artifact_dir.iterdir()): + candidate = child / "profile_export_aiperf.json" + if child.is_dir() and candidate.is_file(): + return candidate + + return direct + + +def _metric_avg(aggregate: dict[str, Any], name: str) -> float | None: + """Read an aggregate metric's numeric average, if present.""" + metric = aggregate.get(name) + if metric is None: + return None + if not isinstance(metric, dict): + raise ValueError(f"{name} must be an object") + + value = metric.get("avg") + if value is None: + return None + if not isinstance(value, int | float) or isinstance(value, bool): + raise ValueError(f"{name}.avg must be numeric") + + value = float(value) + if not math.isfinite(value) or value < 0: + raise ValueError(f"{name}.avg must be a finite non-negative number") + return value + + +def validate_result(artifact_dir: Path, failed_request_threshold: float) -> list[str]: + """Return validation errors for an aiperf artifact directory.""" + aggregate_path = _resolve_aggregate_path(artifact_dir) + if not aggregate_path.is_file(): + return [f"{aggregate_path} not found"] + + try: + with open(aggregate_path) as f: + aggregate = json.load(f) + if not isinstance(aggregate, dict): + return [f"{aggregate_path} must contain a JSON object"] + + successes = _metric_avg(aggregate, "request_count") + errors = _metric_avg(aggregate, "error_request_count") or 0.0 + completed = _metric_avg(aggregate, "completed_request_count") + except (OSError, json.JSONDecodeError, ValueError) as exc: + return [f"failed to read {aggregate_path}: {exc}"] + + if successes is None: + return ["request_count.avg is missing"] + if completed is None: + completed = successes + errors + if completed <= 0: + return ["aiperf completed zero requests"] + + error_rate = errors / completed + if error_rate > failed_request_threshold: + return [ + "aiperf request error rate exceeded the benchmark limit: " + f"{errors:g}/{completed:g} = {error_rate:.3%} > " + f"{failed_request_threshold:.3%}" + ] + + print( + "Validated aiperf request error rate: " + f"{errors:g}/{completed:g} = {error_rate:.3%} <= " + f"{failed_request_threshold:.3%}" + ) + return [] + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("artifact_dir", type=Path) + parser.add_argument( + "--failed-request-threshold", + type=float, + required=True, + help="Maximum accepted error fraction, inclusive", + ) + args = parser.parse_args() + + if not 0 <= args.failed_request_threshold <= 1: + parser.error("--failed-request-threshold must be between 0 and 1") + + errors = validate_result(args.artifact_dir, args.failed_request_threshold) + for error in errors: + print(f"ERROR: {error}", file=sys.stderr) + return 1 if errors else 0 + + +if __name__ == "__main__": + sys.exit(main()) From 923186d1e3d196469d3bd524fa77831379abcce4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 12:56:45 -0500 Subject: [PATCH 021/132] feat(agentic): route B200 DEP traffic through native vLLM router --- .../single_node/agentic/dsv4_fp4_b200_vllm.sh | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index e80008f71..b704b06e3 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -55,11 +55,28 @@ nvidia-smi resolve_trace_source install_agentic_deps +# vllm-project/router expands the one HTTP backend into one logical worker per +# DP rank and sends X-data-parallel-rank on forwarded requests. aiperf's +# X-Correlation-ID is stable for every turn of a conversation; alias it to the +# router's preferred X-Session-ID header. +USE_VLLM_ROUTER=false +VLLM_BACKEND_PORT="$PORT" +if [ "$DP_ATTENTION" = "true" ]; then + USE_VLLM_ROUTER=true + VLLM_BACKEND_PORT=$((PORT + 1)) + VLLM_ROUTER_VERSION=0.1.14 + VLLM_ROUTER_POLICY=consistent_hash + VLLM_ROUTER_METRICS_PORT=$((PORT + 10000)) + export AIPERF_HTTP_X_SESSION_ID_FROM_CORRELATION_ID=1 + agentic_pip_install --quiet "vllm-router==$VLLM_ROUTER_VERSION" +fi + # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s. export VLLM_ENGINE_READY_TIMEOUT_S=3600 # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" +ROUTER_LOG="$RESULT_DIR/router.log" LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" mkdir -p "$RESULT_DIR" @@ -233,7 +250,7 @@ export VLLM_FLOAT32_MATMUL_PRECISION=high VLLM_CMD=( vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 - --port "$PORT" + --port "$VLLM_BACKEND_PORT" --trust-remote-code --kv-cache-dtype fp8 --block-size 256 @@ -257,7 +274,23 @@ printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" SERVER_PID=$! echo "Server PID: $SERVER_PID" -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" +wait_for_server_ready --port "$VLLM_BACKEND_PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +if [ "$USE_VLLM_ROUTER" = "true" ]; then + echo "Starting native vLLM router on port $PORT for $TP DP ranks..." + vllm-router \ + --worker-urls "http://localhost:$VLLM_BACKEND_PORT" \ + --policy "$VLLM_ROUTER_POLICY" \ + --intra-node-data-parallel-size "$TP" \ + --host 0.0.0.0 \ + --port "$PORT" \ + --prometheus-host 127.0.0.1 \ + --prometheus-port "$VLLM_ROUTER_METRICS_PORT" \ + --disable-retries > "$ROUTER_LOG" 2>&1 & + ROUTER_PID=$! + echo "Router PID: $ROUTER_PID" + wait_for_server_ready --port "$PORT" --server-log "$ROUTER_LOG" --server-pid "$ROUTER_PID" +fi # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" From 5291955b6d914791331a32552eb4b806924a479f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 13:19:57 -0500 Subject: [PATCH 022/132] benchmarks(agentic): default DSv4 recipes to v6 (060226) corpus Removes the DSv4 carveout in resolve_trace_source that pinned DSv4 to the older 052726 v5 corpus. With v6 (060226) validated on both b300 and gb300 DSv4 runs, there's no reason to keep DSv4 on the older trace set. - benchmark_lib.sh: the model-prefix branch on MODEL_PREFIX==dsv4 is gone; default_loader is now 060226 unconditionally. WEKA_LOADER_OVERRIDE still works for any recipe that wants to pin an older or different variant (e.g. the 256k-capped flavor used by max_model_len-constrained setups). - dsv4_fp4_b300_vllm.sh: drops its now-redundant explicit WEKA_LOADER_OVERRIDE=...060226 export since that's the default now. Effect: every DSv4 agentic recipe (b200_vllm, b300_vllm, fp8_h200, mi355x_sglang, gb300_dynamo_vllm) automatically rides v6 with no per-recipe edit needed. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/benchmark_lib.sh | 13 ++++--------- .../single_node/agentic/dsv4_fp4_b300_vllm.sh | 2 -- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 29a6b91c3..dbde467c2 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -926,17 +926,12 @@ resolve_trace_source() { # scenario. Used by recipes whose servers have non-default context # caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the # unfiltered corpus and switches to the 256k-capped variant), or - # by recipes that want to pin a specific corpus generation rather - # than ride the model-prefix-aware default below. + # by recipes that want to pin an older corpus generation. # - # Default (no override) is model-prefix-aware: - # DSv4 recipes -> 052726 (v5 corpus, the original baseline) - # everything else -> 060226 (v6 corpus, newer CC versions) - # DSv4 stays on 052726 for continuity with prior published baselines. + # Default (no override): semianalysis_cc_traces_weka_with_subagents_060226 + # (v6 corpus, newer CC versions). All recipes — including DSv4 — ride + # this default unless they opt out via WEKA_LOADER_OVERRIDE. local default_loader="semianalysis_cc_traces_weka_with_subagents_060226" - if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then - default_loader="semianalysis_cc_traces_weka_with_subagents" - fi local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}" local dataset case "$loader" in diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 06bdbe432..efe39d9aa 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -52,8 +52,6 @@ fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- -# Opt this recipe out of the DSv4 052726 default; use the v6 corpus. -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226 resolve_trace_source install_agentic_deps From 40736e8b419f0a1f1da46b7be1397a6a798dfe56 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 14:12:57 -0500 Subject: [PATCH 023/132] chore(agentic): bump aiperf for warmup progress logging --- utils/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/aiperf b/utils/aiperf index ed5b4ce04..db491528e 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit ed5b4ce04eee054fd3cb5f9a8510677df7cfe37b +Subproject commit db491528eb49e78df9e4f151bdcf4d6205b32099 From 70529f209c0ebc9998bb706be5e5e21370eb0518 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 15:50:19 -0500 Subject: [PATCH 024/132] chore(agentic): bump aiperf for phase-continuous replay --- utils/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/aiperf b/utils/aiperf index db491528e..f47bd5537 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit db491528eb49e78df9e4f151bdcf4d6205b32099 +Subproject commit f47bd5537e2cd43eab5b91395180a353e3856ab8 From 1ed000180e72a929abca8a95e81810788ca2df8a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 16:44:20 -0500 Subject: [PATCH 025/132] chore(agentic): bump aiperf snapshot accessor rename Signed-off-by: Cam Quilici --- utils/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/aiperf b/utils/aiperf index f47bd5537..f74a3c993 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit f47bd5537e2cd43eab5b91395180a353e3856ab8 +Subproject commit f74a3c993d415ece4c0147b6f05013bbad1bd358 From 1c84916e82d43211e1dcde4ef852699ddb7f7cc2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 17:17:49 -0500 Subject: [PATCH 026/132] fix(agentic): align B200 DSv4 with bespoke vLLM image --- .github/configs/nvidia-master.yaml | 2 +- .../single_node/agentic/dsv4_fp4_b200_vllm.sh | 42 ++++++++++++------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 380c799e1..fc5ca6ab2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9259,7 +9259,7 @@ glm5-fp8-gb300-dynamo-sglang: dp-attn: false dsv4-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.22.0 + image: cquil/vllm-openai:v0.22.0-6c529f3001ab8bf44b1657e779dc54b622397045 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dgxc diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index b704b06e3..84b5828e7 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -13,16 +13,18 @@ set -x # experts EP-sharded across DP ranks (per the vLLM blog recipe). # Highest aggregate throughput at large CONC. # -# Image is vllm/vllm-openai:v0.20.0-cu130. block_size=256, kv-cache-dtype=fp8, -# FP4 indexer cache enabled, FULL_AND_PIECEWISE cudagraph capture with -# custom_ops=all (per the vLLM blog recipe at https://vllm.ai/blog/deepseek-v4). +# Image is cquil/vllm-openai:v0.22.0-6c529f3001ab8bf44b1657e779dc54b622397045. +# block_size=256, kv-cache-dtype=fp8, FP4 indexer cache enabled, +# FULL_AND_PIECEWISE cudagraph capture with custom_ops=all (per the vLLM blog +# recipe at https://vllm.ai/blog/deepseek-v4). # # Required env vars: # MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR # # OFFLOADING values: # none - vLLM GPU KV only, with DSv4 hybrid KV manager enabled. -# cpu - vLLM native OffloadingConnector, with hybrid KV manager enabled. +# cpu - SimpleCPUOffloadConnector lazy offload, with hybrid KV manager +# enabled. # lmcache-mp - Temporarily disabled for DSv4. LMCache PR #3261 must merge # first so LMCacheMPConnector can support HMA block-id tuples. @@ -74,6 +76,12 @@ fi # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s. export VLLM_ENGINE_READY_TIMEOUT_S=3600 +# vllm-project/vllm#43447: keep SWA prefix-cache tails sparsely so transient +# sliding-window allocations don't evict useful prefix entries. 32k matches +# the trace-replay tuning the PR author validated (0% -> 74% hit rate). +# Requires the custom cquil image configured for this recipe. +export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768 + # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" ROUTER_LOG="$RESULT_DIR/router.log" @@ -136,26 +144,28 @@ case "$OFFLOADING" in none) ;; cpu) # b200-dgxc compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits - # individual jobs to a fraction of that. Aim for ~1.2 TB total native - # CPU offload pool across the engine(s); previously 2.8 TB but every - # DP-attn worker stalled for 4+ min during pinned-CPU-tensor allocation - # and the shm_broadcast watchdog killed them (run 26246044726). 150 GB - # per worker (1.2 TB / 8) completes the alloc within the 60 s window. + # individual jobs to a fraction of that. Aim for ~1.2 TB total host + # CPU pool across the engine(s); previously 2.8 TB but every DP-attn + # worker stalled for 4+ min during pinned-CPU-tensor allocation and the + # shm_broadcast watchdog killed them (run 26246044726). 150 GB per + # worker (1.2 TB / 8) completes the alloc within the 60 s window. # - # Native --kv-offloading-size becomes OffloadingConnector's - # cpu_bytes_to_use. For DP-attn there are $TP independent DP engines, - # so pre-divide to keep aggregate host commit near TOTAL_CPU_DRAM_GB. - # For pure TP, vLLM treats the size as the total across TP ranks. + # SimpleCPUOffloadConnector divides cpu_bytes_to_use by + # parallel_config.world_size (= TP*PP, NOT including DP). For DP-attn + # there are $TP independent engines with world_size=1, so pre-divide + # to keep aggregate host commit near TOTAL_CPU_DRAM_GB. For pure TP, + # pass the total and let the connector divide across TP ranks. TOTAL_CPU_DRAM_GB=1200 if [ "$DP_ATTENTION" = "true" ]; then PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP)) else PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB fi - unset VLLM_USE_SIMPLE_KV_OFFLOAD + PER_ENGINE_BYTES=$((PER_ENGINE_GB * 1024 * 1024 * 1024)) + export VLLM_USE_SIMPLE_KV_OFFLOAD=1 OFFLOAD_ARGS=( - --kv-offloading-backend native - --kv-offloading-size "$PER_ENGINE_GB" + --kv-transfer-config + "{\"kv_connector\":\"SimpleCPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"cpu_bytes_to_use\":$PER_ENGINE_BYTES,\"lazy_offload\":true}}" ) ;; lmcache-mp) From 4bd54ce12cba5cb75df0f05e5c0960ea2c5ef14c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 17:18:46 -0500 Subject: [PATCH 027/132] chore(agentic): bump aiperf for heartbeat-only warmup logging --- utils/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/aiperf b/utils/aiperf index f74a3c993..be20977ad 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit f74a3c993d415ece4c0147b6f05013bbad1bd358 +Subproject commit be20977adf063aa5bb679d15293218eff6e64062 From 97576faef1064616229bf806f80cff7188baa344 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Jun 2026 20:18:07 -0500 Subject: [PATCH 028/132] test(agentic): run B300 CPU offload in eager mode --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index efe39d9aa..b8234af48 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -114,13 +114,11 @@ case "$OFFLOADING" in PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB fi PER_ENGINE_BYTES=$((PER_ENGINE_GB * 1024 * 1024 * 1024)) - # Use --kv-transfer-config JSON to also pass lazy_offload=true. Eager - # mode (default) hits an AssertionError in - # vllm/v1/core/kv_cache_utils.py:269 popleft_n at low/mid CONC; lazy - # mode defers the store path and clears low/mid CONC at 80-100%. + # Temporarily run eager mode to isolate whether lazy offloading is + # required to reproduce the SimpleCPUOffloadConnector CUDA failures. # See SimpleCPUOffloadConnector PR #37160 for the lazy_offload knob. export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"SimpleCPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"cpu_bytes_to_use\":$PER_ENGINE_BYTES,\"lazy_offload\":true}}" + OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"SimpleCPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"cpu_bytes_to_use\":$PER_ENGINE_BYTES,\"lazy_offload\":false}}" ;; *) echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 From e08ba71be0a79b85fb399c5153b63de76d8db216 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 01:24:09 -0500 Subject: [PATCH 029/132] config(dsv4-fp4 agentic): run offloading=none with expanded concurrency sweeps Comment out the cpu (and prior none) entries for dsv4-fp4-b200/b300-vllm-agentic and run offloading=none only, expanding the concurrency lists: dense low-to-mid ramp on plain-TP entries and a cliff-spanning high-end ramp on the DEP entries (B300 mirrors B200). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/nvidia-master.yaml | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index fc5ca6ab2..d896a26c9 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9270,11 +9270,14 @@ dsv4-fp4-b200-vllm-agentic: agentic-coding: - duration: 1800 search-space: + # TEMPORARILY COMMENTED OUT — running offloading=none only this iteration. # cpu offload only this iteration — none entries already validated in # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%). # Re-add when investigating regressions in offload=none. - - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } + # - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } + # - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } + - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } qwen3.5-fp8-b200-sglang-agentic: image: lmsysorg/sglang:nightly-dev-20260422-de962f32 @@ -9416,16 +9419,21 @@ dsv4-fp4-b300-vllm-agentic: agentic-coding: - duration: 1800 search-space: + # TEMPORARILY COMMENTED OUT — running offloading=none only this iteration. # cpu offload only this iteration — none entries already validated in # earlier runs. Re-add when investigating regressions in offload=none. - - { tp: 4, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } - - { tp: 4, offloading: none, conc-list: [16, 32, 64] } - - { tp: 8, offloading: none, conc-list: [16, 32, 64] } - - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } - - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [128, 256, 512] } + # - { tp: 4, offloading: cpu, conc-list: [16, 32, 64] } + # - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } + # - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } + # - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } + # - { tp: 4, offloading: none, conc-list: [16, 32, 64] } + # - { tp: 8, offloading: none, conc-list: [16, 32, 64] } + # - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } + # - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [128, 256, 512] } + - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } + - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } gptoss-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 From 60f3be03d7cd65d8711165011d8c4f9b64a6fc1b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 11:16:33 -0500 Subject: [PATCH 030/132] fix(agentic): extend native router request timeout --- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 1 + benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 84b5828e7..87822c154 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -296,6 +296,7 @@ if [ "$USE_VLLM_ROUTER" = "true" ]; then --port "$PORT" \ --prometheus-host 127.0.0.1 \ --prometheus-port "$VLLM_ROUTER_METRICS_PORT" \ + --request-timeout-secs 3600 \ --disable-retries > "$ROUTER_LOG" 2>&1 & ROUTER_PID=$! echo "Router PID: $ROUTER_PID" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index b8234af48..e35cffa2d 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -186,6 +186,7 @@ if [ "$USE_VLLM_ROUTER" = "true" ]; then --port "$PORT" \ --prometheus-host 127.0.0.1 \ --prometheus-port "$VLLM_ROUTER_METRICS_PORT" \ + --request-timeout-secs 3600 \ --disable-retries > "$ROUTER_LOG" 2>&1 & ROUTER_PID=$! echo "Router PID: $ROUTER_PID" From 3747263631ddeb5c6cdf096e3e71d42cece99f89 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 14:31:45 -0500 Subject: [PATCH 031/132] fix(agentic): use native B300 KV offloading --- .../single_node/agentic/dsv4_fp4_b300_vllm.sh | 37 ++++++++----------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index e35cffa2d..38e8cf927 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -87,38 +87,31 @@ SERVER_LOG="$RESULT_DIR/server.log" ROUTER_LOG="$RESULT_DIR/router.log" mkdir -p "$RESULT_DIR" -OFFLOAD_ARGS="" +OFFLOAD_ARGS=() case "$OFFLOADING" in none) ;; cpu) # B300 compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits - # individual jobs to a fraction of that. Aim for ~2.2 TB total host + # individual jobs to a fraction of that. Aim for ~2.5 TB total host # CPU pool across the engine(s). # - # SimpleCPUOffloadConnector divides cpu_bytes_to_use by - # parallel_config.world_size (= TP*PP, NOT including DP — see - # vllm/config/parallel.py docstring). So: - # - DP-attn=true → each of $TP DP engines has world_size=1 in - # its parallel_config; the connector does no internal divide, - # and each engine torch.zeros + pin_tensor allocates the full - # --kv_offloading_size value. Pre-divide by $TP here so the - # aggregate host commit ≈ TOTAL_CPU_DRAM_GB. - # - DP-attn=false → single engine with world_size=TP. Pass the - # full TOTAL_CPU_DRAM_GB; the connector's internal divide - # yields TOTAL/TP per rank, and TP-shared mmap (PR #37206) - # keeps the aggregate at TOTAL. - TOTAL_CPU_DRAM_GB=2200 + # --kv_offloading_size configures one native OffloadingConnector pool + # per vLLM engine. DP-attn starts one engine per DP rank, so pre-divide + # the aggregate host budget across those engines. + TOTAL_CPU_DRAM_GB=2500 if [ "$DP_ATTENTION" = "true" ]; then PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP)) else PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB fi - PER_ENGINE_BYTES=$((PER_ENGINE_GB * 1024 * 1024 * 1024)) - # Temporarily run eager mode to isolate whether lazy offloading is - # required to reproduce the SimpleCPUOffloadConnector CUDA failures. - # See SimpleCPUOffloadConnector PR #37160 for the lazy_offload knob. - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"SimpleCPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"cpu_bytes_to_use\":$PER_ENGINE_BYTES,\"lazy_offload\":false}}" + + # The native backend resolves to OffloadingConnector while this env var + # is unset. + unset VLLM_USE_SIMPLE_KV_OFFLOAD + OFFLOAD_ARGS=( + --kv_offloading_backend native + --kv_offloading_size "$PER_ENGINE_GB" + ) ;; *) echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 @@ -170,7 +163,7 @@ vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --no-disable-hybrid-kv-cache-manager \ --max-model-len "$MAX_MODEL_LEN" \ --max-num-seqs "$PER_ENGINE_MAX_NUM_SEQS" \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & +"${OFFLOAD_ARGS[@]}" > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" From cb21694d72e79ca7c51ec0a9d05b6ec23e3daf57 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 14:32:50 -0500 Subject: [PATCH 032/132] (testing) add offload off scneario to dsv4 b300 --- .github/configs/nvidia-master.yaml | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d896a26c9..d3ea482c3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9259,7 +9259,7 @@ glm5-fp8-gb300-dynamo-sglang: dp-attn: false dsv4-fp4-b200-vllm-agentic: - image: cquil/vllm-openai:v0.22.0-6c529f3001ab8bf44b1657e779dc54b622397045 + image: vllm/vllm-openai:nightly-d0975a4b50140a9d953f00955a1cbb2a4945edef model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dgxc @@ -9408,7 +9408,8 @@ dsv4-fp8-h200-vllm-agentic: dsv4-fp4-b300-vllm-agentic: # image: vllm/vllm-openai:v0.22.0 # includes https://github.com/vllm-project/vllm/pull/43447 up to 6c529f3001ab8bf44b1657e779dc54b622397045 - image: cquil/vllm-openai:v0.22.0-6c529f3001ab8bf44b1657e779dc54b622397045 + # image: cquil/vllm-openai:v0.22.0-6c529f3001ab8bf44b1657e779dc54b622397045' + image: vllm/vllm-openai:nightly-d0975a4b50140a9d953f00955a1cbb2a4945edef model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -9419,20 +9420,10 @@ dsv4-fp4-b300-vllm-agentic: agentic-coding: - duration: 1800 search-space: - # TEMPORARILY COMMENTED OUT — running offloading=none only this iteration. - # cpu offload only this iteration — none entries already validated in - # earlier runs. Re-add when investigating regressions in offload=none. - # - { tp: 4, offloading: cpu, conc-list: [16, 32, 64] } - # - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - # - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - # - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } - # - { tp: 4, offloading: none, conc-list: [16, 32, 64] } - # - { tp: 8, offloading: none, conc-list: [16, 32, 64] } - # - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } - # - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [128, 256, 512] } - - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } + - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] } - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } - - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } + - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [32, 48, 64, 96, 128, 192, 256] } - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } gptoss-fp4-b200-vllm-agentic: From 06a4ea7711ce8982f89aa027c8d08837d39fabbb Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 16:24:50 -0500 Subject: [PATCH 033/132] test(agentic): enable blocking CUDA offload diagnostics --- .github/configs/nvidia-master.yaml | 10 ++++++---- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 3 +++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d3ea482c3..9fb6960e6 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9420,11 +9420,13 @@ dsv4-fp4-b300-vllm-agentic: agentic-coding: - duration: 1800 search-space: - - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] } - - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } - - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } + # TEMPORARY: run only native CPU-offload scenarios while diagnosing + # asynchronous CUDA failures. + # - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] } + # - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } + # - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [32, 48, 64, 96, 128, 192, 256] } - - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + # - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } gptoss-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 38e8cf927..a02d3f98d 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -144,6 +144,9 @@ echo "Starting vllm server..." export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high +# Temporary diagnostic: surface asynchronous CUDA failures at the operation +# that caused them instead of at a later synchronization point. +export CUDA_LAUNCH_BLOCKING=1 vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ From fb362a696555e3896c71566cc38021501b896537 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 16:25:28 -0500 Subject: [PATCH 034/132] chore(agentic): remove stale B200 sweep comments --- .github/configs/nvidia-master.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9fb6960e6..8ffcb2a28 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9270,12 +9270,6 @@ dsv4-fp4-b200-vllm-agentic: agentic-coding: - duration: 1800 search-space: - # TEMPORARILY COMMENTED OUT — running offloading=none only this iteration. - # cpu offload only this iteration — none entries already validated in - # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%). - # Re-add when investigating regressions in offload=none. - # - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - # - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } From 2f27beacda4eaef937bdb0267c1d5fa663e32ae3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 16:51:34 -0500 Subject: [PATCH 035/132] feat(agentic): use Mooncake store for B300 offload --- .github/configs/nvidia-master.yaml | 4 +- .github/workflows/benchmark-tmpl.yml | 4 ++ .../single_node/agentic/dsv4_fp4_b300_vllm.sh | 56 ++++++++++++++----- 3 files changed, 47 insertions(+), 17 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 8ffcb2a28..3f6e52d30 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9414,8 +9414,8 @@ dsv4-fp4-b300-vllm-agentic: agentic-coding: - duration: 1800 search-space: - # TEMPORARY: run only native CPU-offload scenarios while diagnosing - # asynchronous CUDA failures. + # TEMPORARY: run only MooncakeStore CPU-offload scenarios while + # diagnosing the native/SimpleCPU offload failures. # - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] } # - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } # - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 4c2c2ba5f..46f305fe8 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -241,6 +241,8 @@ jobs: results/server.log results/router.log results/lmcache_server.log + results/mooncake_master.log + results/mooncake_config.json results/benchmark.log results/config.yaml results/lmcache_command.txt @@ -282,6 +284,8 @@ jobs: ${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }} ${{ inputs.scenario-type == 'agentic-coding' && 'results/router.log' || '' }} ${{ inputs.scenario-type == 'agentic-coding' && 'results/lmcache_server.log' || '' }} + ${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_master.log' || '' }} + ${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_config.json' || '' }} if-no-files-found: ignore - name: Upload GPU metrics diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index a02d3f98d..536c2afd2 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -85,6 +85,7 @@ export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768 # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" ROUTER_LOG="$RESULT_DIR/router.log" +MOONCAKE_MASTER_LOG="$RESULT_DIR/mooncake_master.log" mkdir -p "$RESULT_DIR" OFFLOAD_ARGS=() @@ -93,24 +94,52 @@ case "$OFFLOADING" in cpu) # B300 compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits # individual jobs to a fraction of that. Aim for ~2.5 TB total host - # CPU pool across the engine(s). + # CPU pool across all GPU ranks. # - # --kv_offloading_size configures one native OffloadingConnector pool - # per vLLM engine. DP-attn starts one engine per DP rank, so pre-divide - # the aggregate host budget across those engines. + # Mooncake embedded mode contributes one global segment per GPU rank to + # a shared distributed store. Pre-divide the aggregate host budget + # across those rank-contributed segments. TOTAL_CPU_DRAM_GB=2500 - if [ "$DP_ATTENTION" = "true" ]; then - PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP)) - else - PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB + PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) + + MOONCAKE_VERSION=0.3.11.post1 + agentic_pip_install --quiet --no-cache-dir --no-deps \ + --force-reinstall "mooncake-transfer-engine-cuda13==$MOONCAKE_VERSION" + python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null + + MOONCAKE_MASTER_PORT=$((PORT + 12000)) + MOONCAKE_CONFIG_PATH="$RESULT_DIR/mooncake_config.json" + cat > "$MOONCAKE_CONFIG_PATH" < "$MOONCAKE_MASTER_LOG" 2>&1 & + MOONCAKE_MASTER_PID=$! + sleep 2 + if ! kill -0 "$MOONCAKE_MASTER_PID" 2>/dev/null; then + echo "Mooncake master died during startup." >&2 + cat "$MOONCAKE_MASTER_LOG" >&2 + exit 1 fi - # The native backend resolves to OffloadingConnector while this env var - # is unset. unset VLLM_USE_SIMPLE_KV_OFFLOAD OFFLOAD_ARGS=( - --kv_offloading_backend native - --kv_offloading_size "$PER_ENGINE_GB" + --kv-transfer-config + '{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"load_async":true}}' ) ;; *) @@ -144,9 +173,6 @@ echo "Starting vllm server..." export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -# Temporary diagnostic: surface asynchronous CUDA failures at the operation -# that caused them instead of at a later synchronization point. -export CUDA_LAUNCH_BLOCKING=1 vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ From 97c4b65b419e279a156f570f20c7a60add323a47 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 17:55:21 -0500 Subject: [PATCH 036/132] test(agentic): validate Mooncake over TCP on B300 --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 536c2afd2..e741f522d 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -92,14 +92,13 @@ OFFLOAD_ARGS=() case "$OFFLOADING" in none) ;; cpu) - # B300 compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits - # individual jobs to a fraction of that. Aim for ~2.5 TB total host - # CPU pool across all GPU ranks. + # Leave enough host-memory headroom for model workers and the runtime. + # Aim for ~1 TB total host CPU pool across all GPU ranks. # # Mooncake embedded mode contributes one global segment per GPU rank to # a shared distributed store. Pre-divide the aggregate host budget # across those rank-contributed segments. - TOTAL_CPU_DRAM_GB=2500 + TOTAL_CPU_DRAM_GB=1000 PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) MOONCAKE_VERSION=0.3.11.post1 @@ -116,7 +115,7 @@ case "$OFFLOADING" in "master_server_address": "127.0.0.1:$MOONCAKE_MASTER_PORT", "global_segment_size": "${PER_RANK_GB}GB", "local_buffer_size": "4GB", - "protocol": "rdma", + "protocol": "tcp", "device_name": "", "enable_offload": false } From 49b99675e35eb259e393b35daf1993fc6ac54af2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 18:20:01 -0500 Subject: [PATCH 037/132] fix(agentic): pool Mooncake TCP connections --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index e741f522d..c7f836570 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -121,6 +121,9 @@ case "$OFFLOADING" in } EOF export MOONCAKE_CONFIG_PATH + # Reuse TCP connections across KV transfers instead of exhausting the + # host's ephemeral port range with one short-lived socket per transfer. + export MC_TCP_ENABLE_CONNECTION_POOL=1 # Identical prefixes must hash to identical store keys across DP ranks. export PYTHONHASHSEED=0 From c3cfe7440828c2398d8730f9cad78f74ee841aab Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 18:41:22 -0500 Subject: [PATCH 038/132] fix(agentic): increase Mooncake TCP transfer slices --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index c7f836570..70443174f 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -124,6 +124,9 @@ EOF # Reuse TCP connections across KV transfers instead of exhausting the # host's ephemeral port range with one short-lived socket per transfer. export MC_TCP_ENABLE_CONNECTION_POOL=1 + # Mooncake defaults to 64 KiB TCP slices and can create thousands of + # concurrent sockets for one large KV batch. Use 4 MiB slices instead. + export MC_TCP_SLICE_SIZE=4194304 # Identical prefixes must hash to identical store keys across DP ranks. export PYTHONHASHSEED=0 From f58528235d46fcd1019202327a4a871e3bf74795 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 18:59:09 -0500 Subject: [PATCH 039/132] test(agentic): pin Mooncake RDMA on B300 --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 70443174f..828dfa71f 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -115,18 +115,12 @@ case "$OFFLOADING" in "master_server_address": "127.0.0.1:$MOONCAKE_MASTER_PORT", "global_segment_size": "${PER_RANK_GB}GB", "local_buffer_size": "4GB", - "protocol": "tcp", - "device_name": "", + "protocol": "rdma", + "device_name": "mlx5_0", "enable_offload": false } EOF export MOONCAKE_CONFIG_PATH - # Reuse TCP connections across KV transfers instead of exhausting the - # host's ephemeral port range with one short-lived socket per transfer. - export MC_TCP_ENABLE_CONNECTION_POOL=1 - # Mooncake defaults to 64 KiB TCP slices and can create thousands of - # concurrent sockets for one large KV batch. Use 4 MiB slices instead. - export MC_TCP_SLICE_SIZE=4194304 # Identical prefixes must hash to identical store keys across DP ranks. export PYTHONHASHSEED=0 From 3599c78560446e44dd4de46a69830ce01ce7aa26 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 19:24:13 -0500 Subject: [PATCH 040/132] perf(agentic): map Mooncake RDMA NICs on B300 --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 828dfa71f..0f4385395 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -116,7 +116,7 @@ case "$OFFLOADING" in "global_segment_size": "${PER_RANK_GB}GB", "local_buffer_size": "4GB", "protocol": "rdma", - "device_name": "mlx5_0", + "device_name": "mlx5_2,mlx5_8,mlx5_4,mlx5_0", "enable_offload": false } EOF From fce49968a882244120e3bd0f61b008108eb3fa93 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 19:42:08 -0500 Subject: [PATCH 041/132] fix(agentic): use shared Mooncake RDMA NIC on B300 --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 0f4385395..828dfa71f 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -116,7 +116,7 @@ case "$OFFLOADING" in "global_segment_size": "${PER_RANK_GB}GB", "local_buffer_size": "4GB", "protocol": "rdma", - "device_name": "mlx5_2,mlx5_8,mlx5_4,mlx5_0", + "device_name": "mlx5_0", "enable_offload": false } EOF From 8d6b735e74da31d5d015dd36ea6db30dfecf5b97 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 21:18:50 -0500 Subject: [PATCH 042/132] perf(agentic): tune Mooncake RDMA transfers on B300 --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 828dfa71f..5d64c03ff 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -123,6 +123,11 @@ EOF export MOONCAKE_CONFIG_PATH # Identical prefixes must hash to identical store keys across DP ranks. export PYTHONHASHSEED=0 + # Large agentic KV writes can exceed Mooncake Store's fixed 60-second + # transfer deadline at the default 64 KiB RDMA slice size. Reduce + # per-transfer bookkeeping and give the shared RNIC more workers. + export MC_SLICE_SIZE=1048576 + export MC_WORKERS_PER_CTX=4 echo "Starting Mooncake master on port $MOONCAKE_MASTER_PORT..." mooncake_master --port "$MOONCAKE_MASTER_PORT" \ From d98d7aec3781658fb7234685494e08185ec7d7ac Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 21:39:46 -0500 Subject: [PATCH 043/132] perf(agentic): use full B300 Mooncake memory budget --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 5d64c03ff..557060ae7 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -93,12 +93,12 @@ case "$OFFLOADING" in none) ;; cpu) # Leave enough host-memory headroom for model workers and the runtime. - # Aim for ~1 TB total host CPU pool across all GPU ranks. + # Use the 2.5 TB host-memory budget across all GPU ranks. # # Mooncake embedded mode contributes one global segment per GPU rank to # a shared distributed store. Pre-divide the aggregate host budget # across those rank-contributed segments. - TOTAL_CPU_DRAM_GB=1000 + TOTAL_CPU_DRAM_GB=2500 PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) MOONCAKE_VERSION=0.3.11.post1 From b83265daccd6aba51c5d983a841559f3a26a6794 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Jun 2026 23:13:18 -0500 Subject: [PATCH 044/132] perf(agentic): evict Mooncake cache before rank exhaustion --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 557060ae7..c73aa5c6a 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -129,8 +129,16 @@ EOF export MC_SLICE_SIZE=1048576 export MC_WORKERS_PER_CTX=4 + # The store is shared, but each rank contributes a separate segment. + # Start eviction before an imbalanced rank exhausts its segment, and + # reclaim enough space for several concurrent multi-GB batch puts. + MOONCAKE_EVICTION_HIGH_WATERMARK_RATIO=0.80 + MOONCAKE_EVICTION_RATIO=0.10 + echo "Starting Mooncake master on port $MOONCAKE_MASTER_PORT..." mooncake_master --port "$MOONCAKE_MASTER_PORT" \ + --eviction_high_watermark_ratio="$MOONCAKE_EVICTION_HIGH_WATERMARK_RATIO" \ + --eviction_ratio="$MOONCAKE_EVICTION_RATIO" \ > "$MOONCAKE_MASTER_LOG" 2>&1 & MOONCAKE_MASTER_PID=$! sleep 2 From 4178b7897939c41a71fc4b14432833805135a4c6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Jun 2026 13:38:13 -0500 Subject: [PATCH 045/132] feat(agentic): default to 060526 weka corpus (DSv4 base, others 256k) Bump aiperf submodule to pick up the 060526 with-subagents loaders, and make resolve_trace_source() model-aware: DSv4 recipes default to the full semianalysis_cc_traces_weka_with_subagents_060526 corpus while every other recipe defaults to the 256k-capped _060526_256k variant (servers at max_model_len ~256k reject >256k requests). WEKA_LOADER_OVERRIDE still honored; older corpora remain pinnable. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/benchmark_lib.sh | 23 ++++++++++++++++++----- utils/aiperf | 2 +- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index dbde467c2..627ed32ac 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -928,10 +928,17 @@ resolve_trace_source() { # unfiltered corpus and switches to the 256k-capped variant), or # by recipes that want to pin an older corpus generation. # - # Default (no override): semianalysis_cc_traces_weka_with_subagents_060226 - # (v6 corpus, newer CC versions). All recipes — including DSv4 — ride - # this default unless they opt out via WEKA_LOADER_OVERRIDE. - local default_loader="semianalysis_cc_traces_weka_with_subagents_060226" + # Default (no override): the 060526 v6 corpus, selected by model family. + # DSv4 (full context) rides the unfiltered base corpus; every non-DSv4 + # recipe defaults to the 256k-capped variant because those servers run at + # max_model_len ~256k and would reject >256k requests. Any recipe can still + # pin a specific corpus via WEKA_LOADER_OVERRIDE. + local default_loader + if [[ "${MODEL_PREFIX:-}" == dsv4* ]]; then + default_loader="semianalysis_cc_traces_weka_with_subagents_060526" + else + default_loader="semianalysis_cc_traces_weka_with_subagents_060526_256k" + fi local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}" local dataset case "$loader" in @@ -947,8 +954,14 @@ resolve_trace_source() { semianalysis_cc_traces_weka_with_subagents_060226_256k) dataset="semianalysisai/cc-traces-weka-with-subagents-060226-256k" ;; + semianalysis_cc_traces_weka_with_subagents_060526) + dataset="semianalysisai/cc-traces-weka-with-subagents-060526" + ;; + semianalysis_cc_traces_weka_with_subagents_060526_256k) + dataset="semianalysisai/cc-traces-weka-with-subagents-060526-256k" + ;; *) - echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k" >&2 + echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k, semianalysis_cc_traces_weka_with_subagents_060526, semianalysis_cc_traces_weka_with_subagents_060526_256k" >&2 exit 1 ;; esac diff --git a/utils/aiperf b/utils/aiperf index be20977ad..ada291f18 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit be20977adf063aa5bb679d15293218eff6e64062 +Subproject commit ada291f18eed788710259ee5d49e1dd8de5bb712 From 0731a8e0e20c1822dca9f5fb9bd3ddfb9a1dac0b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Jun 2026 13:40:12 -0500 Subject: [PATCH 046/132] go --- .github/configs/nvidia-master.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3f6e52d30..3928a1f39 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9416,11 +9416,11 @@ dsv4-fp4-b300-vllm-agentic: search-space: # TEMPORARY: run only MooncakeStore CPU-offload scenarios while # diagnosing the native/SimpleCPU offload failures. - # - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] } - # - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } - # - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } + - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] } + - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [32, 48, 64, 96, 128, 192, 256] } - # - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } gptoss-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 From e6fe59ceba8b8532ac3321e2cdd8ca56b2d1afb3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Jun 2026 14:41:26 -0500 Subject: [PATCH 047/132] feat(agentic): use Mooncake offload for DSv4 B200 --- .github/configs/nvidia-master.yaml | 1 + .../single_node/agentic/dsv4_fp4_b200_vllm.sh | 198 ++++++------------ 2 files changed, 63 insertions(+), 136 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3928a1f39..c25e6eda4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9272,6 +9272,7 @@ dsv4-fp4-b200-vllm-agentic: search-space: - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 196, 256, 512] } qwen3.5-fp8-b200-sglang-agentic: image: lmsysorg/sglang:nightly-dev-20260422-de962f32 diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 87822c154..8f0926bb3 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -13,20 +13,17 @@ set -x # experts EP-sharded across DP ranks (per the vLLM blog recipe). # Highest aggregate throughput at large CONC. # -# Image is cquil/vllm-openai:v0.22.0-6c529f3001ab8bf44b1657e779dc54b622397045. -# block_size=256, kv-cache-dtype=fp8, FP4 indexer cache enabled, -# FULL_AND_PIECEWISE cudagraph capture with custom_ops=all (per the vLLM blog -# recipe at https://vllm.ai/blog/deepseek-v4). +# Image is configured in nvidia-master.yaml. block_size=256, +# kv-cache-dtype=fp8, FP4 indexer cache enabled, FULL_AND_PIECEWISE cudagraph +# capture with custom_ops=all (per the vLLM blog recipe at +# https://vllm.ai/blog/deepseek-v4). # # Required env vars: # MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR # # OFFLOADING values: -# none - vLLM GPU KV only, with DSv4 hybrid KV manager enabled. -# cpu - SimpleCPUOffloadConnector lazy offload, with hybrid KV manager -# enabled. -# lmcache-mp - Temporarily disabled for DSv4. LMCache PR #3261 must merge -# first so LMCacheMPConnector can support HMA block-id tuples. +# none - vLLM GPU KV only. +# cpu - MooncakeStoreConnector with a shared 2.5 TB host-memory KV tier. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -85,147 +82,76 @@ export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768 # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" ROUTER_LOG="$RESULT_DIR/router.log" -LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" +MOONCAKE_MASTER_LOG="$RESULT_DIR/mooncake_master.log" mkdir -p "$RESULT_DIR" OFFLOAD_ARGS=() -HYBRID_KV_ARGS=(--no-disable-hybrid-kv-cache-manager) -LMCACHE_PID="" - -cleanup_lmcache_server() { - if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then - kill "$LMCACHE_PID" 2>/dev/null || true - wait "$LMCACHE_PID" 2>/dev/null || true - fi -} - -trap cleanup_lmcache_server EXIT - -wait_for_lmcache_ready() { - { set +x; } 2>/dev/null - local attempts="${LMCACHE_READY_ATTEMPTS:-120}" - local tail_pid="" - - while [ ! -f "$LMCACHE_LOG" ]; do - if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then - echo "LMCache server died before creating log file. Exiting." >&2 - exit 1 - fi - sleep 1 - done - - tail -f -n +1 "$LMCACHE_LOG" & - tail_pid=$! - - for ((i = 1; i <= attempts; i++)); do - if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then - kill "$tail_pid" 2>/dev/null || true - wait "$tail_pid" 2>/dev/null || true - return 0 - fi - if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then - echo "LMCache server died before becoming healthy. Log follows:" >&2 - kill "$tail_pid" 2>/dev/null || true - wait "$tail_pid" 2>/dev/null || true - cat "$LMCACHE_LOG" >&2 || true - exit 1 - fi - sleep 1 - done - - echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 - kill "$tail_pid" 2>/dev/null || true - wait "$tail_pid" 2>/dev/null || true - cat "$LMCACHE_LOG" >&2 || true - exit 1 -} case "$OFFLOADING" in none) ;; cpu) - # b200-dgxc compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits - # individual jobs to a fraction of that. Aim for ~1.2 TB total host - # CPU pool across the engine(s); previously 2.8 TB but every DP-attn - # worker stalled for 4+ min during pinned-CPU-tensor allocation and the - # shm_broadcast watchdog killed them (run 26246044726). 150 GB per - # worker (1.2 TB / 8) completes the alloc within the 60 s window. + # B200 DGXC compute nodes have about 3.9 TB host RAM. Leave enough + # headroom for model workers and the runtime, and use the same 2.5 TB + # Mooncake budget validated by the B300 recipe. # - # SimpleCPUOffloadConnector divides cpu_bytes_to_use by - # parallel_config.world_size (= TP*PP, NOT including DP). For DP-attn - # there are $TP independent engines with world_size=1, so pre-divide - # to keep aggregate host commit near TOTAL_CPU_DRAM_GB. For pure TP, - # pass the total and let the connector divide across TP ranks. - TOTAL_CPU_DRAM_GB=1200 - if [ "$DP_ATTENTION" = "true" ]; then - PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP)) - else - PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB + # Embedded mode contributes one segment per GPU rank to a shared + # distributed store, so pre-divide the aggregate host-memory budget. + TOTAL_CPU_DRAM_GB=2500 + PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) + + MOONCAKE_VERSION=0.3.11.post1 + agentic_pip_install --quiet --no-cache-dir --no-deps \ + --force-reinstall "mooncake-transfer-engine-cuda13==$MOONCAKE_VERSION" + python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null + + MOONCAKE_MASTER_PORT=$((PORT + 12000)) + MOONCAKE_CONFIG_PATH="$RESULT_DIR/mooncake_config.json" + cat > "$MOONCAKE_CONFIG_PATH" < "$MOONCAKE_MASTER_LOG" 2>&1 & + MOONCAKE_MASTER_PID=$! + sleep 2 + if ! kill -0 "$MOONCAKE_MASTER_PID" 2>/dev/null; then + echo "Mooncake master died during startup." >&2 + cat "$MOONCAKE_MASTER_LOG" >&2 + exit 1 fi - PER_ENGINE_BYTES=$((PER_ENGINE_GB * 1024 * 1024 * 1024)) - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS=( - --kv-transfer-config - "{\"kv_connector\":\"SimpleCPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"cpu_bytes_to_use\":$PER_ENGINE_BYTES,\"lazy_offload\":true}}" - ) - ;; - lmcache-mp) - { set +x; } 2>/dev/null - # LMCacheMPConnector needs HMA support before it can run DSv4 with the - # hybrid KV manager. Re-enable this path after - # https://github.com/LMCache/LMCache/pull/3261 is merged. - echo "Error: OFFLOADING=lmcache-mp is disabled for DSv4 until LMCache PR #3261 adds HMA support." >&2 - exit 1 - - # LMCache docs recommend MP mode for production: start an external - # `lmcache server`, then point vLLM's LMCacheMPConnector at it. For - # vLLM >= 0.20, prefer the LMCache-shipped connector module because it - # tracks the latest server protocol ahead of vLLM's vendored copy. - # - # Important DSv4 caveat: LMCacheMPConnector currently only accepts the - # non-hybrid KV block layout. The connector raises if vLLM returns the - # hybrid block-id tuple used by the CSA/HCA hybrid KV manager. This - # mode therefore disables the hybrid manager; `none` and `cpu` keep it - # enabled for the normal B200 DSv4 path. - agentic_pip_install --quiet --no-cache-dir lmcache - python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null - - TOTAL_CPU_DRAM_GB=2800 - LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" - LMCACHE_PORT="${LMCACHE_PORT:-5555}" - LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" - LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}" - LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-200}" - LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" - LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" - - echo "Starting LMCache MP server..." - LMCACHE_CMD=( - lmcache server - --host "$LMCACHE_HOST" - --port "$LMCACHE_PORT" - --http-host "$LMCACHE_HOST" - --http-port "$LMCACHE_HTTP_PORT" - --l1-size-gb "$LMCACHE_L1_SIZE_GB" - --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" - --chunk-size "$LMCACHE_CHUNK_SIZE" - --max-workers "$LMCACHE_MAX_WORKERS" - --eviction-policy LRU - ) - printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" - printf '\n' >> "$RESULT_DIR/lmcache_command.txt" - "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & - LMCACHE_PID=$! - echo "LMCache server PID: $LMCACHE_PID" - wait_for_lmcache_ready - HYBRID_KV_ARGS=(--disable-hybrid-kv-cache-manager) + unset VLLM_USE_SIMPLE_KV_OFFLOAD OFFLOAD_ARGS=( --kv-transfer-config - "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" + '{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"load_async":true}}' ) ;; *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache-mp)" >&2 + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 exit 1 ;; esac @@ -273,7 +199,7 @@ VLLM_CMD=( --enable-auto-tool-choice --reasoning-parser deepseek_v4 --enable-prefix-caching - "${HYBRID_KV_ARGS[@]}" + --no-disable-hybrid-kv-cache-manager --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$PER_ENGINE_MAX_NUM_SEQS" "${OFFLOAD_ARGS[@]}" From cffe496b624a569548a03e251859dceb15ceab67 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Jun 2026 14:56:41 -0500 Subject: [PATCH 048/132] go --- .github/configs/nvidia-master.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c25e6eda4..652c65afd 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9270,7 +9270,8 @@ dsv4-fp4-b200-vllm-agentic: agentic-coding: - duration: 1800 search-space: - - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } + - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40] } + - { tp: 8, offloading: cpu, conc-list: [40, 48, 52, 64, 72] } - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 196, 256, 512] } From 5b0e1a0b4f2f8b05d59767e5d0dd3af391de57ea Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 09:00:07 -0500 Subject: [PATCH 049/132] fix(agentic): bump aiperf for Weka context resets --- utils/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/aiperf b/utils/aiperf index ada291f18..44aa466d7 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit ada291f18eed788710259ee5d49e1dd8de5bb712 +Subproject commit 44aa466d7d681f5e2e4d7946174540e2a6521922 From 63c7c59787fac739b67f677e29a6ae483b925ce6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 09:08:07 -0500 Subject: [PATCH 050/132] test(agentic): reduce B200 Mooncake memory budget --- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 8f0926bb3..9defbef0c 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -23,7 +23,7 @@ set -x # # OFFLOADING values: # none - vLLM GPU KV only. -# cpu - MooncakeStoreConnector with a shared 2.5 TB host-memory KV tier. +# cpu - MooncakeStoreConnector with a shared 2.0 TB host-memory KV tier. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -91,12 +91,12 @@ case "$OFFLOADING" in none) ;; cpu) # B200 DGXC compute nodes have about 3.9 TB host RAM. Leave enough - # headroom for model workers and the runtime, and use the same 2.5 TB - # Mooncake budget validated by the B300 recipe. + # headroom for model workers and the runtime. Keep the Mooncake budget + # below the 2.5 TB allocation that failed GPU-buffer registration. # # Embedded mode contributes one segment per GPU rank to a shared # distributed store, so pre-divide the aggregate host-memory budget. - TOTAL_CPU_DRAM_GB=2500 + TOTAL_CPU_DRAM_GB=2000 PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) MOONCAKE_VERSION=0.3.11.post1 From 5a566b3381212fdc541341dbd913ea8cd2e86ad2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 09:15:11 -0500 Subject: [PATCH 051/132] fix(agentic): use Mooncake TCP fallback on B200 --- .../single_node/agentic/dsv4_fp4_b200_vllm.sh | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 9defbef0c..adcbc23ac 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -23,7 +23,7 @@ set -x # # OFFLOADING values: # none - vLLM GPU KV only. -# cpu - MooncakeStoreConnector with a shared 2.0 TB host-memory KV tier. +# cpu - MooncakeStoreConnector with a shared 2.5 TB host-memory KV tier. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -91,12 +91,11 @@ case "$OFFLOADING" in none) ;; cpu) # B200 DGXC compute nodes have about 3.9 TB host RAM. Leave enough - # headroom for model workers and the runtime. Keep the Mooncake budget - # below the 2.5 TB allocation that failed GPU-buffer registration. + # headroom for model workers and the runtime. # # Embedded mode contributes one segment per GPU rank to a shared # distributed store, so pre-divide the aggregate host-memory budget. - TOTAL_CPU_DRAM_GB=2000 + TOTAL_CPU_DRAM_GB=2500 PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) MOONCAKE_VERSION=0.3.11.post1 @@ -113,18 +112,17 @@ case "$OFFLOADING" in "master_server_address": "127.0.0.1:$MOONCAKE_MASTER_PORT", "global_segment_size": "${PER_RANK_GB}GB", "local_buffer_size": "4GB", - "protocol": "rdma", - "device_name": "mlx5_0", + "protocol": "tcp", + "device_name": "", "enable_offload": false } EOF export MOONCAKE_CONFIG_PATH # Identical prefixes must hash to identical store keys across DP ranks. export PYTHONHASHSEED=0 - # Reduce per-transfer bookkeeping for large agentic KV writes and give - # the shared RNIC more transfer workers. - export MC_SLICE_SIZE=1048576 - export MC_WORKERS_PER_CTX=4 + # The B200 DGXC nodes do not expose nvidia_peermem, so GPUDirect RDMA + # cannot register vLLM's GPU KV buffers. The CUDA-enabled Mooncake + # wheel stages GPU buffers through host memory for TCP transfers. # Each rank contributes a separate segment. Evict early enough to # avoid an imbalanced rank exhausting its segment. From 6f2d292fe8881764429e4e6321ae56673b8ebe0c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 09:38:10 -0500 Subject: [PATCH 052/132] fix(agentic): reuse B200 Mooncake TCP connections --- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index adcbc23ac..c8d18fe3a 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -123,6 +123,9 @@ EOF # The B200 DGXC nodes do not expose nvidia_peermem, so GPUDirect RDMA # cannot register vLLM's GPU KV buffers. The CUDA-enabled Mooncake # wheel stages GPU buffers through host memory for TCP transfers. + # Reuse connections because agentic KV traffic otherwise exhausts the + # node's ephemeral TCP ports during warmup. + export MC_TCP_ENABLE_CONNECTION_POOL=1 # Each rank contributes a separate segment. Evict early enough to # avoid an imbalanced rank exhausting its segment. From 4e8ec162c0d3938e922b56714d24e34daaf9200c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 10:00:26 -0500 Subject: [PATCH 053/132] fix(agentic): use current Mooncake TCP transport on B200 --- .../single_node/agentic/dsv4_fp4_b200_vllm.sh | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index c8d18fe3a..1cfba2ab2 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -98,9 +98,19 @@ case "$OFFLOADING" in TOTAL_CPU_DRAM_GB=2500 PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) - MOONCAKE_VERSION=0.3.11.post1 + # v0.3.11.post1 predates configurable TCP slice sizing and recent + # connection-pool correctness fixes. The B200 cluster cache contains a + # CUDA 13 wheel built from this pinned upstream main commit. + MOONCAKE_MAIN_COMMIT=4719229d88b10a7a8948a6b1e60705ffdb223077 + MOONCAKE_WHEEL="/aiperf_mmap_cache/mooncake/mooncake_main_4719229d_cuda13_py312.whl" + MOONCAKE_WHEEL_SHA256=88d66c34244f4487afdcef007b988bebf8b14091837214efe5a4dda6e28b4fc4 + if [[ ! -f "$MOONCAKE_WHEEL" ]]; then + echo "Missing Mooncake wheel for commit $MOONCAKE_MAIN_COMMIT: $MOONCAKE_WHEEL" >&2 + exit 1 + fi + echo "$MOONCAKE_WHEEL_SHA256 $MOONCAKE_WHEEL" | sha256sum --check - agentic_pip_install --quiet --no-cache-dir --no-deps \ - --force-reinstall "mooncake-transfer-engine-cuda13==$MOONCAKE_VERSION" + --force-reinstall "$MOONCAKE_WHEEL" python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null MOONCAKE_MASTER_PORT=$((PORT + 12000)) @@ -124,8 +134,10 @@ EOF # cannot register vLLM's GPU KV buffers. The CUDA-enabled Mooncake # wheel stages GPU buffers through host memory for TCP transfers. # Reuse connections because agentic KV traffic otherwise exhausts the - # node's ephemeral TCP ports during warmup. + # node's ephemeral TCP ports during warmup. Use 4 MiB slices instead + # of the old 64 KiB default to reduce concurrent socket sessions. export MC_TCP_ENABLE_CONNECTION_POOL=1 + export MC_TCP_SLICE_SIZE=4194304 # Each rank contributes a separate segment. Evict early enough to # avoid an imbalanced rank exhausting its segment. From bb55646cd9593ba344d20885f6ee047c73719c7c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 10:06:00 -0500 Subject: [PATCH 054/132] fix(agentic): preserve cached Mooncake wheel filename --- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 1cfba2ab2..2a80a640f 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -102,7 +102,7 @@ case "$OFFLOADING" in # connection-pool correctness fixes. The B200 cluster cache contains a # CUDA 13 wheel built from this pinned upstream main commit. MOONCAKE_MAIN_COMMIT=4719229d88b10a7a8948a6b1e60705ffdb223077 - MOONCAKE_WHEEL="/aiperf_mmap_cache/mooncake/mooncake_main_4719229d_cuda13_py312.whl" + MOONCAKE_WHEEL="/aiperf_mmap_cache/mooncake/mooncake_transfer_engine_cuda13-0.3.11.post1-cp312-cp312-manylinux_2_35_x86_64.whl" MOONCAKE_WHEEL_SHA256=88d66c34244f4487afdcef007b988bebf8b14091837214efe5a4dda6e28b4fc4 if [[ ! -f "$MOONCAKE_WHEEL" ]]; then echo "Missing Mooncake wheel for commit $MOONCAKE_MAIN_COMMIT: $MOONCAKE_WHEEL" >&2 From eebefc1c81faf9ca724a078b0e710d9a15a0389b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 10:23:32 -0500 Subject: [PATCH 055/132] test(agentic): use standalone Mooncake store on B200 --- .../single_node/agentic/dsv4_fp4_b200_vllm.sh | 30 +++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 2a80a640f..ee71b4459 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -83,6 +83,7 @@ export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768 SERVER_LOG="$RESULT_DIR/server.log" ROUTER_LOG="$RESULT_DIR/router.log" MOONCAKE_MASTER_LOG="$RESULT_DIR/mooncake_master.log" +MOONCAKE_CLIENT_LOG="$RESULT_DIR/mooncake_client.log" mkdir -p "$RESULT_DIR" OFFLOAD_ARGS=() @@ -93,10 +94,10 @@ case "$OFFLOADING" in # B200 DGXC compute nodes have about 3.9 TB host RAM. Leave enough # headroom for model workers and the runtime. # - # Embedded mode contributes one segment per GPU rank to a shared - # distributed store, so pre-divide the aggregate host-memory budget. + # Use one standalone owner for the aggregate host-memory budget. The + # vLLM ranks are pure requesters, avoiding per-rank store-segment + # imbalance and reducing TCP destination fan-out. TOTAL_CPU_DRAM_GB=2500 - PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) # v0.3.11.post1 predates configurable TCP slice sizing and recent # connection-pool correctness fixes. The B200 cluster cache contains a @@ -114,13 +115,14 @@ case "$OFFLOADING" in python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null MOONCAKE_MASTER_PORT=$((PORT + 12000)) + MOONCAKE_CLIENT_PORT=$((PORT + 12001)) MOONCAKE_CONFIG_PATH="$RESULT_DIR/mooncake_config.json" cat > "$MOONCAKE_CONFIG_PATH" <&2 exit 1 fi + echo "Starting standalone Mooncake owner on port $MOONCAKE_CLIENT_PORT..." + mooncake_client \ + --host=127.0.0.1 \ + --metadata_server=P2PHANDSHAKE \ + --master_server_address="127.0.0.1:$MOONCAKE_MASTER_PORT" \ + --protocol=tcp \ + --port="$MOONCAKE_CLIENT_PORT" \ + --global_segment_size="${TOTAL_CPU_DRAM_GB}GB" \ + --device_names="" \ + > "$MOONCAKE_CLIENT_LOG" 2>&1 & + MOONCAKE_CLIENT_PID=$! + sleep 5 + if ! kill -0 "$MOONCAKE_CLIENT_PID" 2>/dev/null; then + echo "Standalone Mooncake owner died during startup." >&2 + cat "$MOONCAKE_CLIENT_LOG" >&2 + exit 1 + fi unset VLLM_USE_SIMPLE_KV_OFFLOAD OFFLOAD_ARGS=( From 1112011fa4f82f40fd0ced4a603f44d96fe84b10 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 10:56:13 -0500 Subject: [PATCH 056/132] fix(agentic): bound B200 Mooncake transfer batches --- .../single_node/agentic/dsv4_fp4_b200_vllm.sh | 35 ++---- .../patch_vllm_mooncake_transfer_batches.py | 102 ++++++++++++++++++ 2 files changed, 111 insertions(+), 26 deletions(-) create mode 100755 benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index ee71b4459..f467da9e6 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -83,7 +83,6 @@ export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768 SERVER_LOG="$RESULT_DIR/server.log" ROUTER_LOG="$RESULT_DIR/router.log" MOONCAKE_MASTER_LOG="$RESULT_DIR/mooncake_master.log" -MOONCAKE_CLIENT_LOG="$RESULT_DIR/mooncake_client.log" mkdir -p "$RESULT_DIR" OFFLOAD_ARGS=() @@ -94,10 +93,10 @@ case "$OFFLOADING" in # B200 DGXC compute nodes have about 3.9 TB host RAM. Leave enough # headroom for model workers and the runtime. # - # Use one standalone owner for the aggregate host-memory budget. The - # vLLM ranks are pure requesters, avoiding per-rank store-segment - # imbalance and reducing TCP destination fan-out. + # Embedded mode contributes one segment per GPU rank to a shared + # distributed store, so pre-divide the aggregate host-memory budget. TOTAL_CPU_DRAM_GB=2500 + PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) # v0.3.11.post1 predates configurable TCP slice sizing and recent # connection-pool correctness fixes. The B200 cluster cache contains a @@ -113,16 +112,19 @@ case "$OFFLOADING" in agentic_pip_install --quiet --no-cache-dir --no-deps \ --force-reinstall "$MOONCAKE_WHEEL" python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null + # Mooncake TCP currently has no transfer-concurrency limit. Bound each + # vLLM store/load call so large DSv4 requests do not exhaust TCP ports. + export VLLM_MOONCAKE_MAX_TRANSFER_BATCH_KEYS=8 + python3 "$(dirname "$0")/patch_vllm_mooncake_transfer_batches.py" MOONCAKE_MASTER_PORT=$((PORT + 12000)) - MOONCAKE_CLIENT_PORT=$((PORT + 12001)) MOONCAKE_CONFIG_PATH="$RESULT_DIR/mooncake_config.json" cat > "$MOONCAKE_CONFIG_PATH" <&2 exit 1 fi - echo "Starting standalone Mooncake owner on port $MOONCAKE_CLIENT_PORT..." - mooncake_client \ - --host=127.0.0.1 \ - --metadata_server=P2PHANDSHAKE \ - --master_server_address="127.0.0.1:$MOONCAKE_MASTER_PORT" \ - --protocol=tcp \ - --port="$MOONCAKE_CLIENT_PORT" \ - --global_segment_size="${TOTAL_CPU_DRAM_GB}GB" \ - --device_names="" \ - > "$MOONCAKE_CLIENT_LOG" 2>&1 & - MOONCAKE_CLIENT_PID=$! - sleep 5 - if ! kill -0 "$MOONCAKE_CLIENT_PID" 2>/dev/null; then - echo "Standalone Mooncake owner died during startup." >&2 - cat "$MOONCAKE_CLIENT_LOG" >&2 - exit 1 - fi - unset VLLM_USE_SIMPLE_KV_OFFLOAD OFFLOAD_ARGS=( --kv-transfer-config diff --git a/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py b/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py new file mode 100755 index 000000000..beff775ab --- /dev/null +++ b/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +"""Temporarily bound MooncakeStoreConnector transfer batches. + +Mooncake's TCP connection pool grows without a concurrency ceiling. Large +DeepSeek-V4 requests therefore create enough simultaneous per-layer transfers +to exhaust the node's TCP ports. This patch preserves the same keys and buffer +lists but submits them in smaller sequential batches. +""" + +import argparse +from pathlib import Path + + +HELPER_ANCHOR = '''def _rotate_list(values: list[_T], offset: int) -> list[_T]: + return values[offset:] + values[:offset] +''' + +HELPER = ''' + +_INFERENCEX_MOONCAKE_BATCH_PATCH = True + + +def _run_mooncake_transfer_batches(fn, keys, addrs, sizes, *args): + max_keys = int(os.getenv("VLLM_MOONCAKE_MAX_TRANSFER_BATCH_KEYS", "0")) + if max_keys <= 0 or len(keys) <= max_keys: + return fn(keys, addrs, sizes, *args) + + results = [] + for start in range(0, len(keys), max_keys): + end = start + max_keys + results.extend(fn(keys[start:end], addrs[start:end], sizes[start:end], *args)) + return results +''' + +PUT_CALL = '''res = self.store.batch_put_from_multi_buffers( + keys, + addrs, + sizes, + self.replicate_config, + )''' + +PATCHED_PUT_CALL = '''res = _run_mooncake_transfer_batches( + self.store.batch_put_from_multi_buffers, + keys, + addrs, + sizes, + self.replicate_config, + )''' + +GET_CALL = '''res = self.store.batch_get_into_multi_buffers( + batch_keys, batch_addrs, batch_sizes + )''' + +PATCHED_GET_CALL = '''res = _run_mooncake_transfer_batches( + self.store.batch_get_into_multi_buffers, + batch_keys, + batch_addrs, + batch_sizes, + )''' + + +def patch_worker(worker_path: Path) -> None: + source = worker_path.read_text() + if "_INFERENCEX_MOONCAKE_BATCH_PATCH = True" in source: + print(f"Mooncake transfer batching already patched: {worker_path}") + return + + replacements = ( + (HELPER_ANCHOR, HELPER_ANCHOR + HELPER), + (PUT_CALL, PATCHED_PUT_CALL), + (GET_CALL, PATCHED_GET_CALL), + ) + for old, new in replacements: + count = source.count(old) + if count != 1: + raise RuntimeError( + f"Expected exactly one patch target in {worker_path}, found {count}: " + f"{old.splitlines()[0]}" + ) + source = source.replace(old, new, 1) + + worker_path.write_text(source) + print(f"Patched Mooncake transfer batching: {worker_path}") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--worker-path", type=Path) + args = parser.parse_args() + + worker_path = args.worker_path + if worker_path is None: + import vllm + + worker_path = Path(vllm.__file__).parent / ( + "distributed/kv_transfer/kv_connector/v1/mooncake/store/worker.py" + ) + patch_worker(worker_path) + + +if __name__ == "__main__": + main() From 191c4fe3afa94fe617f785d3ff988bfce66709ee Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 11:28:58 -0500 Subject: [PATCH 057/132] test(agentic): raise B200 Mooncake batch limit --- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 2 +- .../single_node/agentic/patch_vllm_mooncake_transfer_batches.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index f467da9e6..d001e8c54 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -114,7 +114,7 @@ case "$OFFLOADING" in python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null # Mooncake TCP currently has no transfer-concurrency limit. Bound each # vLLM store/load call so large DSv4 requests do not exhaust TCP ports. - export VLLM_MOONCAKE_MAX_TRANSFER_BATCH_KEYS=8 + export INFERENCEX_MOONCAKE_MAX_TRANSFER_BATCH_KEYS=32 python3 "$(dirname "$0")/patch_vllm_mooncake_transfer_batches.py" MOONCAKE_MASTER_PORT=$((PORT + 12000)) diff --git a/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py b/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py index beff775ab..5c061606f 100755 --- a/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py +++ b/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py @@ -21,7 +21,7 @@ def _run_mooncake_transfer_batches(fn, keys, addrs, sizes, *args): - max_keys = int(os.getenv("VLLM_MOONCAKE_MAX_TRANSFER_BATCH_KEYS", "0")) + max_keys = int(os.getenv("INFERENCEX_MOONCAKE_MAX_TRANSFER_BATCH_KEYS", "0")) if max_keys <= 0 or len(keys) <= max_keys: return fn(keys, addrs, sizes, *args) From 8894d58cf1d5c12ecc15906d554ca69fa5f19efc Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 12:05:50 -0500 Subject: [PATCH 058/132] fix(agentic): extend B200 Mooncake read lease --- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index d001e8c54..7996529e5 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -147,11 +147,15 @@ EOF # avoid an imbalanced rank exhausting its segment. MOONCAKE_EVICTION_HIGH_WATERMARK_RATIO=0.80 MOONCAKE_EVICTION_RATIO=0.10 + # Mooncake's default 5s read lease is shorter than the observed + # transfer latency for large DSv4 hybrid-KV loads on B200 TCP. + MOONCAKE_KV_LEASE_TTL=60s echo "Starting Mooncake master on port $MOONCAKE_MASTER_PORT..." mooncake_master --port "$MOONCAKE_MASTER_PORT" \ --eviction_high_watermark_ratio="$MOONCAKE_EVICTION_HIGH_WATERMARK_RATIO" \ --eviction_ratio="$MOONCAKE_EVICTION_RATIO" \ + --default_kv_lease_ttl="$MOONCAKE_KV_LEASE_TTL" \ > "$MOONCAKE_MASTER_LOG" 2>&1 & MOONCAKE_MASTER_PID=$! sleep 2 From 077a4d08afcf77c7cf13340d89491214bf8b4733 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 15:06:25 -0500 Subject: [PATCH 059/132] test(agentic): use stock Mooncake DMA-BUF RDMA on B200 --- .../single_node/agentic/dsv4_fp4_b200_vllm.sh | 36 ++++++------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 7996529e5..a2b2db625 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -98,24 +98,10 @@ case "$OFFLOADING" in TOTAL_CPU_DRAM_GB=2500 PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP)) - # v0.3.11.post1 predates configurable TCP slice sizing and recent - # connection-pool correctness fixes. The B200 cluster cache contains a - # CUDA 13 wheel built from this pinned upstream main commit. - MOONCAKE_MAIN_COMMIT=4719229d88b10a7a8948a6b1e60705ffdb223077 - MOONCAKE_WHEEL="/aiperf_mmap_cache/mooncake/mooncake_transfer_engine_cuda13-0.3.11.post1-cp312-cp312-manylinux_2_35_x86_64.whl" - MOONCAKE_WHEEL_SHA256=88d66c34244f4487afdcef007b988bebf8b14091837214efe5a4dda6e28b4fc4 - if [[ ! -f "$MOONCAKE_WHEEL" ]]; then - echo "Missing Mooncake wheel for commit $MOONCAKE_MAIN_COMMIT: $MOONCAKE_WHEEL" >&2 - exit 1 - fi - echo "$MOONCAKE_WHEEL_SHA256 $MOONCAKE_WHEEL" | sha256sum --check - + MOONCAKE_VERSION=0.3.11.post1 agentic_pip_install --quiet --no-cache-dir --no-deps \ - --force-reinstall "$MOONCAKE_WHEEL" + --force-reinstall "mooncake-transfer-engine-cuda13==$MOONCAKE_VERSION" python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null - # Mooncake TCP currently has no transfer-concurrency limit. Bound each - # vLLM store/load call so large DSv4 requests do not exhaust TCP ports. - export INFERENCEX_MOONCAKE_MAX_TRANSFER_BATCH_KEYS=32 - python3 "$(dirname "$0")/patch_vllm_mooncake_transfer_batches.py" MOONCAKE_MASTER_PORT=$((PORT + 12000)) MOONCAKE_CONFIG_PATH="$RESULT_DIR/mooncake_config.json" @@ -126,22 +112,20 @@ case "$OFFLOADING" in "master_server_address": "127.0.0.1:$MOONCAKE_MASTER_PORT", "global_segment_size": "${PER_RANK_GB}GB", "local_buffer_size": "4GB", - "protocol": "tcp", - "device_name": "", + "protocol": "rdma", + "device_name": "mlx5_0", "enable_offload": false } EOF export MOONCAKE_CONFIG_PATH # Identical prefixes must hash to identical store keys across DP ranks. export PYTHONHASHSEED=0 - # The B200 DGXC nodes do not expose nvidia_peermem, so GPUDirect RDMA - # cannot register vLLM's GPU KV buffers. The CUDA-enabled Mooncake - # wheel stages GPU buffers through host memory for TCP transfers. - # Reuse connections because agentic KV traffic otherwise exhausts the - # node's ephemeral TCP ports during warmup. Use 4 MiB slices instead - # of the old 64 KiB default to reduce concurrent socket sessions. - export MC_TCP_ENABLE_CONNECTION_POOL=1 - export MC_TCP_SLICE_SIZE=4194304 + # B200 GPU memory registration works through DMA-BUF, but the compute + # nodes do not expose nvidia_peermem. Force Mooncake's DMA-BUF + # GPUDirect RDMA path instead of its legacy ibv_reg_mr path. + export WITH_NVIDIA_PEERMEM=0 + export MC_SLICE_SIZE=1048576 + export MC_WORKERS_PER_CTX=4 # Each rank contributes a separate segment. Evict early enough to # avoid an imbalanced rank exhausting its segment. From acfeb45a06fa04309de8f7e4d8e893875be903ca Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 16:12:37 -0500 Subject: [PATCH 060/132] feat(agentic): default to 060826 weka corpus (DSv4 base, others 256k) Bump aiperf submodule to pick up the 060826 with-subagents loaders and switch resolve_trace_source() default from 060526 to 060826: DSv4 recipes use the full _060826 corpus, all others use _060826_256k. Older corpora remain pinnable via WEKA_LOADER_OVERRIDE. Co-Authored-By: Claude Opus 4.8 (1M context) --- benchmarks/benchmark_lib.sh | 14 ++++++++++---- utils/aiperf | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 627ed32ac..3fd56e7e4 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -928,16 +928,16 @@ resolve_trace_source() { # unfiltered corpus and switches to the 256k-capped variant), or # by recipes that want to pin an older corpus generation. # - # Default (no override): the 060526 v6 corpus, selected by model family. + # Default (no override): the 060826 v6 corpus, selected by model family. # DSv4 (full context) rides the unfiltered base corpus; every non-DSv4 # recipe defaults to the 256k-capped variant because those servers run at # max_model_len ~256k and would reject >256k requests. Any recipe can still # pin a specific corpus via WEKA_LOADER_OVERRIDE. local default_loader if [[ "${MODEL_PREFIX:-}" == dsv4* ]]; then - default_loader="semianalysis_cc_traces_weka_with_subagents_060526" + default_loader="semianalysis_cc_traces_weka_with_subagents_060826" else - default_loader="semianalysis_cc_traces_weka_with_subagents_060526_256k" + default_loader="semianalysis_cc_traces_weka_with_subagents_060826_256k" fi local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}" local dataset @@ -960,8 +960,14 @@ resolve_trace_source() { semianalysis_cc_traces_weka_with_subagents_060526_256k) dataset="semianalysisai/cc-traces-weka-with-subagents-060526-256k" ;; + semianalysis_cc_traces_weka_with_subagents_060826) + dataset="semianalysisai/cc-traces-weka-with-subagents-060826" + ;; + semianalysis_cc_traces_weka_with_subagents_060826_256k) + dataset="semianalysisai/cc-traces-weka-with-subagents-060826-256k" + ;; *) - echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k, semianalysis_cc_traces_weka_with_subagents_060526, semianalysis_cc_traces_weka_with_subagents_060526_256k" >&2 + echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k, semianalysis_cc_traces_weka_with_subagents_060526, semianalysis_cc_traces_weka_with_subagents_060526_256k, semianalysis_cc_traces_weka_with_subagents_060826, semianalysis_cc_traces_weka_with_subagents_060826_256k" >&2 exit 1 ;; esac diff --git a/utils/aiperf b/utils/aiperf index 44aa466d7..6da78261e 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit 44aa466d7d681f5e2e4d7946174540e2a6521922 +Subproject commit 6da78261e5a6deac632231dbf96757049b5e5385 From 40df9158b10ab70e95c51e460f98a56bbe42104e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 19:15:04 -0500 Subject: [PATCH 061/132] chore(aiperf): suppress repeated histogram schema warnings --- utils/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/aiperf b/utils/aiperf index 6da78261e..8c4ba9def 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit 6da78261e5a6deac632231dbf96757049b5e5385 +Subproject commit 8c4ba9deffea3ec98757545cedb7563c9d6a6a14 From 7acee3088957f47e158e629a8816e64703b7087e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 23:49:02 -0500 Subject: [PATCH 062/132] chore(aiperf): exclude warmup from realtime counters --- utils/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/aiperf b/utils/aiperf index 8c4ba9def..4c6525ab7 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit 8c4ba9deffea3ec98757545cedb7563c9d6a6a14 +Subproject commit 4c6525ab71d4cd9fc01054410d5b88bfe4feff9e From d9e0089bfbc14ab9ff400cb841803634b3108ebf Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 8 Jun 2026 23:59:48 -0500 Subject: [PATCH 063/132] chore(minimax agentic): vllm 0.22.1 + 060826-256k weka corpus Bump all 8 minimaxm2.5 agentic configs to vllm 0.22.1 (vllm/vllm-openai and vllm/vllm-openai-rocm) and repoint their WEKA_LOADER_OVERRIDE from the undated _256k loader (052726-256k) to _060826_256k (Jun 8 256k corpus). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/amd-master.yaml | 6 +++--- .github/configs/nvidia-master.yaml | 10 +++++----- benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh | 2 +- .../single_node/agentic/minimaxm2.5_fp8_mi300x.sh | 2 +- .../single_node/agentic/minimaxm2.5_fp8_mi325x.sh | 2 +- .../single_node/agentic/minimaxm2.5_fp8_mi355x.sh | 2 +- 10 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7f1c8192d..a50d37eab 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2519,7 +2519,7 @@ kimik2.5-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } minimaxm2.5-fp8-mi355x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:v0.22.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x @@ -2537,7 +2537,7 @@ minimaxm2.5-fp8-mi355x-vllm-agentic: - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } minimaxm2.5-fp8-mi300x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:v0.22.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi300x @@ -2555,7 +2555,7 @@ minimaxm2.5-fp8-mi300x-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } minimaxm2.5-fp8-mi325x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:v0.22.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi325x diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 652c65afd..4d459d8ed 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9442,7 +9442,7 @@ gptoss-fp4-b200-vllm-agentic: - { tp: 8, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } minimaxm2.5-fp8-b200-vllm-agentic: - image: vllm/vllm-openai:v0.22.0 + image: vllm/vllm-openai:v0.22.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: b200-dgxc @@ -9464,7 +9464,7 @@ minimaxm2.5-fp8-b200-vllm-agentic: # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. minimaxm2.5-fp8-b300-vllm-agentic: - image: vllm/vllm-openai:v0.22.0 + image: vllm/vllm-openai:v0.22.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: b300 @@ -9485,7 +9485,7 @@ minimaxm2.5-fp8-b300-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } minimaxm2.5-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.22.0 + image: vllm/vllm-openai:v0.22.1 model: nvidia/MiniMax-M2.5-NVFP4 model-prefix: minimaxm2.5 runner: b200 @@ -9503,7 +9503,7 @@ minimaxm2.5-fp4-b200-vllm-agentic: # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. minimaxm2.5-fp8-h100-vllm-agentic: - image: vllm/vllm-openai:v0.22.0 + image: vllm/vllm-openai:v0.22.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h100 @@ -9521,7 +9521,7 @@ minimaxm2.5-fp8-h100-vllm-agentic: - { tp: 4, ep: 4, offloading: cpu, conc-list: [5, 6, 7, 8, 10, 12] } minimaxm2.5-fp8-h200-vllm-agentic: - image: vllm/vllm-openai:v0.22.0 + image: vllm/vllm-openai:v0.22.1 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h200 diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh index b4a63eff3..f9b769636 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh @@ -36,7 +36,7 @@ nvidia-smi # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh index 0724aba5b..d07c3af69 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh @@ -36,7 +36,7 @@ nvidia-smi # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh index c291a2ceb..906ae7408 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh @@ -36,7 +36,7 @@ nvidia-smi # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh index 516bc4696..c35afe33a 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh @@ -36,7 +36,7 @@ nvidia-smi # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh index e6343b8ba..5b4782646 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh @@ -36,7 +36,7 @@ nvidia-smi # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh index 8988316d3..512eb0e6c 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh @@ -42,7 +42,7 @@ amd-smi || true # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh index caa70de63..5e5a9f9a3 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh @@ -42,7 +42,7 @@ amd-smi || true # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh index cd114fe96..8e15e7850 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -42,7 +42,7 @@ amd-smi || true # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 # corpus has requests up to ~1M proxy tokens that would be rejected. # Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k resolve_trace_source install_agentic_deps From 09848efe257e54ee1152c2b36ecf1c06d1ec68c0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 00:33:15 -0500 Subject: [PATCH 064/132] feat(agentic): add DSv4 SGLang HiCache sweeps --- .github/configs/nvidia-master.yaml | 35 ++++ .../agentic/dsv4_fp4_b200_sglang.sh | 4 + .../agentic/dsv4_fp4_b300_sglang.sh | 4 + .../dsv4_fp4_blackwell_sglang_common.sh | 165 ++++++++++++++++++ 4 files changed, 208 insertions(+) create mode 100755 benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh create mode 100755 benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh create mode 100755 benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4d459d8ed..3b3a4b533 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9275,6 +9275,23 @@ dsv4-fp4-b200-vllm-agentic: - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 196, 256, 512] } +dsv4-fp4-b200-sglang-agentic-hicache: + image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b200-dgxc + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40] } + - { tp: 8, offloading: hicache, conc-list: [40, 48, 52, 64, 72] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + - { tp: 8, ep: 8, dp-attn: true, offloading: hicache, conc-list: [64, 128, 196, 256, 512] } + qwen3.5-fp8-b200-sglang-agentic: image: lmsysorg/sglang:nightly-dev-20260422-de962f32 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -9424,6 +9441,24 @@ dsv4-fp4-b300-vllm-agentic: - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [32, 48, 64, 96, 128, 192, 256] } - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } +dsv4-fp4-b300-sglang-agentic-hicache: + image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] } + - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } + - { tp: 4, ep: 4, dp-attn: true, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + gptoss-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: openai/gpt-oss-120b diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh new file mode 100755 index 000000000..17cd10d1b --- /dev/null +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +export DSV4_SGLANG_PLATFORM=B200 +source "$(dirname "$0")/dsv4_fp4_blackwell_sglang_common.sh" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh new file mode 100755 index 000000000..b51526feb --- /dev/null +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +export DSV4_SGLANG_PLATFORM=B300 +source "$(dirname "$0")/dsv4_fp4_blackwell_sglang_common.sh" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh new file mode 100755 index 000000000..0f3dd93b3 --- /dev/null +++ b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh @@ -0,0 +1,165 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on Blackwell using +# SGLang. B200 and B300 use the same current upstream DSv4 recipes. +# +# OFFLOADING values: +# none - SGLang GPU KV cache with RadixAttention prefix caching. +# hicache - SGLang HiCache local CPU tier with DSv4 UnifiedRadixCache. + +source "$(dirname "${BASH_SOURCE[0]}")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION + +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=1000000 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi +nvidia-smi + +resolve_trace_source +install_agentic_deps + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +CACHE_ARGS=() +case "$OFFLOADING" in + none) + ;; + hicache) + # DeepSeek V4 HiCache currently rejects --hicache-size and supports + # capacity control only through a host/device token-capacity ratio. + # Ratio 4 matches SGLang's DSv4 HiCache correctness test and generally + # keeps host allocation within the 2-2.5 TB available on these nodes. + HICACHE_RATIO="${HICACHE_RATIO:-4}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}" + export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1 + CACHE_ARGS=( + --enable-hierarchical-cache + --hicache-ratio "$HICACHE_RATIO" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + ) + echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +PARALLEL_ARGS=(--tp "$TP") +MEM_FRACTION_STATIC=0.88 +CHUNKED_PREFILL_SIZE=8192 +if [ "$DP_ATTENTION" = "true" ]; then + PARALLEL_ARGS+=( + --dp "$TP" + --enable-dp-attention + --ep-size "$EP_SIZE" + --moe-a2a-backend deepep + --deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + --enable-prefill-delayer + ) + MEM_FRACTION_STATIC=0.82 + CHUNKED_PREFILL_SIZE=16384 +else + PARALLEL_ARGS+=( + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + ) +fi + +if [ "$DP_ATTENTION" = "true" ]; then + PER_ENGINE_MAX_RUNNING=$(( (CONC + TP - 1) / TP )) +else + PER_ENGINE_MAX_RUNNING=$CONC +fi +[ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 +CUDA_GRAPH_MAX_BS=$PER_ENGINE_MAX_RUNNING +[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 + +export PYTHONNOUSERSITE=1 +export TORCH_CUDA_ARCH_LIST=10.0 +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 +if [ "$DP_ATTENTION" = "true" ]; then + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 +fi + +SGLANG_CMD=( + python3 -m sglang.launch_server + --model-path "$MODEL_PATH" + --served-model-name "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --trust-remote-code + "${PARALLEL_ARGS[@]}" + --attention-backend compressed + --page-size 256 + --mem-fraction-static "$MEM_FRACTION_STATIC" + --swa-full-tokens-ratio 0.1 + --max-running-requests "$PER_ENGINE_MAX_RUNNING" + --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" + --context-length "$MAX_MODEL_LEN" + --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" + --disable-shared-experts-fusion + --tool-call-parser deepseekv4 + --reasoning-parser deepseek-v4 + --chat-template "$(dirname "${BASH_SOURCE[0]}")/../chat_templates/deepseek_v4_thinking.jinja" + --watchdog-timeout 1800 + --enable-metrics + "${CACHE_ARGS[@]}" +) + +printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" +printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" + +{ + echo "=== SGLANG_* env vars at launch ===" + env | grep -E '^SGLANG_' | sort + echo "===================================" +} | tee "$SERVER_LOG" + +echo "Starting SGLang server for ${DSV4_SGLANG_PLATFORM:-Blackwell}..." +"${SGLANG_CMD[@]}" >> "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +capture_cache_metrics() { + { + echo "=== SGLang cache metrics snapshot $(date --iso-8601=seconds) ===" + curl -fsS "http://localhost:$PORT/metrics" 2>/dev/null \ + | grep -E '^(sglang:(cache_hit_rate|cached_tokens_total|prompt_tokens_total|hicache_host_used_tokens|hicache_host_total_tokens|token_usage|num_requests_running|num_requests_waiting))' \ + || true + echo "============================================================" + } >> "$SERVER_LOG" +} + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" +capture_cache_metrics +trap capture_cache_metrics EXIT + +build_replay_cmd "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" From 864b8a9e8cf813525f25072c3899688306040828 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 09:03:14 -0500 Subject: [PATCH 065/132] fix(agentic): avoid SGLang DEP metrics port collision --- .../agentic/dsv4_fp4_blackwell_sglang_common.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh index 0f3dd93b3..fa1bfbf69 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh @@ -67,6 +67,7 @@ case "$OFFLOADING" in esac PARALLEL_ARGS=(--tp "$TP") +METRICS_ARGS=(--enable-metrics) MEM_FRACTION_STATIC=0.88 CHUNKED_PREFILL_SIZE=8192 if [ "$DP_ATTENTION" = "true" ]; then @@ -78,6 +79,12 @@ if [ "$DP_ATTENTION" = "true" ]; then --deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' --enable-prefill-delayer ) + # Current SGLang DP-attention startup binds the main metrics IPC port and + # then rejects that same port while constructing the DP controller's + # per-rank PortArgs. Until upstream fixes that self-collision, omit + # --enable-metrics for DEP; server.log still records HiCache allocation and + # transfer activity, while pure-TP runs retain the full Prometheus export. + METRICS_ARGS=() MEM_FRACTION_STATIC=0.82 CHUNKED_PREFILL_SIZE=16384 else @@ -129,7 +136,7 @@ SGLANG_CMD=( --reasoning-parser deepseek-v4 --chat-template "$(dirname "${BASH_SOURCE[0]}")/../chat_templates/deepseek_v4_thinking.jinja" --watchdog-timeout 1800 - --enable-metrics + "${METRICS_ARGS[@]}" "${CACHE_ARGS[@]}" ) @@ -158,8 +165,10 @@ capture_cache_metrics() { } wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -capture_cache_metrics -trap capture_cache_metrics EXIT +if [ "${#METRICS_ARGS[@]}" -gt 0 ]; then + capture_cache_metrics + trap capture_cache_metrics EXIT +fi build_replay_cmd "$RESULT_DIR" run_agentic_replay_and_write_outputs "$RESULT_DIR" From 2e0f5fa2d9975dc8b96ccb30fce937a8b777fcf1 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 09:18:48 -0500 Subject: [PATCH 066/132] fix(agentic): run DSv4 HiCache on pure TP --- .github/configs/nvidia-master.yaml | 9 ++-- .../dsv4_fp4_blackwell_sglang_common.sh | 41 +++++-------------- 2 files changed, 13 insertions(+), 37 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3b3a4b533..ba58c729e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9288,9 +9288,7 @@ dsv4-fp4-b200-sglang-agentic-hicache: - duration: 1800 search-space: - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40] } - - { tp: 8, offloading: hicache, conc-list: [40, 48, 52, 64, 72] } - - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } - - { tp: 8, ep: 8, dp-attn: true, offloading: hicache, conc-list: [64, 128, 196, 256, 512] } + - { tp: 8, offloading: hicache, conc-list: [40, 48, 52, 64, 72, 84, 100, 128, 196, 256, 512] } qwen3.5-fp8-b200-sglang-agentic: image: lmsysorg/sglang:nightly-dev-20260422-de962f32 @@ -9455,9 +9453,8 @@ dsv4-fp4-b300-sglang-agentic-hicache: search-space: - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] } - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } - - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } - - { tp: 4, ep: 4, dp-attn: true, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] } - - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + - { tp: 4, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] } + - { tp: 8, offloading: hicache, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } gptoss-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 diff --git a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh index fa1bfbf69..301211b05 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh @@ -37,6 +37,11 @@ install_agentic_deps SERVER_LOG="$RESULT_DIR/server.log" mkdir -p "$RESULT_DIR" +if [ "$DP_ATTENTION" = "true" ]; then + echo "Error: current SGLang nightly self-collides on internal IPC ports during single-node DP-attention startup; use pure TP until upstream fixes PortArgs initialization." >&2 + exit 1 +fi + CACHE_ARGS=() case "$OFFLOADING" in none) @@ -70,35 +75,12 @@ PARALLEL_ARGS=(--tp "$TP") METRICS_ARGS=(--enable-metrics) MEM_FRACTION_STATIC=0.88 CHUNKED_PREFILL_SIZE=8192 -if [ "$DP_ATTENTION" = "true" ]; then - PARALLEL_ARGS+=( - --dp "$TP" - --enable-dp-attention - --ep-size "$EP_SIZE" - --moe-a2a-backend deepep - --deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - --enable-prefill-delayer - ) - # Current SGLang DP-attention startup binds the main metrics IPC port and - # then rejects that same port while constructing the DP controller's - # per-rank PortArgs. Until upstream fixes that self-collision, omit - # --enable-metrics for DEP; server.log still records HiCache allocation and - # transfer activity, while pure-TP runs retain the full Prometheus export. - METRICS_ARGS=() - MEM_FRACTION_STATIC=0.82 - CHUNKED_PREFILL_SIZE=16384 -else - PARALLEL_ARGS+=( - --moe-runner-backend flashinfer_mxfp4 - --disable-flashinfer-autotune - ) -fi +PARALLEL_ARGS+=( + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune +) -if [ "$DP_ATTENTION" = "true" ]; then - PER_ENGINE_MAX_RUNNING=$(( (CONC + TP - 1) / TP )) -else - PER_ENGINE_MAX_RUNNING=$CONC -fi +PER_ENGINE_MAX_RUNNING=$CONC [ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 CUDA_GRAPH_MAX_BS=$PER_ENGINE_MAX_RUNNING [ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 @@ -111,9 +93,6 @@ export SGLANG_OPT_USE_JIT_NORM=1 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 -if [ "$DP_ATTENTION" = "true" ]; then - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 -fi SGLANG_CMD=( python3 -m sglang.launch_server From 422e080a4d710a525c4363223daca7bb5e00f87d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 09:33:25 -0500 Subject: [PATCH 067/132] fix(agentic): size DSv4 HiCache ratio by TP --- .../agentic/dsv4_fp4_blackwell_sglang_common.sh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh index 301211b05..19129e54d 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh @@ -49,9 +49,17 @@ case "$OFFLOADING" in hicache) # DeepSeek V4 HiCache currently rejects --hicache-size and supports # capacity control only through a host/device token-capacity ratio. - # Ratio 4 matches SGLang's DSv4 HiCache correctness test and generally - # keeps host allocation within the 2-2.5 TB available on these nodes. - HICACHE_RATIO="${HICACHE_RATIO:-4}" + # DSv4 allocates several physical host sub-pools for each logical host + # token. At TP8, ratio=4 consumes about 237 GB/rank (1.9 TB total) while + # model loading/page cache is still resident and the OS kills a rank. + # Keep the proven ratio=4 TP4 path, but use ratio=2 at TP8 to leave + # enough transient host-memory headroom during initialization. + if [ "$TP" -ge 8 ]; then + DEFAULT_HICACHE_RATIO=2 + else + DEFAULT_HICACHE_RATIO=4 + fi + HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}" HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}" From 18c46ca155825e1ca6c0a65ab9c654a26be7c996 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 09:59:08 -0500 Subject: [PATCH 068/132] perf(agentic): expand TP4 DSv4 HiCache tier --- .../agentic/dsv4_fp4_blackwell_sglang_common.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh index 19129e54d..1aadc06fb 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh @@ -52,12 +52,14 @@ case "$OFFLOADING" in # DSv4 allocates several physical host sub-pools for each logical host # token. At TP8, ratio=4 consumes about 237 GB/rank (1.9 TB total) while # model loading/page cache is still resident and the OS kills a rank. - # Keep the proven ratio=4 TP4 path, but use ratio=2 at TP8 to leave - # enough transient host-memory headroom during initialization. + # TP4 ratio=4 works but fills its roughly 500 GB host tier during the + # C48/C64 focused tests and useful host hits collapse. Ratio=8 doubles + # that logical capacity while remaining below the node's host budget. + # Use ratio=2 at TP8 to leave enough transient headroom during startup. if [ "$TP" -ge 8 ]; then DEFAULT_HICACHE_RATIO=2 else - DEFAULT_HICACHE_RATIO=4 + DEFAULT_HICACHE_RATIO=8 fi HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}" HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" From b3d2068cdcc74c72ef3328b01460496b1dbfbfdc Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 10:18:59 -0500 Subject: [PATCH 069/132] fix(agentic): tolerate DSv4 SGLang admission stalls --- .../agentic/dsv4_fp4_blackwell_sglang_common.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh index 1aadc06fb..3f3ec0897 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh @@ -97,6 +97,13 @@ CUDA_GRAPH_MAX_BS=$PER_ENGINE_MAX_RUNNING export PYTHONNOUSERSITE=1 export TORCH_CUDA_ARCH_LIST=10.0 +# Agentic warmup dispatches hundreds of large prompts at once. SGLang's +# tokenizer process can leave request bytes unacknowledged for longer than +# AIPerf's 30-second TCP_USER_TIMEOUT while it admits that initial burst, +# causing Linux to abort otherwise-live localhost connections. Keep the +# six-hour request timeout unchanged, but allow up to 15 minutes for TCP +# progress before declaring the connection dead. +export AIPERF_HTTP_TCP_USER_TIMEOUT=900000 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 export SGLANG_OPT_USE_JIT_NORM=1 From c14c9393fc6b1174e254cc778e23f948f55f0a75 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 10:31:17 -0500 Subject: [PATCH 070/132] perf(agentic): retain TP4 DSv4 HiCache working set --- .../agentic/dsv4_fp4_blackwell_sglang_common.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh index 3f3ec0897..36171ff26 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh @@ -52,14 +52,15 @@ case "$OFFLOADING" in # DSv4 allocates several physical host sub-pools for each logical host # token. At TP8, ratio=4 consumes about 237 GB/rank (1.9 TB total) while # model loading/page cache is still resident and the OS kills a rank. - # TP4 ratio=4 works but fills its roughly 500 GB host tier during the - # C48/C64 focused tests and useful host hits collapse. Ratio=8 doubles - # that logical capacity while remaining below the node's host budget. + # TP4 ratio=4 works at C32 but fills its roughly 500 GB host tier at + # C48/C64. Ratio=8 still cannot retain the C64 session working set long + # enough to produce host hits. Ratio=16 provides roughly 21M logical + # host tokens while remaining below the B300 node's host budget. # Use ratio=2 at TP8 to leave enough transient headroom during startup. if [ "$TP" -ge 8 ]; then DEFAULT_HICACHE_RATIO=2 else - DEFAULT_HICACHE_RATIO=8 + DEFAULT_HICACHE_RATIO=16 fi HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}" HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" From 746ba4caabdb0127ce3a940d7ffb36422ad622a5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 10:35:25 -0500 Subject: [PATCH 071/132] delete gptoss --- .../single_node/agentic/gptoss_fp4_b200.sh | 82 --------------- .../single_node/agentic/gptoss_fp4_h100.sh | 86 ---------------- .../single_node/agentic/gptoss_fp4_h200.sh | 86 ---------------- .../single_node/agentic/gptoss_fp4_mi300x.sh | 99 ------------------- .../single_node/agentic/gptoss_fp4_mi325x.sh | 98 ------------------ 5 files changed, 451 deletions(-) delete mode 100755 benchmarks/single_node/agentic/gptoss_fp4_b200.sh delete mode 100755 benchmarks/single_node/agentic/gptoss_fp4_h100.sh delete mode 100755 benchmarks/single_node/agentic/gptoss_fp4_h200.sh delete mode 100755 benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh delete mode 100755 benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh deleted file mode 100755 index 80d70e724..000000000 --- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for GPT-OSS 120B FP4 on B200 using vLLM. -# -# Required env vars: -# MODEL, TP, CONC, RESULT_DIR - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION - -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi -nvidia-smi - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -cat > "$RESULT_DIR/config.yaml" << EOF -kv-cache-dtype: fp8 -compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}' -max-cudagraph-capture-size: 2048 -max-num-batched-tokens: 8192 -max-model-len: $MAX_MODEL_LEN -EOF - -OFFLOAD_ARGS="" -case "$OFFLOADING" in - none) ;; - cpu) - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" - ;; - *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; -esac - -echo "Starting vllm server..." -export TORCH_CUDA_ARCH_LIST="10.0" -export PYTHONNOUSERSITE=1 -export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 - -vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ ---host 0.0.0.0 \ ---port $PORT \ ---config "$RESULT_DIR/config.yaml" \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size $TP \ ---max-num-seqs $CONC \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh deleted file mode 100755 index 13e32d315..000000000 --- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for GPT-OSS 120B FP4 on H100 using vLLM. -# -# Required env vars: -# MODEL, TP, CONC, RESULT_DIR - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION - -# Agentic matrix entries don't set max-model-len, so the workflow passes 0. -# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi -nvidia-smi - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -cat > "$RESULT_DIR/config.yaml" << EOF -async-scheduling: true -max-cudagraph-capture-size: 2048 -max-model-len: $MAX_MODEL_LEN -EOF - -OFFLOAD_ARGS="" -case "$OFFLOADING" in - none) - ;; - cpu) - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 - exit 1 - ;; -esac - -echo "Starting vllm server..." -export TORCH_CUDA_ARCH_LIST="9.0" -export PYTHONNOUSERSITE=1 -export VLLM_MXFP4_USE_MARLIN=1 - -vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ ---host 0.0.0.0 \ ---port $PORT \ ---config "$RESULT_DIR/config.yaml" \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size $TP \ ---max-num-seqs $CONC \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh deleted file mode 100755 index e0d967246..000000000 --- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for GPT-OSS 120B FP4 on H200 using vLLM. -# -# Required env vars: -# MODEL, TP, CONC, RESULT_DIR - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION - -# Agentic matrix entries don't set max-model-len, so the workflow passes 0. -# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi -nvidia-smi - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -cat > "$RESULT_DIR/config.yaml" << EOF -async-scheduling: true -max-cudagraph-capture-size: 2048 -max-model-len: $MAX_MODEL_LEN -EOF - -OFFLOAD_ARGS="" -case "$OFFLOADING" in - none) - ;; - cpu) - export VLLM_USE_SIMPLE_KV_OFFLOAD=1 - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 - exit 1 - ;; -esac - -echo "Starting vllm server..." -export TORCH_CUDA_ARCH_LIST="9.0" -export PYTHONNOUSERSITE=1 -export VLLM_MXFP4_USE_MARLIN=1 - -vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ ---host 0.0.0.0 \ ---port $PORT \ ---config "$RESULT_DIR/config.yaml" \ ---gpu-memory-utilization 0.9 \ ---tensor-parallel-size $TP \ ---max-num-seqs $CONC \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh deleted file mode 100755 index ff597c9a4..000000000 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for GPT-OSS 120B FP4 on MI300X using vLLM. -# -# Required env vars: -# MODEL, TP, CONC, RESULT_DIR - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION - -# Agentic matrix entries don't set max-model-len, so the workflow passes 0. -# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi -rocm-smi -amd-smi || true - -# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory. -# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates -version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` -if [[ "$version" == "" || $version -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES -if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then - export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" -fi - -export AMDGCN_USE_BUFFER_OPS=0 -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -export PYTHONNOUSERSITE=1 - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -OFFLOAD_ARGS="" -case "$OFFLOADING" in - none) - ;; - cpu) - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 - exit 1 - ;; -esac - -echo "Starting vllm server..." - -vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ ---host 0.0.0.0 \ ---port $PORT \ ---attention-backend ROCM_AITER_UNIFIED_ATTN \ --cc.pass_config.fuse_rope_kvcache=True \ --cc.use_inductor_graph_partition=True \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.85 \ ---max-model-len $MAX_MODEL_LEN \ ---max-num-seqs $CONC \ ---block-size=64 \ ---kv-cache-dtype fp8 \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh deleted file mode 100755 index 1f8c29351..000000000 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for GPT-OSS 120B FP4 on MI325X using vLLM. -# -# Required env vars: -# MODEL, TP, CONC, RESULT_DIR - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION - -# Agentic matrix entries don't set max-model-len, so the workflow passes 0. -# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly. -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi -rocm-smi - -# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory. -# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates -version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` -if [[ "$version" == "" || $version -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES -if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then - export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" -fi - -export AMDGCN_USE_BUFFER_OPS=0 -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -export PYTHONNOUSERSITE=1 - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -OFFLOAD_ARGS="" -case "$OFFLOADING" in - none) - ;; - cpu) - OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager" - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2 - exit 1 - ;; -esac - -echo "Starting vllm server..." - -vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ ---host 0.0.0.0 \ ---port $PORT \ ---attention-backend ROCM_AITER_UNIFIED_ATTN \ --cc.pass_config.fuse_rope_kvcache=True \ --cc.use_inductor_graph_partition=True \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.85 \ ---max-model-len $MAX_MODEL_LEN \ ---max-num-seqs $CONC \ ---block-size=64 \ ---kv-cache-dtype fp8 \ -$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" From e795090aa15776c5bd50aaabd38533cfadc0c9ad Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 10:56:18 -0500 Subject: [PATCH 072/132] perf(agentic): expand B200 DSv4 HiCache tier --- .../agentic/dsv4_fp4_blackwell_sglang_common.sh | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh index 36171ff26..152209bc7 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh @@ -50,15 +50,22 @@ case "$OFFLOADING" in # DeepSeek V4 HiCache currently rejects --hicache-size and supports # capacity control only through a host/device token-capacity ratio. # DSv4 allocates several physical host sub-pools for each logical host - # token. At TP8, ratio=4 consumes about 237 GB/rank (1.9 TB total) while - # model loading/page cache is still resident and the OS kills a rank. + # token. On B300 TP8, ratio=4 consumes about 237 GB/rank (1.9 TB total) + # while model loading/page cache is still resident and the OS kills a + # rank, so leave transient startup headroom with ratio=2. B200 has a + # smaller device KV pool and 3.8 TiB of host RAM, so ratio=8 provides a + # substantially larger useful CPU tier while staying within its node + # budget. # TP4 ratio=4 works at C32 but fills its roughly 500 GB host tier at # C48/C64. Ratio=8 still cannot retain the C64 session working set long # enough to produce host hits. Ratio=16 provides roughly 21M logical # host tokens while remaining below the B300 node's host budget. - # Use ratio=2 at TP8 to leave enough transient headroom during startup. if [ "$TP" -ge 8 ]; then - DEFAULT_HICACHE_RATIO=2 + if [ "${DSV4_SGLANG_PLATFORM:-}" = "B200" ]; then + DEFAULT_HICACHE_RATIO=8 + else + DEFAULT_HICACHE_RATIO=2 + fi else DEFAULT_HICACHE_RATIO=16 fi From 2bd002870df9db4da6505618a0c8031c9dd955b6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 10:56:49 -0500 Subject: [PATCH 073/132] chore(agentic): register DSv4 SGLang HiCache sweeps --- perf-changelog.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d201e9f3b..4a2faf23f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3395,3 +3395,10 @@ description: - "Add DeepSeek-V4-Pro FP4 MI355X ATOM MTP3 benchmark; image rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1627 + +- config-keys: + - dsv4-fp4-b200-sglang-agentic-hicache + - dsv4-fp4-b300-sglang-agentic-hicache + description: + - "Add DeepSeek-V4-Pro FP4 B200 and B300 SGLang agentic benchmarks with HiCache CPU KV offloading" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1640 From 24adefb9689429adc44e72ba7c478f0d83221859 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 12:05:01 -0500 Subject: [PATCH 074/132] fix(agentic): use SGLang UnifiedTree deadlock fix --- .github/configs/nvidia-master.yaml | 4 ++-- perf-changelog.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ba58c729e..76f05ca1d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9276,7 +9276,7 @@ dsv4-fp4-b200-vllm-agentic: - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 196, 256, 512] } dsv4-fp4-b200-sglang-agentic-hicache: - image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b + image: lmsysorg/sglang:nightly-dev-cu13-20260609-317fc6a9 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dgxc @@ -9440,7 +9440,7 @@ dsv4-fp4-b300-vllm-agentic: - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } dsv4-fp4-b300-sglang-agentic-hicache: - image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b + image: lmsysorg/sglang:nightly-dev-cu13-20260609-317fc6a9 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4a2faf23f..af0122f09 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3400,5 +3400,5 @@ - dsv4-fp4-b200-sglang-agentic-hicache - dsv4-fp4-b300-sglang-agentic-hicache description: - - "Add DeepSeek-V4-Pro FP4 B200 and B300 SGLang agentic benchmarks with HiCache CPU KV offloading" + - "Add DeepSeek-V4-Pro FP4 B200 and B300 SGLang agentic benchmarks with HiCache CPU KV offloading on the June 9 nightly, including the UnifiedRadixCache TP deadlock fix" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1640 From 3e0ea9c44c5fc74ef353f213406d1d8f926e7365 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 12:43:38 -0500 Subject: [PATCH 075/132] fix(agentic): use DSv4 Blackwell image on B200 --- .github/configs/nvidia-master.yaml | 2 +- perf-changelog.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 76f05ca1d..294685eda 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9276,7 +9276,7 @@ dsv4-fp4-b200-vllm-agentic: - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 196, 256, 512] } dsv4-fp4-b200-sglang-agentic-hicache: - image: lmsysorg/sglang:nightly-dev-cu13-20260609-317fc6a9 + image: lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dgxc diff --git a/perf-changelog.yaml b/perf-changelog.yaml index af0122f09..c4111ef0d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3400,5 +3400,5 @@ - dsv4-fp4-b200-sglang-agentic-hicache - dsv4-fp4-b300-sglang-agentic-hicache description: - - "Add DeepSeek-V4-Pro FP4 B200 and B300 SGLang agentic benchmarks with HiCache CPU KV offloading on the June 9 nightly, including the UnifiedRadixCache TP deadlock fix" + - "Add DeepSeek-V4-Pro FP4 B200 and B300 SGLang agentic benchmarks with HiCache CPU KV offloading; use the B200-specific DeepSeek-V4 Blackwell image and the June 9 nightly on B300" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1640 From 21d1981f01ec9aafd8a52877e76fa0b59bab6fd5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 13:09:03 -0500 Subject: [PATCH 076/132] fix(agentic): preserve B200 specialized image workspace --- .../dsv4_fp4_blackwell_sglang_common.sh | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh index 152209bc7..202d0a25e 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh @@ -9,7 +9,22 @@ set -x # none - SGLang GPU KV cache with RadixAttention prefix caching. # hicache - SGLang HiCache local CPU tier with DSv4 UnifiedRadixCache. -source "$(dirname "${BASH_SOURCE[0]}")/../../benchmark_lib.sh" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INFERENCEX_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +export INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}" + +# The B200 DeepSeek-V4 Blackwell image installs SGLang editable under +# /workspace, so its launcher mounts InferenceX at /ix instead. Resolve the +# agentic tooling and results against the actual repository mount so the image +# can keep its /workspace install and GitHub Actions can collect the outputs. +if [[ ! -d "$INFMAX_CONTAINER_WORKSPACE/utils/aiperf" ]]; then + export INFMAX_CONTAINER_WORKSPACE="$INFERENCEX_ROOT" +fi +if [[ "${RESULT_DIR:-}" == /workspace/* && "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then + export RESULT_DIR="$INFMAX_CONTAINER_WORKSPACE/${RESULT_DIR#/workspace/}" +fi + +source "$INFERENCEX_ROOT/benchmarks/benchmark_lib.sh" check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION @@ -138,7 +153,7 @@ SGLANG_CMD=( --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 - --chat-template "$(dirname "${BASH_SOURCE[0]}")/../chat_templates/deepseek_v4_thinking.jinja" + --chat-template "$SCRIPT_DIR/../chat_templates/deepseek_v4_thinking.jinja" --watchdog-timeout 1800 "${METRICS_ARGS[@]}" "${CACHE_ARGS[@]}" From 6f39022850e03663f385e686a566a06612fd23e5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 13:20:53 -0500 Subject: [PATCH 077/132] fix(agentic): isolate AIPerf from B200 SGLang --- .../agentic/dsv4_fp4_blackwell_sglang_common.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh index 202d0a25e..6bb60da03 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh @@ -47,6 +47,17 @@ fi nvidia-smi resolve_trace_source + +# Keep AIPerf's Transformers-main dependency from replacing the older +# Transformers build pinned by the B200-specialized SGLang image. The server +# always launches with the image's original interpreter; AIPerf and result +# processing use the isolated environment when InferenceX is mounted at /ix. +SGLANG_PYTHON="$(command -v python3)" +if [[ "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then + AGENTIC_VENV="${AGENTIC_VENV:-/tmp/inferencex-agentic-venv}" + "$SGLANG_PYTHON" -m venv "$AGENTIC_VENV" + export PATH="$AGENTIC_VENV/bin:$PATH" +fi install_agentic_deps SERVER_LOG="$RESULT_DIR/server.log" @@ -135,7 +146,7 @@ export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 SGLANG_CMD=( - python3 -m sglang.launch_server + "$SGLANG_PYTHON" -m sglang.launch_server --model-path "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 From 51334c07e163a949626695489e85bff2f6bd6f23 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 13:39:39 -0500 Subject: [PATCH 078/132] fix(agentic): align B200 DSv4 SGLang runtime flags --- .../dsv4_fp4_blackwell_sglang_common.sh | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh index 6bb60da03..391d9df49 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh @@ -117,13 +117,27 @@ esac PARALLEL_ARGS=(--tp "$TP") METRICS_ARGS=(--enable-metrics) -MEM_FRACTION_STATIC=0.88 CHUNKED_PREFILL_SIZE=8192 PARALLEL_ARGS+=( --moe-runner-backend flashinfer_mxfp4 --disable-flashinfer-autotune ) +MODEL_ARGS=() +if [ "${DSV4_SGLANG_PLATFORM:-}" = "B200" ]; then + # Match the established B200 DSv4 recipe. The B200-specialized image + # deadlocks immediately after weight loading when forced through the + # B300-oriented compressed-attention/page-size overrides. + MEM_FRACTION_STATIC=0.90 +else + MEM_FRACTION_STATIC=0.88 + MODEL_ARGS+=( + --attention-backend compressed + --page-size 256 + --disable-shared-experts-fusion + ) +fi + PER_ENGINE_MAX_RUNNING=$CONC [ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 CUDA_GRAPH_MAX_BS=$PER_ENGINE_MAX_RUNNING @@ -153,19 +167,17 @@ SGLANG_CMD=( --port "$PORT" --trust-remote-code "${PARALLEL_ARGS[@]}" - --attention-backend compressed - --page-size 256 --mem-fraction-static "$MEM_FRACTION_STATIC" --swa-full-tokens-ratio 0.1 --max-running-requests "$PER_ENGINE_MAX_RUNNING" --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" --context-length "$MAX_MODEL_LEN" --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" - --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 --chat-template "$SCRIPT_DIR/../chat_templates/deepseek_v4_thinking.jinja" --watchdog-timeout 1800 + "${MODEL_ARGS[@]}" "${METRICS_ARGS[@]}" "${CACHE_ARGS[@]}" ) From 06b9b108881f28712510365e9f7e953f1aed32a4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 14:01:17 -0500 Subject: [PATCH 079/132] fix(agentic): use stable SGLang DSv4 image on B200 --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 294685eda..8bb6217f0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9276,7 +9276,7 @@ dsv4-fp4-b200-vllm-agentic: - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 196, 256, 512] } dsv4-fp4-b200-sglang-agentic-hicache: - image: lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b + image: lmsysorg/sglang:v0.5.12.post1-cu130 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dgxc From abfe21722102dfc7f5bcf66dbb6ba2b351db62db Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 14:23:13 -0500 Subject: [PATCH 080/132] perf(agentic): load B200 DSv4 from node-local RAID --- runners/launch_b200-dgxc.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index bb3bf9ed1..6556639e1 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -367,6 +367,24 @@ else salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) + # DSv4 is also staged on the compute nodes' local RAID. Loading the 806 GB + # checkpoint independently from Lustre on every TP rank leaves the loader + # threads blocked in Lustre I/O for hours. Select the local copy only after + # Slurm assigns a node, and retain the shared-Lustre path as a fallback for + # nodes whose local staging is incomplete. + if [[ "$MODEL_PREFIX" == "dsv4" && "$PRECISION" == "fp4" && "$FRAMEWORK" == "sglang" ]]; then + LOCAL_MODEL_PATH=/raid/models/DeepSeek-V4-Pro-NVFP4 + if srun --jobid="$JOB_ID" bash -c \ + 'test -f "$1/config.json" && test -f "$1/model.safetensors.index.json" && test "$(find "$1" -maxdepth 1 -name "model-*.safetensors" | wc -l)" -eq 64' \ + _ "$LOCAL_MODEL_PATH"; then + export MODEL_PATH="$LOCAL_MODEL_PATH" + export MODEL="$MODEL_PATH" + echo "Using node-local DSv4 checkpoint: $MODEL_PATH" + else + echo "Node-local DSv4 checkpoint unavailable; using shared checkpoint: $MODEL_PATH" + fi + fi + # Use flock to serialize concurrent imports to the same squash file # Override ENROOT_CACHE_PATH to avoid permission issues with system-wide cache on worker nodes srun --jobid=$JOB_ID bash -c " From 8e08957c3ff41cd36c57468e0d4c1553824382ef Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 22:54:09 -0500 Subject: [PATCH 081/132] fix(agentic): enable selective Mooncake caching for DSv4 --- .github/configs/nvidia-master.yaml | 11 ++++++----- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 7 +++---- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 8 +++----- runners/launch_b200-dgxc.sh | 9 ++++++++- runners/launch_b300-nv.sh | 8 +++++++- 5 files changed, 27 insertions(+), 16 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 8bb6217f0..00686c2de 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9259,7 +9259,9 @@ glm5-fp8-gb300-dynamo-sglang: dp-attn: false dsv4-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:nightly-d0975a4b50140a9d953f00955a1cbb2a4945edef + # Includes vllm-project/vllm#44774 so Mooncake honors sparse-attention + # prefix-cache retention when deciding which hybrid-KV blocks to store. + image: cquil/vllm-openai:v0.22.1-dcc957098904749bf375ffbf85aba6c74dfc9fe9 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dgxc @@ -9417,10 +9419,9 @@ dsv4-fp8-h200-vllm-agentic: # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. dsv4-fp4-b300-vllm-agentic: - # image: vllm/vllm-openai:v0.22.0 - # includes https://github.com/vllm-project/vllm/pull/43447 up to 6c529f3001ab8bf44b1657e779dc54b622397045 - # image: cquil/vllm-openai:v0.22.0-6c529f3001ab8bf44b1657e779dc54b622397045' - image: vllm/vllm-openai:nightly-d0975a4b50140a9d953f00955a1cbb2a4945edef + # Includes vllm-project/vllm#44774 so Mooncake honors sparse-attention + # prefix-cache retention when deciding which hybrid-KV blocks to store. + image: cquil/vllm-openai:v0.22.1-dcc957098904749bf375ffbf85aba6c74dfc9fe9 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index a2b2db625..89779e824 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -73,10 +73,9 @@ fi # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s. export VLLM_ENGINE_READY_TIMEOUT_S=3600 -# vllm-project/vllm#43447: keep SWA prefix-cache tails sparsely so transient -# sliding-window allocations don't evict useful prefix entries. 32k matches -# the trace-replay tuning the PR author validated (0% -> 74% hit rate). -# Requires the custom cquil image configured for this recipe. +# vllm-project/vllm#43447 keeps local SWA prefix-cache tails sparsely, while +# vllm-project/vllm#44774 applies the same reachability policy to Mooncake's +# store mask. 32k matches the trace-replay tuning validated for this workload. export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768 # ---- Server config ---------------------------------------------------------- diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index c73aa5c6a..60aa019b1 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -75,11 +75,9 @@ fi # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s. export VLLM_ENGINE_READY_TIMEOUT_S=3600 -# vllm-project/vllm#43447: keep SWA prefix-cache tails sparsely so transient -# sliding-window allocations don't evict useful prefix entries. 32k matches -# the trace-replay tuning the PR author validated (0% -> 74% hit rate). -# Requires the custom image (cquil/vllm-openai:*-7ead0a0f...) that carries -# the patch; on stock images the env var is ignored. +# vllm-project/vllm#43447 keeps local SWA prefix-cache tails sparsely, while +# vllm-project/vllm#44774 applies the same reachability policy to Mooncake's +# store mask. 32k matches the trace-replay tuning validated for this workload. export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768 # ---- Server config ---------------------------------------------------------- diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index 6556639e1..205cdf4d6 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -364,7 +364,14 @@ else # and gpu-15 names no longer exist. gpu-2 currently has 10 fully-idle GPU # nodes (all of gpu-2-[0-9]); gpu-1 has 2 drained (gpu-1-4, gpu-1-8). We # land on gpu-2 to avoid drained nodes and skip the per-node excludes. - salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" + SALLOC_MEMORY_ARGS=() + if [[ "$MODEL_PREFIX" == "dsv4" && "$FRAMEWORK" == "vllm" && "${OFFLOADING:-none}" == "cpu" ]]; then + # The embedded Mooncake segments total 2.5 TB. Without an explicit + # request, Slurm caps this exclusive job at 2 TB and OOM-kills it even + # though the B200 node has about 4 TB of physical RAM. + SALLOC_MEMORY_ARGS=(--mem=0) + fi + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive "${SALLOC_MEMORY_ARGS[@]}" --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) # DSv4 is also staged on the compute nodes' local RAID. Loading the 806 GB diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index cb4a634c3..e644ca5f1 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -379,7 +379,13 @@ else fi ) - salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" + SALLOC_MEMORY_ARGS=() + if [[ "$MODEL_PREFIX" == "dsv4" && "$FRAMEWORK" == "vllm" && "${OFFLOADING:-none}" == "cpu" ]]; then + # Give the 2.5 TB embedded Mooncake store the full memory allocation of + # the exclusive node instead of relying on the partition's default. + SALLOC_MEMORY_ARGS=(--mem=0) + fi + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive "${SALLOC_MEMORY_ARGS[@]}" --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) srun --jobid=$JOB_ID \ From 3da2d69f58fda5d2edaae08d38762d01dfcec78b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 9 Jun 2026 23:12:39 -0500 Subject: [PATCH 082/132] fix(agentic): align B300 CUTLASS DSL bindings --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 60aa019b1..3de9c8bfb 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -55,6 +55,12 @@ nvidia-smi resolve_trace_source install_agentic_deps +# vLLM v0.22.1 can ship CUTLASS DSL 4.5.2 with stale native MLIR bindings, +# which fails DSV4 indexer compilation with mlir_global_dtors(..., data). +# Reinstall the matching native wheel until NVIDIA/cutlass#3259 is resolved. +agentic_pip_install --quiet --force-reinstall --no-deps \ + 'nvidia-cutlass-dsl-libs-cu13==4.5.2' + # vllm-project/router expands the one HTTP backend into one logical worker per # DP rank and sends X-data-parallel-rank on forwarded requests. aiperf's # X-Correlation-ID is stable for every turn of a conversation; alias it to the From 01fb21fe387ada1b0cc1392e593ff5ffaca23ff6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 10 Jun 2026 10:42:31 -0500 Subject: [PATCH 083/132] fix(agentic): align B200 CUTLASS DSL bindings --- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 89779e824..514c6df8c 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -54,6 +54,12 @@ nvidia-smi resolve_trace_source install_agentic_deps +# vLLM v0.22.1 can ship CUTLASS DSL 4.5.2 with stale native MLIR bindings, +# which fails DSV4 indexer compilation with mlir_global_dtors(..., data). +# Reinstall the matching native wheel until NVIDIA/cutlass#3259 is resolved. +agentic_pip_install --quiet --force-reinstall --no-deps \ + 'nvidia-cutlass-dsl-libs-cu13==4.5.2' + # vllm-project/router expands the one HTTP backend into one logical worker per # DP rank and sends X-data-parallel-rank on forwarded requests. aiperf's # X-Correlation-ID is stable for every turn of a conversation; alias it to the From 7b2c50fac08b2966bd055311af17c1cbcc512fe4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 10 Jun 2026 10:57:54 -0500 Subject: [PATCH 084/132] perf(agentic): enable Mooncake RDMA device affinity --- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 3 ++- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 514c6df8c..bfd37705c 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -118,11 +118,12 @@ case "$OFFLOADING" in "global_segment_size": "${PER_RANK_GB}GB", "local_buffer_size": "4GB", "protocol": "rdma", - "device_name": "mlx5_0", + "device_name": "", "enable_offload": false } EOF export MOONCAKE_CONFIG_PATH + export MC_ENABLE_DEST_DEVICE_AFFINITY=1 # Identical prefixes must hash to identical store keys across DP ranks. export PYTHONHASHSEED=0 # B200 GPU memory registration works through DMA-BUF, but the compute diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 3de9c8bfb..7fc30b60b 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -120,11 +120,12 @@ case "$OFFLOADING" in "global_segment_size": "${PER_RANK_GB}GB", "local_buffer_size": "4GB", "protocol": "rdma", - "device_name": "mlx5_0", + "device_name": "", "enable_offload": false } EOF export MOONCAKE_CONFIG_PATH + export MC_ENABLE_DEST_DEVICE_AFFINITY=1 # Identical prefixes must hash to identical store keys across DP ranks. export PYTHONHASHSEED=0 # Large agentic KV writes can exceed Mooncake Store's fixed 60-second From 3925ad78c9283ee0c202409b30809c3b08e8be98 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 10 Jun 2026 11:06:13 -0500 Subject: [PATCH 085/132] fix(agentic): grant full node memory to all host KV offload jobs B200 HiCache TP8 ratio=8 needs a ~3.4 TB pinned host pool, but the b200-dgxc salloc only added --mem=0 for the vLLM Mooncake cpu-offload case, so SGLang HiCache jobs ran under Slurm's implicit 2 TB cap and the rank-0 scheduler was OOM-killed during host pool allocation (job 21153 step .2: MaxRSS 1.9 TiB at the 2 TiB ReqMem ceiling). Extend --mem=0 to every OFFLOADING != none job on both Blackwell launchers; the nodes are exclusively allocated either way. Co-Authored-By: Claude Fable 5 --- runners/launch_b200-dgxc.sh | 9 +++++---- runners/launch_b300-nv.sh | 7 ++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index 205cdf4d6..2187617ae 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -365,10 +365,11 @@ else # nodes (all of gpu-2-[0-9]); gpu-1 has 2 drained (gpu-1-4, gpu-1-8). We # land on gpu-2 to avoid drained nodes and skip the per-node excludes. SALLOC_MEMORY_ARGS=() - if [[ "$MODEL_PREFIX" == "dsv4" && "$FRAMEWORK" == "vllm" && "${OFFLOADING:-none}" == "cpu" ]]; then - # The embedded Mooncake segments total 2.5 TB. Without an explicit - # request, Slurm caps this exclusive job at 2 TB and OOM-kills it even - # though the B200 node has about 4 TB of physical RAM. + if [[ "${OFFLOADING:-none}" != "none" ]]; then + # Host KV tiers (vLLM Mooncake cpu offload, SGLang HiCache) allocate + # multi-TB pinned host pools. Without an explicit request, Slurm caps + # this exclusive job at 2 TB and OOM-kills it even though the B200 + # node has about 4 TB of physical RAM. SALLOC_MEMORY_ARGS=(--mem=0) fi salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive "${SALLOC_MEMORY_ARGS[@]}" --time=180 --no-shell --job-name="$RUNNER_NAME" diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index e644ca5f1..1616ed490 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -380,9 +380,10 @@ else ) SALLOC_MEMORY_ARGS=() - if [[ "$MODEL_PREFIX" == "dsv4" && "$FRAMEWORK" == "vllm" && "${OFFLOADING:-none}" == "cpu" ]]; then - # Give the 2.5 TB embedded Mooncake store the full memory allocation of - # the exclusive node instead of relying on the partition's default. + if [[ "${OFFLOADING:-none}" != "none" ]]; then + # Host KV tiers (vLLM Mooncake cpu offload, SGLang HiCache) allocate + # multi-TB pinned host pools. Give them the full memory allocation of + # the exclusive node instead of Slurm's implicit 2 TB default. SALLOC_MEMORY_ARGS=(--mem=0) fi salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive "${SALLOC_MEMORY_ARGS[@]}" --time=180 --no-shell --job-name="$RUNNER_NAME" From 0b02f0b9c0833c90224fd3c0099289af8dfe7e8d Mon Sep 17 00:00:00 2001 From: cquil11 Date: Wed, 10 Jun 2026 11:38:33 -0500 Subject: [PATCH 086/132] fix(agentic): re-pin GB300 agentic image to R30-validated v0.21.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The repo-wide vllm image bump to v0.22.0 (76aedd65) silently broke the GB300 agentic configs: srtctl resolves the recipe's model.container via the launcher's containers map keyed on $IMAGE, so the mismatch made pyxis pull v0.21.0 raw from Docker Hub per node instead of using the imported squash — and v0.22.0 + ai-dynamo 1.2.0.dev20260426 was never validated on GB300 disagg anyway. Co-Authored-By: Claude Opus 4.8 --- .github/configs/nvidia-master.yaml | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 00686c2de..47c1de73d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9627,7 +9627,17 @@ kimik2.5-fp4-b200-vllm-agentic-lmcache: # Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. dsv4-fp4-gb300-dynamo-vllm-agentic: - image: vllm/vllm-openai:v0.22.0 + # Pinned to the R30-validated stack (vllm v0.21.0-ubuntu2404 + ai-dynamo + # wheel 1.2.0.dev20260426). The repo-wide bump to v0.22.0 (76aedd65) broke + # this config silently: the agentic recipes' `model.container` field must + # match this image string for srtctl's containers-map lookup to resolve to + # the squash file the launcher imports — on mismatch srtctl passes the + # recipe string verbatim to pyxis, which re-pulls from Docker Hub on every + # node and ignores the imported squash. Bump this together with + # `model.container` in benchmarks/multi_node/srt-slurm-recipes/vllm/ + # deepseek-v4/agentic/*.yaml once v0.22.x + the dynamo wheel is validated + # on GB300 disagg. + image: vllm/vllm-openai:v0.21.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 # gb300-nv (not generic gb300) — the generic label is shared by both NV @@ -9716,7 +9726,9 @@ dsv4-fp4-gb300-dynamo-vllm-agentic: # overlay (recipes/vllm/deepseek-v4/agentic/), so a change to the recipe # applies to both clusters with no duplication. dsv4-fp4-gb300-cw-dynamo-vllm-agentic: - image: vllm/vllm-openai:v0.22.0 + # Image pinned to match the agentic recipes' `model.container` — see the + # comment on dsv4-fp4-gb300-dynamo-vllm-agentic. + image: vllm/vllm-openai:v0.21.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw From 633e2631bf46a8492bb8f5d6e8ebed9b640f724c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 10 Jun 2026 11:55:41 -0500 Subject: [PATCH 087/132] fix(agentic): retain stable B200 Mooncake NIC pin --- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index bfd37705c..514c6df8c 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -118,12 +118,11 @@ case "$OFFLOADING" in "global_segment_size": "${PER_RANK_GB}GB", "local_buffer_size": "4GB", "protocol": "rdma", - "device_name": "", + "device_name": "mlx5_0", "enable_offload": false } EOF export MOONCAKE_CONFIG_PATH - export MC_ENABLE_DEST_DEVICE_AFFINITY=1 # Identical prefixes must hash to identical store keys across DP ranks. export PYTHONHASHSEED=0 # B200 GPU memory registration works through DMA-BUF, but the compute From db9603b6ffa89a6625b06f0262f558388277d7ba Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 10 Jun 2026 13:23:45 -0500 Subject: [PATCH 088/132] perf(agentic): test Mooncake RDMA affinity on B200 sweep --- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 514c6df8c..bfd37705c 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -118,11 +118,12 @@ case "$OFFLOADING" in "global_segment_size": "${PER_RANK_GB}GB", "local_buffer_size": "4GB", "protocol": "rdma", - "device_name": "mlx5_0", + "device_name": "", "enable_offload": false } EOF export MOONCAKE_CONFIG_PATH + export MC_ENABLE_DEST_DEVICE_AFFINITY=1 # Identical prefixes must hash to identical store keys across DP ranks. export PYTHONHASHSEED=0 # B200 GPU memory registration works through DMA-BUF, but the compute From 3889ed33fc1263059e0edcd6592429336c7dd66b Mon Sep 17 00:00:00 2001 From: cquil11 Date: Wed, 10 Jun 2026 13:30:54 -0500 Subject: [PATCH 089/132] feat(agentic): add GB200 DSv4 dynamo-vllm disagg agentic config Mirrors the gb300 agentic setup on the validated gb200 fixed-seq-len stack (vllm v0.20.0-ubuntu2404 + ai-dynamo 1.2.0.dev20260426): - recipes: agentic variants of mid-curve-megamoe (1p1d DEP8/DEP8, conc 32/192 tiers) and high-tpt-megamoe (2p1d, conc 4096 tier) with the standard agentic deltas (benchmark.type custom -> agentic_srt.sh, max-model-len dropped, prefix caching ON, NATS max_payload 32 MiB, container-remap-root). - launch_gb200-nv.sh: IS_AGENTIC branch now pins the same cquil11/srt-slurm-nv@6e34b8b as gb300, overlays the agentic recipes, passes --no-preflight (model on compute-local /mnt/numa1), and mounts persistent aiperf-mmap + HF hub caches on Lustre. - nvidia-master.yaml: dsv4-fp4-gb200-dynamo-vllm-agentic (runner gb200). - gb300: wire the same persistent HF_HUB_CACHE mount on nv + cw launchers and recipes (the corpus re-downloaded every run before). Co-Authored-By: Claude Opus 4.8 --- .github/configs/nvidia-master.yaml | 66 +++++++ .../disagg-gb200-1p1d-dep8-dep8-agentic.yaml | 168 ++++++++++++++++++ .../disagg-gb200-2p1d-dep8-dep8-agentic.yaml | 154 ++++++++++++++++ .../disagg-gb300-1p6d-dep4-tp4-agentic.yaml | 4 + ...gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml | 4 + runners/launch_gb200-nv.sh | 46 ++++- runners/launch_gb300-cw.sh | 7 + runners/launch_gb300-nv.sh | 9 + 8 files changed, 455 insertions(+), 3 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 47c1de73d..d8c35affa 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9786,6 +9786,72 @@ dsv4-fp4-gb300-cw-dynamo-vllm-agentic: ep: 8 dp-attn: true +# GB200 sibling of the gb300 agentic configs. Topologies come from the +# validated fixed-seq-len gb200 dynamo-vllm family (mid-curve-megamoe and +# high-tpt-megamoe) rather than the gb300 shapes — see the agentic recipe +# files for the agentic-specific deltas. Image matches the recipes' +# `model.container` (v0.20.0-ubuntu2404, the validated gb200 stack); the +# two must stay in lockstep — see dsv4-fp4-gb300-dynamo-vllm-agentic. +dsv4-fp4-gb200-dynamo-vllm-agentic: + image: vllm/vllm-openai:v0.20.0-ubuntu2404 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # Low-latency: 1p1d (DEP=8 / DEP=8) at conc 32. 5 nodes incl. infra. + - spec-decoding: none + conc-list: [32] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # Mid: same 1p1d shape at conc 192. + - spec-decoding: none + conc-list: [192] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # High-throughput: 2p1d (2x DEP=8 prefill + DEP=8 decode) at conc 4096. + # 7 nodes incl. infra. + - spec-decoding: none + conc-list: [4096] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + qwen3.5-fp8-h100-sglang-agentic: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml new file mode 100644 index 000000000..e16b22bfc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml @@ -0,0 +1,168 @@ +name: "svf-vllm-disagg-gb200-1p1d-dep8-dep8-agentic" + +# Agentic-coding variant of vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe.yaml. +# Topology is identical (1 prefill DEP=8 + 1 decode DEP=8, 16 GPUs across 4 +# GB200 nodes + 1 dedicated NATS/etcd infra node) so we can compare against +# the fixed-seq-len mid-curve baseline. Serves the low-latency (conc 32) and +# mid (conc 192) agentic tiers. +# +# Divergence vs the 8k1k sibling (same deltas as the gb300 agentic recipes): +# - benchmark.type: sa-bench -> custom (hands off to agentic_srt.sh) +# - max-model-len: removed (let vLLM derive from model config; agentic +# trajectories blow past any small explicit cap) +# - no-enable-prefix-caching: dropped (prefix caching MUST be on for +# trajectory reuse — entire point of agentic) +# - infra.nats_max_payload_mb: 32 (agentic prompts at 50k-200k DSv4 tokens +# serialize to 1-3 MB JSON; NATS' 1 MiB default +# rejects them — see gb300 1p6d recipe comment) +# - srun_options.container-remap-root: pyxis may map the calling user to a +# non-root uid in the container; agentic_srt.sh +# needs root for `apt-get install git`. No-op +# when the container user is already root. +# Note: --enable-auto-tool-choice / --tool-call-parser / --reasoning-parser +# are NOT set on the worker. The dynamo-vllm worker entrypoint doesn't +# accept them (different arg parser than `vllm serve`). In disagg, chat +# parsing happens at the dynamo frontend, not at the worker. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + # See the gb300 1p6d agentic recipe for rationale — NATS' 1 MiB default + # rejects long agentic prompts; 32 MiB gives ~10x headroom over the + # largest observed payload. + nats_max_payload_mb: 32 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + enforce-eager: true + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.95 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +srun_options: + # See gb300 agentic recipes: pyxis may map the calling user to a non-root + # uid inside the container; remap to uid 0 so agentic_srt.sh's apt-get + # install git works. No-op when the container user is already root. + container-remap-root: "" + +benchmark: + type: custom + command: bash /infmax-workspace/benchmarks/multi_node/agentic_srt.sh + env: + INFMAX_CONTAINER_WORKSPACE: /infmax-workspace + RESULT_DIR: /logs/agentic + PORT: "8000" + IS_MULTINODE: "true" + # Container-side path of the aiperf mmap dataset cache; the host-side + # mount is wired via launch_gb200-nv.sh's srtslurm.yaml default_mounts. + # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files + # per dataset on every run. + AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache" + # Persistent HF hub cache (also wired via default_mounts) so the trace + # dataset isn't re-downloaded on every run. Overrides the workflow-level + # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes. + HF_HUB_CACHE: "/hf_hub_cache" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml new file mode 100644 index 000000000..4ac4b36a8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml @@ -0,0 +1,154 @@ +name: "svf-vllm-disagg-gb200-2p1d-dep8-dep8-agentic" + +# Agentic-coding variant of vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe.yaml. +# Topology is identical (2 prefill DEP=8 each + 1 decode DEP=8, 24 GPUs across +# 6 GB200 nodes + 1 dedicated NATS/etcd infra node). Serves the +# high-throughput (conc 4096) agentic tier. +# +# Divergence vs the 8k1k sibling — same agentic deltas as the 1p1d recipe; +# see disagg-gb200-1p1d-dep8-dep8-agentic.yaml for the full rationale: +# - benchmark.type custom, max-model-len removed, prefix caching ON, +# nats_max_payload_mb 32, container-remap-root. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + # See the gb300 1p6d agentic recipe for rationale — NATS' 1 MiB default + # rejects long agentic prompts; 32 MiB gives ~10x headroom over the + # largest observed payload. + nats_max_payload_mb: 32 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + enforce-eager: true + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.95 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +srun_options: + # See gb300 agentic recipes: pyxis may map the calling user to a non-root + # uid inside the container; remap to uid 0 so agentic_srt.sh's apt-get + # install git works. No-op when the container user is already root. + container-remap-root: "" + +benchmark: + type: custom + command: bash /infmax-workspace/benchmarks/multi_node/agentic_srt.sh + env: + INFMAX_CONTAINER_WORKSPACE: /infmax-workspace + RESULT_DIR: /logs/agentic + PORT: "8000" + IS_MULTINODE: "true" + # Container-side path of the aiperf mmap dataset cache; the host-side + # mount is wired via launch_gb200-nv.sh's srtslurm.yaml default_mounts. + # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files + # per dataset on every run. + AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache" + # Persistent HF hub cache (also wired via default_mounts) so the trace + # dataset isn't re-downloaded on every run. Overrides the workflow-level + # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes. + HF_HUB_CACHE: "/hf_hub_cache" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml index fb7b9fd97..2caf202a6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml @@ -175,3 +175,7 @@ benchmark: # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files # per dataset on every run. AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache" + # Persistent HF hub cache (also wired via default_mounts) so the trace + # dataset isn't re-downloaded on every run. Overrides the workflow-level + # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes. + HF_HUB_CACHE: "/hf_hub_cache" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml index bb8fc6df8..98e25c450 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml @@ -174,3 +174,7 @@ benchmark: # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files # per dataset on every run. AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache" + # Persistent HF hub cache (also wired via default_mounts) so the trace + # dataset isn't re-downloaded on every run. Overrides the workflow-level + # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes. + HF_HUB_CACHE: "/hf_hub_cache" diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index dada98bd6..823e9cb71 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -148,8 +148,21 @@ fi # TODO(CJQ): make first class upon srt-slurm upstream refactor if [[ "$IS_AGENTIC" == "1" ]]; then - git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" + # Agentic multi-node uses the same pinned cquil11/srt-slurm-nv commit as + # launch_gb300-nv.sh — everything the agentic recipes need is there: + # - BenchmarkType.CUSTOM + benchmark.command + benchmark.env + # (the hook that hands off to benchmarks/multi_node/agentic_srt.sh) + # - DynamoConfig.wheel (recipes pin the ai-dynamo wheel) + # - srtctl apply --no-preflight (model path /mnt/numa1 is compute-node + # local NVMe, invisible to the login-node runner) + # - benchmark_stage srun_options propagation (container-remap-root + # must reach the agentic_srt.sh srun) + git clone https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" + git checkout 6e34b8b83229634d732e41a4e2d6595f46ef60b5 + mkdir -p recipes/vllm/deepseek-v4/agentic + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic" \ + recipes/vllm/deepseek-v4/agentic elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" @@ -200,6 +213,24 @@ echo "Configs available at: $SRT_REPO_DIR/" # Create srtslurm.yaml for srtctl (used by both frameworks) SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" + +# Agentic runs bind-mount two persistent caches into every worker container +# (Lustre, shared across nodes): aiperf's content-addressed dataset mmap +# cache (~65 GB per corpus, re-tokenized from scratch without it) and the +# HF hub cache holding the trace dataset download. The container-side paths +# are referenced by the agentic recipes' benchmark.env +# (AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache, HF_HUB_CACHE=/hf_hub_cache). +DEFAULT_MOUNTS_BLOCK="" +if [[ "$IS_AGENTIC" == "1" ]]; then + AIPERF_MMAP_CACHE_HOST_PATH="/mnt/lustre01/users-public/sa-shared/ai-perf-cache" + HF_HUB_CACHE_HOST_PATH="/mnt/lustre01/users-public/sa-shared/hf-hub-cache" + mkdir -p "$AIPERF_MMAP_CACHE_HOST_PATH" "$HF_HUB_CACHE_HOST_PATH" + chmod 777 "$AIPERF_MMAP_CACHE_HOST_PATH" "$HF_HUB_CACHE_HOST_PATH" 2>/dev/null || true + DEFAULT_MOUNTS_BLOCK="default_mounts: + ${AIPERF_MMAP_CACHE_HOST_PATH}: /aiperf_mmap_cache + ${HF_HUB_CACHE_HOST_PATH}: /hf_hub_cache" +fi + echo "Creating srtslurm.yaml configuration..." cat > srtslurm.yaml <&1) + SRTCTL_OUTPUT=$(srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) else - SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) + SRTCTL_OUTPUT=$(srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) fi echo "$SRTCTL_OUTPUT" diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 6a5c50e38..7a7a66afa 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -88,6 +88,12 @@ export NVIDIA_DRIVER_CAPABILITIES=compute,utility # write to it. export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/vast/ai-perf-cache" +# Persistent HF hub cache for the agentic trace datasets — see the +# launch_gb300-nv.sh comment. Mounted at /hf_hub_cache; agentic recipes set +# HF_HUB_CACHE=/hf_hub_cache in benchmark.env. +export HF_HUB_CACHE_HOST_PATH="/mnt/vast/hf-hub-cache" +mkdir -p "$HF_HUB_CACHE_HOST_PATH" + NGINX_IMAGE="nginx:1.27.4" # Squash files live alongside models on /mnt/vast (shared across nodes). @@ -221,6 +227,7 @@ srtctl_root: "${SRTCTL_ROOT}" default_mounts: ${DYNAMO_WHEELS_CACHE_HOST}: /configs/dynamo-wheels ${AIPERF_MMAP_CACHE_HOST_PATH}: /aiperf_mmap_cache + ${HF_HUB_CACHE_HOST_PATH}: /hf_hub_cache model_paths: dspro: "${MODEL_PATH}" diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index b47e103fd..e4597302f 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -17,6 +17,14 @@ export ENROOT_ROOTFS_WRITABLE=1 # write to it. export AIPERF_MMAP_CACHE_HOST_PATH="/data/home/sa-shared/gharunners/ai-perf-cache" +# Persistent HF hub cache for the agentic trace datasets — mounted into +# worker containers at /hf_hub_cache; the agentic recipes set +# HF_HUB_CACHE=/hf_hub_cache in benchmark.env. Without it the workflow-level +# HF_HUB_CACHE (/mnt/hf_hub_cache) doesn't exist on these nodes and every +# run re-downloads the corpus into the ephemeral container overlay. +export HF_HUB_CACHE_HOST_PATH="/data/home/sa-shared/gharunners/hf-hub-cache" +mkdir -p "$HF_HUB_CACHE_HOST_PATH" + export MODEL_PATH=$MODEL if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then @@ -189,6 +197,7 @@ srtctl_root: "${SRTCTL_ROOT}" # re-tokenized + re-written every job. default_mounts: "${AIPERF_MMAP_CACHE_HOST_PATH}": "/aiperf_mmap_cache" + "${HF_HUB_CACHE_HOST_PATH}": "/hf_hub_cache" # Model path aliases model_paths: From 0dffde77cbdd5e1a98a9b95a2a87a4d8a2a979d0 Mon Sep 17 00:00:00 2001 From: cquil11 Date: Wed, 10 Jun 2026 13:54:36 -0500 Subject: [PATCH 090/132] fix(agentic): disable sbatch segment directive on gb200 srtctl defaults use_segment_sbatch_directive=true, which renders #SBATCH --segment=. Under partition fragmentation Slurm backfills a non-contiguous node set, segment placement fails at start, and the job dies with CANCELLED/Resources at RunTime=0 (gb200 agentic R1, job 18582, SchedNodeList=blue-cn[03-06,08-09,11]). The whole batch partition is one NVL72 rack, so the directive adds no MNNVL value here. Mirrors launch_gb300-nv.sh. Co-Authored-By: Claude Opus 4.8 --- runners/launch_gb200-nv.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 823e9cb71..a1db9ea66 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -255,6 +255,14 @@ containers: dynamo-sglang: ${SQUASH_FILE} "${IMAGE}": ${SQUASH_FILE} nginx-sqsh: ${NGINX_SQUASH_FILE} +# srtctl defaults this to true, which adds #SBATCH --segment=. +# On watchtower the whole batch partition (blue-cn01-18) is a single NVL72 +# rack, so segment contiguity buys nothing for MNNVL — but it DOES make +# jobs unschedulable when the partition is fragmented: Slurm backfills a +# non-contiguous node set, fails segment placement at start, and the job +# dies with "CANCELLED Reason=Resources" at RunTime=0 (hit by the first +# gb200 agentic run, job 18582). Mirror launch_gb300-nv.sh and disable. +use_segment_sbatch_directive: false ${DEFAULT_MOUNTS_BLOCK} EOF From c2b341f41fe1e2d04b14fe08dd4d916fb321b4bc Mon Sep 17 00:00:00 2001 From: cquil11 Date: Wed, 10 Jun 2026 14:23:16 -0500 Subject: [PATCH 091/132] fix(agentic): don't leak login-node VIRTUAL_ENV into gb200 orchestrator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sbatch --export=ALL propagates VIRTUAL_ENV from the launcher's `source .venv/bin/activate` into the compute-node job script, whose `uv run` prefetch step inspects the active venv and dies with "Broken symlink at .venv/bin/python3" — the login-node interpreter path doesn't exist on compute nodes (gb200 agentic R2, job 18587). Unset it before srtctl apply; srtctl still resolves via PATH. Co-Authored-By: Claude Opus 4.8 --- runners/launch_gb200-nv.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index a1db9ea66..eb8fd0681 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -280,6 +280,15 @@ echo "Submitting job with srtctl..." # Override the job name in the config file with the runner name sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "${CONFIG_FILE%%:*}" +# Don't leak the login-node venv to the compute-node orchestrator. sbatch's +# default --export=ALL propagates VIRTUAL_ENV (set by `source +# .venv/bin/activate` above) into job_script_minimal.j2, whose +# `uv run` step then tries to inspect the *active* venv — and dies with +# "Broken symlink at .venv/bin/python3" because the login-node interpreter +# path doesn't exist on compute nodes (gb200 agentic R2, job 18587). +# srtctl itself still resolves through PATH (.venv/bin is on it). +unset VIRTUAL_ENV + # --no-preflight is only safe on the agentic path, where the recipe resolves # model.path to /mnt/numa1 (compute-node-only NVMe) that the login-node # runner can't see. Fixed-seq-len recipes keep enforcement on. From 66a71daf862d79b43919943e11c246e1ea7edbc1 Mon Sep 17 00:00:00 2001 From: cquil11 Date: Wed, 10 Jun 2026 16:30:37 -0500 Subject: [PATCH 092/132] fix(agentic): gb200 256k context cap + collision-proof slurm job names Two independent R3 failures: 1. KV OOM at engine init (job 18592): GB200's 186 GB HBM can't fit the 1M-context KV requirement vLLM derives when max-model-len is omitted (24.06 GiB/rank needed, ~7.9 GiB free after FP4 weights + MegaMOE buffers). Pin max-model-len 262144 on both workers and switch the replay to the 060826 256k-capped corpus via WEKA_LOADER_OVERRIDE + MAX_MODEL_LEN, mirroring the minimaxm2.5 agentic configs. 2. Foreign scancel (job 18593, CANCELLED by uid 1010): another runner fleet on watchtower uses the same gb200-nv_N job names and scancels by name across users from its pre-job hook. Prefix our job names with ifx- in launch_gb200-nv.sh; the workflow cleanup now scancels both names and filters its squeue wait by user so it can't hang on foreign jobs. Co-Authored-By: Claude Opus 4.8 --- .../workflows/benchmark-multinode-tmpl.yml | 13 +++++++++--- .../disagg-gb200-1p1d-dep8-dep8-agentic.yaml | 21 +++++++++++++++++-- .../disagg-gb200-2p1d-dep8-dep8-agentic.yaml | 12 +++++++++-- runners/launch_gb200-nv.sh | 10 +++++++-- 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 81727ef39..c2a6d5d77 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -171,10 +171,17 @@ jobs: - name: Slurm cleanup (pre-run) run: &slurm-cleanup | if command -v squeue >/dev/null 2>&1; then - echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..." + # Clean both the bare runner name and the "ifx-" prefixed form. + # launch_gb200-nv.sh names jobs ifx- to dodge a foreign + # runner fleet on watchtower that scancels by the bare name + # across users (see the comment there). squeue is filtered to + # our user so the wait loop can't hang on a same-named foreign + # job we have no permission to cancel. + echo "[Slurm] Cleaning up jobs named: ${{ runner.name }}, ifx-${{ runner.name }} ..." scancel --name="${{ runner.name }}" || true - while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do - squeue --name="${{ runner.name }}" + scancel --name="ifx-${{ runner.name }}" || true + while [ -n "$(squeue --user="$USER" --name='${{ runner.name }},ifx-${{ runner.name }}' --noheader --format='%i')" ]; do + squeue --user="$USER" --name="${{ runner.name }},ifx-${{ runner.name }}" sleep 5 done fi diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml index e16b22bfc..3f33b73e5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml @@ -8,8 +8,18 @@ name: "svf-vllm-disagg-gb200-1p1d-dep8-dep8-agentic" # # Divergence vs the 8k1k sibling (same deltas as the gb300 agentic recipes): # - benchmark.type: sa-bench -> custom (hands off to agentic_srt.sh) -# - max-model-len: removed (let vLLM derive from model config; agentic -# trajectories blow past any small explicit cap) +# - max-model-len: 262144 (NOT removed like on gb300). GB200 has +# 186 GB HBM vs GB300's 288 GB; letting vLLM +# derive DSv4's 1M context needs 24.06 GiB KV +# per DEP8 rank but only ~7.9 GiB is free after +# FP4 weights + MegaMOE buffers — engine init +# dies in _check_enough_kv_cache_memory (R3, +# job 18592). 256k fits (~6 GiB) and pairs with +# the 256k-capped corpus below, mirroring the +# minimaxm2.5 agentic configs. +# - WEKA_LOADER_OVERRIDE: 060826 *256k* corpus variant + MAX_MODEL_LEN env +# so aiperf filters replay inputs to the served +# context window. # - no-enable-prefix-caching: dropped (prefix caching MUST be on for # trajectory reuse — entire point of agentic) # - infra.nats_max_payload_mb: 32 (agentic prompts at 50k-200k DSv4 tokens @@ -108,6 +118,7 @@ backend: enable-ep-weight-filter: true moe-backend: deep_gemm_mega_moe enforce-eager: true + max-model-len: 262144 max-num-seqs: 16 max-num-batched-tokens: 32768 trust-remote-code: true @@ -130,6 +141,7 @@ backend: enable-expert-parallel: true enable-ep-weight-filter: true moe-backend: deep_gemm_mega_moe + max-model-len: 262144 max-num-seqs: 512 max-cudagraph-capture-size: 512 max-num-batched-tokens: 512 @@ -166,3 +178,8 @@ benchmark: # dataset isn't re-downloaded on every run. Overrides the workflow-level # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes. HF_HUB_CACHE: "/hf_hub_cache" + # The server runs at max-model-len 262144 (see header comment) — replay + # the 256k-capped corpus and tell aiperf to filter inputs to the served + # window, mirroring the minimaxm2.5 agentic configs. + WEKA_LOADER_OVERRIDE: "semianalysis_cc_traces_weka_with_subagents_060826_256k" + MAX_MODEL_LEN: "262144" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml index 4ac4b36a8..7a4719505 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml @@ -7,8 +7,9 @@ name: "svf-vllm-disagg-gb200-2p1d-dep8-dep8-agentic" # # Divergence vs the 8k1k sibling — same agentic deltas as the 1p1d recipe; # see disagg-gb200-1p1d-dep8-dep8-agentic.yaml for the full rationale: -# - benchmark.type custom, max-model-len removed, prefix caching ON, -# nats_max_payload_mb 32, container-remap-root. +# - benchmark.type custom, max-model-len 262144 (GB200 HBM can't fit the +# derived 1M-context KV requirement) + 256k-capped corpus override, +# prefix caching ON, nats_max_payload_mb 32, container-remap-root. model: path: "deepseek-v4-pro" @@ -94,6 +95,7 @@ backend: enable-ep-weight-filter: true moe-backend: deep_gemm_mega_moe enforce-eager: true + max-model-len: 262144 max-num-seqs: 16 max-num-batched-tokens: 32768 trust-remote-code: true @@ -116,6 +118,7 @@ backend: enable-expert-parallel: true enable-ep-weight-filter: true moe-backend: deep_gemm_mega_moe + max-model-len: 262144 max-num-seqs: 512 max-cudagraph-capture-size: 512 max-num-batched-tokens: 512 @@ -152,3 +155,8 @@ benchmark: # dataset isn't re-downloaded on every run. Overrides the workflow-level # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes. HF_HUB_CACHE: "/hf_hub_cache" + # The server runs at max-model-len 262144 (see header comment) — replay + # the 256k-capped corpus and tell aiperf to filter inputs to the served + # window, mirroring the minimaxm2.5 agentic configs. + WEKA_LOADER_OVERRIDE: "semianalysis_cc_traces_weka_with_subagents_060826_256k" + MAX_MODEL_LEN: "262144" diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index eb8fd0681..748ce983f 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -277,8 +277,14 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" echo "Submitting job with srtctl..." -# Override the job name in the config file with the runner name -sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "${CONFIG_FILE%%:*}" +# Override the job name in the config file with the runner name, prefixed +# "ifx-": another runner fleet on watchtower (user slurm-shared, uid 1010, +# with Slurm operator rights) names ITS jobs after the same runner names +# (gb200-nv_N) and its pre-job cleanup scancels by job name across users — +# it killed our job 18593 mid-startup (CANCELLED by 1010). The distinct +# prefix keeps their --name match away from our jobs. The workflow's own +# pre-run cleanup scancels both the bare and ifx- prefixed names. +sed -i "s/^name:.*/name: \"ifx-${RUNNER_NAME}\"/" "${CONFIG_FILE%%:*}" # Don't leak the login-node venv to the compute-node orchestrator. sbatch's # default --export=ALL propagates VIRTUAL_ENV (set by `source From ca247348f8398cef226f31ef845269aefc473b1b Mon Sep 17 00:00:00 2001 From: cquil11 Date: Wed, 10 Jun 2026 16:56:38 -0500 Subject: [PATCH 093/132] fix(agentic): gb200 TEP8/TP8 topology + srtctl-level job-name prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R4 post-mortem (all 3 shards failed): 1. Even at max-model-len 262144, DSv4's hybrid KV needs 19.82 GiB per data-parallel rank to admit one max-len request — GB200 only has ~8.8 GiB free per GPU after FP4 weights + MegaMOE buffers. The DEP8 shapes are unservable for long-context agentic on this SKU. Replace with TEP8 prefill + TP8 decode (1p1d, flag sets mirrored from the validated gb300 TEP/TP recipes): TP shards KV 8-ways (~2.5 GiB/GPU at 256k). Drop the conc-4096 tier for now (single TP8 decode caps at 512 seqs; DEP decode hits the KV wall) and delete the DEP recipes. 2. The ifx- job-name prefix from 66a71daf never reached Slurm: srtctl's get_job_name() prefers the RUNNER_NAME env var over the recipe name, so job 18599 was still named gb200-nv_0 and got scancelled by the foreign fleet again (CANCELLED by 1010). Pass the prefixed name as RUNNER_NAME to srtctl apply itself. Co-Authored-By: Claude Opus 4.8 --- .github/configs/nvidia-master.yaml | 51 ++--- .../disagg-gb200-1p1d-dep8-dep8-agentic.yaml | 185 ------------------ ...> disagg-gb200-1p1d-tep8-tp8-agentic.yaml} | 47 +++-- runners/launch_gb200-nv.sh | 24 ++- 4 files changed, 61 insertions(+), 246 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/{disagg-gb200-2p1d-dep8-dep8-agentic.yaml => disagg-gb200-1p1d-tep8-tp8-agentic.yaml} (74%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d8c35affa..e6ed25e28 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9786,12 +9786,17 @@ dsv4-fp4-gb300-cw-dynamo-vllm-agentic: ep: 8 dp-attn: true -# GB200 sibling of the gb300 agentic configs. Topologies come from the -# validated fixed-seq-len gb200 dynamo-vllm family (mid-curve-megamoe and -# high-tpt-megamoe) rather than the gb300 shapes — see the agentic recipe -# files for the agentic-specific deltas. Image matches the recipes' -# `model.container` (v0.20.0-ubuntu2404, the validated gb200 stack); the -# two must stay in lockstep — see dsv4-fp4-gb300-dynamo-vllm-agentic. +# GB200 sibling of the gb300 agentic configs. Unlike gb300, the topology is +# TEP8 prefill + TP8 decode (NOT the fixed-seq-len DEP8/DEP8 megamoe family): +# DSv4's hybrid KV needs ~20 GiB per data-parallel rank to admit one +# 256k-token request, but GB200's 186 GB HBM leaves only ~8.8 GiB free after +# FP4 weights — TP shards the KV 8-ways so it fits. See the recipe header. +# No high-throughput conc-4096 tier yet: a single TP8 decode worker caps at +# max-num-seqs 512, and DEP decode (which scales seqs) hits the KV wall +# above; revisit with fp4 indexer cache or multi-worker TP8 decode. +# Image matches the recipes' `model.container` (v0.20.0-ubuntu2404, the +# validated gb200 stack); the two must stay in lockstep — see +# dsv4-fp4-gb300-dynamo-vllm-agentic. dsv4-fp4-gb200-dynamo-vllm-agentic: image: vllm/vllm-openai:v0.20.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro @@ -9805,21 +9810,21 @@ dsv4-fp4-gb200-dynamo-vllm-agentic: agentic-coding: - duration: 1800 search-space: - # Low-latency: 1p1d (DEP=8 / DEP=8) at conc 32. 5 nodes incl. infra. + # Low-latency: 1p1d (TEP=8 / TP=8) at conc 32. 5 nodes incl. infra. - spec-decoding: none conc-list: [32] prefill: num-worker: 1 tp: 8 ep: 8 - dp-attn: true + dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml" decode: num-worker: 1 tp: 8 - ep: 8 - dp-attn: true + ep: 1 + dp-attn: false # Mid: same 1p1d shape at conc 192. - spec-decoding: none conc-list: [192] @@ -9827,30 +9832,14 @@ dsv4-fp4-gb200-dynamo-vllm-agentic: num-worker: 1 tp: 8 ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - # High-throughput: 2p1d (2x DEP=8 prefill + DEP=8 decode) at conc 4096. - # 7 nodes incl. infra. - - spec-decoding: none - conc-list: [4096] - prefill: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true + dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml" decode: num-worker: 1 tp: 8 - ep: 8 - dp-attn: true + ep: 1 + dp-attn: false qwen3.5-fp8-h100-sglang-agentic: image: lmsysorg/sglang:v0.5.12-cu130 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml deleted file mode 100644 index 3f33b73e5..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-dep8-dep8-agentic.yaml +++ /dev/null @@ -1,185 +0,0 @@ -name: "svf-vllm-disagg-gb200-1p1d-dep8-dep8-agentic" - -# Agentic-coding variant of vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe.yaml. -# Topology is identical (1 prefill DEP=8 + 1 decode DEP=8, 16 GPUs across 4 -# GB200 nodes + 1 dedicated NATS/etcd infra node) so we can compare against -# the fixed-seq-len mid-curve baseline. Serves the low-latency (conc 32) and -# mid (conc 192) agentic tiers. -# -# Divergence vs the 8k1k sibling (same deltas as the gb300 agentic recipes): -# - benchmark.type: sa-bench -> custom (hands off to agentic_srt.sh) -# - max-model-len: 262144 (NOT removed like on gb300). GB200 has -# 186 GB HBM vs GB300's 288 GB; letting vLLM -# derive DSv4's 1M context needs 24.06 GiB KV -# per DEP8 rank but only ~7.9 GiB is free after -# FP4 weights + MegaMOE buffers — engine init -# dies in _check_enough_kv_cache_memory (R3, -# job 18592). 256k fits (~6 GiB) and pairs with -# the 256k-capped corpus below, mirroring the -# minimaxm2.5 agentic configs. -# - WEKA_LOADER_OVERRIDE: 060826 *256k* corpus variant + MAX_MODEL_LEN env -# so aiperf filters replay inputs to the served -# context window. -# - no-enable-prefix-caching: dropped (prefix caching MUST be on for -# trajectory reuse — entire point of agentic) -# - infra.nats_max_payload_mb: 32 (agentic prompts at 50k-200k DSv4 tokens -# serialize to 1-3 MB JSON; NATS' 1 MiB default -# rejects them — see gb300 1p6d recipe comment) -# - srun_options.container-remap-root: pyxis may map the calling user to a -# non-root uid in the container; agentic_srt.sh -# needs root for `apt-get install git`. No-op -# when the container user is already root. -# Note: --enable-auto-tool-choice / --tool-call-parser / --reasoning-parser -# are NOT set on the worker. The dynamo-vllm worker entrypoint doesn't -# accept them (different arg parser than `vllm serve`). In disagg, chat -# parsing happens at the dynamo frontend, not at the worker. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260426" - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -infra: - etcd_nats_dedicated_node: true - # See the gb300 1p6d agentic recipe for rationale — NATS' 1 MiB default - # rejects long agentic prompts; 32 MiB gives ~10x headroom over the - # largest observed payload. - nats_max_payload_mb: 32 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - TORCH_SYMMMEM: "NVSHMEM" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - TORCH_SYMMMEM: "NVSHMEM" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe - enforce-eager: true - max-model-len: 262144 - max-num-seqs: 16 - max-num-batched-tokens: 32768 - trust-remote-code: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.95 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - numa-bind: true - tokenizer-mode: deepseek_v4 - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe - max-model-len: 262144 - max-num-seqs: 512 - max-cudagraph-capture-size: 512 - max-num-batched-tokens: 512 - trust-remote-code: true - no-enable-flashinfer-autotune: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 - -srun_options: - # See gb300 agentic recipes: pyxis may map the calling user to a non-root - # uid inside the container; remap to uid 0 so agentic_srt.sh's apt-get - # install git works. No-op when the container user is already root. - container-remap-root: "" - -benchmark: - type: custom - command: bash /infmax-workspace/benchmarks/multi_node/agentic_srt.sh - env: - INFMAX_CONTAINER_WORKSPACE: /infmax-workspace - RESULT_DIR: /logs/agentic - PORT: "8000" - IS_MULTINODE: "true" - # Container-side path of the aiperf mmap dataset cache; the host-side - # mount is wired via launch_gb200-nv.sh's srtslurm.yaml default_mounts. - # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files - # per dataset on every run. - AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache" - # Persistent HF hub cache (also wired via default_mounts) so the trace - # dataset isn't re-downloaded on every run. Overrides the workflow-level - # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes. - HF_HUB_CACHE: "/hf_hub_cache" - # The server runs at max-model-len 262144 (see header comment) — replay - # the 256k-capped corpus and tell aiperf to filter inputs to the served - # window, mirroring the minimaxm2.5 agentic configs. - WEKA_LOADER_OVERRIDE: "semianalysis_cc_traces_weka_with_subagents_060826_256k" - MAX_MODEL_LEN: "262144" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml similarity index 74% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml index 7a4719505..1283fb205 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-2p1d-dep8-dep8-agentic.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml @@ -1,15 +1,26 @@ -name: "svf-vllm-disagg-gb200-2p1d-dep8-dep8-agentic" +name: "svf-vllm-disagg-gb200-1p1d-tep8-tp8-agentic" -# Agentic-coding variant of vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe.yaml. -# Topology is identical (2 prefill DEP=8 each + 1 decode DEP=8, 24 GPUs across -# 6 GB200 nodes + 1 dedicated NATS/etcd infra node). Serves the -# high-throughput (conc 4096) agentic tier. +# Agentic-coding recipe for GB200: 1 prefill (TEP=8) + 1 decode (TP=8), +# 16 GPUs across 4 GB200 nodes + 1 dedicated NATS/etcd infra node. # -# Divergence vs the 8k1k sibling — same agentic deltas as the 1p1d recipe; -# see disagg-gb200-1p1d-dep8-dep8-agentic.yaml for the full rationale: -# - benchmark.type custom, max-model-len 262144 (GB200 HBM can't fit the -# derived 1M-context KV requirement) + 256k-capped corpus override, -# prefix caching ON, nats_max_payload_mb 32, container-remap-root. +# Why TEP/TP instead of the fixed-seq-len DEP8/DEP8 family +# (disagg-gb200-mid-curve-megamoe.yaml): with data-parallel ranks each rank +# holds the FULL KV of its sequences, and DSv4's hybrid KV needs 19.82 GiB +# per rank just to admit one 256k-token request — but only ~8.8 GiB is free +# on a 186 GB GB200 GPU after FP4 weights + MegaMOE buffers (engine init +# died in _check_enough_kv_cache_memory; R4 jobs 18598/18600). Tensor +# parallelism shards the KV 8-ways (~2.5 GiB/GPU at 256k), which fits with +# room for concurrent sequences. Worker flag sets mirror the validated +# gb300 TEP/TP recipes (disagg-gb300-1p17d-tep4-tp4.yaml and the 1p6d +# agentic decode): no data-parallel, no deep_gemm_mega_moe. +# +# Standard agentic deltas (see the gb300 agentic recipes): +# - benchmark.type custom -> agentic_srt.sh +# - prefix caching ON (no no-enable-prefix-caching) +# - max-model-len 262144 + 060826 256k-capped corpus (GB200 cannot serve +# the full 1M DSv4 context, mirroring the minimaxm2.5 agentic configs) +# - infra.nats_max_payload_mb 32 (long agentic prompts exceed NATS' 1 MiB) +# - srun_options.container-remap-root (apt-get git in agentic_srt.sh) model: path: "deepseek-v4-pro" @@ -32,9 +43,9 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 4 + prefill_nodes: 2 decode_nodes: 2 - prefill_workers: 2 + prefill_workers: 1 decode_workers: 1 gpus_per_prefill: 8 gpus_per_decode: 8 @@ -87,13 +98,10 @@ backend: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" kv-cache-dtype: "fp8" - tensor-parallel-size: 1 + tensor-parallel-size: 8 pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe enforce-eager: true max-model-len: 262144 max-num-seqs: 16 @@ -111,13 +119,9 @@ backend: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" kv-cache-dtype: "fp8" - tensor-parallel-size: 1 + tensor-parallel-size: 8 pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true enable-ep-weight-filter: true - moe-backend: deep_gemm_mega_moe max-model-len: 262144 max-num-seqs: 512 max-cudagraph-capture-size: 512 @@ -128,6 +132,7 @@ backend: compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' gpu-memory-utilization: 0.9 stream-interval: 50 + all2all-backend: "flashinfer_nvlink_one_sided" no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true tokenizer-mode: deepseek_v4 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 748ce983f..18f286965 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -277,14 +277,20 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" echo "Submitting job with srtctl..." -# Override the job name in the config file with the runner name, prefixed -# "ifx-": another runner fleet on watchtower (user slurm-shared, uid 1010, -# with Slurm operator rights) names ITS jobs after the same runner names -# (gb200-nv_N) and its pre-job cleanup scancels by job name across users — -# it killed our job 18593 mid-startup (CANCELLED by 1010). The distinct -# prefix keeps their --name match away from our jobs. The workflow's own -# pre-run cleanup scancels both the bare and ifx- prefixed names. +# Override the job name with the runner name, prefixed "ifx-": another +# runner fleet on watchtower (user slurm-shared, uid 1010, with Slurm +# operator rights) names ITS jobs after the same runner names (gb200-nv_N) +# and its pre-job cleanup scancels by job name across users — it killed our +# jobs 18593 and 18599 mid-startup (CANCELLED by 1010). The distinct prefix +# keeps their --name match away from our jobs; the workflow's own pre-run +# cleanup scancels both the bare and ifx- prefixed names. +# +# NOTE the sed alone is not enough: srtctl's get_job_name() (cli/submit.py) +# prefers the RUNNER_NAME env var over the recipe name, so the prefixed +# RUNNER_NAME must be passed to `srtctl apply` itself (R4 job 18599 proved +# the recipe-name route gets ignored on CI runners). sed -i "s/^name:.*/name: \"ifx-${RUNNER_NAME}\"/" "${CONFIG_FILE%%:*}" +SRTCTL_RUNNER_NAME="ifx-${RUNNER_NAME}" # Don't leak the login-node venv to the compute-node orchestrator. sbatch's # default --export=ALL propagates VIRTUAL_ENV (set by `source @@ -304,9 +310,9 @@ if [[ "$IS_AGENTIC" == "1" ]]; then fi if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then - SRTCTL_OUTPUT=$(srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) + SRTCTL_OUTPUT=$(RUNNER_NAME="$SRTCTL_RUNNER_NAME" srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) else - SRTCTL_OUTPUT=$(srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) + SRTCTL_OUTPUT=$(RUNNER_NAME="$SRTCTL_RUNNER_NAME" srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) fi echo "$SRTCTL_OUTPUT" From fe1d6956720d7aaeec301eeafdea336cc8a93c0e Mon Sep 17 00:00:00 2001 From: cquil11 Date: Wed, 10 Jun 2026 17:51:10 -0500 Subject: [PATCH 094/132] fix(agentic): gb200 prefill headroom for long-context activation spikes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R5 job 18603: TEP8 prefill passed engine init but OOM'd on the first scheduled request — 'Tried to allocate 1.98 GiB ... 1.53 GiB free' with PyTorch already at 175.8/184 GiB. The fixed-seq megamoe settings (gpu-memory-utilization 0.95 + max-num-batched-tokens 32768) leave no runtime headroom for 256k-context prefill activation spikes. Drop to 0.9 / 16384, matching the green gb300 agentic prefill. Co-Authored-By: Claude Opus 4.8 --- .../agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml index 1283fb205..0ad29ccb9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml @@ -105,12 +105,19 @@ backend: enforce-eager: true max-model-len: 262144 max-num-seqs: 16 - max-num-batched-tokens: 32768 + # 16384 batched tokens + util 0.90 (the fixed-seq megamoe recipes use + # 32768 + 0.95, tuned for 9k contexts): at 256k contexts the first + # long prefill's activation spike (sparse indexer logits, mhc fused + # kernels) needs ~2 GiB of runtime headroom that 0.95 doesn't leave — + # R5 job 18603 died with "CUDA out of memory. Tried to allocate + # 1.98 GiB ... 1.53 GiB free" on the first scheduled request. Matches + # the green gb300 agentic prefill (0.9 / 16384). + max-num-batched-tokens: 16384 trust-remote-code: true no-enable-flashinfer-autotune: true no-async-scheduling: true block-size: 256 - gpu-memory-utilization: 0.95 + gpu-memory-utilization: 0.9 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true numa-bind: true From 64f43ed71d424fa2be38d02368e716adf35720bd Mon Sep 17 00:00:00 2001 From: cquil11 Date: Wed, 10 Jun 2026 20:26:22 -0500 Subject: [PATCH 095/132] fix(agentic): bump gb200 agentic to vllm v0.21.0 for NIXL TP8<->TP8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R6 (both shards): decode worker's first NIXL get_finished() poll dies with KeyError on the remote prefill engine_id in transfer_topo.get_engine_info() — the prefill engine never registers in the decode's engine map. v0.20.0-only bug on the TP<->TP transfer path; the fixed-seq DEP family (per-rank TP=1) never exercises it. v0.21.0 + the same ai-dynamo wheel ran green NIXL transfers on gb300 agentic. Co-Authored-By: Claude Opus 4.8 --- .github/configs/nvidia-master.yaml | 7 ++++--- .../agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml | 12 +++++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e6ed25e28..38ef2fcae 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9794,11 +9794,12 @@ dsv4-fp4-gb300-cw-dynamo-vllm-agentic: # No high-throughput conc-4096 tier yet: a single TP8 decode worker caps at # max-num-seqs 512, and DEP decode (which scales seqs) hits the KV wall # above; revisit with fp4 indexer cache or multi-worker TP8 decode. -# Image matches the recipes' `model.container` (v0.20.0-ubuntu2404, the -# validated gb200 stack); the two must stay in lockstep — see +# Image matches the recipes' `model.container` (v0.21.0-ubuntu2404 — the +# gb300-validated agentic stack; v0.20.0's NIXL connector breaks TP8<->TP8 +# transfers, see the recipe header); the two must stay in lockstep — see # dsv4-fp4-gb300-dynamo-vllm-agentic. dsv4-fp4-gb200-dynamo-vllm-agentic: - image: vllm/vllm-openai:v0.20.0-ubuntu2404 + image: vllm/vllm-openai:v0.21.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb200 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml index 0ad29ccb9..7750d7e4b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml @@ -14,6 +14,16 @@ name: "svf-vllm-disagg-gb200-1p1d-tep8-tp8-agentic" # gb300 TEP/TP recipes (disagg-gb300-1p17d-tep4-tp4.yaml and the 1p6d # agentic decode): no data-parallel, no deep_gemm_mega_moe. # +# Container is v0.21.0-ubuntu2404 (the gb300-validated agentic stack), NOT +# the v0.20.0 the gb200 fixed-seq family pins: v0.20.0's NIXL connector +# breaks on TP8<->TP8 transfers — the decode worker's first get_finished() +# poll dies with KeyError on the remote (prefill) engine_id in +# transfer_topo.get_engine_info() because the prefill engine never +# registers in the decode's engine map (R6, both shards, identical +# tracebacks). The fixed-seq DEP8/DEP8 family never hits this path +# (per-rank TP=1 transfer topology). v0.21.0 + the same ai-dynamo wheel +# ran green NIXL transfers on gb300 agentic (R30 + manual 8137). +# # Standard agentic deltas (see the gb300 agentic recipes): # - benchmark.type custom -> agentic_srt.sh # - prefix caching ON (no no-enable-prefix-caching) @@ -24,7 +34,7 @@ name: "svf-vllm-disagg-gb200-1p1d-tep8-tp8-agentic" model: path: "deepseek-v4-pro" - container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + container: "vllm/vllm-openai:v0.21.0-ubuntu2404" precision: "fp4" dynamo: From 5df62c5c2677468fb42c2eaa21320bc39b331396 Mon Sep 17 00:00:00 2001 From: cquil11 Date: Wed, 10 Jun 2026 20:45:05 -0500 Subject: [PATCH 096/132] fix(agentic): pin static NIXL engine_id for 2-node TP8 gb200 workers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R7 (both shards): decode's NIXL handshake failed with 'Remote NIXL agent engine ID mismatch'. srtctl launches the 2-node TP8 worker as two processes (--node-rank 0 + --node-rank 1 --headless) and each generates its own random NixlConnector engine_id — ranks 0-3 and 4-7 of the same worker register under different UUIDs. Pin a static engine_id in kv-transfer-config (one per worker, distinct prefill vs decode) so both node processes agree. gb300 never hit this: its workers are all single-node (TP4 fits in 288 GB HBM; TP4 weights don't fit GB200's 186 GB, forcing the 2-node TP8 shape). Co-Authored-By: Claude Opus 4.8 --- .../disagg-gb200-1p1d-tep8-tp8-agentic.yaml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml index 7750d7e4b..f8d29e5fd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml @@ -105,7 +105,15 @@ backend: vllm_config: prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + # Static engine_id (one per worker, distinct between prefill/decode): + # the TP8 workers span 2 GB200 nodes, which srtctl launches as two + # processes (--node-rank 0 + --node-rank 1 --headless). Without a + # pinned engine_id each process generates its own random NIXL UUID, so + # ranks 0-3 and ranks 4-7 of the SAME worker register under different + # engine ids and the consumer's handshake dies with "Remote NIXL agent + # engine ID mismatch" on the first transfer (R7, both shards). + # Single-node-per-worker topologies (all gb300 recipes) never hit this. + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "11111111-1111-4111-8111-111111111111"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" kv-cache-dtype: "fp8" tensor-parallel-size: 8 @@ -133,7 +141,9 @@ backend: numa-bind: true tokenizer-mode: deepseek_v4 decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + # See prefill: static engine_id shared by both node processes of this + # 2-node TP8 worker (distinct from the prefill worker's id). + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "22222222-2222-4222-8222-222222222222"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" kv-cache-dtype: "fp8" tensor-parallel-size: 8 From 7f618296a79a1004b572f7713ed6be6791d5db5c Mon Sep 17 00:00:00 2001 From: cquil11 Date: Wed, 10 Jun 2026 21:20:45 -0500 Subject: [PATCH 097/132] fix(agentic): feed gb200 etcd CPUs + reject zero-request agentic results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit R8 c192: the decode worker's etcd lease expired 11 min after the server went healthy ('Keep-alive lease expired', worker deregistered), so the frontend 500'd every benchmark request with 'Instance not found'. Root cause matches gb300 R12: srtctl's infra step (etcd+nats) launches without --gres, and watchtower's CpusPerTres=gpu:35 default gives GPU-less steps a single CPU — starved etcd drops lease keep-alives. Add sbatch_directives.cpus-per-task=72 like the gb300 agentic recipes. The shard still went green because the workflow's agentic gate only checks that the aggregate JSON exists, and process_agentic_result.py writes an all-null aggregate when aiperf recorded zero valid requests. Gate now also requires num_requests_successful > 0. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/benchmark-multinode-tmpl.yml | 10 ++++++++++ .../agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml | 11 +++++++++++ 2 files changed, 21 insertions(+) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index c2a6d5d77..d46c75a5c 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -225,6 +225,16 @@ jobs: elif [ "${{ inputs.scenario-type }}" = "agentic-coding" ]; then if [ -f "${RESULT_FILENAME}.json" ]; then echo "Found agentic result file: ${RESULT_FILENAME}.json" + # Existence is not enough: process_agentic_result.py writes the + # aggregate even when aiperf recorded zero valid requests (e.g. + # the server 500'd every request — gb200 R8 went green on an + # all-null result this way). Require at least one successful + # request. + ok=$(python3 -c "import json,sys; d=json.load(open('${RESULT_FILENAME}.json')); print(int(bool(d.get('num_requests_successful'))))" 2>/dev/null || echo 0) + if [ "$ok" != "1" ]; then + echo "Run failed: ${RESULT_FILENAME}.json has zero successful requests." >&2 + exit 1 + fi else echo "Run failed: Agentic benchmark result ${RESULT_FILENAME}.json not found." >&2 exit 1 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml index f8d29e5fd..8587b5aae 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml @@ -164,6 +164,17 @@ backend: enable-sleep-mode: true tokenizer-mode: deepseek_v4 +# cpus-per-task=72: one full GB200 NUMA socket (144 cores split 2 x 72) per +# task. Critical for the *infra step* (etcd + nats), which srtctl spawns +# without --gres — on watchtower the per-GPU CPU default (CpusPerTres=gpu:35) +# doesn't apply to GPU-less steps, so etcd lands with 1 CPU, falls behind on +# lease keep-alives, and worker registrations silently expire mid-run: R8's +# decode worker logged "Keep-alive lease expired" 11 min after going healthy +# and the frontend 500'd every benchmark request with "Instance not found". +# Same failure mode and fix as the gb300 agentic recipes (their R12). +sbatch_directives: + cpus-per-task: "72" + srun_options: # See gb300 agentic recipes: pyxis may map the calling user to a non-root # uid inside the container; remap to uid 0 so agentic_srt.sh's apt-get From bfff4ccdd3ddfff81c8080453684154389278279 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 10:47:28 -0500 Subject: [PATCH 098/132] fix(agentic): restore stable B200 Mooncake NIC pin --- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index bfd37705c..514c6df8c 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -118,12 +118,11 @@ case "$OFFLOADING" in "global_segment_size": "${PER_RANK_GB}GB", "local_buffer_size": "4GB", "protocol": "rdma", - "device_name": "", + "device_name": "mlx5_0", "enable_offload": false } EOF export MOONCAKE_CONFIG_PATH - export MC_ENABLE_DEST_DEVICE_AFFINITY=1 # Identical prefixes must hash to identical store keys across DP ranks. export PYTHONHASHSEED=0 # B200 GPU memory registration works through DMA-BUF, but the compute From a152f8600f5502569e3881b9891daef7b4715456 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 12:00:15 -0500 Subject: [PATCH 099/132] fix(agentic): route SGLang DP benchmarks --- .github/configs/nvidia-master.yaml | 3 + .../agentic/dsv4_fp4_b200_sglang.sh | 201 +++++++++++++- .../agentic/dsv4_fp4_b300_sglang.sh | 248 +++++++++++++++++- .../dsv4_fp4_blackwell_sglang_common.sh | 216 --------------- 4 files changed, 448 insertions(+), 220 deletions(-) delete mode 100755 benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 38ef2fcae..125d5d9b5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9456,6 +9456,9 @@ dsv4-fp4-b300-sglang-agentic-hicache: - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } - { tp: 4, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] } - { tp: 8, offloading: hicache, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } + - { tp: 4, ep: 4, dp-attn: true, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } gptoss-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh index 17cd10d1b..8f816ca0c 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh @@ -1,4 +1,201 @@ #!/usr/bin/env bash +set -euo pipefail +set -x -export DSV4_SGLANG_PLATFORM=B200 -source "$(dirname "$0")/dsv4_fp4_blackwell_sglang_common.sh" +# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on B200 using SGLang. +# +# OFFLOADING values: +# none - SGLang GPU KV cache with RadixAttention prefix caching. +# hicache - SGLang HiCache local CPU tier with DSv4 UnifiedRadixCache. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INFERENCEX_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +export INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}" + +# The B200 DeepSeek-V4 Blackwell image installs SGLang editable under +# /workspace, so its launcher mounts InferenceX at /ix instead. Resolve the +# agentic tooling and results against the actual repository mount so the image +# can keep its /workspace install and GitHub Actions can collect the outputs. +if [[ ! -d "$INFMAX_CONTAINER_WORKSPACE/utils/aiperf" ]]; then + export INFMAX_CONTAINER_WORKSPACE="$INFERENCEX_ROOT" +fi +if [[ "${RESULT_DIR:-}" == /workspace/* && "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then + export RESULT_DIR="$INFMAX_CONTAINER_WORKSPACE/${RESULT_DIR#/workspace/}" +fi + +source "$INFERENCEX_ROOT/benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION + +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=1000000 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi +nvidia-smi + +resolve_trace_source + +# Keep AIPerf's Transformers-main dependency from replacing the older +# Transformers build pinned by the B200-specialized SGLang image. The server +# always launches with the image's original interpreter; AIPerf and result +# processing use the isolated environment when InferenceX is mounted at /ix. +SGLANG_PYTHON="$(command -v python3)" +if [[ "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then + AGENTIC_VENV="${AGENTIC_VENV:-/tmp/inferencex-agentic-venv}" + "$SGLANG_PYTHON" -m venv "$AGENTIC_VENV" + export PATH="$AGENTIC_VENV/bin:$PATH" +fi +install_agentic_deps + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +if [ "$DP_ATTENTION" = "true" ]; then + echo "Error: current SGLang nightly self-collides on internal IPC ports during single-node DP-attention startup; use pure TP until upstream fixes PortArgs initialization." >&2 + exit 1 +fi + +CACHE_ARGS=() +case "$OFFLOADING" in + none) + ;; + hicache) + # DeepSeek V4 HiCache currently rejects --hicache-size and supports + # capacity control only through a host/device token-capacity ratio. + # DSv4 allocates several physical host sub-pools for each logical host + # token. On B300 TP8, ratio=4 consumes about 237 GB/rank (1.9 TB total) + # while model loading/page cache is still resident and the OS kills a + # rank, so leave transient startup headroom with ratio=2. B200 has a + # smaller device KV pool and 3.8 TiB of host RAM, so ratio=8 provides a + # substantially larger useful CPU tier while staying within its node + # budget. + # TP4 ratio=4 works at C32 but fills its roughly 500 GB host tier at + # C48/C64. Ratio=8 still cannot retain the C64 session working set long + # enough to produce host hits. Ratio=16 provides roughly 21M logical + # host tokens while remaining below the B300 node's host budget. + if [ "$TP" -ge 8 ]; then + DEFAULT_HICACHE_RATIO=8 + else + DEFAULT_HICACHE_RATIO=16 + fi + HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}" + export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1 + CACHE_ARGS=( + --enable-hierarchical-cache + --hicache-ratio "$HICACHE_RATIO" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + ) + echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +PARALLEL_ARGS=(--tp "$TP") +METRICS_ARGS=(--enable-metrics) +CHUNKED_PREFILL_SIZE=8192 +PARALLEL_ARGS+=( + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune +) + +MODEL_ARGS=() +# The B200-specialized image deadlocks immediately after weight loading when +# forced through the B300 compressed-attention/page-size overrides. +MEM_FRACTION_STATIC=0.90 + +PER_ENGINE_MAX_RUNNING=$CONC +[ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 +CUDA_GRAPH_MAX_BS=$PER_ENGINE_MAX_RUNNING +[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 + +export PYTHONNOUSERSITE=1 +export TORCH_CUDA_ARCH_LIST=10.0 +# Agentic warmup dispatches hundreds of large prompts at once. SGLang's +# tokenizer process can leave request bytes unacknowledged for longer than +# AIPerf's 30-second TCP_USER_TIMEOUT while it admits that initial burst, +# causing Linux to abort otherwise-live localhost connections. Keep the +# six-hour request timeout unchanged, but allow up to 15 minutes for TCP +# progress before declaring the connection dead. +export AIPERF_HTTP_TCP_USER_TIMEOUT=900000 +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 + +SGLANG_CMD=( + "$SGLANG_PYTHON" -m sglang.launch_server + --model-path "$MODEL_PATH" + --served-model-name "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --trust-remote-code + "${PARALLEL_ARGS[@]}" + --mem-fraction-static "$MEM_FRACTION_STATIC" + --swa-full-tokens-ratio 0.1 + --max-running-requests "$PER_ENGINE_MAX_RUNNING" + --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" + --context-length "$MAX_MODEL_LEN" + --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" + --tool-call-parser deepseekv4 + --reasoning-parser deepseek-v4 + --chat-template "$SCRIPT_DIR/../chat_templates/deepseek_v4_thinking.jinja" + --watchdog-timeout 1800 + "${MODEL_ARGS[@]}" + "${METRICS_ARGS[@]}" + "${CACHE_ARGS[@]}" +) + +printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" +printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" + +{ + echo "=== SGLANG_* env vars at launch ===" + env | grep -E '^SGLANG_' | sort + echo "===================================" +} | tee "$SERVER_LOG" + +echo "Starting SGLang server for B200..." +"${SGLANG_CMD[@]}" >> "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +capture_cache_metrics() { + { + echo "=== SGLang cache metrics snapshot $(date --iso-8601=seconds) ===" + curl -fsS "http://localhost:$PORT/metrics" 2>/dev/null \ + | grep -E '^(sglang:(cache_hit_rate|cached_tokens_total|prompt_tokens_total|hicache_host_used_tokens|hicache_host_total_tokens|token_usage|num_requests_running|num_requests_waiting))' \ + || true + echo "============================================================" + } >> "$SERVER_LOG" +} + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" +if [ "${#METRICS_ARGS[@]}" -gt 0 ]; then + capture_cache_metrics + trap capture_cache_metrics EXIT +fi + +build_replay_cmd "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index b51526feb..915d401de 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -1,4 +1,248 @@ #!/usr/bin/env bash +set -euo pipefail +set -x -export DSV4_SGLANG_PLATFORM=B300 -source "$(dirname "$0")/dsv4_fp4_blackwell_sglang_common.sh" +# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on B300 using SGLang. +# +# OFFLOADING values: +# none - SGLang GPU KV cache with RadixAttention prefix caching. +# hicache - SGLang HiCache local CPU tier with DSv4 UnifiedRadixCache. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INFERENCEX_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +export INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}" + +# The B200 DeepSeek-V4 Blackwell image installs SGLang editable under +# /workspace, so its launcher mounts InferenceX at /ix instead. Resolve the +# agentic tooling and results against the actual repository mount so the image +# can keep its /workspace install and GitHub Actions can collect the outputs. +if [[ ! -d "$INFMAX_CONTAINER_WORKSPACE/utils/aiperf" ]]; then + export INFMAX_CONTAINER_WORKSPACE="$INFERENCEX_ROOT" +fi +if [[ "${RESULT_DIR:-}" == /workspace/* && "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then + export RESULT_DIR="$INFMAX_CONTAINER_WORKSPACE/${RESULT_DIR#/workspace/}" +fi + +source "$INFERENCEX_ROOT/benchmarks/benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION + +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=1000000 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi +nvidia-smi + +resolve_trace_source + +# Keep AIPerf's Transformers-main dependency from replacing the older +# Transformers build pinned by the B200-specialized SGLang image. The server +# always launches with the image's original interpreter; AIPerf and result +# processing use the isolated environment when InferenceX is mounted at /ix. +SGLANG_PYTHON="$(command -v python3)" +if [[ "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then + AGENTIC_VENV="${AGENTIC_VENV:-/tmp/inferencex-agentic-venv}" + "$SGLANG_PYTHON" -m venv "$AGENTIC_VENV" + export PATH="$AGENTIC_VENV/bin:$PATH" +fi +install_agentic_deps + +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +CACHE_ARGS=() +case "$OFFLOADING" in + none) + ;; + hicache) + # DeepSeek V4 HiCache currently rejects --hicache-size and supports + # capacity control only through a host/device token-capacity ratio. + # DSv4 allocates several physical host sub-pools for each logical host + # token. On B300 TP8, ratio=4 consumes about 237 GB/rank (1.9 TB total) + # while model loading/page cache is still resident and the OS kills a + # rank, so leave transient startup headroom with ratio=2. B200 has a + # smaller device KV pool and 3.8 TiB of host RAM, so ratio=8 provides a + # substantially larger useful CPU tier while staying within its node + # budget. + # TP4 ratio=4 works at C32 but fills its roughly 500 GB host tier at + # C48/C64. Ratio=8 still cannot retain the C64 session working set long + # enough to produce host hits. Ratio=16 provides roughly 21M logical + # host tokens while remaining below the B300 node's host budget. + if [ "$TP" -ge 8 ]; then + DEFAULT_HICACHE_RATIO=2 + else + DEFAULT_HICACHE_RATIO=16 + fi + HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}" + export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1 + CACHE_ARGS=( + --enable-hierarchical-cache + --hicache-ratio "$HICACHE_RATIO" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + ) + echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT" + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +USE_SGLANG_ROUTER=false +SGLANG_BACKEND_PORT="$PORT" +ROUTER_LOG="$RESULT_DIR/router.log" +if [ "$DP_ATTENTION" = "true" ]; then + USE_SGLANG_ROUTER=true + SGLANG_BACKEND_PORT=$((PORT + 1)) + SGLANG_ROUTER_METRICS_PORT=$((PORT + 10000)) +fi + +PARALLEL_ARGS=(--tp "$TP") +METRICS_ARGS=(--enable-metrics) +MEM_FRACTION_STATIC=0.88 +CHUNKED_PREFILL_SIZE=8192 +if [ "$DP_ATTENTION" = "true" ]; then + PARALLEL_ARGS+=( + --dp "$TP" + --enable-dp-attention + --ep-size "$EP_SIZE" + --moe-a2a-backend deepep + --deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + --enable-prefill-delayer + ) + METRICS_ARGS=() + MEM_FRACTION_STATIC=0.82 + CHUNKED_PREFILL_SIZE=16384 +else + PARALLEL_ARGS+=( + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + ) +fi + +MODEL_ARGS=( + --attention-backend compressed + --page-size 256 + --disable-shared-experts-fusion +) + +if [ "$DP_ATTENTION" = "true" ]; then + PER_ENGINE_MAX_RUNNING=$(( (CONC + TP - 1) / TP )) +else + PER_ENGINE_MAX_RUNNING=$CONC +fi +[ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 +CUDA_GRAPH_MAX_BS=$PER_ENGINE_MAX_RUNNING +[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 + +export PYTHONNOUSERSITE=1 +export TORCH_CUDA_ARCH_LIST=10.0 +# Agentic warmup dispatches hundreds of large prompts at once. SGLang's +# tokenizer process can leave request bytes unacknowledged for longer than +# AIPerf's 30-second TCP_USER_TIMEOUT while it admits that initial burst, +# causing Linux to abort otherwise-live localhost connections. Keep the +# six-hour request timeout unchanged, but allow up to 15 minutes for TCP +# progress before declaring the connection dead. +export AIPERF_HTTP_TCP_USER_TIMEOUT=900000 +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 +if [ "$DP_ATTENTION" = "true" ]; then + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 +fi + +SGLANG_CMD=( + "$SGLANG_PYTHON" -m sglang.launch_server + --model-path "$MODEL_PATH" + --served-model-name "$MODEL" + --host 0.0.0.0 + --port "$SGLANG_BACKEND_PORT" + --trust-remote-code + "${PARALLEL_ARGS[@]}" + --mem-fraction-static "$MEM_FRACTION_STATIC" + --swa-full-tokens-ratio 0.1 + --max-running-requests "$PER_ENGINE_MAX_RUNNING" + --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" + --context-length "$MAX_MODEL_LEN" + --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" + --tool-call-parser deepseekv4 + --reasoning-parser deepseek-v4 + --chat-template "$SCRIPT_DIR/../chat_templates/deepseek_v4_thinking.jinja" + --watchdog-timeout 1800 + "${MODEL_ARGS[@]}" + "${METRICS_ARGS[@]}" + "${CACHE_ARGS[@]}" +) + +printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" +printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" + +{ + echo "=== SGLANG_* env vars at launch ===" + env | grep -E '^SGLANG_' | sort + echo "===================================" +} | tee "$SERVER_LOG" + +echo "Starting SGLang server for B300..." +"${SGLANG_CMD[@]}" >> "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +capture_cache_metrics() { + { + echo "=== SGLang cache metrics snapshot $(date --iso-8601=seconds) ===" + curl -fsS "http://localhost:$SGLANG_BACKEND_PORT/metrics" 2>/dev/null \ + | grep -E '^(sglang:(cache_hit_rate|cached_tokens_total|prompt_tokens_total|hicache_host_used_tokens|hicache_host_total_tokens|token_usage|num_requests_running|num_requests_waiting))' \ + || true + echo "============================================================" + } >> "$SERVER_LOG" +} + +wait_for_server_ready --port "$SGLANG_BACKEND_PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +if [ "$USE_SGLANG_ROUTER" = "true" ]; then + echo "Starting SGLang router on port $PORT for $TP DP ranks..." + "$SGLANG_PYTHON" -m sglang_router.launch_router \ + --worker-urls "http://localhost:$SGLANG_BACKEND_PORT" \ + --policy manual \ + --assignment-mode min_load \ + --request-id-headers x-correlation-id \ + --dp-aware \ + --host 0.0.0.0 \ + --port "$PORT" \ + --prometheus-host 127.0.0.1 \ + --prometheus-port "$SGLANG_ROUTER_METRICS_PORT" \ + --request-timeout-secs 3600 \ + --disable-retries > "$ROUTER_LOG" 2>&1 & + ROUTER_PID=$! + echo "Router PID: $ROUTER_PID" + wait_for_server_ready --port "$PORT" --server-log "$ROUTER_LOG" --server-pid "$ROUTER_PID" +fi + +if [ "${#METRICS_ARGS[@]}" -gt 0 ]; then + capture_cache_metrics + trap capture_cache_metrics EXIT +fi + +build_replay_cmd "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh b/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh deleted file mode 100755 index 391d9df49..000000000 --- a/benchmarks/single_node/agentic/dsv4_fp4_blackwell_sglang_common.sh +++ /dev/null @@ -1,216 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on Blackwell using -# SGLang. B200 and B300 use the same current upstream DSv4 recipes. -# -# OFFLOADING values: -# none - SGLang GPU KV cache with RadixAttention prefix caching. -# hicache - SGLang HiCache local CPU tier with DSv4 UnifiedRadixCache. - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -INFERENCEX_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" -export INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}" - -# The B200 DeepSeek-V4 Blackwell image installs SGLang editable under -# /workspace, so its launcher mounts InferenceX at /ix instead. Resolve the -# agentic tooling and results against the actual repository mount so the image -# can keep its /workspace install and GitHub Actions can collect the outputs. -if [[ ! -d "$INFMAX_CONTAINER_WORKSPACE/utils/aiperf" ]]; then - export INFMAX_CONTAINER_WORKSPACE="$INFERENCEX_ROOT" -fi -if [[ "${RESULT_DIR:-}" == /workspace/* && "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then - export RESULT_DIR="$INFMAX_CONTAINER_WORKSPACE/${RESULT_DIR#/workspace/}" -fi - -source "$INFERENCEX_ROOT/benchmarks/benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION - -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=1000000 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi -nvidia-smi - -resolve_trace_source - -# Keep AIPerf's Transformers-main dependency from replacing the older -# Transformers build pinned by the B200-specialized SGLang image. The server -# always launches with the image's original interpreter; AIPerf and result -# processing use the isolated environment when InferenceX is mounted at /ix. -SGLANG_PYTHON="$(command -v python3)" -if [[ "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then - AGENTIC_VENV="${AGENTIC_VENV:-/tmp/inferencex-agentic-venv}" - "$SGLANG_PYTHON" -m venv "$AGENTIC_VENV" - export PATH="$AGENTIC_VENV/bin:$PATH" -fi -install_agentic_deps - -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -if [ "$DP_ATTENTION" = "true" ]; then - echo "Error: current SGLang nightly self-collides on internal IPC ports during single-node DP-attention startup; use pure TP until upstream fixes PortArgs initialization." >&2 - exit 1 -fi - -CACHE_ARGS=() -case "$OFFLOADING" in - none) - ;; - hicache) - # DeepSeek V4 HiCache currently rejects --hicache-size and supports - # capacity control only through a host/device token-capacity ratio. - # DSv4 allocates several physical host sub-pools for each logical host - # token. On B300 TP8, ratio=4 consumes about 237 GB/rank (1.9 TB total) - # while model loading/page cache is still resident and the OS kills a - # rank, so leave transient startup headroom with ratio=2. B200 has a - # smaller device KV pool and 3.8 TiB of host RAM, so ratio=8 provides a - # substantially larger useful CPU tier while staying within its node - # budget. - # TP4 ratio=4 works at C32 but fills its roughly 500 GB host tier at - # C48/C64. Ratio=8 still cannot retain the C64 session working set long - # enough to produce host hits. Ratio=16 provides roughly 21M logical - # host tokens while remaining below the B300 node's host budget. - if [ "$TP" -ge 8 ]; then - if [ "${DSV4_SGLANG_PLATFORM:-}" = "B200" ]; then - DEFAULT_HICACHE_RATIO=8 - else - DEFAULT_HICACHE_RATIO=2 - fi - else - DEFAULT_HICACHE_RATIO=16 - fi - HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}" - HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}" - HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" - HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}" - export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1 - CACHE_ARGS=( - --enable-hierarchical-cache - --hicache-ratio "$HICACHE_RATIO" - --hicache-write-policy "$HICACHE_WRITE_POLICY" - --hicache-io-backend "$HICACHE_IO_BACKEND" - --hicache-mem-layout "$HICACHE_MEM_LAYOUT" - ) - echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT" - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 - exit 1 - ;; -esac - -PARALLEL_ARGS=(--tp "$TP") -METRICS_ARGS=(--enable-metrics) -CHUNKED_PREFILL_SIZE=8192 -PARALLEL_ARGS+=( - --moe-runner-backend flashinfer_mxfp4 - --disable-flashinfer-autotune -) - -MODEL_ARGS=() -if [ "${DSV4_SGLANG_PLATFORM:-}" = "B200" ]; then - # Match the established B200 DSv4 recipe. The B200-specialized image - # deadlocks immediately after weight loading when forced through the - # B300-oriented compressed-attention/page-size overrides. - MEM_FRACTION_STATIC=0.90 -else - MEM_FRACTION_STATIC=0.88 - MODEL_ARGS+=( - --attention-backend compressed - --page-size 256 - --disable-shared-experts-fusion - ) -fi - -PER_ENGINE_MAX_RUNNING=$CONC -[ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 -CUDA_GRAPH_MAX_BS=$PER_ENGINE_MAX_RUNNING -[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 - -export PYTHONNOUSERSITE=1 -export TORCH_CUDA_ARCH_LIST=10.0 -# Agentic warmup dispatches hundreds of large prompts at once. SGLang's -# tokenizer process can leave request bytes unacknowledged for longer than -# AIPerf's 30-second TCP_USER_TIMEOUT while it admits that initial burst, -# causing Linux to abort otherwise-live localhost connections. Keep the -# six-hour request timeout unchanged, but allow up to 15 minutes for TCP -# progress before declaring the connection dead. -export AIPERF_HTTP_TCP_USER_TIMEOUT=900000 -export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 -export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 -export SGLANG_OPT_USE_JIT_NORM=1 -export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 -export SGLANG_OPT_USE_TOPK_V2=1 -export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 - -SGLANG_CMD=( - "$SGLANG_PYTHON" -m sglang.launch_server - --model-path "$MODEL_PATH" - --served-model-name "$MODEL" - --host 0.0.0.0 - --port "$PORT" - --trust-remote-code - "${PARALLEL_ARGS[@]}" - --mem-fraction-static "$MEM_FRACTION_STATIC" - --swa-full-tokens-ratio 0.1 - --max-running-requests "$PER_ENGINE_MAX_RUNNING" - --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" - --context-length "$MAX_MODEL_LEN" - --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" - --tool-call-parser deepseekv4 - --reasoning-parser deepseek-v4 - --chat-template "$SCRIPT_DIR/../chat_templates/deepseek_v4_thinking.jinja" - --watchdog-timeout 1800 - "${MODEL_ARGS[@]}" - "${METRICS_ARGS[@]}" - "${CACHE_ARGS[@]}" -) - -printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" -printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" - -{ - echo "=== SGLANG_* env vars at launch ===" - env | grep -E '^SGLANG_' | sort - echo "===================================" -} | tee "$SERVER_LOG" - -echo "Starting SGLang server for ${DSV4_SGLANG_PLATFORM:-Blackwell}..." -"${SGLANG_CMD[@]}" >> "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -capture_cache_metrics() { - { - echo "=== SGLang cache metrics snapshot $(date --iso-8601=seconds) ===" - curl -fsS "http://localhost:$PORT/metrics" 2>/dev/null \ - | grep -E '^(sglang:(cache_hit_rate|cached_tokens_total|prompt_tokens_total|hicache_host_used_tokens|hicache_host_total_tokens|token_usage|num_requests_running|num_requests_waiting))' \ - || true - echo "============================================================" - } >> "$SERVER_LOG" -} - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" -if [ "${#METRICS_ARGS[@]}" -gt 0 ]; then - capture_cache_metrics - trap capture_cache_metrics EXIT -fi - -build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" From 07f94d4f3c049214ae68fabebc26b7684c89044c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 12:05:29 -0500 Subject: [PATCH 100/132] test(agentic): isolate B300 SGLang DP canary --- .github/configs/nvidia-master.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 125d5d9b5..2301b4368 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9452,13 +9452,7 @@ dsv4-fp4-b300-sglang-agentic-hicache: agentic-coding: - duration: 1800 search-space: - - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] } - - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } - - { tp: 4, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] } - - { tp: 8, offloading: hicache, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } - - { tp: 4, ep: 4, dp-attn: true, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] } - - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } gptoss-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 From 481cbde7545fd2bdb8848633bddbe5ce409925dd Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 12:13:06 -0500 Subject: [PATCH 101/132] fix(agentic): avoid SGLang DP port collisions --- .../agentic/dsv4_fp4_b300_sglang.sh | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 915d401de..cc44bd4ca 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -59,6 +59,32 @@ if [[ "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then fi install_agentic_deps +# The 2026-06-09 image creates every local DP scheduler from the same HTTP +# port, so PortArgs derives the same RPC/metrics TCP range for every rank and +# rank 1 collides with rank 0 before startup completes. Give each scheduler a +# private derivation port until the upstream image includes the fix. +if [ "$DP_ATTENTION" = "true" ] && [ "${SGLANG_BUILD_COMMIT:-}" = "317fc6a9ddb62eb0320a7517c267bef4f9040853" ]; then + "$SGLANG_PYTHON" - <<'PY' +from pathlib import Path + +controller_path = Path("/sgl-workspace/sglang/python/sglang/srt/managers/data_parallel_controller.py") +source = controller_path.read_text() +old = " tmp_port_args = PortArgs.init_new(server_args)\n" +new = ( + " rank_server_args = dataclasses.replace(\n" + " server_args, port=server_args.port + (dp_rank + 1) * 100\n" + " )\n" + " tmp_port_args = PortArgs.init_new(rank_server_args)\n" +) +if old in source: + source = source.replace("import faulthandler\n", "import dataclasses\nimport faulthandler\n", 1) + source = source.replace(old, new, 1) + controller_path.write_text(source) +elif new not in source: + raise RuntimeError(f"Unexpected SGLang DP controller source: {controller_path}") +PY +fi + SERVER_LOG="$RESULT_DIR/server.log" mkdir -p "$RESULT_DIR" From 1379f98aba1813f1ec63b56005f3b82afbed9e98 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 12:29:40 -0500 Subject: [PATCH 102/132] fix(agentic): verify SGLang DP port patch --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index cc44bd4ca..14753f7d8 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -63,7 +63,7 @@ install_agentic_deps # port, so PortArgs derives the same RPC/metrics TCP range for every rank and # rank 1 collides with rank 0 before startup completes. Give each scheduler a # private derivation port until the upstream image includes the fix. -if [ "$DP_ATTENTION" = "true" ] && [ "${SGLANG_BUILD_COMMIT:-}" = "317fc6a9ddb62eb0320a7517c267bef4f9040853" ]; then +if [ "$DP_ATTENTION" = "true" ]; then "$SGLANG_PYTHON" - <<'PY' from pathlib import Path @@ -82,6 +82,8 @@ if old in source: controller_path.write_text(source) elif new not in source: raise RuntimeError(f"Unexpected SGLang DP controller source: {controller_path}") +if new not in controller_path.read_text(): + raise RuntimeError(f"Failed to patch SGLang DP controller: {controller_path}") PY fi From f2e67fa66ab3f666a49c6073b24788b35485af87 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 12:33:54 -0500 Subject: [PATCH 103/132] fix(agentic): bypass SGLang DP self-checks --- .../single_node/agentic/dsv4_fp4_b300_sglang.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 14753f7d8..76e189b25 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -84,6 +84,18 @@ elif new not in source: raise RuntimeError(f"Unexpected SGLang DP controller source: {controller_path}") if new not in controller_path.read_text(): raise RuntimeError(f"Failed to patch SGLang DP controller: {controller_path}") + +server_args_path = Path("/sgl-workspace/sglang/python/sglang/srt/server_args.py") +source = server_args_path.read_text() +old = ( + ' wait_port_available(rpc_port, "rpc_port")\n' + ' wait_port_available(metrics_port, "metrics_port")\n' +) +if old in source: + source = source.replace(old, "", 1) + server_args_path.write_text(source) +elif old in server_args_path.read_text(): + raise RuntimeError(f"Failed to patch SGLang DP port checks: {server_args_path}") PY fi From cd480e58e6b7ccc1a9d68a764bc7a591fa0da711 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 12:39:23 -0500 Subject: [PATCH 104/132] perf(agentic): fund B300 SGLang DEP KV cache --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 76e189b25..97fb962a2 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -168,7 +168,7 @@ if [ "$DP_ATTENTION" = "true" ]; then --enable-prefill-delayer ) METRICS_ARGS=() - MEM_FRACTION_STATIC=0.82 + MEM_FRACTION_STATIC=0.84 CHUNKED_PREFILL_SIZE=16384 else PARALLEL_ARGS+=( From cf3956a063ecbde905da34353e911740b309d8cb Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 12:44:40 -0500 Subject: [PATCH 105/132] fix(agentic): size SGLang DEP CUDA graphs globally --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 97fb962a2..a6510552d 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -189,7 +189,7 @@ else PER_ENGINE_MAX_RUNNING=$CONC fi [ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 -CUDA_GRAPH_MAX_BS=$PER_ENGINE_MAX_RUNNING +CUDA_GRAPH_MAX_BS=$CONC [ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 export PYTHONNOUSERSITE=1 From b178cc1c24f188f5dc14b66727a654d0c460e418 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 12:48:33 -0500 Subject: [PATCH 106/132] fix(agentic): isolate SGLang DP rendezvous ports --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index a6510552d..3bce57a22 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -84,6 +84,11 @@ elif new not in source: raise RuntimeError(f"Unexpected SGLang DP controller source: {controller_path}") if new not in controller_path.read_text(): raise RuntimeError(f"Failed to patch SGLang DP controller: {controller_path}") +source = controller_path.read_text() +old = " rank_port_args.nccl_port = port_args.nccl_port\n" +if old in source: + source = source.replace(old, "", 1) + controller_path.write_text(source) server_args_path = Path("/sgl-workspace/sglang/python/sglang/srt/server_args.py") source = server_args_path.read_text() From ca26948615d02bbbc92c627cddeb3fa3e85c5401 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 13:03:14 -0500 Subject: [PATCH 107/132] fix(agentic): use loopback for SGLang DEP rendezvous --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 3bce57a22..407a7dfe5 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -84,12 +84,6 @@ elif new not in source: raise RuntimeError(f"Unexpected SGLang DP controller source: {controller_path}") if new not in controller_path.read_text(): raise RuntimeError(f"Failed to patch SGLang DP controller: {controller_path}") -source = controller_path.read_text() -old = " rank_port_args.nccl_port = port_args.nccl_port\n" -if old in source: - source = source.replace(old, "", 1) - controller_path.write_text(source) - server_args_path = Path("/sgl-workspace/sglang/python/sglang/srt/server_args.py") source = server_args_path.read_text() old = ( @@ -167,6 +161,7 @@ if [ "$DP_ATTENTION" = "true" ]; then PARALLEL_ARGS+=( --dp "$TP" --enable-dp-attention + --dist-init-addr "127.0.0.1:$((PORT + 2000))" --ep-size "$EP_SIZE" --moe-a2a-backend deepep --deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' From bf54707b1ab292ef3a7508d6f8467173edc67e3c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 13:09:21 -0500 Subject: [PATCH 108/132] fix(agentic): bound SGLang DEP context length --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 407a7dfe5..303b1f72d 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -28,7 +28,11 @@ source "$INFERENCEX_ROOT/benchmarks/benchmark_lib.sh" check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=1000000 + if [ "$DP_ATTENTION" = "true" ]; then + MAX_MODEL_LEN=65536 + else + MAX_MODEL_LEN=1000000 + fi fi if [[ -n "${SLURM_JOB_ID:-}" ]]; then From 057be40f6accbf0876ee4962172cf8e82f1b8073 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 13:15:10 -0500 Subject: [PATCH 109/132] fix(agentic): reserve SGLang DEP request capacity --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 303b1f72d..f5456698d 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -29,7 +29,7 @@ check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then if [ "$DP_ATTENTION" = "true" ]; then - MAX_MODEL_LEN=65536 + MAX_MODEL_LEN=32768 else MAX_MODEL_LEN=1000000 fi From e1e72dc4430cf7a6889419d38c4511731e9f975e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 13:20:47 -0500 Subject: [PATCH 110/132] fix(agentic): pass global SGLang DEP concurrency --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index f5456698d..455b93839 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -187,12 +187,7 @@ MODEL_ARGS=( --disable-shared-experts-fusion ) -if [ "$DP_ATTENTION" = "true" ]; then - PER_ENGINE_MAX_RUNNING=$(( (CONC + TP - 1) / TP )) -else - PER_ENGINE_MAX_RUNNING=$CONC -fi -[ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1 +MAX_RUNNING_REQUESTS=$CONC CUDA_GRAPH_MAX_BS=$CONC [ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64 @@ -225,7 +220,7 @@ SGLANG_CMD=( "${PARALLEL_ARGS[@]}" --mem-fraction-static "$MEM_FRACTION_STATIC" --swa-full-tokens-ratio 0.1 - --max-running-requests "$PER_ENGINE_MAX_RUNNING" + --max-running-requests "$MAX_RUNNING_REQUESTS" --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" --context-length "$MAX_MODEL_LEN" --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" From 1568580f4b5a63b042a940bd7e5dcf8a4e3ffca5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 13:26:55 -0500 Subject: [PATCH 111/132] fix(agentic): keep SGLang DEP transport local --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 455b93839..0aae6ef83 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -207,6 +207,7 @@ export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 if [ "$DP_ATTENTION" = "true" ]; then + export NVSHMEM_DISABLE_IB=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 fi From d50678e8a83d78b740b36c14f84df84992a8afa2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 13:33:02 -0500 Subject: [PATCH 112/132] fix(agentic): disable NVSHMEM IBGDA for DEP --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 0aae6ef83..557803a5c 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -208,6 +208,7 @@ export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 if [ "$DP_ATTENTION" = "true" ]; then export NVSHMEM_DISABLE_IB=1 + export NVSHMEM_IB_ENABLE_IBGDA=0 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 fi From 496a047e186b4cbb7eda8f35ccf03f10056be314 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 13:38:53 -0500 Subject: [PATCH 113/132] fix(agentic): use proven SGLang DP MoE backend --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 557803a5c..939d961d2 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -167,8 +167,8 @@ if [ "$DP_ATTENTION" = "true" ]; then --enable-dp-attention --dist-init-addr "127.0.0.1:$((PORT + 2000))" --ep-size "$EP_SIZE" - --moe-a2a-backend deepep - --deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune --enable-prefill-delayer ) METRICS_ARGS=() @@ -206,12 +206,6 @@ export SGLANG_OPT_USE_JIT_NORM=1 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 -if [ "$DP_ATTENTION" = "true" ]; then - export NVSHMEM_DISABLE_IB=1 - export NVSHMEM_IB_ENABLE_IBGDA=0 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 -fi - SGLANG_CMD=( "$SGLANG_PYTHON" -m sglang.launch_server --model-path "$MODEL_PATH" From a346dc1779d4317edb30d22461583f6ef259f57d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 13:46:17 -0500 Subject: [PATCH 114/132] fix(agentic): retain trace-compatible DEP context --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 939d961d2..5abdb2801 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -29,7 +29,7 @@ check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then if [ "$DP_ATTENTION" = "true" ]; then - MAX_MODEL_LEN=32768 + MAX_MODEL_LEN=131072 else MAX_MODEL_LEN=1000000 fi From 1ed44191c7c67f34bed4db7c08f3ffc04b630867 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 13:56:02 -0500 Subject: [PATCH 115/132] fix(agentic): preserve default SGLang context --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 5abdb2801..204cd311d 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -28,11 +28,7 @@ source "$INFERENCEX_ROOT/benchmarks/benchmark_lib.sh" check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - if [ "$DP_ATTENTION" = "true" ]; then - MAX_MODEL_LEN=131072 - else - MAX_MODEL_LEN=1000000 - fi + MAX_MODEL_LEN=1000000 fi if [[ -n "${SLURM_JOB_ID:-}" ]]; then From 04dfbddbadf076877c3ffe4c433e388a0d38fe3e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 13:58:44 -0500 Subject: [PATCH 116/132] chore(agentic): update B300 SGLang image --- .github/configs/nvidia-master.yaml | 2 +- .../agentic/dsv4_fp4_b300_sglang.sh | 39 ------------------- 2 files changed, 1 insertion(+), 40 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 2301b4368..dcf555061 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9441,7 +9441,7 @@ dsv4-fp4-b300-vllm-agentic: - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } dsv4-fp4-b300-sglang-agentic-hicache: - image: lmsysorg/sglang:nightly-dev-cu13-20260609-317fc6a9 + image: lmsysorg/sglang:v0.5.13-cu130 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 204cd311d..54469a5cd 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -59,45 +59,6 @@ if [[ "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then fi install_agentic_deps -# The 2026-06-09 image creates every local DP scheduler from the same HTTP -# port, so PortArgs derives the same RPC/metrics TCP range for every rank and -# rank 1 collides with rank 0 before startup completes. Give each scheduler a -# private derivation port until the upstream image includes the fix. -if [ "$DP_ATTENTION" = "true" ]; then - "$SGLANG_PYTHON" - <<'PY' -from pathlib import Path - -controller_path = Path("/sgl-workspace/sglang/python/sglang/srt/managers/data_parallel_controller.py") -source = controller_path.read_text() -old = " tmp_port_args = PortArgs.init_new(server_args)\n" -new = ( - " rank_server_args = dataclasses.replace(\n" - " server_args, port=server_args.port + (dp_rank + 1) * 100\n" - " )\n" - " tmp_port_args = PortArgs.init_new(rank_server_args)\n" -) -if old in source: - source = source.replace("import faulthandler\n", "import dataclasses\nimport faulthandler\n", 1) - source = source.replace(old, new, 1) - controller_path.write_text(source) -elif new not in source: - raise RuntimeError(f"Unexpected SGLang DP controller source: {controller_path}") -if new not in controller_path.read_text(): - raise RuntimeError(f"Failed to patch SGLang DP controller: {controller_path}") -server_args_path = Path("/sgl-workspace/sglang/python/sglang/srt/server_args.py") -source = server_args_path.read_text() -old = ( - ' wait_port_available(rpc_port, "rpc_port")\n' - ' wait_port_available(metrics_port, "metrics_port")\n' -) -if old in source: - source = source.replace(old, "", 1) - server_args_path.write_text(source) -elif old in server_args_path.read_text(): - raise RuntimeError(f"Failed to patch SGLang DP port checks: {server_args_path}") -PY -fi - SERVER_LOG="$RESULT_DIR/server.log" mkdir -p "$RESULT_DIR" From 9f900c9fb8032dbf2537a31e58a43b7f3bf94227 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 14:22:35 -0500 Subject: [PATCH 117/132] fix(agentic): accept SGLang usage stream chunks --- utils/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/aiperf b/utils/aiperf index 4c6525ab7..ea2daff0e 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit 4c6525ab71d4cd9fc01054410d5b88bfe4feff9e +Subproject commit ea2daff0e01fa497ac359450c96dec3bf6245167 From f9711ad2d03691d1ab2f9038ba96b114fc48b86c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 14:47:15 -0500 Subject: [PATCH 118/132] fix(agentic): surface SGLang stream errors --- utils/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/aiperf b/utils/aiperf index ea2daff0e..ff2b646c0 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit ea2daff0e01fa497ac359450c96dec3bf6245167 +Subproject commit ff2b646c0425aff9307a0e73161b23d77003a357 From 1aebf72e69d067eeb3865bf2542db3f66e85fda3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 15:08:29 -0500 Subject: [PATCH 119/132] fix(agentic): truncate prompts to SGLang KV capacity --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 54469a5cd..c2a4dd9c1 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -176,6 +176,7 @@ SGLANG_CMD=( --max-running-requests "$MAX_RUNNING_REQUESTS" --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" --context-length "$MAX_MODEL_LEN" + --allow-auto-truncate --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4 From f84b175231b85f03f82eeadd15ecd8414a70a242 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 15:14:38 -0500 Subject: [PATCH 120/132] chore: add temporary B300 memory profile matrix --- .github/configs/aiperf-memory-profile-b300.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .github/configs/aiperf-memory-profile-b300.yaml diff --git a/.github/configs/aiperf-memory-profile-b300.yaml b/.github/configs/aiperf-memory-profile-b300.yaml new file mode 100644 index 000000000..dfd16853f --- /dev/null +++ b/.github/configs/aiperf-memory-profile-b300.yaml @@ -0,0 +1,13 @@ +dsv4-fp4-b300-vllm-agentic-memory-profile: + image: cquil/vllm-openai:v0.22.1-dcc957098904749bf375ffbf85aba6c74dfc9fe9 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } From b163d3084b2604715431b3d413a3121f6c888cd2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 16:04:44 -0500 Subject: [PATCH 121/132] perf(agentic): restore B300 SGLang DP KV capacity --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index c2a4dd9c1..c2121850a 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -129,7 +129,7 @@ if [ "$DP_ATTENTION" = "true" ]; then --enable-prefill-delayer ) METRICS_ARGS=() - MEM_FRACTION_STATIC=0.84 + MEM_FRACTION_STATIC=0.88 CHUNKED_PREFILL_SIZE=16384 else PARALLEL_ARGS+=( From b02eb37925ed958e99a964d80ad60a8c5cf64140 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 16:24:56 -0500 Subject: [PATCH 122/132] feat(agentic): restore B300 SGLang sweep matrix --- .github/configs/nvidia-master.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index dcf555061..c8364c851 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9452,7 +9452,13 @@ dsv4-fp4-b300-sglang-agentic-hicache: agentic-coding: - duration: 1800 search-space: - - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } + - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] } + - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] } + - { tp: 4, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] } + - { tp: 8, offloading: hicache, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] } + - { tp: 4, ep: 4, dp-attn: true, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } gptoss-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 From 865995c743f29c46b2b11b805962db855dc96fce Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 16:50:45 -0500 Subject: [PATCH 123/132] chore: remove temporary B300 memory profile matrix --- .github/configs/aiperf-memory-profile-b300.yaml | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 .github/configs/aiperf-memory-profile-b300.yaml diff --git a/.github/configs/aiperf-memory-profile-b300.yaml b/.github/configs/aiperf-memory-profile-b300.yaml deleted file mode 100644 index dfd16853f..000000000 --- a/.github/configs/aiperf-memory-profile-b300.yaml +++ /dev/null @@ -1,13 +0,0 @@ -dsv4-fp4-b300-vllm-agentic-memory-profile: - image: cquil/vllm-openai:v0.22.1-dcc957098904749bf375ffbf85aba6c74dfc9fe9 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: b300 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } From b4220233c2880e10b80f17bdd343eeee508c3435 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 16:52:11 -0500 Subject: [PATCH 124/132] fix(agentic): use Triton bundled ptxas --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index c2121850a..fb1cd2c71 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -163,6 +163,13 @@ export SGLANG_OPT_USE_JIT_NORM=1 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 +TRITON_BUNDLED_PTXAS=$( + "$SGLANG_PYTHON" -c \ + 'from pathlib import Path; import triton; print(Path(triton.__file__).parent / "backends/nvidia/bin/ptxas")' +) +if [ -x "$TRITON_BUNDLED_PTXAS" ]; then + export TRITON_PTXAS_PATH="$TRITON_BUNDLED_PTXAS" +fi SGLANG_CMD=( "$SGLANG_PYTHON" -m sglang.launch_server --model-path "$MODEL_PATH" From d904e5d5b42c7c903d97ba97876902df4da129fb Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 17:36:00 -0500 Subject: [PATCH 125/132] fix(agentic): discover CUDA ptxas for Triton --- .../single_node/agentic/dsv4_fp4_b300_sglang.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index fb1cd2c71..606b47943 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -163,12 +163,14 @@ export SGLANG_OPT_USE_JIT_NORM=1 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 -TRITON_BUNDLED_PTXAS=$( - "$SGLANG_PYTHON" -c \ - 'from pathlib import Path; import triton; print(Path(triton.__file__).parent / "backends/nvidia/bin/ptxas")' -) -if [ -x "$TRITON_BUNDLED_PTXAS" ]; then - export TRITON_PTXAS_PATH="$TRITON_BUNDLED_PTXAS" +TRITON_PTXAS_PATH=$(find \ + /usr/local/cuda/bin \ + /usr/local/lib/python*/dist-packages/nvidia \ + /usr/local/lib/python*/site-packages/nvidia \ + -type f -name ptxas -perm -u+x -print -quit 2>/dev/null || true) +if [ -n "$TRITON_PTXAS_PATH" ]; then + export TRITON_PTXAS_PATH + echo "Using ptxas for Triton: $TRITON_PTXAS_PATH" fi SGLANG_CMD=( "$SGLANG_PYTHON" -m sglang.launch_server From 88909ca47bfbaf963b05a87235615640ce87f300 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 18:01:42 -0500 Subject: [PATCH 126/132] fix(agentic): search versioned CUDA toolkits --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 606b47943..cfad20166 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -164,7 +164,7 @@ export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 TRITON_PTXAS_PATH=$(find \ - /usr/local/cuda/bin \ + /usr/local/cuda* \ /usr/local/lib/python*/dist-packages/nvidia \ /usr/local/lib/python*/site-packages/nvidia \ -type f -name ptxas -perm -u+x -print -quit 2>/dev/null || true) From 1fc2591b4ff6a092fa6294d2b6016174ad589993 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 11 Jun 2026 18:28:05 -0500 Subject: [PATCH 127/132] fix(agentic): precompile B300 DeepGEMM kernels --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index cfad20166..0f1981f99 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -157,7 +157,7 @@ export TORCH_CUDA_ARCH_LIST=10.0 # six-hour request timeout unchanged, but allow up to 15 minutes for TCP # progress before declaring the connection dead. export AIPERF_HTTP_TCP_USER_TIMEOUT=900000 -export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 export SGLANG_OPT_USE_JIT_NORM=1 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 From e3389bb0d9f9c9705650f4f79099542eadaa687e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 12 Jun 2026 10:11:44 -0500 Subject: [PATCH 128/132] fix(agentic): restore known-good B300 SGLang image --- .github/configs/nvidia-master.yaml | 2 +- .../single_node/agentic/dsv4_fp4_b300_sglang.sh | 11 +---------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c8364c851..c22aeef6f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9441,7 +9441,7 @@ dsv4-fp4-b300-vllm-agentic: - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] } dsv4-fp4-b300-sglang-agentic-hicache: - image: lmsysorg/sglang:v0.5.13-cu130 + image: lmsysorg/sglang:nightly-dev-cu13-20260609-317fc6a9 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index 0f1981f99..c2121850a 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -157,21 +157,12 @@ export TORCH_CUDA_ARCH_LIST=10.0 # six-hour request timeout unchanged, but allow up to 15 minutes for TCP # progress before declaring the connection dead. export AIPERF_HTTP_TCP_USER_TIMEOUT=900000 -export SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 export SGLANG_OPT_USE_JIT_NORM=1 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 -TRITON_PTXAS_PATH=$(find \ - /usr/local/cuda* \ - /usr/local/lib/python*/dist-packages/nvidia \ - /usr/local/lib/python*/site-packages/nvidia \ - -type f -name ptxas -perm -u+x -print -quit 2>/dev/null || true) -if [ -n "$TRITON_PTXAS_PATH" ]; then - export TRITON_PTXAS_PATH - echo "Using ptxas for Triton: $TRITON_PTXAS_PATH" -fi SGLANG_CMD=( "$SGLANG_PYTHON" -m sglang.launch_server --model-path "$MODEL_PATH" From 199ed1df89e4f5ce735900e12c60c78a9c5c4369 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 12 Jun 2026 13:34:28 -0500 Subject: [PATCH 129/132] fix(agentic): use valid B200 runner pool --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c22aeef6f..97a03f97e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -8454,7 +8454,7 @@ glm5-fp8-b200-dynamo-sglang: image: lmsysorg/sglang:v0.5.11-cu130 model: zai-org/GLM-5-FP8 model-prefix: glm5 - runner: b200-dgxc + runner: b200 precision: fp8 framework: dynamo-sglang multinode: true From 1bdc56ebd08b12ca2e6adbca10c5423fa3467a43 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 12 Jun 2026 13:35:11 -0500 Subject: [PATCH 130/132] fix(agentic): use available B200 SGLang runners --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 97a03f97e..7a15b43a3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9281,7 +9281,7 @@ dsv4-fp4-b200-sglang-agentic-hicache: image: lmsysorg/sglang:v0.5.12.post1-cu130 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 - runner: b200-dgxc + runner: b200 precision: fp4 framework: sglang multinode: false From dfc27db9446bf8a78b3068cea2b90af1e9f2e33a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 12 Jun 2026 13:52:55 -0500 Subject: [PATCH 131/132] fix(agentic): collect B300 DP backend metrics --- benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh index c2121850a..dcc41f688 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh @@ -128,7 +128,6 @@ if [ "$DP_ATTENTION" = "true" ]; then --disable-flashinfer-autotune --enable-prefill-delayer ) - METRICS_ARGS=() MEM_FRACTION_STATIC=0.88 CHUNKED_PREFILL_SIZE=16384 else @@ -238,4 +237,7 @@ if [ "${#METRICS_ARGS[@]}" -gt 0 ]; then fi build_replay_cmd "$RESULT_DIR" +if [ "$DP_ATTENTION" = "true" ]; then + REPLAY_CMD+=" --server-metrics http://localhost:$SGLANG_BACKEND_PORT/metrics" +fi run_agentic_replay_and_write_outputs "$RESULT_DIR" From dd77c829b5306bfbf0815bf919a9467d04708a6b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 12 Jun 2026 13:53:17 -0500 Subject: [PATCH 132/132] fix(agentic): collect B200 backend metrics explicitly --- benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh index 8f816ca0c..b159f9022 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh @@ -198,4 +198,5 @@ if [ "${#METRICS_ARGS[@]}" -gt 0 ]; then fi build_replay_cmd "$RESULT_DIR" +REPLAY_CMD+=" --server-metrics http://localhost:$PORT/metrics" run_agentic_replay_and_write_outputs "$RESULT_DIR"