Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
132 commits
Select commit Hold shift + click to select a range
f632aa4
agentic(trace-source): default non-DSv4 to v6 (060226) corpus
camatsemianalysis Jun 2, 2026
5544a44
configs(master): consolidate agentic recipes at end + split combined …
cquil11 Jun 2, 2026
76aedd6
configs(master): bump all vllm images to v0.22.0
cquil11 Jun 2, 2026
6dede7b
configs(master): strip stale narrative comments
cquil11 Jun 2, 2026
3257275
chore(aiperf): bump submodule for 060226 loader allowlist fix
cquil11 Jun 2, 2026
321fd44
(testing) b300 dsv4 simple offloading
cquil11 Jun 2, 2026
3283934
runners(b300-nv): remap container UID to root for apt-get install
cquil11 Jun 2, 2026
360bcf0
benchmarks(agentic): skip hf download when MODEL_PATH is pre-staged
cquil11 Jun 2, 2026
57d4adb
benchmarks(agentic): launch server from MODEL_PATH, not the HF id
cquil11 Jun 2, 2026
1bccc5c
benchmarks(dsv4-b300): enable VLLM_PREFIX_CACHE_RETENTION_INTERVAL
cquil11 Jun 2, 2026
0946107
configs(dsv4-b300-vllm-agentic): bump cquil image to 6c529f30 for ret…
cquil11 Jun 3, 2026
38c365c
benchmarks(dsv4-b300-vllm): override trace loader to 060226 (v6)
cquil11 Jun 3, 2026
ee8d743
[AMD] agentx-v0.4: add MiniMax/Kimi lmcache agentic entries, update Q…
seungrokj Jun 3, 2026
616f4db
[AMD] agentx-v0.4: add MiniMax agentic script, refactor Kimi/Qwen scr…
seungrokj Jun 3, 2026
574d891
Revert "[AMD] agentx-v0.4: add MiniMax agentic script, refactor Kimi/…
seungrokj Jun 3, 2026
5ec21d4
utils(process_agentic_result): align cache metrics + theoretical-trac…
cquil11 Jun 3, 2026
d7841d8
feat(agentic): route DEP traffic through native vLLM router
cquil11 Jun 3, 2026
fc5a792
benchmarks(agentic): disable DCGM gpu_telemetry in aiperf invocation
cquil11 Jun 3, 2026
ba65df8
refactor(agentic): hardcode DSv4 B300 router settings
cquil11 Jun 3, 2026
76a3f09
fix(agentic): fail jobs with excessive aiperf errors
cquil11 Jun 3, 2026
923186d
feat(agentic): route B200 DEP traffic through native vLLM router
cquil11 Jun 3, 2026
5291955
benchmarks(agentic): default DSv4 recipes to v6 (060226) corpus
cquil11 Jun 3, 2026
40736e8
chore(agentic): bump aiperf for warmup progress logging
cquil11 Jun 3, 2026
70529f2
chore(agentic): bump aiperf for phase-continuous replay
cquil11 Jun 3, 2026
1ed0001
chore(agentic): bump aiperf snapshot accessor rename
cquil11 Jun 3, 2026
1c84916
fix(agentic): align B200 DSv4 with bespoke vLLM image
cquil11 Jun 3, 2026
4bd54ce
chore(agentic): bump aiperf for heartbeat-only warmup logging
cquil11 Jun 3, 2026
97576fa
test(agentic): run B300 CPU offload in eager mode
cquil11 Jun 4, 2026
e08ba71
config(dsv4-fp4 agentic): run offloading=none with expanded concurren…
cquil11 Jun 4, 2026
60f3be0
fix(agentic): extend native router request timeout
cquil11 Jun 4, 2026
3747263
fix(agentic): use native B300 KV offloading
cquil11 Jun 4, 2026
cb21694
(testing) add offload off scneario to dsv4 b300
cquil11 Jun 4, 2026
06a4ea7
test(agentic): enable blocking CUDA offload diagnostics
cquil11 Jun 4, 2026
fb362a6
chore(agentic): remove stale B200 sweep comments
cquil11 Jun 4, 2026
2f27bea
feat(agentic): use Mooncake store for B300 offload
cquil11 Jun 4, 2026
97c4b65
test(agentic): validate Mooncake over TCP on B300
cquil11 Jun 4, 2026
49b9967
fix(agentic): pool Mooncake TCP connections
cquil11 Jun 4, 2026
c3cfe74
fix(agentic): increase Mooncake TCP transfer slices
cquil11 Jun 4, 2026
f585282
test(agentic): pin Mooncake RDMA on B300
cquil11 Jun 4, 2026
3599c78
perf(agentic): map Mooncake RDMA NICs on B300
cquil11 Jun 5, 2026
fce4996
fix(agentic): use shared Mooncake RDMA NIC on B300
cquil11 Jun 5, 2026
8d6b735
perf(agentic): tune Mooncake RDMA transfers on B300
cquil11 Jun 5, 2026
d98d7ae
perf(agentic): use full B300 Mooncake memory budget
cquil11 Jun 5, 2026
b83265d
perf(agentic): evict Mooncake cache before rank exhaustion
cquil11 Jun 5, 2026
4178b78
feat(agentic): default to 060526 weka corpus (DSv4 base, others 256k)
cquil11 Jun 5, 2026
0731a8e
go
cquil11 Jun 5, 2026
e6fe59c
feat(agentic): use Mooncake offload for DSv4 B200
cquil11 Jun 5, 2026
cffe496
go
cquil11 Jun 5, 2026
5b0e1a0
fix(agentic): bump aiperf for Weka context resets
cquil11 Jun 8, 2026
63c7c59
test(agentic): reduce B200 Mooncake memory budget
cquil11 Jun 8, 2026
5a566b3
fix(agentic): use Mooncake TCP fallback on B200
cquil11 Jun 8, 2026
6f2d292
fix(agentic): reuse B200 Mooncake TCP connections
cquil11 Jun 8, 2026
4e8ec16
fix(agentic): use current Mooncake TCP transport on B200
cquil11 Jun 8, 2026
bb55646
fix(agentic): preserve cached Mooncake wheel filename
cquil11 Jun 8, 2026
eebefc1
test(agentic): use standalone Mooncake store on B200
cquil11 Jun 8, 2026
1112011
fix(agentic): bound B200 Mooncake transfer batches
cquil11 Jun 8, 2026
191c4fe
test(agentic): raise B200 Mooncake batch limit
cquil11 Jun 8, 2026
8894d58
fix(agentic): extend B200 Mooncake read lease
cquil11 Jun 8, 2026
077a4d0
test(agentic): use stock Mooncake DMA-BUF RDMA on B200
cquil11 Jun 8, 2026
acfeb45
feat(agentic): default to 060826 weka corpus (DSv4 base, others 256k)
cquil11 Jun 8, 2026
40df915
chore(aiperf): suppress repeated histogram schema warnings
cquil11 Jun 9, 2026
7acee30
chore(aiperf): exclude warmup from realtime counters
cquil11 Jun 9, 2026
d9e0089
chore(minimax agentic): vllm 0.22.1 + 060826-256k weka corpus
cquil11 Jun 9, 2026
09848ef
feat(agentic): add DSv4 SGLang HiCache sweeps
cquil11 Jun 9, 2026
864b8a9
fix(agentic): avoid SGLang DEP metrics port collision
cquil11 Jun 9, 2026
2e0f5fa
fix(agentic): run DSv4 HiCache on pure TP
cquil11 Jun 9, 2026
422e080
fix(agentic): size DSv4 HiCache ratio by TP
cquil11 Jun 9, 2026
18c46ca
perf(agentic): expand TP4 DSv4 HiCache tier
cquil11 Jun 9, 2026
b3d2068
fix(agentic): tolerate DSv4 SGLang admission stalls
cquil11 Jun 9, 2026
c14c939
perf(agentic): retain TP4 DSv4 HiCache working set
cquil11 Jun 9, 2026
746ba4c
delete gptoss
cquil11 Jun 9, 2026
e795090
perf(agentic): expand B200 DSv4 HiCache tier
cquil11 Jun 9, 2026
2bd0028
chore(agentic): register DSv4 SGLang HiCache sweeps
cquil11 Jun 9, 2026
24adefb
fix(agentic): use SGLang UnifiedTree deadlock fix
cquil11 Jun 9, 2026
3e0ea9c
fix(agentic): use DSv4 Blackwell image on B200
cquil11 Jun 9, 2026
21d1981
fix(agentic): preserve B200 specialized image workspace
cquil11 Jun 9, 2026
6f39022
fix(agentic): isolate AIPerf from B200 SGLang
cquil11 Jun 9, 2026
51334c0
fix(agentic): align B200 DSv4 SGLang runtime flags
cquil11 Jun 9, 2026
06b9b10
fix(agentic): use stable SGLang DSv4 image on B200
cquil11 Jun 9, 2026
abfe217
perf(agentic): load B200 DSv4 from node-local RAID
cquil11 Jun 9, 2026
8e08957
fix(agentic): enable selective Mooncake caching for DSv4
cquil11 Jun 10, 2026
3da2d69
fix(agentic): align B300 CUTLASS DSL bindings
cquil11 Jun 10, 2026
01fb21f
fix(agentic): align B200 CUTLASS DSL bindings
cquil11 Jun 10, 2026
7b2c50f
perf(agentic): enable Mooncake RDMA device affinity
cquil11 Jun 10, 2026
3925ad7
fix(agentic): grant full node memory to all host KV offload jobs
cquil11 Jun 10, 2026
0b02f0b
fix(agentic): re-pin GB300 agentic image to R30-validated v0.21.0
cquil11 Jun 10, 2026
633e263
fix(agentic): retain stable B200 Mooncake NIC pin
cquil11 Jun 10, 2026
db9603b
perf(agentic): test Mooncake RDMA affinity on B200 sweep
cquil11 Jun 10, 2026
3889ed3
feat(agentic): add GB200 DSv4 dynamo-vllm disagg agentic config
cquil11 Jun 10, 2026
0dffde7
fix(agentic): disable sbatch segment directive on gb200
cquil11 Jun 10, 2026
c2b341f
fix(agentic): don't leak login-node VIRTUAL_ENV into gb200 orchestrator
cquil11 Jun 10, 2026
66a71da
fix(agentic): gb200 256k context cap + collision-proof slurm job names
cquil11 Jun 10, 2026
ca24734
fix(agentic): gb200 TEP8/TP8 topology + srtctl-level job-name prefix
cquil11 Jun 10, 2026
fe1d695
fix(agentic): gb200 prefill headroom for long-context activation spikes
cquil11 Jun 10, 2026
64f43ed
fix(agentic): bump gb200 agentic to vllm v0.21.0 for NIXL TP8<->TP8
cquil11 Jun 11, 2026
5df62c5
fix(agentic): pin static NIXL engine_id for 2-node TP8 gb200 workers
cquil11 Jun 11, 2026
7f61829
fix(agentic): feed gb200 etcd CPUs + reject zero-request agentic results
cquil11 Jun 11, 2026
bfff4cc
fix(agentic): restore stable B200 Mooncake NIC pin
cquil11 Jun 11, 2026
a152f86
fix(agentic): route SGLang DP benchmarks
cquil11 Jun 11, 2026
07f94d4
test(agentic): isolate B300 SGLang DP canary
cquil11 Jun 11, 2026
481cbde
fix(agentic): avoid SGLang DP port collisions
cquil11 Jun 11, 2026
1379f98
fix(agentic): verify SGLang DP port patch
cquil11 Jun 11, 2026
f2e67fa
fix(agentic): bypass SGLang DP self-checks
cquil11 Jun 11, 2026
cd480e5
perf(agentic): fund B300 SGLang DEP KV cache
cquil11 Jun 11, 2026
cf3956a
fix(agentic): size SGLang DEP CUDA graphs globally
cquil11 Jun 11, 2026
b178cc1
fix(agentic): isolate SGLang DP rendezvous ports
cquil11 Jun 11, 2026
ca26948
fix(agentic): use loopback for SGLang DEP rendezvous
cquil11 Jun 11, 2026
bf54707
fix(agentic): bound SGLang DEP context length
cquil11 Jun 11, 2026
057be40
fix(agentic): reserve SGLang DEP request capacity
cquil11 Jun 11, 2026
e1e72dc
fix(agentic): pass global SGLang DEP concurrency
cquil11 Jun 11, 2026
1568580
fix(agentic): keep SGLang DEP transport local
cquil11 Jun 11, 2026
d50678e
fix(agentic): disable NVSHMEM IBGDA for DEP
cquil11 Jun 11, 2026
496a047
fix(agentic): use proven SGLang DP MoE backend
cquil11 Jun 11, 2026
a346dc1
fix(agentic): retain trace-compatible DEP context
cquil11 Jun 11, 2026
1ed4419
fix(agentic): preserve default SGLang context
cquil11 Jun 11, 2026
04dfbdd
chore(agentic): update B300 SGLang image
cquil11 Jun 11, 2026
9f900c9
fix(agentic): accept SGLang usage stream chunks
cquil11 Jun 11, 2026
f9711ad
fix(agentic): surface SGLang stream errors
cquil11 Jun 11, 2026
1aebf72
fix(agentic): truncate prompts to SGLang KV capacity
cquil11 Jun 11, 2026
f84b175
chore: add temporary B300 memory profile matrix
cquil11 Jun 11, 2026
b163d30
perf(agentic): restore B300 SGLang DP KV capacity
cquil11 Jun 11, 2026
b02eb37
feat(agentic): restore B300 SGLang sweep matrix
cquil11 Jun 11, 2026
865995c
chore: remove temporary B300 memory profile matrix
cquil11 Jun 11, 2026
b422023
fix(agentic): use Triton bundled ptxas
cquil11 Jun 11, 2026
d904e5d
fix(agentic): discover CUDA ptxas for Triton
cquil11 Jun 11, 2026
88909ca
fix(agentic): search versioned CUDA toolkits
cquil11 Jun 11, 2026
1fc2591
fix(agentic): precompile B300 DeepGEMM kernels
cquil11 Jun 11, 2026
e3389bb
fix(agentic): restore known-good B300 SGLang image
cquil11 Jun 12, 2026
199ed1d
fix(agentic): use valid B200 runner pool
cquil11 Jun 12, 2026
1bdc56e
fix(agentic): use available B200 SGLang runners
cquil11 Jun 12, 2026
dfc27db
fix(agentic): collect B300 DP backend metrics
cquil11 Jun 12, 2026
dd77c82
fix(agentic): collect B200 backend metrics explicitly
cquil11 Jun 12, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
394 changes: 145 additions & 249 deletions .github/configs/amd-master.yaml

Large diffs are not rendered by default.

914 changes: 449 additions & 465 deletions .github/configs/nvidia-master.yaml

Large diffs are not rendered by default.

23 changes: 20 additions & 3 deletions .github/workflows/benchmark-multinode-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,17 @@ jobs:
- name: Slurm cleanup (pre-run)
run: &slurm-cleanup |
if command -v squeue >/dev/null 2>&1; then
echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..."
# Clean both the bare runner name and the "ifx-" prefixed form.
# launch_gb200-nv.sh names jobs ifx-<runner> to dodge a foreign
# runner fleet on watchtower that scancels by the bare name
# across users (see the comment there). squeue is filtered to
# our user so the wait loop can't hang on a same-named foreign
# job we have no permission to cancel.
echo "[Slurm] Cleaning up jobs named: ${{ runner.name }}, ifx-${{ runner.name }} ..."
scancel --name="${{ runner.name }}" || true
while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do
squeue --name="${{ runner.name }}"
scancel --name="ifx-${{ runner.name }}" || true
while [ -n "$(squeue --user="$USER" --name='${{ runner.name }},ifx-${{ runner.name }}' --noheader --format='%i')" ]; do
squeue --user="$USER" --name="${{ runner.name }},ifx-${{ runner.name }}"
sleep 5
done
fi
Expand Down Expand Up @@ -218,6 +225,16 @@ jobs:
elif [ "${{ inputs.scenario-type }}" = "agentic-coding" ]; then
if [ -f "${RESULT_FILENAME}.json" ]; then
echo "Found agentic result file: ${RESULT_FILENAME}.json"
# Existence is not enough: process_agentic_result.py writes the
# aggregate even when aiperf recorded zero valid requests (e.g.
# the server 500'd every request — gb200 R8 went green on an
# all-null result this way). Require at least one successful
# request.
ok=$(python3 -c "import json,sys; d=json.load(open('${RESULT_FILENAME}.json')); print(int(bool(d.get('num_requests_successful'))))" 2>/dev/null || echo 0)
if [ "$ok" != "1" ]; then
echo "Run failed: ${RESULT_FILENAME}.json has zero successful requests." >&2
exit 1
fi
else
echo "Run failed: Agentic benchmark result ${RESULT_FILENAME}.json not found." >&2
exit 1
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/benchmark-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,10 @@ jobs:
name: agentic_${{ env.RESULT_FILENAME }}
path: |
results/server.log
results/router.log
results/lmcache_server.log
results/mooncake_master.log
results/mooncake_config.json
results/benchmark.log
results/config.yaml
results/lmcache_command.txt
Expand Down Expand Up @@ -279,7 +282,10 @@ jobs:
name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }}
path: |
${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }}
${{ inputs.scenario-type == 'agentic-coding' && 'results/router.log' || '' }}
${{ inputs.scenario-type == 'agentic-coding' && 'results/lmcache_server.log' || '' }}
${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_master.log' || '' }}
${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_config.json' || '' }}
if-no-files-found: ignore

- name: Upload GPU metrics
Expand Down
68 changes: 61 additions & 7 deletions benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,7 @@ run_eval() {
INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}"
AGENTIC_DIR="${AGENTIC_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/agentic-benchmark}"
AIPERF_DIR="${AIPERF_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/aiperf}"
AIPERF_FAILED_REQUEST_THRESHOLD=0.10

agentic_pip_install() {
local pip_install=(python3 -m pip install)
Expand All @@ -924,8 +925,21 @@ resolve_trace_source() {
# public-dataset loader names allowed by the inferencex-agentx-mvp
# scenario. Used by recipes whose servers have non-default context
# caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the
# unfiltered 052726 corpus and switches to the 256k-capped variant).
local loader="${WEKA_LOADER_OVERRIDE:-semianalysis_cc_traces_weka_with_subagents}"
# unfiltered corpus and switches to the 256k-capped variant), or
# by recipes that want to pin an older corpus generation.
#
# Default (no override): the 060826 v6 corpus, selected by model family.
# DSv4 (full context) rides the unfiltered base corpus; every non-DSv4
# recipe defaults to the 256k-capped variant because those servers run at
# max_model_len ~256k and would reject >256k requests. Any recipe can still
# pin a specific corpus via WEKA_LOADER_OVERRIDE.
local default_loader
if [[ "${MODEL_PREFIX:-}" == dsv4* ]]; then
default_loader="semianalysis_cc_traces_weka_with_subagents_060826"
else
default_loader="semianalysis_cc_traces_weka_with_subagents_060826_256k"
fi
local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}"
local dataset
case "$loader" in
semianalysis_cc_traces_weka_with_subagents)
Expand All @@ -934,13 +948,31 @@ resolve_trace_source() {
semianalysis_cc_traces_weka_with_subagents_256k)
dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k"
;;
semianalysis_cc_traces_weka_with_subagents_060226)
dataset="semianalysisai/cc-traces-weka-with-subagents-060226"
;;
semianalysis_cc_traces_weka_with_subagents_060226_256k)
dataset="semianalysisai/cc-traces-weka-with-subagents-060226-256k"
;;
semianalysis_cc_traces_weka_with_subagents_060526)
dataset="semianalysisai/cc-traces-weka-with-subagents-060526"
;;
semianalysis_cc_traces_weka_with_subagents_060526_256k)
dataset="semianalysisai/cc-traces-weka-with-subagents-060526-256k"
;;
semianalysis_cc_traces_weka_with_subagents_060826)
dataset="semianalysisai/cc-traces-weka-with-subagents-060826"
;;
semianalysis_cc_traces_weka_with_subagents_060826_256k)
dataset="semianalysisai/cc-traces-weka-with-subagents-060826-256k"
;;
*)
echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2
echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k, semianalysis_cc_traces_weka_with_subagents_060526, semianalysis_cc_traces_weka_with_subagents_060526_256k, semianalysis_cc_traces_weka_with_subagents_060826, semianalysis_cc_traces_weka_with_subagents_060826_256k" >&2
exit 1
;;
esac
TRACE_SOURCE_FLAG="--public-dataset $loader"
echo "Loading traces via aiperf public-dataset: $loader ($dataset)"
echo "Loading traces via aiperf public-dataset: $loader ($dataset) [MODEL_PREFIX=${MODEL_PREFIX:-unset}]"
# Pre-download the dataset into the shared HF_HUB_CACHE (same mount used
# for model weights) so subsequent runs read from cache instead of
# re-downloading every job.
Expand Down Expand Up @@ -1017,7 +1049,7 @@ build_replay_cmd() {
# transient low-rate failures from killing long sweeps while still
# catching malformed payloads or server crashes before they get aggregated
# as benchmarkable data.
REPLAY_CMD+=" --failed-request-threshold 0.10"
REPLAY_CMD+=" --failed-request-threshold $AIPERF_FAILED_REQUEST_THRESHOLD"
# Sample each trajectory's warmup start position uniformly from
# [25%, 75%] of the trace's turn count (was hardcoded 0%-70% upstream).
# Avoids starting trajectories right at turn 0 where the KV cache is
Expand All @@ -1031,6 +1063,14 @@ build_replay_cmd() {
# CPU on minimax-m2.5 at high concurrency. Lossless for vLLM (server
# usage is authoritative).
REPLAY_CMD+=" --use-server-token-count"
# Disable DCGM GPU telemetry collection. aiperf's GpuMetricTimeSeries
# freezes its metric schema on the first DCGM scrape, then KeyErrors when
# an optional field (xid_errors, power_violation, encoder_utilization)
# first appears mid-run. We don't consume the gpu_telemetry artifact in
# downstream processing, and the server-metrics path (Prometheus /metrics
# from vLLM) is unaffected by this flag and still gives us KV usage,
# prefix cache hit rate, etc.
REPLAY_CMD+=" --no-gpu-telemetry"
# aiperf's dataset manager (separate from the inference parser) loads
# the model's tokenizer for trace-prompt tokenization regardless of
# --use-server-token-count. Models like kimi (amd/Kimi-K2.5-MXFP4,
Expand Down Expand Up @@ -1070,8 +1110,9 @@ build_replay_cmd() {

write_agentic_result_json() {
# Aggregate aiperf's profile_export.{json,jsonl} + server_metrics_export.json
# into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow's existing
# retry-based existence check is the single success gate.
# into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow checks that
# this file exists; run_agentic_replay_and_write_outputs separately rejects
# aggregates whose request error rate exceeds the configured limit.
local result_dir="$1"
RESULT_DIR="$result_dir" AGENTIC_OUTPUT_DIR="${AGENTIC_OUTPUT_DIR:-$INFMAX_CONTAINER_WORKSPACE}" \
python3 "$INFMAX_CONTAINER_WORKSPACE/utils/process_agentic_result.py"
Expand All @@ -1085,6 +1126,7 @@ write_agentic_result_json() {
run_agentic_replay_and_write_outputs() {
local result_dir="$1"
local replay_rc
local validation_rc

echo "$REPLAY_CMD" > "$result_dir/benchmark_command.txt"

Expand All @@ -1100,8 +1142,20 @@ run_agentic_replay_and_write_outputs() {
python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
"$result_dir/aiperf_artifacts" -o "$result_dir" 2>&1 || true

set +e
python3 "$INFMAX_CONTAINER_WORKSPACE/utils/validate_agentic_result.py" \
"$result_dir/aiperf_artifacts" \
--failed-request-threshold "$AIPERF_FAILED_REQUEST_THRESHOLD"
validation_rc=$?
set -e

if [ "$replay_rc" -ne 0 ]; then
echo "ERROR: agentic trace replay exited with code $replay_rc after writing available results" >&2
return "$replay_rc"
fi

if [ "$validation_rc" -ne 0 ]; then
echo "ERROR: agentic trace replay produced invalid results after writing available artifacts" >&2
return "$validation_rc"
fi
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
name: "svf-vllm-disagg-gb200-1p1d-tep8-tp8-agentic"

# Agentic-coding recipe for GB200: 1 prefill (TEP=8) + 1 decode (TP=8),
# 16 GPUs across 4 GB200 nodes + 1 dedicated NATS/etcd infra node.
#
# Why TEP/TP instead of the fixed-seq-len DEP8/DEP8 family
# (disagg-gb200-mid-curve-megamoe.yaml): with data-parallel ranks each rank
# holds the FULL KV of its sequences, and DSv4's hybrid KV needs 19.82 GiB
# per rank just to admit one 256k-token request — but only ~8.8 GiB is free
# on a 186 GB GB200 GPU after FP4 weights + MegaMOE buffers (engine init
# died in _check_enough_kv_cache_memory; R4 jobs 18598/18600). Tensor
# parallelism shards the KV 8-ways (~2.5 GiB/GPU at 256k), which fits with
# room for concurrent sequences. Worker flag sets mirror the validated
# gb300 TEP/TP recipes (disagg-gb300-1p17d-tep4-tp4.yaml and the 1p6d
# agentic decode): no data-parallel, no deep_gemm_mega_moe.
#
# Container is v0.21.0-ubuntu2404 (the gb300-validated agentic stack), NOT
# the v0.20.0 the gb200 fixed-seq family pins: v0.20.0's NIXL connector
# breaks on TP8<->TP8 transfers — the decode worker's first get_finished()
# poll dies with KeyError on the remote (prefill) engine_id in
# transfer_topo.get_engine_info() because the prefill engine never
# registers in the decode's engine map (R6, both shards, identical
# tracebacks). The fixed-seq DEP8/DEP8 family never hits this path
# (per-rank TP=1 transfer topology). v0.21.0 + the same ai-dynamo wheel
# ran green NIXL transfers on gb300 agentic (R30 + manual 8137).
#
# Standard agentic deltas (see the gb300 agentic recipes):
# - benchmark.type custom -> agentic_srt.sh
# - prefix caching ON (no no-enable-prefix-caching)
# - max-model-len 262144 + 060826 256k-capped corpus (GB200 cannot serve
# the full 1M DSv4 context, mirroring the minimaxm2.5 agentic configs)
# - infra.nats_max_payload_mb 32 (long agentic prompts exceed NATS' 1 MiB)
# - srun_options.container-remap-root (apt-get git in agentic_srt.sh)

model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.21.0-ubuntu2404"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8

infra:
etcd_nats_dedicated_node: true
# See the gb300 1p6d agentic recipe for rationale — NATS' 1 MiB default
# rejects long agentic prompts; 32 MiB gives ~10x headroom over the
# largest observed payload.
nats_max_payload_mb: 32

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
TORCH_SYMMMEM: "NVSHMEM"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
TORCH_SYMMMEM: "NVSHMEM"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL

vllm_config:
prefill:
# Static engine_id (one per worker, distinct between prefill/decode):
# the TP8 workers span 2 GB200 nodes, which srtctl launches as two
# processes (--node-rank 0 + --node-rank 1 --headless). Without a
# pinned engine_id each process generates its own random NIXL UUID, so
# ranks 0-3 and ranks 4-7 of the SAME worker register under different
# engine ids and the consumer's handshake dies with "Remote NIXL agent
# engine ID mismatch" on the first transfer (R7, both shards).
# Single-node-per-worker topologies (all gb300 recipes) never hit this.
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "11111111-1111-4111-8111-111111111111"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 8
pipeline-parallel-size: 1
enable-expert-parallel: true
enable-ep-weight-filter: true
enforce-eager: true
max-model-len: 262144
max-num-seqs: 16
# 16384 batched tokens + util 0.90 (the fixed-seq megamoe recipes use
# 32768 + 0.95, tuned for 9k contexts): at 256k contexts the first
# long prefill's activation spike (sparse indexer logits, mhc fused
# kernels) needs ~2 GiB of runtime headroom that 0.95 doesn't leave —
# R5 job 18603 died with "CUDA out of memory. Tried to allocate
# 1.98 GiB ... 1.53 GiB free" on the first scheduled request. Matches
# the green gb300 agentic prefill (0.9 / 16384).
max-num-batched-tokens: 16384
trust-remote-code: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.9
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
tokenizer-mode: deepseek_v4
decode:
# See prefill: static engine_id shared by both node processes of this
# 2-node TP8 worker (distinct from the prefill worker's id).
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "22222222-2222-4222-8222-222222222222"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 8
pipeline-parallel-size: 1
enable-ep-weight-filter: true
max-model-len: 262144
max-num-seqs: 512
max-cudagraph-capture-size: 512
max-num-batched-tokens: 512
trust-remote-code: true
no-enable-flashinfer-autotune: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
all2all-backend: "flashinfer_nvlink_one_sided"
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4

# cpus-per-task=72: one full GB200 NUMA socket (144 cores split 2 x 72) per
# task. Critical for the *infra step* (etcd + nats), which srtctl spawns
# without --gres — on watchtower the per-GPU CPU default (CpusPerTres=gpu:35)
# doesn't apply to GPU-less steps, so etcd lands with 1 CPU, falls behind on
# lease keep-alives, and worker registrations silently expire mid-run: R8's
# decode worker logged "Keep-alive lease expired" 11 min after going healthy
# and the frontend 500'd every benchmark request with "Instance not found".
# Same failure mode and fix as the gb300 agentic recipes (their R12).
sbatch_directives:
cpus-per-task: "72"

srun_options:
# See gb300 agentic recipes: pyxis may map the calling user to a non-root
# uid inside the container; remap to uid 0 so agentic_srt.sh's apt-get
# install git works. No-op when the container user is already root.
container-remap-root: ""

benchmark:
type: custom
command: bash /infmax-workspace/benchmarks/multi_node/agentic_srt.sh
env:
INFMAX_CONTAINER_WORKSPACE: /infmax-workspace
RESULT_DIR: /logs/agentic
PORT: "8000"
IS_MULTINODE: "true"
# Container-side path of the aiperf mmap dataset cache; the host-side
# mount is wired via launch_gb200-nv.sh's srtslurm.yaml default_mounts.
# Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files
# per dataset on every run.
AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache"
# Persistent HF hub cache (also wired via default_mounts) so the trace
# dataset isn't re-downloaded on every run. Overrides the workflow-level
# HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes.
HF_HUB_CACHE: "/hf_hub_cache"
# The server runs at max-model-len 262144 (see header comment) — replay
# the 256k-capped corpus and tell aiperf to filter inputs to the served
# window, mirroring the minimaxm2.5 agentic configs.
WEKA_LOADER_OVERRIDE: "semianalysis_cc_traces_weka_with_subagents_060826_256k"
MAX_MODEL_LEN: "262144"
Loading