SemiAnalysisAI · cquil11 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -171,10 +171,17 @@ jobs:
       - name: Slurm cleanup (pre-run)
         run: &slurm-cleanup |
           if command -v squeue >/dev/null 2>&1; then
-            echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..."
+            # Clean both the bare runner name and the "ifx-" prefixed form.
+            # launch_gb200-nv.sh names jobs ifx-<runner> to dodge a foreign
+            # runner fleet on watchtower that scancels by the bare name
+            # across users (see the comment there). squeue is filtered to
+            # our user so the wait loop can't hang on a same-named foreign
+            # job we have no permission to cancel.
+            echo "[Slurm] Cleaning up jobs named: ${{ runner.name }}, ifx-${{ runner.name }} ..."
             scancel --name="${{ runner.name }}" || true
-            while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do
-              squeue --name="${{ runner.name }}"
+            scancel --name="ifx-${{ runner.name }}" || true
+            while [ -n "$(squeue --user="$USER" --name='${{ runner.name }},ifx-${{ runner.name }}' --noheader --format='%i')" ]; do
+              squeue --user="$USER" --name="${{ runner.name }},ifx-${{ runner.name }}"
               sleep 5
             done
           fi
@@ -218,6 +225,16 @@ jobs:
           elif [ "${{ inputs.scenario-type }}" = "agentic-coding" ]; then
             if [ -f "${RESULT_FILENAME}.json" ]; then
               echo "Found agentic result file: ${RESULT_FILENAME}.json"
+              # Existence is not enough: process_agentic_result.py writes the
+              # aggregate even when aiperf recorded zero valid requests (e.g.
+              # the server 500'd every request — gb200 R8 went green on an
+              # all-null result this way). Require at least one successful
+              # request.
+              ok=$(python3 -c "import json,sys; d=json.load(open('${RESULT_FILENAME}.json')); print(int(bool(d.get('num_requests_successful'))))" 2>/dev/null || echo 0)
+              if [ "$ok" != "1" ]; then
+                echo "Run failed: ${RESULT_FILENAME}.json has zero successful requests." >&2
+                exit 1
+              fi
             else
               echo "Run failed: Agentic benchmark result ${RESULT_FILENAME}.json not found." >&2
               exit 1

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -239,7 +239,10 @@ jobs:
           name: agentic_${{ env.RESULT_FILENAME }}
           path: |
             results/server.log
+            results/router.log
             results/lmcache_server.log
+            results/mooncake_master.log
+            results/mooncake_config.json
             results/benchmark.log
             results/config.yaml
             results/lmcache_command.txt
@@ -279,7 +282,10 @@ jobs:
           name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }}
           path: |
             ${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }}
+            ${{ inputs.scenario-type == 'agentic-coding' && 'results/router.log' || '' }}
             ${{ inputs.scenario-type == 'agentic-coding' && 'results/lmcache_server.log' || '' }}
+            ${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_master.log' || '' }}
+            ${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_config.json' || '' }}
           if-no-files-found: ignore
 
       - name: Upload GPU metrics

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -899,6 +899,7 @@ run_eval() {
 INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}"
 AGENTIC_DIR="${AGENTIC_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/agentic-benchmark}"
 AIPERF_DIR="${AIPERF_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/aiperf}"
+AIPERF_FAILED_REQUEST_THRESHOLD=0.10
 
 agentic_pip_install() {
     local pip_install=(python3 -m pip install)
@@ -924,8 +925,21 @@ resolve_trace_source() {
     # public-dataset loader names allowed by the inferencex-agentx-mvp
     # scenario. Used by recipes whose servers have non-default context
     # caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the
-    # unfiltered 052726 corpus and switches to the 256k-capped variant).
-    local loader="${WEKA_LOADER_OVERRIDE:-semianalysis_cc_traces_weka_with_subagents}"
+    # unfiltered corpus and switches to the 256k-capped variant), or
+    # by recipes that want to pin an older corpus generation.
+    #
+    # Default (no override): the 060826 v6 corpus, selected by model family.
+    # DSv4 (full context) rides the unfiltered base corpus; every non-DSv4
+    # recipe defaults to the 256k-capped variant because those servers run at
+    # max_model_len ~256k and would reject >256k requests. Any recipe can still
+    # pin a specific corpus via WEKA_LOADER_OVERRIDE.
+    local default_loader
+    if [[ "${MODEL_PREFIX:-}" == dsv4* ]]; then
+        default_loader="semianalysis_cc_traces_weka_with_subagents_060826"
+    else
+        default_loader="semianalysis_cc_traces_weka_with_subagents_060826_256k"
+    fi
+    local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}"
     local dataset
     case "$loader" in
         semianalysis_cc_traces_weka_with_subagents)
@@ -934,13 +948,31 @@ resolve_trace_source() {
         semianalysis_cc_traces_weka_with_subagents_256k)
             dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k"
             ;;
+        semianalysis_cc_traces_weka_with_subagents_060226)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060226"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_060226_256k)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060226-256k"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_060526)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060526"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_060526_256k)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060526-256k"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_060826)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060826"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_060826_256k)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060826-256k"
+            ;;
         *)
-            echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2
+            echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k, semianalysis_cc_traces_weka_with_subagents_060526, semianalysis_cc_traces_weka_with_subagents_060526_256k, semianalysis_cc_traces_weka_with_subagents_060826, semianalysis_cc_traces_weka_with_subagents_060826_256k" >&2
             exit 1
             ;;
     esac
     TRACE_SOURCE_FLAG="--public-dataset $loader"
-    echo "Loading traces via aiperf public-dataset: $loader ($dataset)"
+    echo "Loading traces via aiperf public-dataset: $loader ($dataset) [MODEL_PREFIX=${MODEL_PREFIX:-unset}]"
     # Pre-download the dataset into the shared HF_HUB_CACHE (same mount used
     # for model weights) so subsequent runs read from cache instead of
     # re-downloading every job.
@@ -1017,7 +1049,7 @@ build_replay_cmd() {
     # transient low-rate failures from killing long sweeps while still
     # catching malformed payloads or server crashes before they get aggregated
     # as benchmarkable data.
-    REPLAY_CMD+=" --failed-request-threshold 0.10"
+    REPLAY_CMD+=" --failed-request-threshold $AIPERF_FAILED_REQUEST_THRESHOLD"
     # Sample each trajectory's warmup start position uniformly from
     # [25%, 75%] of the trace's turn count (was hardcoded 0%-70% upstream).
     # Avoids starting trajectories right at turn 0 where the KV cache is
@@ -1031,6 +1063,14 @@ build_replay_cmd() {
     # CPU on minimax-m2.5 at high concurrency. Lossless for vLLM (server
     # usage is authoritative).
     REPLAY_CMD+=" --use-server-token-count"
+    # Disable DCGM GPU telemetry collection. aiperf's GpuMetricTimeSeries
+    # freezes its metric schema on the first DCGM scrape, then KeyErrors when
+    # an optional field (xid_errors, power_violation, encoder_utilization)
+    # first appears mid-run. We don't consume the gpu_telemetry artifact in
+    # downstream processing, and the server-metrics path (Prometheus /metrics
+    # from vLLM) is unaffected by this flag and still gives us KV usage,
+    # prefix cache hit rate, etc.
+    REPLAY_CMD+=" --no-gpu-telemetry"
     # aiperf's dataset manager (separate from the inference parser) loads
     # the model's tokenizer for trace-prompt tokenization regardless of
     # --use-server-token-count. Models like kimi (amd/Kimi-K2.5-MXFP4,
@@ -1070,8 +1110,9 @@ build_replay_cmd() {
 
 write_agentic_result_json() {
     # Aggregate aiperf's profile_export.{json,jsonl} + server_metrics_export.json
-    # into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow's existing
-    # retry-based existence check is the single success gate.
+    # into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow checks that
+    # this file exists; run_agentic_replay_and_write_outputs separately rejects
+    # aggregates whose request error rate exceeds the configured limit.
     local result_dir="$1"
     RESULT_DIR="$result_dir" AGENTIC_OUTPUT_DIR="${AGENTIC_OUTPUT_DIR:-$INFMAX_CONTAINER_WORKSPACE}" \
         python3 "$INFMAX_CONTAINER_WORKSPACE/utils/process_agentic_result.py"
@@ -1085,6 +1126,7 @@ write_agentic_result_json() {
 run_agentic_replay_and_write_outputs() {
     local result_dir="$1"
     local replay_rc
+    local validation_rc
 
     echo "$REPLAY_CMD" > "$result_dir/benchmark_command.txt"
 
@@ -1100,8 +1142,20 @@ run_agentic_replay_and_write_outputs() {
     python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
         "$result_dir/aiperf_artifacts" -o "$result_dir" 2>&1 || true
 
+    set +e
+    python3 "$INFMAX_CONTAINER_WORKSPACE/utils/validate_agentic_result.py" \
+        "$result_dir/aiperf_artifacts" \
+        --failed-request-threshold "$AIPERF_FAILED_REQUEST_THRESHOLD"
+    validation_rc=$?
+    set -e
+
     if [ "$replay_rc" -ne 0 ]; then
         echo "ERROR: agentic trace replay exited with code $replay_rc after writing available results" >&2
         return "$replay_rc"
     fi
+
+    if [ "$validation_rc" -ne 0 ]; then
+        echo "ERROR: agentic trace replay produced invalid results after writing available artifacts" >&2
+        return "$validation_rc"
+    fi
 }
diff --git a/...i_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml b/...i_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml
@@ -0,0 +1,205 @@
+name: "svf-vllm-disagg-gb200-1p1d-tep8-tp8-agentic"
+
+# Agentic-coding recipe for GB200: 1 prefill (TEP=8) + 1 decode (TP=8),
+# 16 GPUs across 4 GB200 nodes + 1 dedicated NATS/etcd infra node.
+#
+# Why TEP/TP instead of the fixed-seq-len DEP8/DEP8 family
+# (disagg-gb200-mid-curve-megamoe.yaml): with data-parallel ranks each rank
+# holds the FULL KV of its sequences, and DSv4's hybrid KV needs 19.82 GiB
+# per rank just to admit one 256k-token request — but only ~8.8 GiB is free
+# on a 186 GB GB200 GPU after FP4 weights + MegaMOE buffers (engine init
+# died in _check_enough_kv_cache_memory; R4 jobs 18598/18600). Tensor
+# parallelism shards the KV 8-ways (~2.5 GiB/GPU at 256k), which fits with
+# room for concurrent sequences. Worker flag sets mirror the validated
+# gb300 TEP/TP recipes (disagg-gb300-1p17d-tep4-tp4.yaml and the 1p6d
+# agentic decode): no data-parallel, no deep_gemm_mega_moe.
+#
+# Container is v0.21.0-ubuntu2404 (the gb300-validated agentic stack), NOT
+# the v0.20.0 the gb200 fixed-seq family pins: v0.20.0's NIXL connector
+# breaks on TP8<->TP8 transfers — the decode worker's first get_finished()
+# poll dies with KeyError on the remote (prefill) engine_id in
+# transfer_topo.get_engine_info() because the prefill engine never
+# registers in the decode's engine map (R6, both shards, identical
+# tracebacks). The fixed-seq DEP8/DEP8 family never hits this path
+# (per-rank TP=1 transfer topology). v0.21.0 + the same ai-dynamo wheel
+# ran green NIXL transfers on gb300 agentic (R30 + manual 8137).
+#
+# Standard agentic deltas (see the gb300 agentic recipes):
+#   - benchmark.type custom -> agentic_srt.sh
+#   - prefix caching ON (no no-enable-prefix-caching)
+#   - max-model-len 262144 + 060826 256k-capped corpus (GB200 cannot serve
+#     the full 1M DSv4 context, mirroring the minimaxm2.5 agentic configs)
+#   - infra.nats_max_payload_mb 32 (long agentic prompts exceed NATS' 1 MiB)
+#   - srun_options.container-remap-root (apt-get git in agentic_srt.sh)
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.21.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+infra:
+  etcd_nats_dedicated_node: true
+  # See the gb300 1p6d agentic recipe for rationale — NATS' 1 MiB default
+  # rejects long agentic prompts; 32 MiB gives ~10x headroom over the
+  # largest observed payload.
+  nats_max_payload_mb: 32
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  vllm_config:
+    prefill:
+      # Static engine_id (one per worker, distinct between prefill/decode):
+      # the TP8 workers span 2 GB200 nodes, which srtctl launches as two
+      # processes (--node-rank 0 + --node-rank 1 --headless). Without a
+      # pinned engine_id each process generates its own random NIXL UUID, so
+      # ranks 0-3 and ranks 4-7 of the SAME worker register under different
+      # engine ids and the consumer's handshake dies with "Remote NIXL agent
+      # engine ID mismatch" on the first transfer (R7, both shards).
+      # Single-node-per-worker topologies (all gb300 recipes) never hit this.
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "11111111-1111-4111-8111-111111111111"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enable-ep-weight-filter: true
+      enforce-eager: true
+      max-model-len: 262144
+      max-num-seqs: 16
+      # 16384 batched tokens + util 0.90 (the fixed-seq megamoe recipes use
+      # 32768 + 0.95, tuned for 9k contexts): at 256k contexts the first
+      # long prefill's activation spike (sparse indexer logits, mhc fused
+      # kernels) needs ~2 GiB of runtime headroom that 0.95 doesn't leave —
+      # R5 job 18603 died with "CUDA out of memory. Tried to allocate
+      # 1.98 GiB ... 1.53 GiB free" on the first scheduled request. Matches
+      # the green gb300 agentic prefill (0.9 / 16384).
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      tokenizer-mode: deepseek_v4
+    decode:
+      # See prefill: static engine_id shared by both node processes of this
+      # 2-node TP8 worker (distinct from the prefill worker's id).
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "22222222-2222-4222-8222-222222222222"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-ep-weight-filter: true
+      max-model-len: 262144
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+# cpus-per-task=72: one full GB200 NUMA socket (144 cores split 2 x 72) per
+# task. Critical for the *infra step* (etcd + nats), which srtctl spawns
+# without --gres — on watchtower the per-GPU CPU default (CpusPerTres=gpu:35)
+# doesn't apply to GPU-less steps, so etcd lands with 1 CPU, falls behind on
+# lease keep-alives, and worker registrations silently expire mid-run: R8's
+# decode worker logged "Keep-alive lease expired" 11 min after going healthy
+# and the frontend 500'd every benchmark request with "Instance not found".
+# Same failure mode and fix as the gb300 agentic recipes (their R12).
+sbatch_directives:
+  cpus-per-task: "72"
+
+srun_options:
+  # See gb300 agentic recipes: pyxis may map the calling user to a non-root
+  # uid inside the container; remap to uid 0 so agentic_srt.sh's apt-get
+  # install git works. No-op when the container user is already root.
+  container-remap-root: ""
+
+benchmark:
+  type: custom
+  command: bash /infmax-workspace/benchmarks/multi_node/agentic_srt.sh
+  env:
+    INFMAX_CONTAINER_WORKSPACE: /infmax-workspace
+    RESULT_DIR: /logs/agentic
+    PORT: "8000"
+    IS_MULTINODE: "true"
+    # Container-side path of the aiperf mmap dataset cache; the host-side
+    # mount is wired via launch_gb200-nv.sh's srtslurm.yaml default_mounts.
+    # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files
+    # per dataset on every run.
+    AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache"
+    # Persistent HF hub cache (also wired via default_mounts) so the trace
+    # dataset isn't re-downloaded on every run. Overrides the workflow-level
+    # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes.
+    HF_HUB_CACHE: "/hf_hub_cache"
+    # The server runs at max-model-len 262144 (see header comment) — replay
+    # the 256k-capped corpus and tell aiperf to filter inputs to the served
+    # window, mirroring the minimaxm2.5 agentic configs.
+    WEKA_LOADER_OVERRIDE: "semianalysis_cc_traces_weka_with_subagents_060826_256k"
+    MAX_MODEL_LEN: "262144"