diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index fb3966ce6..a50d37eab 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -304,25 +304,6 @@ qwen3.5-fp8-mi355x-sglang-mtp:
       - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp }
       - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp }
 
-# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main.
-qwen3.5-fp8-mi355x-sglang-agentic:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
 qwen3.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -704,26 +685,6 @@ glm5.1-fp4-mi355x-sglang:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 16 }
 
-# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main.
-glm5.1-fp4-mi355x-sglang-agentic:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
-  model: amd/GLM-5.1-MXFP4
-  model-prefix: glm5.1
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
 glm5.1-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: amd/GLM-5.1-MXFP4
@@ -744,7 +705,7 @@ glm5.1-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 256 }
 
 kimik2.5-int4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi355x
@@ -763,7 +724,7 @@ kimik2.5-int4-mi355x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-int4-mi325x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi325x
@@ -782,7 +743,7 @@ kimik2.5-int4-mi325x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-int4-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi300x
@@ -821,38 +782,6 @@ kimik2.5-fp4-mi355x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
-# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0'
-kimik2.5-fp4-mi355x-vllm-agentic:
-  # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
-  # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
-  # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
-  # includes all subsequent ROCm offload work.
-  image: vllm/vllm-openai-rocm:v0.21.0
-  model: amd/Kimi-K2.5-MXFP4
-  model-prefix: kimik2.5
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
-      # CPU offload only above the KV cliff. Lower concurrencies fit
-      # entirely on-GPU, so paying the offload-path overhead there would
-      # just slow them down without measuring anything new.
-      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
-      # TP=4 probe: half-node layout doubles per-GPU weight footprint
-      # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to
-      # cliff-region concurrencies on both offload modes so we can directly
-      # compare TP=4 vs TP=8 at the same conc points.
-      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
-
 kimik2.5-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
   model: amd/Kimi-K2.5-MXFP4
@@ -897,33 +826,6 @@ minimaxm2.5-fp8-mi355x-vllm:
       - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
       - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }
 
-# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
-minimaxm2.5-fp8-mi355x-vllm-agentic:
-  # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector]
-  # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"),
-  # which enables SimpleCPUOffloadConnector on ROCm. Required for the
-  # cpu-offload sweep points to use the same offload path as the NVIDIA
-  # agentic-coding configs.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi355x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
-    # Compute saturates first; cpu offload likely won't help, but worth confirming.
-    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
-    - duration: 1800
-      search-space:
-      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
-      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
-
 minimaxm2.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: MiniMaxAI/MiniMax-M2.5
@@ -994,7 +896,7 @@ minimaxm2.5-fp4-mi355x-vllm:
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi300x
@@ -1014,29 +916,6 @@ minimaxm2.5-fp8-mi300x-vllm:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
-# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
-minimaxm2.5-fp8-mi300x-vllm-agentic:
-  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi300x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200);
-    # KV cliff ~52. Compute saturates first.
-    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
-
 minimaxm2.5-fp8-mi325x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -1058,32 +937,8 @@ minimaxm2.5-fp8-mi325x-vllm:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
 
-# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
-minimaxm2.5-fp8-mi325x-vllm-agentic:
-  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi325x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI325X tp=4: cloned from MI300X recipe (slightly faster compute,
-    # similar HBM profile). Compute saturates first; cpu-offload window
-    # exercises the SimpleCPUOffloadConnector path enabled by the rocm
-    # nightly. Mirror MI300X conc grid for cross-vendor comparability.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
-
 gptoss-fp4-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.17.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: mi300x
@@ -1524,7 +1379,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
           - "DECODE_MTP_SIZE=2"
 
 kimik2.5-fp4-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
   runner: mi355x-disagg
@@ -1578,7 +1433,7 @@ kimik2.5-fp4-mi355x-vllm-disagg:
           - "DECODE_NODES=2"
 
 minimaxm2.5-fp8-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi355x-disagg
@@ -1971,7 +1826,6 @@ dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
   
-
 dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
   image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
   model: amd/DeepSeek-R1-0528-MXFP4-v2
@@ -2082,7 +1936,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=3"
 
-
       # 1*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
         conc-list: [ 128 ]
@@ -2140,11 +1993,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
 
-
-# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
-# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
-# image tag, so bumping sglang is just an image tag bump here. Sweeps
-# DP-attention on/off and EP=8.
 dsv4-fp4-mi355x-sglang:
   image: rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -2201,25 +2049,6 @@ dsv4-fp4-mi355x-sglang-mtp:
       - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp }
       - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp }
 
-# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
-# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
-# on 2026-05-05, so any nightly built after that includes the
-# DeepseekV4ForCausalLM model class.
-#
-# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
-# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
-# files keyed on the image string and short-circuits re-import if the
-# file already exists, so the floating tag silently keeps a stale build
-# even after Docker Hub updates `:nightly`.
-#
-# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
-# rest); InferenceX classifies this as fp4 — same as the sister sglang
-# and atom DSv4 mi355x entries below. Image and serving flags follow the
-# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
-# executor, triton_unfused MoE (required for the FP4 expert format),
-# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
-# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
-# probe to validate the ROCm DP+EP path.
 dsv4-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -2408,44 +2237,6 @@ glm5-fp8-mi325x-sglang-mtp:
       search-space:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
-# ============================================================================
-# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries).
-# Recipes that ALREADY existed on main were intentionally left at main's version
-# to preserve main behavior; PR-branch modifications to those recipes are NOT
-# brought in here.
-# ============================================================================
-
-qwen3.5-fp8-mi355x-sglang-agentic-hicache:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
-
-dsv4-fp4-mi355x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.21.0
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
-      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
-
 dsr1-fp4-mi355x-sglang-disagg-mtp:
   image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
   model: amd/DeepSeek-R1-0528-MXFP4-v2
@@ -2674,20 +2465,145 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
       
+qwen3.5-fp8-mi355x-sglang-agentic:
+  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+glm5.1-fp4-mi355x-sglang-agentic:
+  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
+  model: amd/GLM-5.1-MXFP4
+  model-prefix: glm5.1
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+kimik2.5-fp4-mi355x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
+      # CPU offload only above the KV cliff. Lower concurrencies fit
+      # entirely on-GPU, so paying the offload-path overhead there would
+      # just slow them down without measuring anything new.
+      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
+      # TP=4 probe: half-node layout doubles per-GPU weight footprint
+      # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to
+      # cliff-region concurrencies on both offload modes so we can directly
+      # compare TP=4 vs TP=8 at the same conc points.
+      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
+
+minimaxm2.5-fp8-mi355x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.22.1
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
+    # Compute saturates first; cpu offload likely won't help, but worth confirming.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
+      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
+
+minimaxm2.5-fp8-mi300x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.22.1
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi300x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200);
+    # KV cliff ~52. Compute saturates first.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
+
+minimaxm2.5-fp8-mi325x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.22.1
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi325x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI325X tp=4: cloned from MI300X recipe (slightly faster compute,
+    # similar HBM profile). Compute saturates first; cpu-offload window
+    # exercises the SimpleCPUOffloadConnector path enabled by the rocm
+    # nightly. Mirror MI300X conc grid for cross-vendor comparability.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
+
+qwen3.5-fp8-mi355x-sglang-agentic-hicache:
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
+
+dsv4-fp4-mi355x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
 
-# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
-# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
-# image tag, so bumping sglang is just an image tag bump here. Sweeps
-# DP-attention on/off and EP=8.
-
-# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below;
-# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - scenarios: replaced fixed-seq-len with agentic-coding.
-# Image is identical to the base entry (rocm/sgl-dev DSv4 build).
-# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware
-# comparability. Offload sweep is none-only (SGLang has no equivalent of
-# vLLM's SimpleCPUOffloadConnector path that we exercise on b200).
 dsv4-fp4-mi355x-sglang-agentic:
   image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -2702,23 +2618,3 @@ dsv4-fp4-mi355x-sglang-agentic:
       search-space:
       - { tp: 8, offloading: none, conc-list: [16, 32, 64] }
       - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] }
-
-# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
-# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
-# on 2026-05-05, so any nightly built after that includes the
-# DeepseekV4ForCausalLM model class.
-#
-# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
-# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
-# files keyed on the image string and short-circuits re-import if the
-# file already exists, so the floating tag silently keeps a stale build
-# even after Docker Hub updates `:nightly`.
-#
-# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
-# rest); InferenceX classifies this as fp4 — same as the sister sglang
-# and atom DSv4 mi355x entries below. Image and serving flags follow the
-# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
-# executor, triton_unfused MoE (required for the FP4 expert format),
-# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
-# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
-# probe to validate the ROCm DP+EP path.
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index d3b1b6729..7a15b43a3 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -384,25 +384,6 @@ dsr1-fp4-b200-dynamo-trt:
           ep: 8
           dp-attn: true
 
-    agentic-coding:
-    - duration: 300
-      search-space:
-      - spec-decoding: "none"
-        conc-list: [ 1, 2, 4, 8, 16, 32 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: true
-          additional-settings:
-          # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml
-          - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: false
-
 dsr1-fp8-b200-dynamo-trt:
   image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
   model: deepseek-ai/DeepSeek-R1-0528
@@ -1778,28 +1759,6 @@ dsv4-fp4-b200-vllm:
       - { tp: 8, conc-start: 1, conc-end: 32 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
 
-# Diverged from dsv4-fp4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original dsv4-fp4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - runner: 'b200-dsv4' -> 'b200-dgxc'
-dsv4-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.0-cu130
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: b200-dgxc
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # cpu offload only this iteration — none entries already validated in
-      # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%).
-      # Re-add when investigating regressions in offload=none.
-      - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
-      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
-
 dsv4-fp4-b200-trt:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -1845,7 +1804,7 @@ dsv4-fp4-b200-trt-mtp:
 # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
 dsv4-fp4-b200-vllm-mtp:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dsv4
@@ -2105,23 +2064,6 @@ qwen3.5-bf16-b200-sglang-mtp:
       search-space:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
-# agentic-coding sibling — temporarily disabled, blocked by e2e-tests.yml
-# artifact-name mismatch (downloads `agentic_*` but benchmark-tmpl.yml uploads
-# as `bmk_agentic_*`). Re-enable once that workflow is aligned.
-# qwen3.5-bf16-b200-sglang-agentic:
-#   image: lmsysorg/sglang:v0.5.12-cu130
-#   model: Qwen/Qwen3.5-397B-A17B
-#   model-prefix: qwen3.5
-#   runner: b200
-#   precision: bf16
-#   framework: sglang
-#   multinode: false
-#   scenarios:
-#     agentic-coding:
-#     - duration: 1800
-#       search-space:
-#       - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
 qwen3.5-fp8-b200-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -2143,25 +2085,6 @@ qwen3.5-fp8-b200-sglang:
       - { tp: 8, conc-start: 4, conc-end: 4 }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
 
-# Diverged from qwen3.5-fp8-b200-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's qwen3.5-fp8-b200-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original qwen3.5-fp8-b200-sglang entry stays byte-identical to origin/main.
-qwen3.5-fp8-b200-sglang-agentic:
-  image: lmsysorg/sglang:nightly-dev-20260422-de962f32
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: b200
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
 qwen3.5-fp4-b200-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/Qwen3.5-397B-A17B-NVFP4
@@ -2245,26 +2168,6 @@ glm5-fp8-b200-sglang-mtp:
   # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1
   # does not have a B300-specific recipe, so this config reuses the existing GLM5 FP8
   # B200 SGLang recipe as-is until B300-specific tuning is available.
-# Diverged from glm5-fp8-b200-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's glm5-fp8-b200-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original glm5-fp8-b200-sglang entry stays byte-identical to origin/main.
-glm5-fp8-b200-sglang-agentic:
-  image: lmsysorg/sglang:v0.5.12-cu130
-  model: zai-org/GLM-5-FP8
-  model-prefix: glm5
-  runner: b200
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] }
-
 glm5-fp8-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: zai-org/GLM-5-FP8
@@ -2411,7 +2314,6 @@ qwen3.5-fp8-b200-sglang-mtp:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
     
-
 qwen3.5-fp8-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -2553,39 +2455,8 @@ kimik2.5-int4-b200-vllm:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from kimik2.5-int4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-int4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - runner: 'b200' -> 'b200-dgxc'
-kimik2.5-int4-b200-vllm-agentic:
-  # Bumped from v0.19.1 — that release tripped a bug in
-  # `flashinfer_trtllm_mxint4_moe` ('list' object has no attribute 'to')
-  # during warmup `profile_run` on the agentic-coding path
-  # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the
-  # flashinfer fix.
-  image: vllm/vllm-openai:v0.20.2
-  model: moonshotai/Kimi-K2.5
-  model-prefix: kimik2.5
-  runner: b200-dgxc
-  precision: int4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-      - { tp: 8, offloading: cpu,  conc-list: [32, 64, 96, 128] }
-
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
-
 kimik2.5-int4-b300-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: b300
@@ -2624,29 +2495,6 @@ kimik2.5-int4-h200-vllm:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
-# Diverged from kimik2.5-int4-h200-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-int4-h200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - runner: 'h200' -> 'h200-dgxc'
-kimik2.5-int4-h200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
-  model: moonshotai/Kimi-K2.5
-  model-prefix: kimik2.5
-  # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with
-  # the /home/sa-shared/gharunners/ai-perf-cache mount where aiperf's
-  # content-addressed dataset mmap cache lives. Other h200 pools (cw, nb)
-  # don't have that mount and would re-materialize 65 GB to /tmp every job.
-  runner: h200-dgxc
-  precision: int4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] }
-      - { tp: 8, offloading: cpu,  conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] }
-
 kimik2.5-fp4-b200-vllm:
   image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
@@ -2668,40 +2516,8 @@ kimik2.5-fp4-b200-vllm:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
 
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2'
-#   - runner: 'b200' -> 'b200-dgxc'
-kimik2.5-fp4-b200-vllm-agentic:
-  # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that
-  # cleared the agentic-coding warmup crash on max_model_len=131072 +
-  # prefix caching.
-  image: vllm/vllm-openai:v0.20.2
-  model: nvidia/Kimi-K2.5-NVFP4
-  model-prefix: kimik2.5
-  runner: b200-dgxc
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] }
-      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [16, 24, 32, 36] }
-      - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] }
-      - { tp: 4, ep: 1, offloading: cpu,  conc-list: [12, 14, 16, 18, 20, 22, 24, 32] }
-
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-
 kimik2.5-fp4-b300-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: b300
@@ -2763,34 +2579,6 @@ dsr1-fp8-b300-sglang-mtp:
       search-space:
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 512, spec-decoding: mtp }
 
-# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130'
-#   - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4'
-#   - model-prefix: 'dsr1' -> 'kimik2.5'
-#   - precision: 'fp8' -> 'fp4'
-#   - framework: 'sglang' -> 'vllm'
-kimik2.5-fp4-b300-vllm-agentic:
-  # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM
-  # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the
-  # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted
-  # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the
-  # INT4 B300 sister already uses successfully.
-  image: vllm/vllm-openai:v0.20.0-cu130
-  model: nvidia/Kimi-K2.5-NVFP4
-  model-prefix: kimik2.5
-  runner: b300
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
-      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
-
 dsr1-fp8-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc14
   model: deepseek-ai/DeepSeek-R1-0528
@@ -2880,7 +2668,7 @@ dsr1-fp8-h200-sglang-mtp:
 # Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache
 # flag is omitted. Max-model-len is pinned at 800k per the recipe.
 dsv4-fp8-h200-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: h200
@@ -2904,7 +2692,7 @@ dsv4-fp8-h200-vllm:
 # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
 dsv4-fp8-h200-vllm-mtp:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: h200
@@ -2924,31 +2712,6 @@ dsv4-fp8-h200-vllm-mtp:
       - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp }
 
-# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only).
-# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper
-# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up.
-# Diverged from dsv4-fp8-h200-vllm (agentic-coding sibling). Reasons below;
-# the original dsv4-fp8-h200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129'
-dsv4-fp8-h200-vllm-agentic:
-  image: vllm/vllm-openai:deepseekv4-cu129
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: h200
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [1, 2, 4, 8, 16] }
-
-# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image
-# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
-# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
-
 dsv4-fp8-h200-sglang:
   image: lmsysorg/sglang:deepseek-v4-hopper@sha256:7f19c6dc092e47a10fac2e41f47eab78970280d06648b8e50d312a82f0ae722f
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -3024,30 +2787,6 @@ dsv4-fp4-b300-vllm:
       - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
-# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main.
-dsv4-fp4-b300-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.0-cu130
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: b300
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # cpu offload only this iteration — none entries already validated in
-      # earlier runs. Re-add when investigating regressions in offload=none.
-      - { tp: 4, offloading: cpu,  conc-list: [16, 32, 64] }
-      - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
-      - { tp: 4, ep: 4, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
-      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [128, 256, 512] }
-
 dsv4-fp4-b300-trt:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -3095,7 +2834,7 @@ dsv4-fp4-b300-trt-mtp:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024, spec-decoding: mtp }
 
 dsv4-fp4-b300-vllm-mtp:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -4284,27 +4023,6 @@ gptoss-fp4-b200-vllm:
       - { tp: 4, conc-start: 4, conc-end: 64 }
       - { tp: 8, conc-start: 4, conc-end: 4 }
 
-# Diverged from gptoss-fp4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original gptoss-fp4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1'
-gptoss-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.1
-  model: openai/gpt-oss-120b
-  model-prefix: gptoss
-  runner: b200
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] }
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] }
-      - { tp: 4, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
-      - { tp: 8, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
-
 minimaxm2.5-fp8-b200-vllm:
   image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -4330,35 +4048,8 @@ minimaxm2.5-fp8-b200-vllm:
   # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
   # does not have a B300-specific recipe, so this config reuses the existing
   # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from minimaxm2.5-fp8-b200-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
-#   - runner: 'b200' -> 'b200-dgxc'
-minimaxm2.5-fp8-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.1
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: b200-dgxc
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # B200 tp=4: compute ceiling ~50 (empirical), KV cliff ~48 (analytical).
-    # Push none past the KV cliff (96, 128) to make the no-offload throughput
-    # collapse visible; cpu range overlaps fully for same-conc comparison.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 96, 128] }
-      - { tp: 4, offloading: cpu,  conc-list: [48, 56, 64, 96, 128] }
-
-  # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
-  # does not have a B300-specific recipe, so this config reuses the existing
-  # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.
-
 minimaxm2.5-fp8-b300-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: b300
@@ -4381,31 +4072,6 @@ minimaxm2.5-fp8-b300-vllm:
       - { tp: 2, conc-start: 64, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 8 }
 
-# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
-minimaxm2.5-fp8-b300-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.1
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: b300
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # B300 tp=4: compute ceiling ~60 (empirical), KV cliff ~85 (analytical).
-    # Push none past the KV cliff (96, 128, 192) so the no-offload throughput
-    # collapse is visible; cpu range overlaps fully so each high-conc point
-    # has a same-conc no-offload counterpart for direct comparison.
-    # Dense sampling between 96 and 128 (step=4) to resolve the sharp dropoff
-    # observed in v6 cpu data right past conc=96.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
-      - { tp: 4, offloading: cpu,  conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
-
 minimaxm2.5-fp4-b200-vllm:
   image: vllm/vllm-openai:v0.22.0
   model: nvidia/MiniMax-M2.5-NVFP4
@@ -4438,31 +4104,8 @@ minimaxm2.5-fp4-b200-vllm:
   # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
   # does not have a B300-specific recipe, so this config reuses the existing
   # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from minimaxm2.5-fp4-b200-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's minimaxm2.5-fp4-b200-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main.
-minimaxm2.5-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.0-cu130
-  model: nvidia/MiniMax-M2.5-NVFP4
-  model-prefix: minimaxm2.5
-  runner: b200
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
-  # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
-  # does not have a B300-specific recipe, so this config reuses the existing
-  # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-
 minimaxm2.5-fp4-b300-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/MiniMax-M2.5-NVFP4
   model-prefix: minimaxm2.5
   runner: b300
@@ -4489,7 +4132,7 @@ minimaxm2.5-fp4-b300-vllm:
       - { tp: 8, conc-start: 4, conc-end: 4 }
 
 gptoss-fp4-h100-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: h100
@@ -4530,29 +4173,6 @@ minimaxm2.5-fp8-h100-vllm:
       search-space:
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
 
-# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main.
-minimaxm2.5-fp8-h100-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: h100
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # H100 tp=4 ep=4: compute ceiling ~10 (empirical), KV cliff ~6 (analytical).
-    # Best cpu-offload demo SKU — 4-conc-point window between cliffs.
-    # Dense sampling 4-12 covers both cliffs; conc 16 confirms compute plateau.
-    - duration: 1800
-      search-space:
-      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] }
-      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [5, 6, 7, 8, 10, 12] }
-
 dsr1-fp8-h100-dynamo-sglang:
   image: lmsysorg/sglang:v0.5.8-cu130
   model: deepseek-ai/DeepSeek-R1-0528
@@ -4757,28 +4377,6 @@ minimaxm2.5-fp8-h200-vllm:
       search-space:
       - { tp: 4, conc-start: 1, conc-end: 256 }
 
-# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main.
-minimaxm2.5-fp8-h200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: h200
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # H200 tp=4: compute ceiling ~35 (empirical), KV cliff ~29 (analytical).
-    # cpu offload window conc 29-35 — dense sampling 24-40 captures both cliffs.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [24, 28, 32, 36, 40, 48] }
-
 dsr1-fp4-gb200-dynamo-trt:
   image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
   model: nvidia/DeepSeek-R1-0528-NVFP4-v2
@@ -8267,7 +7865,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
           dp-attn: true
 
 kimik2.5-fp4-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:v0.18.0-cu130
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: gb200
@@ -8369,7 +7967,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
           dp-attn: true
 
 dsv4-fp4-b200-dynamo-vllm:
-  image: vllm/vllm-openai:v0.20.1
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-multinode
@@ -8425,7 +8023,7 @@ dsv4-fp4-b200-dynamo-vllm:
           dp-attn: true
 
 dsv4-fp4-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:v0.20.0-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb200
@@ -8525,7 +8123,7 @@ dsv4-fp4-gb200-dynamo-vllm:
 # MTP2 variant of dsv4-fp4-gb200-dynamo-vllm. Uses the vLLM 0.20.1 image
 # and hand-picked 8k/1k Pareto points mirrored from NVIDIA/srt-slurm.
 dsv4-fp4-gb200-dynamo-vllm-mtp2:
-  image: vllm/vllm-openai:v0.20.1-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb200
@@ -8605,7 +8203,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
           dp-attn: true
 
 dsv4-fp4-b300-dynamo-vllm:
-  image: vllm/vllm-openai:v0.20.1
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -8661,7 +8259,7 @@ dsv4-fp4-b300-dynamo-vllm:
           dp-attn: true
 
 dsv4-fp4-gb300-dynamo-vllm:
-  image: vllm/vllm-openai:v0.20.0-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb300-nv
@@ -8856,7 +8454,7 @@ glm5-fp8-b200-dynamo-sglang:
   image: lmsysorg/sglang:v0.5.11-cu130
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
-  runner: b200-dgxc
+  runner: b200
   precision: fp8
   framework: dynamo-sglang
   multinode: true
@@ -9202,27 +8800,6 @@ dsv4-fp4-gb300-dynamo-sglang-mtp:
           ep: 8
           dp-attn: true
 
-
-kimik2.5-int4-h100-vllm:
-  image: vllm/vllm-openai:v0.20.2
-  model: moonshotai/Kimi-K2.5
-  model-prefix: kimik2.5
-  runner: h100
-  precision: int4
-  framework: vllm
-  multinode: false
-  scenarios:
-    # New entry, agentic-coding only: this PR intentionally does NOT add
-    # fixed-seq-len coverage for kimik2.5-int4 on H100 to keep the
-    # fixed-seq-len test surface identical to origin/main.
-    # H100 has 80 GB HBM per GPU (smallest in this set); the KV cliff arrives
-    # early. Sweep saturates conc=20 to keep total HBM headroom.
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 20] }
-      - { tp: 8, offloading: cpu,  conc-list: [1, 2, 4, 8, 12, 16, 20] }
-
 qwen3.5-fp8-h100-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -9681,12 +9258,340 @@ glm5-fp8-gb300-dynamo-sglang:
           ep: 1
           dp-attn: false
 
-# ============================================================================
-# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries).
-# Recipes that ALREADY existed on main were intentionally left at main's version
-# to preserve main behavior; PR-branch modifications to those recipes are NOT
-# brought in here.
-# ============================================================================
+dsv4-fp4-b200-vllm-agentic:
+  # Includes vllm-project/vllm#44774 so Mooncake honors sparse-attention
+  # prefix-cache retention when deciding which hybrid-KV blocks to store.
+  image: cquil/vllm-openai:v0.22.1-dcc957098904749bf375ffbf85aba6c74dfc9fe9
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b200-dgxc
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none,  conc-list: [1, 4, 8, 16, 32, 40] }
+      - { tp: 8, offloading: cpu,  conc-list: [40, 48, 52, 64, 72] }
+      - { tp: 8, ep: 8, dp-attn: true, offloading: none,  conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] }
+      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,   conc-list: [64, 128, 196, 256, 512] }
+
+dsv4-fp4-b200-sglang-agentic-hicache:
+  image: lmsysorg/sglang:v0.5.12.post1-cu130
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b200
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none,     conc-list: [1, 4, 8, 16, 32, 40] }
+      - { tp: 8, offloading: hicache,  conc-list: [40, 48, 52, 64, 72, 84, 100, 128, 196, 256, 512] }
+
+qwen3.5-fp8-b200-sglang-agentic:
+  image: lmsysorg/sglang:nightly-dev-20260422-de962f32
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: b200
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+glm5-fp8-b200-sglang-agentic:
+  image: lmsysorg/sglang:v0.5.12-cu130
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: b200
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] }
+
+kimik2.5-int4-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  runner: b200-dgxc
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+      - { tp: 8, offloading: cpu,  conc-list: [32, 64, 96, 128] }
+
+# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
+# does not have a B300-specific recipe, so this config reuses the existing
+# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
+
+kimik2.5-int4-h200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with
+  # the /home/sa-shared/gharunners/ai-perf-cache mount where aiperf's
+  # content-addressed dataset mmap cache lives. Other h200 pools (cw, nb)
+  # don't have that mount and would re-materialize 65 GB to /tmp every job.
+  runner: h200-dgxc
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] }
+      - { tp: 8, offloading: cpu,  conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] }
+
+kimik2.5-fp4-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: nvidia/Kimi-K2.5-NVFP4
+  model-prefix: kimik2.5
+  runner: b200-dgxc
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] }
+      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [16, 24, 32, 36] }
+      - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] }
+      - { tp: 4, ep: 1, offloading: cpu,  conc-list: [12, 14, 16, 18, 20, 22, 24, 32] }
+
+# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
+# does not have a B300-specific recipe, so this config reuses the existing
+# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
+
+kimik2.5-fp4-b300-vllm-agentic:
+  # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM
+  # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the
+  # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted
+  # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the
+  # INT4 B300 sister already uses successfully.
+  image: vllm/vllm-openai:v0.22.0
+  model: nvidia/Kimi-K2.5-NVFP4
+  model-prefix: kimik2.5
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+
+dsv4-fp8-h200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: h200
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [1, 2, 4, 8, 16] }
+
+# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image
+# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
+# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
+
+dsv4-fp4-b300-vllm-agentic:
+  # Includes vllm-project/vllm#44774 so Mooncake honors sparse-attention
+  # prefix-cache retention when deciding which hybrid-KV blocks to store.
+  image: cquil/vllm-openai:v0.22.1-dcc957098904749bf375ffbf85aba6c74dfc9fe9
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # TEMPORARY: run only MooncakeStore CPU-offload scenarios while
+      # diagnosing the native/SimpleCPU offload failures.
+      - { tp: 4, offloading: none,  conc-list: [1, 4, 8, 16, 32] }
+      - { tp: 8, offloading: none,  conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: none,  conc-list: [8, 16, 32, 64, 128] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: cpu,   conc-list: [32, 48, 64, 96, 128, 192, 256] }
+      - { tp: 8, ep: 8, dp-attn: true, offloading: none,  conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] }
+
+dsv4-fp4-b300-sglang-agentic-hicache:
+  image: lmsysorg/sglang:nightly-dev-cu13-20260609-317fc6a9
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 4, 8, 16, 32] }
+      - { tp: 8, offloading: none, conc-list: [1, 4, 8, 16, 32, 40, 48, 52, 64, 72] }
+      - { tp: 4, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] }
+      - { tp: 8, offloading: hicache, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [8, 16, 32, 64, 128] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: hicache, conc-list: [32, 48, 64, 96, 128, 192, 256] }
+      - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [52, 64, 72, 84, 100, 128, 196, 256, 512] }
+
+gptoss-fp4-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  runner: b200
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 4, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
+      - { tp: 8, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
+
+minimaxm2.5-fp8-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.1
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: b200-dgxc
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # B200 tp=4: compute ceiling ~50 (empirical), KV cliff ~48 (analytical).
+    # Push none past the KV cliff (96, 128) to make the no-offload throughput
+    # collapse visible; cpu range overlaps fully for same-conc comparison.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 96, 128] }
+      - { tp: 4, offloading: cpu,  conc-list: [48, 56, 64, 96, 128] }
+
+  # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
+  # does not have a B300-specific recipe, so this config reuses the existing
+  # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.
+
+minimaxm2.5-fp8-b300-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.1
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: b300
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # B300 tp=4: compute ceiling ~60 (empirical), KV cliff ~85 (analytical).
+    # Push none past the KV cliff (96, 128, 192) so the no-offload throughput
+    # collapse is visible; cpu range overlaps fully so each high-conc point
+    # has a same-conc no-offload counterpart for direct comparison.
+    # Dense sampling between 96 and 128 (step=4) to resolve the sharp dropoff
+    # observed in v6 cpu data right past conc=96.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
+      - { tp: 4, offloading: cpu,  conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
+
+minimaxm2.5-fp4-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.1
+  model: nvidia/MiniMax-M2.5-NVFP4
+  model-prefix: minimaxm2.5
+  runner: b200
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+  # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
+  # does not have a B300-specific recipe, so this config reuses the existing
+  # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
+
+minimaxm2.5-fp8-h100-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.1
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: h100
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # H100 tp=4 ep=4: compute ceiling ~10 (empirical), KV cliff ~6 (analytical).
+    # Best cpu-offload demo SKU — 4-conc-point window between cliffs.
+    # Dense sampling 4-12 covers both cliffs; conc 16 confirms compute plateau.
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] }
+      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [5, 6, 7, 8, 10, 12] }
+
+minimaxm2.5-fp8-h200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.1
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: h200
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # H200 tp=4: compute ceiling ~35 (empirical), KV cliff ~29 (analytical).
+    # cpu offload window conc 29-35 — dense sampling 24-40 captures both cliffs.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [24, 28, 32, 36, 40, 48] }
+
+kimik2.5-int4-h100-vllm:
+  image: vllm/vllm-openai:v0.22.0
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  runner: h100
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    # New entry, agentic-coding only: this PR intentionally does NOT add
+    # fixed-seq-len coverage for kimik2.5-int4 on H100 to keep the
+    # fixed-seq-len test surface identical to origin/main.
+    # H100 has 80 GB HBM per GPU (smallest in this set); the KV cliff arrives
+    # early. Sweep saturates conc=20 to keep total HBM headroom.
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 20] }
+      - { tp: 8, offloading: cpu,  conc-list: [1, 2, 4, 8, 12, 16, 20] }
 
 qwen3.5-fp8-b300-sglang-agentic-hicache:
   image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd
@@ -9704,7 +9609,7 @@ qwen3.5-fp8-b300-sglang-agentic-hicache:
       - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
 
 kimik2.5-fp4-b200-vllm-agentic-lmcache:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: b200-dgxc
@@ -9724,16 +9629,17 @@ kimik2.5-fp4-b200-vllm-agentic-lmcache:
 # does not have a B300-specific recipe, so this config reuses the existing
 # Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
 
-# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons
-# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to
-# origin/main so its fixed-seq-len sweep is unaffected.
-#   - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape
-#     mirroring the conc=192 point in the base entry's fixed-seq-len sweep.
-#   - additional-settings.CONFIG_FILE: points at the new agentic recipe under
-#     recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh
-#     overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC
-#     branch). Local-overlay pattern mirrors the existing 8k1k overlay.
 dsv4-fp4-gb300-dynamo-vllm-agentic:
+  # Pinned to the R30-validated stack (vllm v0.21.0-ubuntu2404 + ai-dynamo
+  # wheel 1.2.0.dev20260426). The repo-wide bump to v0.22.0 (76aedd65) broke
+  # this config silently: the agentic recipes' `model.container` field must
+  # match this image string for srtctl's containers-map lookup to resolve to
+  # the squash file the launcher imports — on mismatch srtctl passes the
+  # recipe string verbatim to pyxis, which re-pulls from Docker Hub on every
+  # node and ignores the imported squash. Bump this together with
+  # `model.container` in benchmarks/multi_node/srt-slurm-recipes/vllm/
+  # deepseek-v4/agentic/*.yaml once v0.22.x + the dynamo wheel is validated
+  # on GB300 disagg.
   image: vllm/vllm-openai:v0.21.0-ubuntu2404
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
@@ -9823,6 +9729,8 @@ dsv4-fp4-gb300-dynamo-vllm-agentic:
 # overlay (recipes/vllm/deepseek-v4/agentic/), so a change to the recipe
 # applies to both clusters with no duplication.
 dsv4-fp4-gb300-cw-dynamo-vllm-agentic:
+  # Image pinned to match the agentic recipes' `model.container` — see the
+  # comment on dsv4-fp4-gb300-dynamo-vllm-agentic.
   image: vllm/vllm-openai:v0.21.0-ubuntu2404
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
@@ -9881,16 +9789,62 @@ dsv4-fp4-gb300-cw-dynamo-vllm-agentic:
           ep: 8
           dp-attn: true
 
-# Diverged from qwen3.5-fp8-h100-sglang (agentic-coding sibling). Reasons below;
-# the original qwen3.5-fp8-h100-sglang entry stays byte-identical to origin/main
-# so its fixed-seq-len sweep is unaffected.
-#   - scenarios: replaced fixed-seq-len with agentic-coding.
-#   - runner: 'h100' -> 'h100-dgxc' (agentic runs need the dgxc-slurm cluster).
-# Image is identical to the base entry (lmsysorg/sglang:v0.5.12-cu130).
-# CONC range conservative for H100's 80 GB HBM3 under the long-ISL with-
-# subagents corpus. hicache arm capped at conc 16 since high-conc + hicache
-# tends to flake on first runs and conc 16 covers the cliff. The bench script
-# sets WEKA_LOADER_OVERRIDE to the 256k-capped corpus variant.
+# GB200 sibling of the gb300 agentic configs. Unlike gb300, the topology is
+# TEP8 prefill + TP8 decode (NOT the fixed-seq-len DEP8/DEP8 megamoe family):
+# DSv4's hybrid KV needs ~20 GiB per data-parallel rank to admit one
+# 256k-token request, but GB200's 186 GB HBM leaves only ~8.8 GiB free after
+# FP4 weights — TP shards the KV 8-ways so it fits. See the recipe header.
+# No high-throughput conc-4096 tier yet: a single TP8 decode worker caps at
+# max-num-seqs 512, and DEP decode (which scales seqs) hits the KV wall
+# above; revisit with fp4 indexer cache or multi-worker TP8 decode.
+# Image matches the recipes' `model.container` (v0.21.0-ubuntu2404 — the
+# gb300-validated agentic stack; v0.20.0's NIXL connector breaks TP8<->TP8
+# transfers, see the recipe header); the two must stay in lockstep — see
+# dsv4-fp4-gb300-dynamo-vllm-agentic.
+dsv4-fp4-gb200-dynamo-vllm-agentic:
+  image: vllm/vllm-openai:v0.21.0-ubuntu2404
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb200
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # Low-latency: 1p1d (TEP=8 / TP=8) at conc 32. 5 nodes incl. infra.
+      - spec-decoding: none
+        conc-list: [32]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+      # Mid: same 1p1d shape at conc 192.
+      - spec-decoding: none
+        conc-list: [192]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+
 qwen3.5-fp8-h100-sglang-agentic:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -9905,3 +9859,33 @@ qwen3.5-fp8-h100-sglang-agentic:
       search-space:
       - { tp: 8, ep: 8, offloading: none,    conc-list: [1, 2, 4, 8, 12, 14, 16] }
       - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] }
+
+# Split from dsr1-fp4-b200-dynamo-trt: agentic-coding scenario only.
+dsr1-fp4-b200-dynamo-trt-agentic:
+  image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1
+  model: deepseek-r1-fp4
+  model-prefix: dsr1
+  runner: b200-multinode
+  precision: fp4
+  framework: dynamo-trt
+  multinode: true
+  disagg: true
+  scenarios:
+    agentic-coding:
+    - duration: 300
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml
+          - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 81727ef39..d46c75a5c 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -171,10 +171,17 @@ jobs:
       - name: Slurm cleanup (pre-run)
         run: &slurm-cleanup |
           if command -v squeue >/dev/null 2>&1; then
-            echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..."
+            # Clean both the bare runner name and the "ifx-" prefixed form.
+            # launch_gb200-nv.sh names jobs ifx-<runner> to dodge a foreign
+            # runner fleet on watchtower that scancels by the bare name
+            # across users (see the comment there). squeue is filtered to
+            # our user so the wait loop can't hang on a same-named foreign
+            # job we have no permission to cancel.
+            echo "[Slurm] Cleaning up jobs named: ${{ runner.name }}, ifx-${{ runner.name }} ..."
             scancel --name="${{ runner.name }}" || true
-            while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do
-              squeue --name="${{ runner.name }}"
+            scancel --name="ifx-${{ runner.name }}" || true
+            while [ -n "$(squeue --user="$USER" --name='${{ runner.name }},ifx-${{ runner.name }}' --noheader --format='%i')" ]; do
+              squeue --user="$USER" --name="${{ runner.name }},ifx-${{ runner.name }}"
               sleep 5
             done
           fi
@@ -218,6 +225,16 @@ jobs:
           elif [ "${{ inputs.scenario-type }}" = "agentic-coding" ]; then
             if [ -f "${RESULT_FILENAME}.json" ]; then
               echo "Found agentic result file: ${RESULT_FILENAME}.json"
+              # Existence is not enough: process_agentic_result.py writes the
+              # aggregate even when aiperf recorded zero valid requests (e.g.
+              # the server 500'd every request — gb200 R8 went green on an
+              # all-null result this way). Require at least one successful
+              # request.
+              ok=$(python3 -c "import json,sys; d=json.load(open('${RESULT_FILENAME}.json')); print(int(bool(d.get('num_requests_successful'))))" 2>/dev/null || echo 0)
+              if [ "$ok" != "1" ]; then
+                echo "Run failed: ${RESULT_FILENAME}.json has zero successful requests." >&2
+                exit 1
+              fi
             else
               echo "Run failed: Agentic benchmark result ${RESULT_FILENAME}.json not found." >&2
               exit 1
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 2148def36..46f305fe8 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -239,7 +239,10 @@ jobs:
           name: agentic_${{ env.RESULT_FILENAME }}
           path: |
             results/server.log
+            results/router.log
             results/lmcache_server.log
+            results/mooncake_master.log
+            results/mooncake_config.json
             results/benchmark.log
             results/config.yaml
             results/lmcache_command.txt
@@ -279,7 +282,10 @@ jobs:
           name: ${{ inputs.eval-only && 'eval_server_logs_' || 'server_logs_' }}${{ env.RESULT_FILENAME }}
           path: |
             ${{ inputs.scenario-type == 'agentic-coding' && 'results/server.log' || 'server.log' }}
+            ${{ inputs.scenario-type == 'agentic-coding' && 'results/router.log' || '' }}
             ${{ inputs.scenario-type == 'agentic-coding' && 'results/lmcache_server.log' || '' }}
+            ${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_master.log' || '' }}
+            ${{ inputs.scenario-type == 'agentic-coding' && 'results/mooncake_config.json' || '' }}
           if-no-files-found: ignore
 
       - name: Upload GPU metrics
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index e3080b4bf..3fd56e7e4 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -899,6 +899,7 @@ run_eval() {
 INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}"
 AGENTIC_DIR="${AGENTIC_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/agentic-benchmark}"
 AIPERF_DIR="${AIPERF_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/aiperf}"
+AIPERF_FAILED_REQUEST_THRESHOLD=0.10
 
 agentic_pip_install() {
     local pip_install=(python3 -m pip install)
@@ -924,8 +925,21 @@ resolve_trace_source() {
     # public-dataset loader names allowed by the inferencex-agentx-mvp
     # scenario. Used by recipes whose servers have non-default context
     # caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the
-    # unfiltered 052726 corpus and switches to the 256k-capped variant).
-    local loader="${WEKA_LOADER_OVERRIDE:-semianalysis_cc_traces_weka_with_subagents}"
+    # unfiltered corpus and switches to the 256k-capped variant), or
+    # by recipes that want to pin an older corpus generation.
+    #
+    # Default (no override): the 060826 v6 corpus, selected by model family.
+    # DSv4 (full context) rides the unfiltered base corpus; every non-DSv4
+    # recipe defaults to the 256k-capped variant because those servers run at
+    # max_model_len ~256k and would reject >256k requests. Any recipe can still
+    # pin a specific corpus via WEKA_LOADER_OVERRIDE.
+    local default_loader
+    if [[ "${MODEL_PREFIX:-}" == dsv4* ]]; then
+        default_loader="semianalysis_cc_traces_weka_with_subagents_060826"
+    else
+        default_loader="semianalysis_cc_traces_weka_with_subagents_060826_256k"
+    fi
+    local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}"
     local dataset
     case "$loader" in
         semianalysis_cc_traces_weka_with_subagents)
@@ -934,13 +948,31 @@ resolve_trace_source() {
         semianalysis_cc_traces_weka_with_subagents_256k)
             dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k"
             ;;
+        semianalysis_cc_traces_weka_with_subagents_060226)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060226"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_060226_256k)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060226-256k"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_060526)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060526"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_060526_256k)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060526-256k"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_060826)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060826"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_060826_256k)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060826-256k"
+            ;;
         *)
-            echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2
+            echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k, semianalysis_cc_traces_weka_with_subagents_060526, semianalysis_cc_traces_weka_with_subagents_060526_256k, semianalysis_cc_traces_weka_with_subagents_060826, semianalysis_cc_traces_weka_with_subagents_060826_256k" >&2
             exit 1
             ;;
     esac
     TRACE_SOURCE_FLAG="--public-dataset $loader"
-    echo "Loading traces via aiperf public-dataset: $loader ($dataset)"
+    echo "Loading traces via aiperf public-dataset: $loader ($dataset) [MODEL_PREFIX=${MODEL_PREFIX:-unset}]"
     # Pre-download the dataset into the shared HF_HUB_CACHE (same mount used
     # for model weights) so subsequent runs read from cache instead of
     # re-downloading every job.
@@ -1017,7 +1049,7 @@ build_replay_cmd() {
     # transient low-rate failures from killing long sweeps while still
     # catching malformed payloads or server crashes before they get aggregated
     # as benchmarkable data.
-    REPLAY_CMD+=" --failed-request-threshold 0.10"
+    REPLAY_CMD+=" --failed-request-threshold $AIPERF_FAILED_REQUEST_THRESHOLD"
     # Sample each trajectory's warmup start position uniformly from
     # [25%, 75%] of the trace's turn count (was hardcoded 0%-70% upstream).
     # Avoids starting trajectories right at turn 0 where the KV cache is
@@ -1031,6 +1063,14 @@ build_replay_cmd() {
     # CPU on minimax-m2.5 at high concurrency. Lossless for vLLM (server
     # usage is authoritative).
     REPLAY_CMD+=" --use-server-token-count"
+    # Disable DCGM GPU telemetry collection. aiperf's GpuMetricTimeSeries
+    # freezes its metric schema on the first DCGM scrape, then KeyErrors when
+    # an optional field (xid_errors, power_violation, encoder_utilization)
+    # first appears mid-run. We don't consume the gpu_telemetry artifact in
+    # downstream processing, and the server-metrics path (Prometheus /metrics
+    # from vLLM) is unaffected by this flag and still gives us KV usage,
+    # prefix cache hit rate, etc.
+    REPLAY_CMD+=" --no-gpu-telemetry"
     # aiperf's dataset manager (separate from the inference parser) loads
     # the model's tokenizer for trace-prompt tokenization regardless of
     # --use-server-token-count. Models like kimi (amd/Kimi-K2.5-MXFP4,
@@ -1070,8 +1110,9 @@ build_replay_cmd() {
 
 write_agentic_result_json() {
     # Aggregate aiperf's profile_export.{json,jsonl} + server_metrics_export.json
-    # into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow's existing
-    # retry-based existence check is the single success gate.
+    # into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow checks that
+    # this file exists; run_agentic_replay_and_write_outputs separately rejects
+    # aggregates whose request error rate exceeds the configured limit.
     local result_dir="$1"
     RESULT_DIR="$result_dir" AGENTIC_OUTPUT_DIR="${AGENTIC_OUTPUT_DIR:-$INFMAX_CONTAINER_WORKSPACE}" \
         python3 "$INFMAX_CONTAINER_WORKSPACE/utils/process_agentic_result.py"
@@ -1085,6 +1126,7 @@ write_agentic_result_json() {
 run_agentic_replay_and_write_outputs() {
     local result_dir="$1"
     local replay_rc
+    local validation_rc
 
     echo "$REPLAY_CMD" > "$result_dir/benchmark_command.txt"
 
@@ -1100,8 +1142,20 @@ run_agentic_replay_and_write_outputs() {
     python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
         "$result_dir/aiperf_artifacts" -o "$result_dir" 2>&1 || true
 
+    set +e
+    python3 "$INFMAX_CONTAINER_WORKSPACE/utils/validate_agentic_result.py" \
+        "$result_dir/aiperf_artifacts" \
+        --failed-request-threshold "$AIPERF_FAILED_REQUEST_THRESHOLD"
+    validation_rc=$?
+    set -e
+
     if [ "$replay_rc" -ne 0 ]; then
         echo "ERROR: agentic trace replay exited with code $replay_rc after writing available results" >&2
         return "$replay_rc"
     fi
+
+    if [ "$validation_rc" -ne 0 ]; then
+        echo "ERROR: agentic trace replay produced invalid results after writing available artifacts" >&2
+        return "$validation_rc"
+    fi
 }
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml
new file mode 100644
index 000000000..8587b5aae
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb200-1p1d-tep8-tp8-agentic.yaml
@@ -0,0 +1,205 @@
+name: "svf-vllm-disagg-gb200-1p1d-tep8-tp8-agentic"
+
+# Agentic-coding recipe for GB200: 1 prefill (TEP=8) + 1 decode (TP=8),
+# 16 GPUs across 4 GB200 nodes + 1 dedicated NATS/etcd infra node.
+#
+# Why TEP/TP instead of the fixed-seq-len DEP8/DEP8 family
+# (disagg-gb200-mid-curve-megamoe.yaml): with data-parallel ranks each rank
+# holds the FULL KV of its sequences, and DSv4's hybrid KV needs 19.82 GiB
+# per rank just to admit one 256k-token request — but only ~8.8 GiB is free
+# on a 186 GB GB200 GPU after FP4 weights + MegaMOE buffers (engine init
+# died in _check_enough_kv_cache_memory; R4 jobs 18598/18600). Tensor
+# parallelism shards the KV 8-ways (~2.5 GiB/GPU at 256k), which fits with
+# room for concurrent sequences. Worker flag sets mirror the validated
+# gb300 TEP/TP recipes (disagg-gb300-1p17d-tep4-tp4.yaml and the 1p6d
+# agentic decode): no data-parallel, no deep_gemm_mega_moe.
+#
+# Container is v0.21.0-ubuntu2404 (the gb300-validated agentic stack), NOT
+# the v0.20.0 the gb200 fixed-seq family pins: v0.20.0's NIXL connector
+# breaks on TP8<->TP8 transfers — the decode worker's first get_finished()
+# poll dies with KeyError on the remote (prefill) engine_id in
+# transfer_topo.get_engine_info() because the prefill engine never
+# registers in the decode's engine map (R6, both shards, identical
+# tracebacks). The fixed-seq DEP8/DEP8 family never hits this path
+# (per-rank TP=1 transfer topology). v0.21.0 + the same ai-dynamo wheel
+# ran green NIXL transfers on gb300 agentic (R30 + manual 8137).
+#
+# Standard agentic deltas (see the gb300 agentic recipes):
+#   - benchmark.type custom -> agentic_srt.sh
+#   - prefix caching ON (no no-enable-prefix-caching)
+#   - max-model-len 262144 + 060826 256k-capped corpus (GB200 cannot serve
+#     the full 1M DSv4 context, mirroring the minimaxm2.5 agentic configs)
+#   - infra.nats_max_payload_mb 32 (long agentic prompts exceed NATS' 1 MiB)
+#   - srun_options.container-remap-root (apt-get git in agentic_srt.sh)
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.21.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+infra:
+  etcd_nats_dedicated_node: true
+  # See the gb300 1p6d agentic recipe for rationale — NATS' 1 MiB default
+  # rejects long agentic prompts; 32 MiB gives ~10x headroom over the
+  # largest observed payload.
+  nats_max_payload_mb: 32
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  vllm_config:
+    prefill:
+      # Static engine_id (one per worker, distinct between prefill/decode):
+      # the TP8 workers span 2 GB200 nodes, which srtctl launches as two
+      # processes (--node-rank 0 + --node-rank 1 --headless). Without a
+      # pinned engine_id each process generates its own random NIXL UUID, so
+      # ranks 0-3 and ranks 4-7 of the SAME worker register under different
+      # engine ids and the consumer's handshake dies with "Remote NIXL agent
+      # engine ID mismatch" on the first transfer (R7, both shards).
+      # Single-node-per-worker topologies (all gb300 recipes) never hit this.
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "11111111-1111-4111-8111-111111111111"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enable-ep-weight-filter: true
+      enforce-eager: true
+      max-model-len: 262144
+      max-num-seqs: 16
+      # 16384 batched tokens + util 0.90 (the fixed-seq megamoe recipes use
+      # 32768 + 0.95, tuned for 9k contexts): at 256k contexts the first
+      # long prefill's activation spike (sparse indexer logits, mhc fused
+      # kernels) needs ~2 GiB of runtime headroom that 0.95 doesn't leave —
+      # R5 job 18603 died with "CUDA out of memory. Tried to allocate
+      # 1.98 GiB ... 1.53 GiB free" on the first scheduled request. Matches
+      # the green gb300 agentic prefill (0.9 / 16384).
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      tokenizer-mode: deepseek_v4
+    decode:
+      # See prefill: static engine_id shared by both node processes of this
+      # 2-node TP8 worker (distinct from the prefill worker's id).
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "22222222-2222-4222-8222-222222222222"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-ep-weight-filter: true
+      max-model-len: 262144
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+# cpus-per-task=72: one full GB200 NUMA socket (144 cores split 2 x 72) per
+# task. Critical for the *infra step* (etcd + nats), which srtctl spawns
+# without --gres — on watchtower the per-GPU CPU default (CpusPerTres=gpu:35)
+# doesn't apply to GPU-less steps, so etcd lands with 1 CPU, falls behind on
+# lease keep-alives, and worker registrations silently expire mid-run: R8's
+# decode worker logged "Keep-alive lease expired" 11 min after going healthy
+# and the frontend 500'd every benchmark request with "Instance not found".
+# Same failure mode and fix as the gb300 agentic recipes (their R12).
+sbatch_directives:
+  cpus-per-task: "72"
+
+srun_options:
+  # See gb300 agentic recipes: pyxis may map the calling user to a non-root
+  # uid inside the container; remap to uid 0 so agentic_srt.sh's apt-get
+  # install git works. No-op when the container user is already root.
+  container-remap-root: ""
+
+benchmark:
+  type: custom
+  command: bash /infmax-workspace/benchmarks/multi_node/agentic_srt.sh
+  env:
+    INFMAX_CONTAINER_WORKSPACE: /infmax-workspace
+    RESULT_DIR: /logs/agentic
+    PORT: "8000"
+    IS_MULTINODE: "true"
+    # Container-side path of the aiperf mmap dataset cache; the host-side
+    # mount is wired via launch_gb200-nv.sh's srtslurm.yaml default_mounts.
+    # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files
+    # per dataset on every run.
+    AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache"
+    # Persistent HF hub cache (also wired via default_mounts) so the trace
+    # dataset isn't re-downloaded on every run. Overrides the workflow-level
+    # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes.
+    HF_HUB_CACHE: "/hf_hub_cache"
+    # The server runs at max-model-len 262144 (see header comment) — replay
+    # the 256k-capped corpus and tell aiperf to filter inputs to the served
+    # window, mirroring the minimaxm2.5 agentic configs.
+    WEKA_LOADER_OVERRIDE: "semianalysis_cc_traces_weka_with_subagents_060826_256k"
+    MAX_MODEL_LEN: "262144"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml
index fb7b9fd97..2caf202a6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-1p6d-dep4-tp4-agentic.yaml
@@ -175,3 +175,7 @@ benchmark:
     # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files
     # per dataset on every run.
     AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache"
+    # Persistent HF hub cache (also wired via default_mounts) so the trace
+    # dataset isn't re-downloaded on every run. Overrides the workflow-level
+    # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes.
+    HF_HUB_CACHE: "/hf_hub_cache"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml
index bb8fc6df8..98e25c450 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic/disagg-gb300-4p1d-dep4-dep8-24-c4096-agentic.yaml
@@ -174,3 +174,7 @@ benchmark:
     # Without this, aiperf re-tokenizes + re-writes ~65 GB of mmap files
     # per dataset on every run.
     AIPERF_DATASET_MMAP_CACHE_DIR: "/aiperf_mmap_cache"
+    # Persistent HF hub cache (also wired via default_mounts) so the trace
+    # dataset isn't re-downloaded on every run. Overrides the workflow-level
+    # HF_HUB_CACHE=/mnt/hf_hub_cache, which doesn't exist on these nodes.
+    HF_HUB_CACHE: "/hf_hub_cache"
diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
index f9955adc7..16dc3bfd5 100755
--- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
@@ -17,7 +17,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -33,7 +43,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
---model-path $MODEL \
+--model-path "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
index ff76b768d..3b2561fe2 100755
--- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
@@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi
 amd-smi || true
 
@@ -34,7 +44,7 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
---model-path=$MODEL \
+--model-path=$MODEL_PATH --served-model-name=$MODEL \
 --host=0.0.0.0 \
 --port=$PORT \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh
new file mode 100755
index 000000000..b159f9022
--- /dev/null
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_sglang.sh
@@ -0,0 +1,202 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on B200 using SGLang.
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV cache with RadixAttention prefix caching.
+#   hicache - SGLang HiCache local CPU tier with DSv4 UnifiedRadixCache.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INFERENCEX_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+export INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}"
+
+# The B200 DeepSeek-V4 Blackwell image installs SGLang editable under
+# /workspace, so its launcher mounts InferenceX at /ix instead. Resolve the
+# agentic tooling and results against the actual repository mount so the image
+# can keep its /workspace install and GitHub Actions can collect the outputs.
+if [[ ! -d "$INFMAX_CONTAINER_WORKSPACE/utils/aiperf" ]]; then
+    export INFMAX_CONTAINER_WORKSPACE="$INFERENCEX_ROOT"
+fi
+if [[ "${RESULT_DIR:-}" == /workspace/* && "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then
+    export RESULT_DIR="$INFMAX_CONTAINER_WORKSPACE/${RESULT_DIR#/workspace/}"
+fi
+
+source "$INFERENCEX_ROOT/benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION
+
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=1000000
+fi
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+nvidia-smi
+
+resolve_trace_source
+
+# Keep AIPerf's Transformers-main dependency from replacing the older
+# Transformers build pinned by the B200-specialized SGLang image. The server
+# always launches with the image's original interpreter; AIPerf and result
+# processing use the isolated environment when InferenceX is mounted at /ix.
+SGLANG_PYTHON="$(command -v python3)"
+if [[ "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then
+    AGENTIC_VENV="${AGENTIC_VENV:-/tmp/inferencex-agentic-venv}"
+    "$SGLANG_PYTHON" -m venv "$AGENTIC_VENV"
+    export PATH="$AGENTIC_VENV/bin:$PATH"
+fi
+install_agentic_deps
+
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+if [ "$DP_ATTENTION" = "true" ]; then
+    echo "Error: current SGLang nightly self-collides on internal IPC ports during single-node DP-attention startup; use pure TP until upstream fixes PortArgs initialization." >&2
+    exit 1
+fi
+
+CACHE_ARGS=()
+case "$OFFLOADING" in
+    none)
+        ;;
+    hicache)
+        # DeepSeek V4 HiCache currently rejects --hicache-size and supports
+        # capacity control only through a host/device token-capacity ratio.
+        # DSv4 allocates several physical host sub-pools for each logical host
+        # token. On B300 TP8, ratio=4 consumes about 237 GB/rank (1.9 TB total)
+        # while model loading/page cache is still resident and the OS kills a
+        # rank, so leave transient startup headroom with ratio=2. B200 has a
+        # smaller device KV pool and 3.8 TiB of host RAM, so ratio=8 provides a
+        # substantially larger useful CPU tier while staying within its node
+        # budget.
+        # TP4 ratio=4 works at C32 but fills its roughly 500 GB host tier at
+        # C48/C64. Ratio=8 still cannot retain the C64 session working set long
+        # enough to produce host hits. Ratio=16 provides roughly 21M logical
+        # host tokens while remaining below the B300 node's host budget.
+        if [ "$TP" -ge 8 ]; then
+            DEFAULT_HICACHE_RATIO=8
+        else
+            DEFAULT_HICACHE_RATIO=16
+        fi
+        HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}"
+        export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1
+        CACHE_ARGS=(
+            --enable-hierarchical-cache
+            --hicache-ratio "$HICACHE_RATIO"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+        )
+        echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT"
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
+PARALLEL_ARGS=(--tp "$TP")
+METRICS_ARGS=(--enable-metrics)
+CHUNKED_PREFILL_SIZE=8192
+PARALLEL_ARGS+=(
+    --moe-runner-backend flashinfer_mxfp4
+    --disable-flashinfer-autotune
+)
+
+MODEL_ARGS=()
+# The B200-specialized image deadlocks immediately after weight loading when
+# forced through the B300 compressed-attention/page-size overrides.
+MEM_FRACTION_STATIC=0.90
+
+PER_ENGINE_MAX_RUNNING=$CONC
+[ "$PER_ENGINE_MAX_RUNNING" -lt 1 ] && PER_ENGINE_MAX_RUNNING=1
+CUDA_GRAPH_MAX_BS=$PER_ENGINE_MAX_RUNNING
+[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64
+
+export PYTHONNOUSERSITE=1
+export TORCH_CUDA_ARCH_LIST=10.0
+# Agentic warmup dispatches hundreds of large prompts at once. SGLang's
+# tokenizer process can leave request bytes unacknowledged for longer than
+# AIPerf's 30-second TCP_USER_TIMEOUT while it admits that initial burst,
+# causing Linux to abort otherwise-live localhost connections. Keep the
+# six-hour request timeout unchanged, but allow up to 15 minutes for TCP
+# progress before declaring the connection dead.
+export AIPERF_HTTP_TCP_USER_TIMEOUT=900000
+export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
+export SGLANG_OPT_USE_JIT_NORM=1
+export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
+export SGLANG_OPT_USE_TOPK_V2=1
+export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
+
+SGLANG_CMD=(
+    "$SGLANG_PYTHON" -m sglang.launch_server
+    --model-path "$MODEL_PATH"
+    --served-model-name "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --trust-remote-code
+    "${PARALLEL_ARGS[@]}"
+    --mem-fraction-static "$MEM_FRACTION_STATIC"
+    --swa-full-tokens-ratio 0.1
+    --max-running-requests "$PER_ENGINE_MAX_RUNNING"
+    --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
+    --context-length "$MAX_MODEL_LEN"
+    --chunked-prefill-size "$CHUNKED_PREFILL_SIZE"
+    --tool-call-parser deepseekv4
+    --reasoning-parser deepseek-v4
+    --chat-template "$SCRIPT_DIR/../chat_templates/deepseek_v4_thinking.jinja"
+    --watchdog-timeout 1800
+    "${MODEL_ARGS[@]}"
+    "${METRICS_ARGS[@]}"
+    "${CACHE_ARGS[@]}"
+)
+
+printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt"
+
+{
+    echo "=== SGLANG_* env vars at launch ==="
+    env | grep -E '^SGLANG_' | sort
+    echo "==================================="
+} | tee "$SERVER_LOG"
+
+echo "Starting SGLang server for B200..."
+"${SGLANG_CMD[@]}" >> "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+capture_cache_metrics() {
+    {
+        echo "=== SGLang cache metrics snapshot $(date --iso-8601=seconds) ==="
+        curl -fsS "http://localhost:$PORT/metrics" 2>/dev/null \
+            | grep -E '^(sglang:(cache_hit_rate|cached_tokens_total|prompt_tokens_total|hicache_host_used_tokens|hicache_host_total_tokens|token_usage|num_requests_running|num_requests_waiting))' \
+            || true
+        echo "============================================================"
+    } >> "$SERVER_LOG"
+}
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+if [ "${#METRICS_ARGS[@]}" -gt 0 ]; then
+    capture_cache_metrics
+    trap capture_cache_metrics EXIT
+fi
+
+build_replay_cmd "$RESULT_DIR"
+REPLAY_CMD+=" --server-metrics http://localhost:$PORT/metrics"
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
index 108347479..514c6df8c 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
@@ -13,18 +13,17 @@ set -x
 #       experts EP-sharded across DP ranks (per the vLLM blog recipe).
 #       Highest aggregate throughput at large CONC.
 #
-# Image is vllm/vllm-openai:v0.20.0-cu130. block_size=256, kv-cache-dtype=fp8,
-# FP4 indexer cache enabled, FULL_AND_PIECEWISE cudagraph capture with
-# custom_ops=all (per the vLLM blog recipe at https://vllm.ai/blog/deepseek-v4).
+# Image is configured in nvidia-master.yaml. block_size=256,
+# kv-cache-dtype=fp8, FP4 indexer cache enabled, FULL_AND_PIECEWISE cudagraph
+# capture with custom_ops=all (per the vLLM blog recipe at
+# https://vllm.ai/blog/deepseek-v4).
 #
 # Required env vars:
 #   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
 #
 # OFFLOADING values:
-#   none        - vLLM GPU KV only, with DSv4 hybrid KV manager enabled.
-#   cpu         - vLLM native OffloadingConnector, with hybrid KV manager enabled.
-#   lmcache-mp  - Temporarily disabled for DSv4. LMCache PR #3261 must merge
-#                 first so LMCacheMPConnector can support HMA block-id tuples.
+#   none - vLLM GPU KV only.
+#   cpu  - MooncakeStoreConnector with a shared 2.5 TB host-memory KV tier.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -38,157 +37,130 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
 
+# vLLM v0.22.1 can ship CUTLASS DSL 4.5.2 with stale native MLIR bindings,
+# which fails DSV4 indexer compilation with mlir_global_dtors(..., data).
+# Reinstall the matching native wheel until NVIDIA/cutlass#3259 is resolved.
+agentic_pip_install --quiet --force-reinstall --no-deps \
+    'nvidia-cutlass-dsl-libs-cu13==4.5.2'
+
+# vllm-project/router expands the one HTTP backend into one logical worker per
+# DP rank and sends X-data-parallel-rank on forwarded requests. aiperf's
+# X-Correlation-ID is stable for every turn of a conversation; alias it to the
+# router's preferred X-Session-ID header.
+USE_VLLM_ROUTER=false
+VLLM_BACKEND_PORT="$PORT"
+if [ "$DP_ATTENTION" = "true" ]; then
+    USE_VLLM_ROUTER=true
+    VLLM_BACKEND_PORT=$((PORT + 1))
+    VLLM_ROUTER_VERSION=0.1.14
+    VLLM_ROUTER_POLICY=consistent_hash
+    VLLM_ROUTER_METRICS_PORT=$((PORT + 10000))
+    export AIPERF_HTTP_X_SESSION_ID_FROM_CORRELATION_ID=1
+    agentic_pip_install --quiet "vllm-router==$VLLM_ROUTER_VERSION"
+fi
+
 # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s.
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 
+# vllm-project/vllm#43447 keeps local SWA prefix-cache tails sparsely, while
+# vllm-project/vllm#44774 applies the same reachability policy to Mooncake's
+# store mask. 32k matches the trace-replay tuning validated for this workload.
+export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768
+
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
-LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
+ROUTER_LOG="$RESULT_DIR/router.log"
+MOONCAKE_MASTER_LOG="$RESULT_DIR/mooncake_master.log"
 mkdir -p "$RESULT_DIR"
 
 OFFLOAD_ARGS=()
-HYBRID_KV_ARGS=(--no-disable-hybrid-kv-cache-manager)
-LMCACHE_PID=""
-
-cleanup_lmcache_server() {
-    if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
-        kill "$LMCACHE_PID" 2>/dev/null || true
-        wait "$LMCACHE_PID" 2>/dev/null || true
-    fi
-}
-
-trap cleanup_lmcache_server EXIT
-
-wait_for_lmcache_ready() {
-    { set +x; } 2>/dev/null
-    local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
-    local tail_pid=""
-
-    while [ ! -f "$LMCACHE_LOG" ]; do
-        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
-            echo "LMCache server died before creating log file. Exiting." >&2
-            exit 1
-        fi
-        sleep 1
-    done
-
-    tail -f -n +1 "$LMCACHE_LOG" &
-    tail_pid=$!
-
-    for ((i = 1; i <= attempts; i++)); do
-        if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
-            kill "$tail_pid" 2>/dev/null || true
-            wait "$tail_pid" 2>/dev/null || true
-            return 0
-        fi
-        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
-            echo "LMCache server died before becoming healthy. Log follows:" >&2
-            kill "$tail_pid" 2>/dev/null || true
-            wait "$tail_pid" 2>/dev/null || true
-            cat "$LMCACHE_LOG" >&2 || true
-            exit 1
-        fi
-        sleep 1
-    done
-
-    echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
-    kill "$tail_pid" 2>/dev/null || true
-    wait "$tail_pid" 2>/dev/null || true
-    cat "$LMCACHE_LOG" >&2 || true
-    exit 1
-}
 
 case "$OFFLOADING" in
     none) ;;
     cpu)
-        # b200-dgxc compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits
-        # individual jobs to a fraction of that. Aim for ~1.2 TB total native
-        # CPU offload pool across the engine(s); previously 2.8 TB but every
-        # DP-attn worker stalled for 4+ min during pinned-CPU-tensor allocation
-        # and the shm_broadcast watchdog killed them (run 26246044726). 150 GB
-        # per worker (1.2 TB / 8) completes the alloc within the 60 s window.
+        # B200 DGXC compute nodes have about 3.9 TB host RAM. Leave enough
+        # headroom for model workers and the runtime.
         #
-        # Native --kv-offloading-size becomes OffloadingConnector's
-        # cpu_bytes_to_use. For DP-attn there are $TP independent DP engines,
-        # so pre-divide to keep aggregate host commit near TOTAL_CPU_DRAM_GB.
-        # For pure TP, vLLM treats the size as the total across TP ranks.
-        TOTAL_CPU_DRAM_GB=1200
-        if [ "$DP_ATTENTION" = "true" ]; then
-            PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP))
-        else
-            PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB
+        # Embedded mode contributes one segment per GPU rank to a shared
+        # distributed store, so pre-divide the aggregate host-memory budget.
+        TOTAL_CPU_DRAM_GB=2500
+        PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP))
+
+        MOONCAKE_VERSION=0.3.11.post1
+        agentic_pip_install --quiet --no-cache-dir --no-deps \
+            --force-reinstall "mooncake-transfer-engine-cuda13==$MOONCAKE_VERSION"
+        python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null
+
+        MOONCAKE_MASTER_PORT=$((PORT + 12000))
+        MOONCAKE_CONFIG_PATH="$RESULT_DIR/mooncake_config.json"
+        cat > "$MOONCAKE_CONFIG_PATH" <<EOF
+{
+  "mode": "embedded",
+  "metadata_server": "P2PHANDSHAKE",
+  "master_server_address": "127.0.0.1:$MOONCAKE_MASTER_PORT",
+  "global_segment_size": "${PER_RANK_GB}GB",
+  "local_buffer_size": "4GB",
+  "protocol": "rdma",
+  "device_name": "mlx5_0",
+  "enable_offload": false
+}
+EOF
+        export MOONCAKE_CONFIG_PATH
+        # Identical prefixes must hash to identical store keys across DP ranks.
+        export PYTHONHASHSEED=0
+        # B200 GPU memory registration works through DMA-BUF, but the compute
+        # nodes do not expose nvidia_peermem. Force Mooncake's DMA-BUF
+        # GPUDirect RDMA path instead of its legacy ibv_reg_mr path.
+        export WITH_NVIDIA_PEERMEM=0
+        export MC_SLICE_SIZE=1048576
+        export MC_WORKERS_PER_CTX=4
+
+        # Each rank contributes a separate segment. Evict early enough to
+        # avoid an imbalanced rank exhausting its segment.
+        MOONCAKE_EVICTION_HIGH_WATERMARK_RATIO=0.80
+        MOONCAKE_EVICTION_RATIO=0.10
+        # Mooncake's default 5s read lease is shorter than the observed
+        # transfer latency for large DSv4 hybrid-KV loads on B200 TCP.
+        MOONCAKE_KV_LEASE_TTL=60s
+
+        echo "Starting Mooncake master on port $MOONCAKE_MASTER_PORT..."
+        mooncake_master --port "$MOONCAKE_MASTER_PORT" \
+            --eviction_high_watermark_ratio="$MOONCAKE_EVICTION_HIGH_WATERMARK_RATIO" \
+            --eviction_ratio="$MOONCAKE_EVICTION_RATIO" \
+            --default_kv_lease_ttl="$MOONCAKE_KV_LEASE_TTL" \
+            > "$MOONCAKE_MASTER_LOG" 2>&1 &
+        MOONCAKE_MASTER_PID=$!
+        sleep 2
+        if ! kill -0 "$MOONCAKE_MASTER_PID" 2>/dev/null; then
+            echo "Mooncake master died during startup." >&2
+            cat "$MOONCAKE_MASTER_LOG" >&2
+            exit 1
         fi
         unset VLLM_USE_SIMPLE_KV_OFFLOAD
-        OFFLOAD_ARGS=(
-            --kv-offloading-backend native
-            --kv-offloading-size "$PER_ENGINE_GB"
-        )
-        ;;
-    lmcache-mp)
-        { set +x; } 2>/dev/null
-        # LMCacheMPConnector needs HMA support before it can run DSv4 with the
-        # hybrid KV manager. Re-enable this path after
-        # https://github.com/LMCache/LMCache/pull/3261 is merged.
-        echo "Error: OFFLOADING=lmcache-mp is disabled for DSv4 until LMCache PR #3261 adds HMA support." >&2
-        exit 1
-
-        # LMCache docs recommend MP mode for production: start an external
-        # `lmcache server`, then point vLLM's LMCacheMPConnector at it. For
-        # vLLM >= 0.20, prefer the LMCache-shipped connector module because it
-        # tracks the latest server protocol ahead of vLLM's vendored copy.
-        #
-        # Important DSv4 caveat: LMCacheMPConnector currently only accepts the
-        # non-hybrid KV block layout. The connector raises if vLLM returns the
-        # hybrid block-id tuple used by the CSA/HCA hybrid KV manager. This
-        # mode therefore disables the hybrid manager; `none` and `cpu` keep it
-        # enabled for the normal B200 DSv4 path.
-        agentic_pip_install --quiet --no-cache-dir lmcache
-        python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
-
-        TOTAL_CPU_DRAM_GB=2800
-        LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
-        LMCACHE_PORT="${LMCACHE_PORT:-5555}"
-        LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
-        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}"
-        LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-200}"
-        LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
-        LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
-
-        echo "Starting LMCache MP server..."
-        LMCACHE_CMD=(
-            lmcache server
-            --host "$LMCACHE_HOST"
-            --port "$LMCACHE_PORT"
-            --http-host "$LMCACHE_HOST"
-            --http-port "$LMCACHE_HTTP_PORT"
-            --l1-size-gb "$LMCACHE_L1_SIZE_GB"
-            --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
-            --chunk-size "$LMCACHE_CHUNK_SIZE"
-            --max-workers "$LMCACHE_MAX_WORKERS"
-            --eviction-policy LRU
-        )
-        printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
-        printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
-        "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
-        LMCACHE_PID=$!
-        echo "LMCache server PID: $LMCACHE_PID"
-        wait_for_lmcache_ready
-
-        HYBRID_KV_ARGS=(--disable-hybrid-kv-cache-manager)
         OFFLOAD_ARGS=(
             --kv-transfer-config
-            "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
+            '{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"load_async":true}}'
         )
         ;;
     *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu, lmcache-mp)" >&2
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2
         exit 1
         ;;
 esac
@@ -221,9 +193,9 @@ export VLLM_FLOAT32_MATMUL_PRECISION=high
 
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
-    --port "$PORT"
+    --port "$VLLM_BACKEND_PORT"
     --trust-remote-code
     --kv-cache-dtype fp8
     --block-size 256
@@ -236,7 +208,7 @@ VLLM_CMD=(
     --enable-auto-tool-choice
     --reasoning-parser deepseek_v4
     --enable-prefix-caching
-    "${HYBRID_KV_ARGS[@]}"
+    --no-disable-hybrid-kv-cache-manager
     --max-model-len "$MAX_MODEL_LEN"
     --max-num-seqs "$PER_ENGINE_MAX_NUM_SEQS"
     "${OFFLOAD_ARGS[@]}"
@@ -247,7 +219,24 @@ printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
 
-wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+wait_for_server_ready --port "$VLLM_BACKEND_PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+if [ "$USE_VLLM_ROUTER" = "true" ]; then
+    echo "Starting native vLLM router on port $PORT for $TP DP ranks..."
+    vllm-router \
+        --worker-urls "http://localhost:$VLLM_BACKEND_PORT" \
+        --policy "$VLLM_ROUTER_POLICY" \
+        --intra-node-data-parallel-size "$TP" \
+        --host 0.0.0.0 \
+        --port "$PORT" \
+        --prometheus-host 127.0.0.1 \
+        --prometheus-port "$VLLM_ROUTER_METRICS_PORT" \
+        --request-timeout-secs 3600 \
+        --disable-retries > "$ROUTER_LOG" 2>&1 &
+    ROUTER_PID=$!
+    echo "Router PID: $ROUTER_PID"
+    wait_for_server_ready --port "$PORT" --server-log "$ROUTER_LOG" --server-pid "$ROUTER_PID"
+fi
 
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh
new file mode 100755
index 000000000..dcc41f688
--- /dev/null
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_sglang.sh
@@ -0,0 +1,243 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for DeepSeek-V4-Pro FP4 on B300 using SGLang.
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV cache with RadixAttention prefix caching.
+#   hicache - SGLang HiCache local CPU tier with DSv4 UnifiedRadixCache.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INFERENCEX_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+export INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}"
+
+# The B200 DeepSeek-V4 Blackwell image installs SGLang editable under
+# /workspace, so its launcher mounts InferenceX at /ix instead. Resolve the
+# agentic tooling and results against the actual repository mount so the image
+# can keep its /workspace install and GitHub Actions can collect the outputs.
+if [[ ! -d "$INFMAX_CONTAINER_WORKSPACE/utils/aiperf" ]]; then
+    export INFMAX_CONTAINER_WORKSPACE="$INFERENCEX_ROOT"
+fi
+if [[ "${RESULT_DIR:-}" == /workspace/* && "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then
+    export RESULT_DIR="$INFMAX_CONTAINER_WORKSPACE/${RESULT_DIR#/workspace/}"
+fi
+
+source "$INFERENCEX_ROOT/benchmarks/benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION
+
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=1000000
+fi
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+nvidia-smi
+
+resolve_trace_source
+
+# Keep AIPerf's Transformers-main dependency from replacing the older
+# Transformers build pinned by the B200-specialized SGLang image. The server
+# always launches with the image's original interpreter; AIPerf and result
+# processing use the isolated environment when InferenceX is mounted at /ix.
+SGLANG_PYTHON="$(command -v python3)"
+if [[ "$INFMAX_CONTAINER_WORKSPACE" != /workspace ]]; then
+    AGENTIC_VENV="${AGENTIC_VENV:-/tmp/inferencex-agentic-venv}"
+    "$SGLANG_PYTHON" -m venv "$AGENTIC_VENV"
+    export PATH="$AGENTIC_VENV/bin:$PATH"
+fi
+install_agentic_deps
+
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+CACHE_ARGS=()
+case "$OFFLOADING" in
+    none)
+        ;;
+    hicache)
+        # DeepSeek V4 HiCache currently rejects --hicache-size and supports
+        # capacity control only through a host/device token-capacity ratio.
+        # DSv4 allocates several physical host sub-pools for each logical host
+        # token. On B300 TP8, ratio=4 consumes about 237 GB/rank (1.9 TB total)
+        # while model loading/page cache is still resident and the OS kills a
+        # rank, so leave transient startup headroom with ratio=2. B200 has a
+        # smaller device KV pool and 3.8 TiB of host RAM, so ratio=8 provides a
+        # substantially larger useful CPU tier while staying within its node
+        # budget.
+        # TP4 ratio=4 works at C32 but fills its roughly 500 GB host tier at
+        # C48/C64. Ratio=8 still cannot retain the C64 session working set long
+        # enough to produce host hits. Ratio=16 provides roughly 21M logical
+        # host tokens while remaining below the B300 node's host budget.
+        if [ "$TP" -ge 8 ]; then
+            DEFAULT_HICACHE_RATIO=2
+        else
+            DEFAULT_HICACHE_RATIO=16
+        fi
+        HICACHE_RATIO="${HICACHE_RATIO:-$DEFAULT_HICACHE_RATIO}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-page_first_direct}"
+        export SGLANG_ENABLE_UNIFIED_RADIX_TREE=1
+        CACHE_ARGS=(
+            --enable-hierarchical-cache
+            --hicache-ratio "$HICACHE_RATIO"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+        )
+        echo "HiCache DSv4 CPU tier: ratio=$HICACHE_RATIO, write_policy=$HICACHE_WRITE_POLICY, io_backend=$HICACHE_IO_BACKEND, mem_layout=$HICACHE_MEM_LAYOUT"
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
+USE_SGLANG_ROUTER=false
+SGLANG_BACKEND_PORT="$PORT"
+ROUTER_LOG="$RESULT_DIR/router.log"
+if [ "$DP_ATTENTION" = "true" ]; then
+    USE_SGLANG_ROUTER=true
+    SGLANG_BACKEND_PORT=$((PORT + 1))
+    SGLANG_ROUTER_METRICS_PORT=$((PORT + 10000))
+fi
+
+PARALLEL_ARGS=(--tp "$TP")
+METRICS_ARGS=(--enable-metrics)
+MEM_FRACTION_STATIC=0.88
+CHUNKED_PREFILL_SIZE=8192
+if [ "$DP_ATTENTION" = "true" ]; then
+    PARALLEL_ARGS+=(
+        --dp "$TP"
+        --enable-dp-attention
+        --dist-init-addr "127.0.0.1:$((PORT + 2000))"
+        --ep-size "$EP_SIZE"
+        --moe-runner-backend flashinfer_mxfp4
+        --disable-flashinfer-autotune
+        --enable-prefill-delayer
+    )
+    MEM_FRACTION_STATIC=0.88
+    CHUNKED_PREFILL_SIZE=16384
+else
+    PARALLEL_ARGS+=(
+        --moe-runner-backend flashinfer_mxfp4
+        --disable-flashinfer-autotune
+    )
+fi
+
+MODEL_ARGS=(
+    --attention-backend compressed
+    --page-size 256
+    --disable-shared-experts-fusion
+)
+
+MAX_RUNNING_REQUESTS=$CONC
+CUDA_GRAPH_MAX_BS=$CONC
+[ "$CUDA_GRAPH_MAX_BS" -gt 64 ] && CUDA_GRAPH_MAX_BS=64
+
+export PYTHONNOUSERSITE=1
+export TORCH_CUDA_ARCH_LIST=10.0
+# Agentic warmup dispatches hundreds of large prompts at once. SGLang's
+# tokenizer process can leave request bytes unacknowledged for longer than
+# AIPerf's 30-second TCP_USER_TIMEOUT while it admits that initial burst,
+# causing Linux to abort otherwise-live localhost connections. Keep the
+# six-hour request timeout unchanged, but allow up to 15 minutes for TCP
+# progress before declaring the connection dead.
+export AIPERF_HTTP_TCP_USER_TIMEOUT=900000
+export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
+export SGLANG_OPT_USE_JIT_NORM=1
+export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
+export SGLANG_OPT_USE_TOPK_V2=1
+export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
+SGLANG_CMD=(
+    "$SGLANG_PYTHON" -m sglang.launch_server
+    --model-path "$MODEL_PATH"
+    --served-model-name "$MODEL"
+    --host 0.0.0.0
+    --port "$SGLANG_BACKEND_PORT"
+    --trust-remote-code
+    "${PARALLEL_ARGS[@]}"
+    --mem-fraction-static "$MEM_FRACTION_STATIC"
+    --swa-full-tokens-ratio 0.1
+    --max-running-requests "$MAX_RUNNING_REQUESTS"
+    --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
+    --context-length "$MAX_MODEL_LEN"
+    --allow-auto-truncate
+    --chunked-prefill-size "$CHUNKED_PREFILL_SIZE"
+    --tool-call-parser deepseekv4
+    --reasoning-parser deepseek-v4
+    --chat-template "$SCRIPT_DIR/../chat_templates/deepseek_v4_thinking.jinja"
+    --watchdog-timeout 1800
+    "${MODEL_ARGS[@]}"
+    "${METRICS_ARGS[@]}"
+    "${CACHE_ARGS[@]}"
+)
+
+printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt"
+
+{
+    echo "=== SGLANG_* env vars at launch ==="
+    env | grep -E '^SGLANG_' | sort
+    echo "==================================="
+} | tee "$SERVER_LOG"
+
+echo "Starting SGLang server for B300..."
+"${SGLANG_CMD[@]}" >> "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+capture_cache_metrics() {
+    {
+        echo "=== SGLang cache metrics snapshot $(date --iso-8601=seconds) ==="
+        curl -fsS "http://localhost:$SGLANG_BACKEND_PORT/metrics" 2>/dev/null \
+            | grep -E '^(sglang:(cache_hit_rate|cached_tokens_total|prompt_tokens_total|hicache_host_used_tokens|hicache_host_total_tokens|token_usage|num_requests_running|num_requests_waiting))' \
+            || true
+        echo "============================================================"
+    } >> "$SERVER_LOG"
+}
+
+wait_for_server_ready --port "$SGLANG_BACKEND_PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+if [ "$USE_SGLANG_ROUTER" = "true" ]; then
+    echo "Starting SGLang router on port $PORT for $TP DP ranks..."
+    "$SGLANG_PYTHON" -m sglang_router.launch_router \
+        --worker-urls "http://localhost:$SGLANG_BACKEND_PORT" \
+        --policy manual \
+        --assignment-mode min_load \
+        --request-id-headers x-correlation-id \
+        --dp-aware \
+        --host 0.0.0.0 \
+        --port "$PORT" \
+        --prometheus-host 127.0.0.1 \
+        --prometheus-port "$SGLANG_ROUTER_METRICS_PORT" \
+        --request-timeout-secs 3600 \
+        --disable-retries > "$ROUTER_LOG" 2>&1 &
+    ROUTER_PID=$!
+    echo "Router PID: $ROUTER_PID"
+    wait_for_server_ready --port "$PORT" --server-log "$ROUTER_LOG" --server-pid "$ROUTER_PID"
+fi
+
+if [ "${#METRICS_ARGS[@]}" -gt 0 ]; then
+    capture_cache_metrics
+    trap capture_cache_metrics EXIT
+fi
+
+build_replay_cmd "$RESULT_DIR"
+if [ "$DP_ATTENTION" = "true" ]; then
+    REPLAY_CMD+=" --server-metrics http://localhost:$SGLANG_BACKEND_PORT/metrics"
+fi
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
index f6748a5f8..7fc30b60b 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
@@ -24,62 +24,140 @@ source "$(dirname "$0")/../../benchmark_lib.sh"
 
 check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE DP_ATTENTION
 
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+if ! declare -p MAX_MODEL_LEN >/dev/null 2>&1; then
+    MAX_MODEL_LEN=1000000
+elif [[ -z "$MAX_MODEL_LEN" || "$MAX_MODEL_LEN" = "0" ]]; then
     MAX_MODEL_LEN=1000000
 fi
 
-if [[ -n "${SLURM_JOB_ID:-}" ]]; then
-    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+if declare -p SLURM_JOB_ID >/dev/null 2>&1 && [ -n "$SLURM_JOB_ID" ]; then
+    SLURM_NODE=unknown
+    if declare -p SLURMD_NODENAME >/dev/null 2>&1 && [ -n "$SLURMD_NODENAME" ]; then
+        SLURM_NODE="$SLURMD_NODENAME"
+    fi
+    echo "JOB $SLURM_JOB_ID running on $SLURM_NODE"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if declare -p MODEL_PATH >/dev/null 2>&1 && [ -n "$MODEL_PATH" ]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
 
+# vLLM v0.22.1 can ship CUTLASS DSL 4.5.2 with stale native MLIR bindings,
+# which fails DSV4 indexer compilation with mlir_global_dtors(..., data).
+# Reinstall the matching native wheel until NVIDIA/cutlass#3259 is resolved.
+agentic_pip_install --quiet --force-reinstall --no-deps \
+    'nvidia-cutlass-dsl-libs-cu13==4.5.2'
+
+# vllm-project/router expands the one HTTP backend into one logical worker per
+# DP rank and sends X-data-parallel-rank on forwarded requests. aiperf's
+# X-Correlation-ID is stable for every turn of a conversation; alias it to the
+# router's preferred X-Session-ID header. This also keeps affinity correct when
+# testing older wheels that prioritize per-request X-Request-ID.
+USE_VLLM_ROUTER=false
+VLLM_BACKEND_PORT="$PORT"
+if [ "$DP_ATTENTION" = "true" ]; then
+    USE_VLLM_ROUTER=true
+    VLLM_BACKEND_PORT=$((PORT + 1))
+    VLLM_ROUTER_VERSION=0.1.14
+    VLLM_ROUTER_POLICY=consistent_hash
+    VLLM_ROUTER_METRICS_PORT=$((PORT + 10000))
+    export AIPERF_HTTP_X_SESSION_ID_FROM_CORRELATION_ID=1
+    agentic_pip_install --quiet "vllm-router==$VLLM_ROUTER_VERSION"
+fi
+
 # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s.
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 
+# vllm-project/vllm#43447 keeps local SWA prefix-cache tails sparsely, while
+# vllm-project/vllm#44774 applies the same reachability policy to Mooncake's
+# store mask. 32k matches the trace-replay tuning validated for this workload.
+export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768
+
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
+ROUTER_LOG="$RESULT_DIR/router.log"
+MOONCAKE_MASTER_LOG="$RESULT_DIR/mooncake_master.log"
 mkdir -p "$RESULT_DIR"
 
-OFFLOAD_ARGS=""
+OFFLOAD_ARGS=()
 case "$OFFLOADING" in
     none) ;;
     cpu)
-        # B300 compute nodes have ~3.8 TiB host RAM; SLURM cgroup limits
-        # individual jobs to a fraction of that. Aim for ~2.2 TB total host
-        # CPU pool across the engine(s).
+        # Leave enough host-memory headroom for model workers and the runtime.
+        # Use the 2.5 TB host-memory budget across all GPU ranks.
         #
-        # SimpleCPUOffloadConnector divides cpu_bytes_to_use by
-        # parallel_config.world_size (= TP*PP, NOT including DP — see
-        # vllm/config/parallel.py docstring). So:
-        #   - DP-attn=true  → each of $TP DP engines has world_size=1 in
-        #     its parallel_config; the connector does no internal divide,
-        #     and each engine torch.zeros + pin_tensor allocates the full
-        #     --kv_offloading_size value. Pre-divide by $TP here so the
-        #     aggregate host commit ≈ TOTAL_CPU_DRAM_GB.
-        #   - DP-attn=false → single engine with world_size=TP. Pass the
-        #     full TOTAL_CPU_DRAM_GB; the connector's internal divide
-        #     yields TOTAL/TP per rank, and TP-shared mmap (PR #37206)
-        #     keeps the aggregate at TOTAL.
-        TOTAL_CPU_DRAM_GB=2200
-        if [ "$DP_ATTENTION" = "true" ]; then
-            PER_ENGINE_GB=$((TOTAL_CPU_DRAM_GB / TP))
-        else
-            PER_ENGINE_GB=$TOTAL_CPU_DRAM_GB
+        # Mooncake embedded mode contributes one global segment per GPU rank to
+        # a shared distributed store. Pre-divide the aggregate host budget
+        # across those rank-contributed segments.
+        TOTAL_CPU_DRAM_GB=2500
+        PER_RANK_GB=$((TOTAL_CPU_DRAM_GB / TP))
+
+        MOONCAKE_VERSION=0.3.11.post1
+        agentic_pip_install --quiet --no-cache-dir --no-deps \
+            --force-reinstall "mooncake-transfer-engine-cuda13==$MOONCAKE_VERSION"
+        python3 -c "from mooncake.store import MooncakeDistributedStore" >/dev/null
+
+        MOONCAKE_MASTER_PORT=$((PORT + 12000))
+        MOONCAKE_CONFIG_PATH="$RESULT_DIR/mooncake_config.json"
+        cat > "$MOONCAKE_CONFIG_PATH" <<EOF
+{
+  "mode": "embedded",
+  "metadata_server": "P2PHANDSHAKE",
+  "master_server_address": "127.0.0.1:$MOONCAKE_MASTER_PORT",
+  "global_segment_size": "${PER_RANK_GB}GB",
+  "local_buffer_size": "4GB",
+  "protocol": "rdma",
+  "device_name": "",
+  "enable_offload": false
+}
+EOF
+        export MOONCAKE_CONFIG_PATH
+        export MC_ENABLE_DEST_DEVICE_AFFINITY=1
+        # Identical prefixes must hash to identical store keys across DP ranks.
+        export PYTHONHASHSEED=0
+        # Large agentic KV writes can exceed Mooncake Store's fixed 60-second
+        # transfer deadline at the default 64 KiB RDMA slice size. Reduce
+        # per-transfer bookkeeping and give the shared RNIC more workers.
+        export MC_SLICE_SIZE=1048576
+        export MC_WORKERS_PER_CTX=4
+
+        # The store is shared, but each rank contributes a separate segment.
+        # Start eviction before an imbalanced rank exhausts its segment, and
+        # reclaim enough space for several concurrent multi-GB batch puts.
+        MOONCAKE_EVICTION_HIGH_WATERMARK_RATIO=0.80
+        MOONCAKE_EVICTION_RATIO=0.10
+
+        echo "Starting Mooncake master on port $MOONCAKE_MASTER_PORT..."
+        mooncake_master --port "$MOONCAKE_MASTER_PORT" \
+            --eviction_high_watermark_ratio="$MOONCAKE_EVICTION_HIGH_WATERMARK_RATIO" \
+            --eviction_ratio="$MOONCAKE_EVICTION_RATIO" \
+            > "$MOONCAKE_MASTER_LOG" 2>&1 &
+        MOONCAKE_MASTER_PID=$!
+        sleep 2
+        if ! kill -0 "$MOONCAKE_MASTER_PID" 2>/dev/null; then
+            echo "Mooncake master died during startup." >&2
+            cat "$MOONCAKE_MASTER_LOG" >&2
+            exit 1
         fi
-        PER_ENGINE_BYTES=$((PER_ENGINE_GB * 1024 * 1024 * 1024))
-        # Use --kv-transfer-config JSON to also pass lazy_offload=true. Eager
-        # mode (default) hits an AssertionError in
-        # vllm/v1/core/kv_cache_utils.py:269 popleft_n at low/mid CONC; lazy
-        # mode defers the store path and clears low/mid CONC at 80-100%.
-        # See SimpleCPUOffloadConnector PR #37160 for the lazy_offload knob.
-        export VLLM_USE_SIMPLE_KV_OFFLOAD=1
-        OFFLOAD_ARGS="--kv-transfer-config {\"kv_connector\":\"SimpleCPUOffloadConnector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"cpu_bytes_to_use\":$PER_ENGINE_BYTES,\"lazy_offload\":true}}"
+
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+        OFFLOAD_ARGS=(
+            --kv-transfer-config
+            '{"kv_connector":"MooncakeStoreConnector","kv_role":"kv_both","kv_connector_extra_config":{"load_async":true}}'
+        )
         ;;
     *)
         echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2
@@ -113,9 +191,9 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve "$MODEL" \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
---port "$PORT" \
+--port "$VLLM_BACKEND_PORT" \
 --trust-remote-code \
 --kv-cache-dtype fp8 \
 --block-size 256 \
@@ -131,11 +209,28 @@ vllm serve "$MODEL" \
 --no-disable-hybrid-kv-cache-manager \
 --max-model-len "$MAX_MODEL_LEN" \
 --max-num-seqs "$PER_ENGINE_MAX_NUM_SEQS" \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
+"${OFFLOAD_ARGS[@]}" > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
 
-wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+wait_for_server_ready --port "$VLLM_BACKEND_PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+if [ "$USE_VLLM_ROUTER" = "true" ]; then
+    echo "Starting native vLLM router on port $PORT for $TP DP ranks..."
+    vllm-router \
+        --worker-urls "http://localhost:$VLLM_BACKEND_PORT" \
+        --policy "$VLLM_ROUTER_POLICY" \
+        --intra-node-data-parallel-size "$TP" \
+        --host 0.0.0.0 \
+        --port "$PORT" \
+        --prometheus-host 127.0.0.1 \
+        --prometheus-port "$VLLM_ROUTER_METRICS_PORT" \
+        --request-timeout-secs 3600 \
+        --disable-retries > "$ROUTER_LOG" 2>&1 &
+    ROUTER_PID=$!
+    echo "Router PID: $ROUTER_PID"
+    wait_for_server_ready --port "$PORT" --server-log "$ROUTER_LOG" --server-pid "$ROUTER_PID"
+fi
 
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
index 99aec25fe..029c8ea7f 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -31,7 +31,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -135,7 +145,7 @@ fi
 
 echo "Starting sglang server..."
 python3 -m sglang.launch_server \
-    --model-path "$MODEL" \
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
     --host=0.0.0.0 \
     --port "$PORT" \
     "${PARALLEL_ARGS[@]}" \
diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
index 0a0177983..799c2bf26 100755
--- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -40,7 +50,7 @@ export PYTHONNOUSERSITE=1
 
 # Per recipe: EP + DP=8 (no --tensor-parallel-size). TP from search space is
 # used for GPU allocation by the runner and as the DP size.
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
index 500b456f5..3b85a31cd 100755
--- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -42,7 +52,7 @@ echo "Starting SGLang server..."
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
-    --model-path $MODEL \
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh
index 259c19586..b3597cf52 100755
--- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -39,7 +49,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
---model-path=$MODEL \
+--model-path=$MODEL_PATH --served-model-name=$MODEL \
 --host=0.0.0.0 \
 --port=$PORT \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
deleted file mode 100755
index 6e921db58..000000000
--- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-set -x
-
-# Agentic trace replay benchmark for GPT-OSS 120B FP4 on B200 using vLLM.
-#
-# Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
-
-source "$(dirname "$0")/../../benchmark_lib.sh"
-
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
-
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
-
-if [[ -n "${SLURM_JOB_ID:-}" ]]; then
-    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
-fi
-
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
-nvidia-smi
-
-# ---- Resolve traces and install deps ----------------------------------------
-resolve_trace_source
-install_agentic_deps
-
-# ---- Server config ----------------------------------------------------------
-SERVER_LOG="$RESULT_DIR/server.log"
-mkdir -p "$RESULT_DIR"
-
-cat > "$RESULT_DIR/config.yaml" << EOF
-kv-cache-dtype: fp8
-compilation-config: '{"pass_config":{"fuse_allreduce_rms":true,"eliminate_noops":true}}'
-max-cudagraph-capture-size: 2048
-max-num-batched-tokens: 8192
-max-model-len: $MAX_MODEL_LEN
-EOF
-
-OFFLOAD_ARGS=""
-case "$OFFLOADING" in
-    none) ;;
-    cpu)
-        export VLLM_USE_SIMPLE_KV_OFFLOAD=1
-        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
-        ;;
-    *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
-esac
-
-echo "Starting vllm server..."
-export TORCH_CUDA_ARCH_LIST="10.0"
-export PYTHONNOUSERSITE=1
-export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
-
-vllm serve $MODEL \
---host 0.0.0.0 \
---port $PORT \
---config "$RESULT_DIR/config.yaml" \
---gpu-memory-utilization 0.9 \
---tensor-parallel-size $TP \
---max-num-seqs $CONC \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
-SERVER_PID=$!
-echo "Server PID: $SERVER_PID"
-
-wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
-
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
deleted file mode 100755
index 557986b0d..000000000
--- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-set -x
-
-# Agentic trace replay benchmark for GPT-OSS 120B FP4 on H100 using vLLM.
-#
-# Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
-
-source "$(dirname "$0")/../../benchmark_lib.sh"
-
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
-
-# Agentic matrix entries don't set max-model-len, so the workflow passes 0.
-# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly.
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
-
-if [[ -n "${SLURM_JOB_ID:-}" ]]; then
-    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
-fi
-
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
-nvidia-smi
-
-# ---- Resolve traces and install deps ----------------------------------------
-resolve_trace_source
-install_agentic_deps
-
-# ---- Server config ----------------------------------------------------------
-SERVER_LOG="$RESULT_DIR/server.log"
-mkdir -p "$RESULT_DIR"
-
-cat > "$RESULT_DIR/config.yaml" << EOF
-async-scheduling: true
-max-cudagraph-capture-size: 2048
-max-model-len: $MAX_MODEL_LEN
-EOF
-
-OFFLOAD_ARGS=""
-case "$OFFLOADING" in
-    none)
-        ;;
-    cpu)
-        export VLLM_USE_SIMPLE_KV_OFFLOAD=1
-        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
-        ;;
-    *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2
-        exit 1
-        ;;
-esac
-
-echo "Starting vllm server..."
-export TORCH_CUDA_ARCH_LIST="9.0"
-export PYTHONNOUSERSITE=1
-export VLLM_MXFP4_USE_MARLIN=1
-
-vllm serve $MODEL \
---host 0.0.0.0 \
---port $PORT \
---config "$RESULT_DIR/config.yaml" \
---gpu-memory-utilization 0.9 \
---tensor-parallel-size $TP \
---max-num-seqs $CONC \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
-SERVER_PID=$!
-echo "Server PID: $SERVER_PID"
-
-wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
-
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
deleted file mode 100755
index 1592a8d5c..000000000
--- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-set -x
-
-# Agentic trace replay benchmark for GPT-OSS 120B FP4 on H200 using vLLM.
-#
-# Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
-
-source "$(dirname "$0")/../../benchmark_lib.sh"
-
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
-
-# Agentic matrix entries don't set max-model-len, so the workflow passes 0.
-# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly.
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
-
-if [[ -n "${SLURM_JOB_ID:-}" ]]; then
-    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
-fi
-
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
-nvidia-smi
-
-# ---- Resolve traces and install deps ----------------------------------------
-resolve_trace_source
-install_agentic_deps
-
-# ---- Server config ----------------------------------------------------------
-SERVER_LOG="$RESULT_DIR/server.log"
-mkdir -p "$RESULT_DIR"
-
-cat > "$RESULT_DIR/config.yaml" << EOF
-async-scheduling: true
-max-cudagraph-capture-size: 2048
-max-model-len: $MAX_MODEL_LEN
-EOF
-
-OFFLOAD_ARGS=""
-case "$OFFLOADING" in
-    none)
-        ;;
-    cpu)
-        export VLLM_USE_SIMPLE_KV_OFFLOAD=1
-        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
-        ;;
-    *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2
-        exit 1
-        ;;
-esac
-
-echo "Starting vllm server..."
-export TORCH_CUDA_ARCH_LIST="9.0"
-export PYTHONNOUSERSITE=1
-export VLLM_MXFP4_USE_MARLIN=1
-
-vllm serve $MODEL \
---host 0.0.0.0 \
---port $PORT \
---config "$RESULT_DIR/config.yaml" \
---gpu-memory-utilization 0.9 \
---tensor-parallel-size $TP \
---max-num-seqs $CONC \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
-SERVER_PID=$!
-echo "Server PID: $SERVER_PID"
-
-wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
-
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
deleted file mode 100755
index eb1883ff1..000000000
--- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-set -x
-
-# Agentic trace replay benchmark for GPT-OSS 120B FP4 on MI300X using vLLM.
-#
-# Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
-
-source "$(dirname "$0")/../../benchmark_lib.sh"
-
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
-
-# Agentic matrix entries don't set max-model-len, so the workflow passes 0.
-# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly.
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
-
-if [[ -n "${SLURM_JOB_ID:-}" ]]; then
-    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
-fi
-
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
-rocm-smi
-amd-smi || true
-
-# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory.
-# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
-version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'`
-if [[ "$version" == "" || $version -lt 177 ]]; then
-  export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
-# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES
-if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
-    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
-fi
-
-export AMDGCN_USE_BUFFER_OPS=0
-export VLLM_ROCM_USE_AITER=1
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-export PYTHONNOUSERSITE=1
-
-# ---- Resolve traces and install deps ----------------------------------------
-resolve_trace_source
-install_agentic_deps
-
-# ---- Server config ----------------------------------------------------------
-SERVER_LOG="$RESULT_DIR/server.log"
-mkdir -p "$RESULT_DIR"
-
-OFFLOAD_ARGS=""
-case "$OFFLOADING" in
-    none)
-        ;;
-    cpu)
-        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
-        ;;
-    *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2
-        exit 1
-        ;;
-esac
-
-echo "Starting vllm server..."
-
-vllm serve $MODEL \
---host 0.0.0.0 \
---port $PORT \
---attention-backend ROCM_AITER_UNIFIED_ATTN \
--cc.pass_config.fuse_rope_kvcache=True \
--cc.use_inductor_graph_partition=True \
---tensor-parallel-size=$TP \
---gpu-memory-utilization 0.85 \
---max-model-len $MAX_MODEL_LEN \
---max-num-seqs $CONC \
---block-size=64 \
---kv-cache-dtype fp8 \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
-SERVER_PID=$!
-echo "Server PID: $SERVER_PID"
-
-wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
-
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
deleted file mode 100755
index 99e29c819..000000000
--- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-set -x
-
-# Agentic trace replay benchmark for GPT-OSS 120B FP4 on MI325X using vLLM.
-#
-# Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
-
-source "$(dirname "$0")/../../benchmark_lib.sh"
-
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION
-
-# Agentic matrix entries don't set max-model-len, so the workflow passes 0.
-# ${:-DEFAULT} only fires on unset/empty, so handle 0 explicitly.
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
-
-if [[ -n "${SLURM_JOB_ID:-}" ]]; then
-    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
-fi
-
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
-rocm-smi
-
-# If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory.
-# See https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
-version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'`
-if [[ "$version" == "" || $version -lt 177 ]]; then
-  export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
-# Ray compatibility in vLLM 0.14+ needs HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES
-if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
-    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
-fi
-
-export AMDGCN_USE_BUFFER_OPS=0
-export VLLM_ROCM_USE_AITER=1
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-export PYTHONNOUSERSITE=1
-
-# ---- Resolve traces and install deps ----------------------------------------
-resolve_trace_source
-install_agentic_deps
-
-# ---- Server config ----------------------------------------------------------
-SERVER_LOG="$RESULT_DIR/server.log"
-mkdir -p "$RESULT_DIR"
-
-OFFLOAD_ARGS=""
-case "$OFFLOADING" in
-    none)
-        ;;
-    cpu)
-        OFFLOAD_ARGS="--kv_offloading_backend native --kv_offloading_size $TOTAL_CPU_DRAM_GB --disable-hybrid-kv-cache-manager"
-        ;;
-    *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, cpu)" >&2
-        exit 1
-        ;;
-esac
-
-echo "Starting vllm server..."
-
-vllm serve $MODEL \
---host 0.0.0.0 \
---port $PORT \
---attention-backend ROCM_AITER_UNIFIED_ATTN \
--cc.pass_config.fuse_rope_kvcache=True \
--cc.use_inductor_graph_partition=True \
---tensor-parallel-size=$TP \
---gpu-memory-utilization 0.85 \
---max-model-len $MAX_MODEL_LEN \
---max-num-seqs $CONC \
---block-size=64 \
---kv-cache-dtype fp8 \
-$OFFLOAD_ARGS > "$SERVER_LOG" 2>&1 &
-SERVER_PID=$!
-echo "Server PID: $SERVER_PID"
-
-wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
-
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
index ad0b4495a..34b45c9ec 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -178,7 +188,7 @@ export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
 
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --tensor-parallel-size="$TP"
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
index 8cebe4f20..9667003e1 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -85,7 +95,7 @@ export PYTHONNOUSERSITE=1
 
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --tensor-parallel-size="$TP"
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index fd0ce3677..139b12256 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -33,7 +33,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -786,7 +796,7 @@ export PYTHONNOUSERSITE=1
 
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --tensor-parallel-size="$TP"
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
index 697d3fa45..5685f098c 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
@@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -45,7 +55,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_INT4=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --gpu-memory-utilization 0.95 \
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
index 2fd3b381c..cb6c67f4b 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
@@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -45,7 +55,7 @@ echo "Starting vllm server..."
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_INT4=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --gpu-memory-utilization 0.95 \
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
index 97929e43e..1bfa0c33b 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
@@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -56,7 +66,7 @@ echo "Starting vllm server..."
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_INT4=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --gpu-memory-utilization 0.95 \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
index 38ef72b56..f9b769636 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
@@ -19,14 +19,24 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
 # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
 # corpus has requests up to ~1M proxy tokens that would be rejected.
 # Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k
 
 resolve_trace_source
 install_agentic_deps
@@ -58,7 +68,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 $PARALLEL_ARGS \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
index 4ce131cba..d07c3af69 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
@@ -19,14 +19,24 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
 # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
 # corpus has requests up to ~1M proxy tokens that would be rejected.
 # Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k
 
 resolve_trace_source
 install_agentic_deps
@@ -62,7 +72,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
index 9f2d83a0b..906ae7408 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
@@ -19,14 +19,24 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
 # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
 # corpus has requests up to ~1M proxy tokens that would be rejected.
 # Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k
 
 resolve_trace_source
 install_agentic_deps
@@ -62,7 +72,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
index d21690da6..c35afe33a 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
@@ -19,14 +19,24 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
 # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
 # corpus has requests up to ~1M proxy tokens that would be rejected.
 # Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k
 
 resolve_trace_source
 install_agentic_deps
@@ -58,7 +68,7 @@ echo "Starting vllm server..."
 export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
index ed59991cb..5b4782646 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
@@ -19,14 +19,24 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
 # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
 # corpus has requests up to ~1M proxy tokens that would be rejected.
 # Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k
 
 resolve_trace_source
 install_agentic_deps
@@ -58,7 +68,7 @@ echo "Starting vllm server..."
 export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
index 260bbdc68..512eb0e6c 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
@@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -32,7 +42,7 @@ amd-smi || true
 # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
 # corpus has requests up to ~1M proxy tokens that would be rejected.
 # Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k
 
 resolve_trace_source
 install_agentic_deps
@@ -64,7 +74,7 @@ echo "Starting vllm server..."
 export VLLM_ROCM_USE_AITER=1
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
index edac27a45..5e5a9f9a3 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
@@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -32,7 +42,7 @@ amd-smi || true
 # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
 # corpus has requests up to ~1M proxy tokens that would be rejected.
 # Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k
 
 resolve_trace_source
 install_agentic_deps
@@ -61,7 +71,7 @@ echo "Starting vllm server..."
 export VLLM_ROCM_USE_AITER=1
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
index 39dd63293..8e15e7850 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
@@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -32,7 +42,7 @@ amd-smi || true
 # MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
 # corpus has requests up to ~1M proxy tokens that would be rejected.
 # Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060826_256k
 
 resolve_trace_source
 install_agentic_deps
@@ -65,7 +75,7 @@ export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py b/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py
new file mode 100755
index 000000000..5c061606f
--- /dev/null
+++ b/benchmarks/single_node/agentic/patch_vllm_mooncake_transfer_batches.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""Temporarily bound MooncakeStoreConnector transfer batches.
+
+Mooncake's TCP connection pool grows without a concurrency ceiling. Large
+DeepSeek-V4 requests therefore create enough simultaneous per-layer transfers
+to exhaust the node's TCP ports. This patch preserves the same keys and buffer
+lists but submits them in smaller sequential batches.
+"""
+
+import argparse
+from pathlib import Path
+
+
+HELPER_ANCHOR = '''def _rotate_list(values: list[_T], offset: int) -> list[_T]:
+    return values[offset:] + values[:offset]
+'''
+
+HELPER = '''
+
+_INFERENCEX_MOONCAKE_BATCH_PATCH = True
+
+
+def _run_mooncake_transfer_batches(fn, keys, addrs, sizes, *args):
+    max_keys = int(os.getenv("INFERENCEX_MOONCAKE_MAX_TRANSFER_BATCH_KEYS", "0"))
+    if max_keys <= 0 or len(keys) <= max_keys:
+        return fn(keys, addrs, sizes, *args)
+
+    results = []
+    for start in range(0, len(keys), max_keys):
+        end = start + max_keys
+        results.extend(fn(keys[start:end], addrs[start:end], sizes[start:end], *args))
+    return results
+'''
+
+PUT_CALL = '''res = self.store.batch_put_from_multi_buffers(
+                    keys,
+                    addrs,
+                    sizes,
+                    self.replicate_config,
+                )'''
+
+PATCHED_PUT_CALL = '''res = _run_mooncake_transfer_batches(
+                    self.store.batch_put_from_multi_buffers,
+                    keys,
+                    addrs,
+                    sizes,
+                    self.replicate_config,
+                )'''
+
+GET_CALL = '''res = self.store.batch_get_into_multi_buffers(
+                    batch_keys, batch_addrs, batch_sizes
+                )'''
+
+PATCHED_GET_CALL = '''res = _run_mooncake_transfer_batches(
+                    self.store.batch_get_into_multi_buffers,
+                    batch_keys,
+                    batch_addrs,
+                    batch_sizes,
+                )'''
+
+
+def patch_worker(worker_path: Path) -> None:
+    source = worker_path.read_text()
+    if "_INFERENCEX_MOONCAKE_BATCH_PATCH = True" in source:
+        print(f"Mooncake transfer batching already patched: {worker_path}")
+        return
+
+    replacements = (
+        (HELPER_ANCHOR, HELPER_ANCHOR + HELPER),
+        (PUT_CALL, PATCHED_PUT_CALL),
+        (GET_CALL, PATCHED_GET_CALL),
+    )
+    for old, new in replacements:
+        count = source.count(old)
+        if count != 1:
+            raise RuntimeError(
+                f"Expected exactly one patch target in {worker_path}, found {count}: "
+                f"{old.splitlines()[0]}"
+            )
+        source = source.replace(old, new, 1)
+
+    worker_path.write_text(source)
+    print(f"Patched Mooncake transfer batching: {worker_path}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--worker-path", type=Path)
+    args = parser.parse_args()
+
+    worker_path = args.worker_path
+    if worker_path is None:
+        import vllm
+
+        worker_path = Path(vllm.__file__).parent / (
+            "distributed/kv_transfer/kv_connector/v1/mooncake/store/worker.py"
+        )
+    patch_worker(worker_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
index 4ba87976b..d06d82ec8 100755
--- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
@@ -20,7 +20,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -39,7 +49,7 @@ export SGL_ENABLE_JIT_DEEPGEMM=false
 export SGLANG_ENABLE_FLASHINFER_GEMM=true
 
 python3 -m sglang.launch_server \
---model-path=$MODEL \
+--model-path=$MODEL_PATH --served-model-name=$MODEL \
 --host=0.0.0.0 \
 --port=$PORT \
 --served-model-name "Qwen/Qwen3.5-397B-A17B" \
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
index 3432af5c9..ad49b2b67 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
@@ -20,7 +20,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -39,7 +49,7 @@ export SGL_ENABLE_JIT_DEEPGEMM=false
 export SGLANG_ENABLE_FLASHINFER_GEMM=true
 
 python3 -m sglang.launch_server \
---model-path=$MODEL \
+--model-path=$MODEL_PATH --served-model-name=$MODEL \
 --host=0.0.0.0 \
 --port=$PORT \
 --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" \
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
index 9d9c1d7d5..4f9b12659 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
@@ -24,7 +24,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -85,7 +95,7 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true
 { set +x; } 2>/dev/null
 SGLANG_CMD=(
     python3 -m sglang.launch_server
-    --model-path="$MODEL"
+    --model-path="$MODEL_PATH" --served-model-name="$MODEL"
     --host=0.0.0.0
     --port="$PORT"
     --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8"
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
index 95f0397a0..b280fff8b 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
@@ -27,7 +27,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -98,7 +108,7 @@ fi
 { set +x; } 2>/dev/null
 SGLANG_CMD=(
     python3 -m sglang.launch_server
-    --model-path="$MODEL"
+    --model-path="$MODEL_PATH" --served-model-name="$MODEL"
     --host=0.0.0.0
     --port="$PORT"
     --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8"
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
index aef9650ca..ff901b674 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -36,7 +46,7 @@ export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
     --attention-backend triton \
-    --model-path $MODEL \
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
index 5427d0d31..cdded8860 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
@@ -24,7 +24,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -110,7 +120,7 @@ export PYTHONNOUSERSITE=1
 SGLANG_CMD=(
     python3 -m sglang.launch_server
     --attention-backend triton
-    --model-path "$MODEL"
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL"
     --host=0.0.0.0
     --port "$PORT"
     --tensor-parallel-size "$TP"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d201e9f3b..c4111ef0d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3395,3 +3395,10 @@
   description:
     - "Add DeepSeek-V4-Pro FP4 MI355X ATOM MTP3 benchmark; image rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1627
+
+- config-keys:
+    - dsv4-fp4-b200-sglang-agentic-hicache
+    - dsv4-fp4-b300-sglang-agentic-hicache
+  description:
+    - "Add DeepSeek-V4-Pro FP4 B200 and B300 SGLang agentic benchmarks with HiCache CPU KV offloading; use the B200-specific DeepSeek-V4 Blackwell image and the June 9 nightly on B300"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1640
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
index bb3bf9ed1..2187617ae 100644
--- a/runners/launch_b200-dgxc.sh
+++ b/runners/launch_b200-dgxc.sh
@@ -364,9 +364,35 @@ else
     # and gpu-15 names no longer exist. gpu-2 currently has 10 fully-idle GPU
     # nodes (all of gpu-2-[0-9]); gpu-1 has 2 drained (gpu-1-4, gpu-1-8). We
     # land on gpu-2 to avoid drained nodes and skip the per-node excludes.
-    salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
+    SALLOC_MEMORY_ARGS=()
+    if [[ "${OFFLOADING:-none}" != "none" ]]; then
+        # Host KV tiers (vLLM Mooncake cpu offload, SGLang HiCache) allocate
+        # multi-TB pinned host pools. Without an explicit request, Slurm caps
+        # this exclusive job at 2 TB and OOM-kills it even though the B200
+        # node has about 4 TB of physical RAM.
+        SALLOC_MEMORY_ARGS=(--mem=0)
+    fi
+    salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive "${SALLOC_MEMORY_ARGS[@]}" --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
+    # DSv4 is also staged on the compute nodes' local RAID. Loading the 806 GB
+    # checkpoint independently from Lustre on every TP rank leaves the loader
+    # threads blocked in Lustre I/O for hours. Select the local copy only after
+    # Slurm assigns a node, and retain the shared-Lustre path as a fallback for
+    # nodes whose local staging is incomplete.
+    if [[ "$MODEL_PREFIX" == "dsv4" && "$PRECISION" == "fp4" && "$FRAMEWORK" == "sglang" ]]; then
+        LOCAL_MODEL_PATH=/raid/models/DeepSeek-V4-Pro-NVFP4
+        if srun --jobid="$JOB_ID" bash -c \
+            'test -f "$1/config.json" && test -f "$1/model.safetensors.index.json" && test "$(find "$1" -maxdepth 1 -name "model-*.safetensors" | wc -l)" -eq 64' \
+            _ "$LOCAL_MODEL_PATH"; then
+            export MODEL_PATH="$LOCAL_MODEL_PATH"
+            export MODEL="$MODEL_PATH"
+            echo "Using node-local DSv4 checkpoint: $MODEL_PATH"
+        else
+            echo "Node-local DSv4 checkpoint unavailable; using shared checkpoint: $MODEL_PATH"
+        fi
+    fi
+
     # Use flock to serialize concurrent imports to the same squash file
     # Override ENROOT_CACHE_PATH to avoid permission issues with system-wide cache on worker nodes
     srun --jobid=$JOB_ID bash -c "
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 67e8b48cc..1616ed490 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -379,7 +379,14 @@ else
         fi
     )
 
-    salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
+    SALLOC_MEMORY_ARGS=()
+    if [[ "${OFFLOADING:-none}" != "none" ]]; then
+        # Host KV tiers (vLLM Mooncake cpu offload, SGLang HiCache) allocate
+        # multi-TB pinned host pools. Give them the full memory allocation of
+        # the exclusive node instead of Slurm's implicit 2 TB default.
+        SALLOC_MEMORY_ARGS=(--mem=0)
+    fi
+    salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT -N 1 --gres=gpu:$TP --exclusive "${SALLOC_MEMORY_ARGS[@]}" --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
     srun --jobid=$JOB_ID \
@@ -387,6 +394,7 @@ else
         --container-image=$SQUASH_FILE \
         --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT,$WRITABLE_MODELS_DIR:$WRITABLE_MODELS_DIR \
         --no-container-mount-home \
+        --container-remap-root \
         --container-workdir=$CONTAINER_MOUNT_DIR \
         --no-container-entrypoint --export=ALL,PORT=8888 \
         bash "$BENCH_SCRIPT"
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index dada98bd6..18f286965 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -148,8 +148,21 @@ fi
 
 # TODO(CJQ): make first class upon srt-slurm upstream refactor
 if [[ "$IS_AGENTIC" == "1" ]]; then
-    git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR"
+    # Agentic multi-node uses the same pinned cquil11/srt-slurm-nv commit as
+    # launch_gb300-nv.sh — everything the agentic recipes need is there:
+    #   - BenchmarkType.CUSTOM + benchmark.command + benchmark.env
+    #     (the hook that hands off to benchmarks/multi_node/agentic_srt.sh)
+    #   - DynamoConfig.wheel (recipes pin the ai-dynamo wheel)
+    #   - srtctl apply --no-preflight (model path /mnt/numa1 is compute-node
+    #     local NVMe, invisible to the login-node runner)
+    #   - benchmark_stage srun_options propagation (container-remap-root
+    #     must reach the agentic_srt.sh srun)
+    git clone https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
+    git checkout 6e34b8b83229634d732e41a4e2d6595f46ef60b5
+    mkdir -p recipes/vllm/deepseek-v4/agentic
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/agentic" \
+        recipes/vllm/deepseek-v4/agentic
 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
@@ -200,6 +213,24 @@ echo "Configs available at: $SRT_REPO_DIR/"
 
 # Create srtslurm.yaml for srtctl (used by both frameworks)
 SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
+
+# Agentic runs bind-mount two persistent caches into every worker container
+# (Lustre, shared across nodes): aiperf's content-addressed dataset mmap
+# cache (~65 GB per corpus, re-tokenized from scratch without it) and the
+# HF hub cache holding the trace dataset download. The container-side paths
+# are referenced by the agentic recipes' benchmark.env
+# (AIPERF_DATASET_MMAP_CACHE_DIR=/aiperf_mmap_cache, HF_HUB_CACHE=/hf_hub_cache).
+DEFAULT_MOUNTS_BLOCK=""
+if [[ "$IS_AGENTIC" == "1" ]]; then
+    AIPERF_MMAP_CACHE_HOST_PATH="/mnt/lustre01/users-public/sa-shared/ai-perf-cache"
+    HF_HUB_CACHE_HOST_PATH="/mnt/lustre01/users-public/sa-shared/hf-hub-cache"
+    mkdir -p "$AIPERF_MMAP_CACHE_HOST_PATH" "$HF_HUB_CACHE_HOST_PATH"
+    chmod 777 "$AIPERF_MMAP_CACHE_HOST_PATH" "$HF_HUB_CACHE_HOST_PATH" 2>/dev/null || true
+    DEFAULT_MOUNTS_BLOCK="default_mounts:
+  ${AIPERF_MMAP_CACHE_HOST_PATH}: /aiperf_mmap_cache
+  ${HF_HUB_CACHE_HOST_PATH}: /hf_hub_cache"
+fi
+
 echo "Creating srtslurm.yaml configuration..."
 cat > srtslurm.yaml <<EOF
 # SRT SLURM Configuration for GB200
@@ -224,6 +255,15 @@ containers:
   dynamo-sglang: ${SQUASH_FILE}
   "${IMAGE}": ${SQUASH_FILE}
   nginx-sqsh: ${NGINX_SQUASH_FILE}
+# srtctl defaults this to true, which adds #SBATCH --segment=<total_nodes>.
+# On watchtower the whole batch partition (blue-cn01-18) is a single NVL72
+# rack, so segment contiguity buys nothing for MNNVL — but it DOES make
+# jobs unschedulable when the partition is fragmented: Slurm backfills a
+# non-contiguous node set, fails segment placement at start, and the job
+# dies with "CANCELLED Reason=Resources" at RunTime=0 (hit by the first
+# gb200 agentic run, job 18582). Mirror launch_gb300-nv.sh and disable.
+use_segment_sbatch_directive: false
+${DEFAULT_MOUNTS_BLOCK}
 EOF
 
 echo "Generated srtslurm.yaml:"
@@ -237,13 +277,42 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
 
 echo "Submitting job with srtctl..."
 
-# Override the job name in the config file with the runner name
-sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "${CONFIG_FILE%%:*}"
+# Override the job name with the runner name, prefixed "ifx-": another
+# runner fleet on watchtower (user slurm-shared, uid 1010, with Slurm
+# operator rights) names ITS jobs after the same runner names (gb200-nv_N)
+# and its pre-job cleanup scancels by job name across users — it killed our
+# jobs 18593 and 18599 mid-startup (CANCELLED by 1010). The distinct prefix
+# keeps their --name match away from our jobs; the workflow's own pre-run
+# cleanup scancels both the bare and ifx- prefixed names.
+#
+# NOTE the sed alone is not enough: srtctl's get_job_name() (cli/submit.py)
+# prefers the RUNNER_NAME env var over the recipe name, so the prefixed
+# RUNNER_NAME must be passed to `srtctl apply` itself (R4 job 18599 proved
+# the recipe-name route gets ignored on CI runners).
+sed -i "s/^name:.*/name: \"ifx-${RUNNER_NAME}\"/" "${CONFIG_FILE%%:*}"
+SRTCTL_RUNNER_NAME="ifx-${RUNNER_NAME}"
+
+# Don't leak the login-node venv to the compute-node orchestrator. sbatch's
+# default --export=ALL propagates VIRTUAL_ENV (set by `source
+# .venv/bin/activate` above) into job_script_minimal.j2, whose
+# `uv run` step then tries to inspect the *active* venv — and dies with
+# "Broken symlink at .venv/bin/python3" because the login-node interpreter
+# path doesn't exist on compute nodes (gb200 agentic R2, job 18587).
+# srtctl itself still resolves through PATH (.venv/bin is on it).
+unset VIRTUAL_ENV
+
+# --no-preflight is only safe on the agentic path, where the recipe resolves
+# model.path to /mnt/numa1 (compute-node-only NVMe) that the login-node
+# runner can't see. Fixed-seq-len recipes keep enforcement on.
+PREFLIGHT_FLAG=""
+if [[ "$IS_AGENTIC" == "1" ]]; then
+    PREFLIGHT_FLAG="--no-preflight"
+fi
 
 if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then
-    SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1)
+    SRTCTL_OUTPUT=$(RUNNER_NAME="$SRTCTL_RUNNER_NAME" srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1)
 else
-    SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+    SRTCTL_OUTPUT=$(RUNNER_NAME="$SRTCTL_RUNNER_NAME" srtctl apply $PREFLIGHT_FLAG -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
 fi
 echo "$SRTCTL_OUTPUT"
 
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 6a5c50e38..7a7a66afa 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -88,6 +88,12 @@ export NVIDIA_DRIVER_CAPABILITIES=compute,utility
 # write to it.
 export AIPERF_MMAP_CACHE_HOST_PATH="/mnt/vast/ai-perf-cache"
 
+# Persistent HF hub cache for the agentic trace datasets — see the
+# launch_gb300-nv.sh comment. Mounted at /hf_hub_cache; agentic recipes set
+# HF_HUB_CACHE=/hf_hub_cache in benchmark.env.
+export HF_HUB_CACHE_HOST_PATH="/mnt/vast/hf-hub-cache"
+mkdir -p "$HF_HUB_CACHE_HOST_PATH"
+
 NGINX_IMAGE="nginx:1.27.4"
 
 # Squash files live alongside models on /mnt/vast (shared across nodes).
@@ -221,6 +227,7 @@ srtctl_root: "${SRTCTL_ROOT}"
 default_mounts:
   ${DYNAMO_WHEELS_CACHE_HOST}: /configs/dynamo-wheels
   ${AIPERF_MMAP_CACHE_HOST_PATH}: /aiperf_mmap_cache
+  ${HF_HUB_CACHE_HOST_PATH}: /hf_hub_cache
 
 model_paths:
   dspro: "${MODEL_PATH}"
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index b47e103fd..e4597302f 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -17,6 +17,14 @@ export ENROOT_ROOTFS_WRITABLE=1
 # write to it.
 export AIPERF_MMAP_CACHE_HOST_PATH="/data/home/sa-shared/gharunners/ai-perf-cache"
 
+# Persistent HF hub cache for the agentic trace datasets — mounted into
+# worker containers at /hf_hub_cache; the agentic recipes set
+# HF_HUB_CACHE=/hf_hub_cache in benchmark.env. Without it the workflow-level
+# HF_HUB_CACHE (/mnt/hf_hub_cache) doesn't exist on these nodes and every
+# run re-downloads the corpus into the ephemeral container overlay.
+export HF_HUB_CACHE_HOST_PATH="/data/home/sa-shared/gharunners/hf-hub-cache"
+mkdir -p "$HF_HUB_CACHE_HOST_PATH"
+
 export MODEL_PATH=$MODEL
 
 if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
@@ -189,6 +197,7 @@ srtctl_root: "${SRTCTL_ROOT}"
 # re-tokenized + re-written every job.
 default_mounts:
   "${AIPERF_MMAP_CACHE_HOST_PATH}": "/aiperf_mmap_cache"
+  "${HF_HUB_CACHE_HOST_PATH}": "/hf_hub_cache"
 
 # Model path aliases
 model_paths:
diff --git a/utils/aiperf b/utils/aiperf
index 062a5de92..ff2b646c0 160000
--- a/utils/aiperf
+++ b/utils/aiperf
@@ -1 +1 @@
-Subproject commit 062a5de92c8ac8a0a6dd5d2a7fb9a539a147f3d9
+Subproject commit ff2b646c0425aff9307a0e73161b23d77003a357
diff --git a/utils/process_agentic_result.py b/utils/process_agentic_result.py
index 3c4015ce6..90f1aaca9 100644
--- a/utils/process_agentic_result.py
+++ b/utils/process_agentic_result.py
@@ -37,7 +37,6 @@
 # Trace metadata lookup: conversation_id (= trace id) -> per-turn dict with
 # ``hash_ids`` and ``output_length``. Built lazily from the HF dataset cache.
 _TRACE_METADATA_CACHE: dict[str, list[dict]] | None = None
-_HF_DATASET = "semianalysisai/cc-traces-weka-with-subagents-051926"
 
 
 # ---- helpers ---------------------------------------------------------------
@@ -118,10 +117,17 @@ def load_server_metrics(path: Path) -> dict:
 def _hf_traces_dir() -> Path | None:
     """Locate the HuggingFace cache directory for the weka traces dataset.
 
-    Returns the directory containing per-trace JSON files, or None if the
-    dataset isn't present locally. Mirrors the layout
+    Returns the directory containing per-trace JSON files, or None if no
+    weka dataset is present locally. Mirrors the layout
     huggingface_hub.snapshot_download() produces:
     ``$HF_HUB_CACHE/datasets--<org>--<name>/snapshots/<revision>/``.
+
+    The bench script supports several corpus revisions
+    (cc-traces-weka-with-subagents-052726, ...-060226, ...-060226-256k, etc.)
+    and may switch between them per-recipe via WEKA_LOADER_OVERRIDE. Rather
+    than hardcode a single dataset name, scan all ``datasets--semianalysisai
+    --cc-traces-weka*`` directories in the cache and pick the most-recently-
+    modified snapshot that contains usable trace files.
     """
     hub_cache = os.environ.get("HF_HUB_CACHE") or os.environ.get("HUGGINGFACE_HUB_CACHE")
     if hub_cache:
@@ -130,17 +136,23 @@ def _hf_traces_dir() -> Path | None:
         home = os.environ.get("HF_HOME")
         cache_root = Path(home) / "hub" if home else Path.home() / ".cache" / "huggingface" / "hub"
 
-    org, name = _HF_DATASET.split("/", 1)
-    snapshots = cache_root / f"datasets--{org}--{name}" / "snapshots"
-    if not snapshots.is_dir():
+    if not cache_root.is_dir():
         return None
-    candidates = sorted(snapshots.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True)
+
+    # Collect every weka-corpus snapshot dir across all matching dataset
+    # entries, sorted newest first.
+    snapshots: list[Path] = []
+    for dataset_dir in cache_root.glob("datasets--semianalysisai--cc-traces-weka*"):
+        snap_root = dataset_dir / "snapshots"
+        if not snap_root.is_dir():
+            continue
+        snapshots.extend(p for p in snap_root.iterdir() if p.is_dir())
+    snapshots.sort(key=lambda p: p.stat().st_mtime, reverse=True)
+
     # Prefer the snapshot that contains usable trace files. The published HF
     # dataset ships a single ``traces.jsonl`` (one trace per line); older /
     # local mirrors may use per-trace ``*.json`` files instead. Accept either.
-    for c in candidates:
-        if not c.is_dir():
-            continue
+    for c in snapshots:
         if any(c.glob("*.jsonl")) or any(c.glob("*.json")):
             return c
     return None
@@ -382,16 +394,36 @@ def compute_throughput_stats(records: list[dict], aggregate: dict) -> dict:
 
 
 def compute_cache_stats(records: list[dict], server_metrics: dict) -> dict:
-    """Cache-hit metrics: theoretical (from trace metadata) + actual (server)."""
+    """Cache-hit metrics: theoretical (from trace metadata) + actual (server).
+
+    Server-metric coverage depends on the engine + KV connector combination,
+    so several fields are structurally null for some configs. The matrix:
+
+    | engine + connector                           | populated server fields            |
+    |----------------------------------------------|------------------------------------|
+    | vLLM, no connector                           | server_gpu_cache_hit_rate,         |
+    |                                              | gpu_kv_cache_usage_pct             |
+    | vLLM + SimpleCPUOffloadConnector             | same as above (the CPU tier        |
+    |                                              | extends the local LRU; reloads are |
+    |                                              | counted as prefix_cache_hits — no  |
+    |                                              | separate vllm:cpu_prefix_cache_*   |
+    |                                              | counter exists)                    |
+    | vLLM + LMCacheMPConnector (kv_role=kv_both)  | server_external_cache_hit_rate.    |
+    |                                              | server_gpu_cache_hit_rate goes to  |
+    |                                              | ~0 because delay_cache_blocks=True |
+    |                                              | suppresses local hash registration |
+    | SGLang                                       | not yet wired                      |
+    """
     result: dict = {
         "theoretical_cache_hit_rate": None,
         "server_gpu_cache_hit_rate": None,
-        "server_cpu_cache_hit_rate": None,
+        "server_external_cache_hit_rate": None,
+        "gpu_kv_cache_usage_pct": None,
+        "cpu_kv_cache_usage_pct": None,
         "kv_offload_bytes_gpu_to_cpu": None,
         "kv_offload_bytes_cpu_to_gpu": None,
         "kv_offload_time_gpu_to_cpu": None,
         "kv_offload_time_cpu_to_gpu": None,
-        "cpu_kv_cache_usage_pct": None,
         "total_prompt_tokens": None,
         "total_generation_tokens": None,
         "total_requests_completed": None,
@@ -476,15 +508,30 @@ def _final_value(metric_name: str) -> float | None:
                 return agg
         return None
 
+    # Local GPU prefix cache (every vLLM config emits these). Note: with
+    # LMCacheMPConnector + kv_role=kv_both, the scheduler sets
+    # delay_cache_blocks=True on every load and these hits stay at ~0 even
+    # when overall cache efficiency is high — read server_external_*.
     hits = _final_value("vllm:prefix_cache_hits")
     queries = _final_value("vllm:prefix_cache_queries")
     if hits is not None and queries and queries > 0:
         result["server_gpu_cache_hit_rate"] = hits / queries
 
-    cpu_hits = _final_value("vllm:cpu_prefix_cache_hits")
-    cpu_queries = _final_value("vllm:cpu_prefix_cache_queries")
-    if cpu_hits is not None and cpu_queries and cpu_queries > 0:
-        result["server_cpu_cache_hit_rate"] = cpu_hits / cpu_queries
+    # External KV connector (LMCacheMPConnector and similar). Only populated
+    # when the connector implements get_num_new_matched_tokens; absent for
+    # SimpleCPUOffloadConnector and for pure-vLLM (no connector) runs.
+    ext_hits = _final_value("vllm:external_prefix_cache_hits")
+    ext_queries = _final_value("vllm:external_prefix_cache_queries")
+    if ext_hits is not None and ext_queries and ext_queries > 0:
+        result["server_external_cache_hit_rate"] = ext_hits / ext_queries
+
+    # GPU KV pool fill ratio gauge. vLLM emits vllm:kv_cache_usage_perc on V1
+    # and vllm:gpu_cache_usage_perc on V0 (kept for older deployments).
+    kv_usage = _final_value("vllm:kv_cache_usage_perc")
+    if kv_usage is None:
+        kv_usage = _final_value("vllm:gpu_cache_usage_perc")
+    if kv_usage is not None:
+        result["gpu_kv_cache_usage_pct"] = kv_usage
 
     for src_key, dst_key in (
         ("vllm:kv_offload_bytes_gpu_to_cpu", "kv_offload_bytes_gpu_to_cpu"),
@@ -679,6 +726,13 @@ def main() -> int:
         )
     if agg.get("server_gpu_cache_hit_rate") is not None:
         print(f"  GPU cache hit rate: {agg['server_gpu_cache_hit_rate']:.1%}")
+    if agg.get("server_external_cache_hit_rate") is not None:
+        print(
+            f"  External cache hit rate: "
+            f"{agg['server_external_cache_hit_rate']:.1%}"
+        )
+    if agg.get("gpu_kv_cache_usage_pct") is not None:
+        print(f"  GPU KV cache usage:  {agg['gpu_kv_cache_usage_pct']:.1%}")
     if agg.get("response_cache_hit_rate") is not None:
         print(f"  Response cache hit rate: {agg['response_cache_hit_rate']:.1%}")
     if agg.get("theoretical_cache_hit_rate") is not None:
diff --git a/utils/test_validate_agentic_result.py b/utils/test_validate_agentic_result.py
new file mode 100644
index 000000000..f21bfa069
--- /dev/null
+++ b/utils/test_validate_agentic_result.py
@@ -0,0 +1,73 @@
+"""Tests for the agentic aiperf result validity gate."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from validate_agentic_result import validate_result
+
+
+def _write_aggregate(tmp_path: Path, aggregate: dict, *, per_run: bool = False) -> Path:
+    artifact_dir = tmp_path / "aiperf_artifacts"
+    output_dir = artifact_dir / "run_0" if per_run else artifact_dir
+    output_dir.mkdir(parents=True)
+    with open(output_dir / "profile_export_aiperf.json", "w") as f:
+        json.dump(aggregate, f)
+    return artifact_dir
+
+
+def test_passes_when_request_error_rate_is_within_limit(tmp_path: Path):
+    artifact_dir = _write_aggregate(
+        tmp_path,
+        {
+            "request_count": {"avg": 90},
+            "error_request_count": {"avg": 10},
+            "completed_request_count": {"avg": 100},
+        },
+    )
+
+    assert validate_result(artifact_dir, 0.10) == []
+
+
+def test_fails_when_request_error_rate_exceeds_limit(tmp_path: Path):
+    artifact_dir = _write_aggregate(
+        tmp_path,
+        {
+            "request_count": {"avg": 2},
+            "error_request_count": {"avg": 65},
+            "completed_request_count": {"avg": 67},
+        },
+    )
+
+    errors = validate_result(artifact_dir, 0.10)
+    assert errors == [
+        "aiperf request error rate exceeded the benchmark limit: "
+        "65/67 = 97.015% > 10.000%"
+    ]
+
+
+def test_treats_missing_error_count_as_zero(tmp_path: Path):
+    artifact_dir = _write_aggregate(
+        tmp_path,
+        {"request_count": {"avg": 12}},
+    )
+
+    assert validate_result(artifact_dir, 0.10) == []
+
+
+def test_supports_per_run_artifact_layout(tmp_path: Path):
+    artifact_dir = _write_aggregate(
+        tmp_path,
+        {"request_count": {"avg": 12}},
+        per_run=True,
+    )
+
+    assert validate_result(artifact_dir, 0.10) == []
+
+
+def test_fails_when_aggregate_is_missing(tmp_path: Path):
+    errors = validate_result(tmp_path / "aiperf_artifacts", 0.10)
+
+    assert len(errors) == 1
+    assert errors[0].endswith("profile_export_aiperf.json not found")
diff --git a/utils/validate_agentic_result.py b/utils/validate_agentic_result.py
new file mode 100644
index 000000000..e54691059
--- /dev/null
+++ b/utils/validate_agentic_result.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""Validate whether an aiperf agentic replay produced benchmarkable results."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import sys
+from pathlib import Path
+from typing import Any
+
+
+def _resolve_aggregate_path(artifact_dir: Path) -> Path:
+    """Find aiperf's aggregate JSON in the direct or per-run artifact layout."""
+    direct = artifact_dir / "profile_export_aiperf.json"
+    if direct.is_file():
+        return direct
+
+    if artifact_dir.is_dir():
+        for child in sorted(artifact_dir.iterdir()):
+            candidate = child / "profile_export_aiperf.json"
+            if child.is_dir() and candidate.is_file():
+                return candidate
+
+    return direct
+
+
+def _metric_avg(aggregate: dict[str, Any], name: str) -> float | None:
+    """Read an aggregate metric's numeric average, if present."""
+    metric = aggregate.get(name)
+    if metric is None:
+        return None
+    if not isinstance(metric, dict):
+        raise ValueError(f"{name} must be an object")
+
+    value = metric.get("avg")
+    if value is None:
+        return None
+    if not isinstance(value, int | float) or isinstance(value, bool):
+        raise ValueError(f"{name}.avg must be numeric")
+
+    value = float(value)
+    if not math.isfinite(value) or value < 0:
+        raise ValueError(f"{name}.avg must be a finite non-negative number")
+    return value
+
+
+def validate_result(artifact_dir: Path, failed_request_threshold: float) -> list[str]:
+    """Return validation errors for an aiperf artifact directory."""
+    aggregate_path = _resolve_aggregate_path(artifact_dir)
+    if not aggregate_path.is_file():
+        return [f"{aggregate_path} not found"]
+
+    try:
+        with open(aggregate_path) as f:
+            aggregate = json.load(f)
+        if not isinstance(aggregate, dict):
+            return [f"{aggregate_path} must contain a JSON object"]
+
+        successes = _metric_avg(aggregate, "request_count")
+        errors = _metric_avg(aggregate, "error_request_count") or 0.0
+        completed = _metric_avg(aggregate, "completed_request_count")
+    except (OSError, json.JSONDecodeError, ValueError) as exc:
+        return [f"failed to read {aggregate_path}: {exc}"]
+
+    if successes is None:
+        return ["request_count.avg is missing"]
+    if completed is None:
+        completed = successes + errors
+    if completed <= 0:
+        return ["aiperf completed zero requests"]
+
+    error_rate = errors / completed
+    if error_rate > failed_request_threshold:
+        return [
+            "aiperf request error rate exceeded the benchmark limit: "
+            f"{errors:g}/{completed:g} = {error_rate:.3%} > "
+            f"{failed_request_threshold:.3%}"
+        ]
+
+    print(
+        "Validated aiperf request error rate: "
+        f"{errors:g}/{completed:g} = {error_rate:.3%} <= "
+        f"{failed_request_threshold:.3%}"
+    )
+    return []
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("artifact_dir", type=Path)
+    parser.add_argument(
+        "--failed-request-threshold",
+        type=float,
+        required=True,
+        help="Maximum accepted error fraction, inclusive",
+    )
+    args = parser.parse_args()
+
+    if not 0 <= args.failed_request_threshold <= 1:
+        parser.error("--failed-request-threshold must be between 0 and 1")
+
+    errors = validate_result(args.artifact_dir, args.failed_request_threshold)
+    for error in errors:
+        print(f"ERROR: {error}", file=sys.stderr)
+    return 1 if errors else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())