fix: switch gb200 sglang mtp recipes to NGC prebuilt dynamo image

Oseltamivir · claude · Oseltamivir · commit d38a486f36e9 · 2026-05-13T14:28:04.000-07:00
Switching transport to nixl in cdf21c3 did not unblock the GB200 sglang MTP sweep (run 25812320128 still 0 output tokens, ~17K aborts/job). Both mooncake and nixl backends fail at the same SGLang code path (decode._update_handshake_waiters) because of an upstream-of-transport bug: dynamo's per-DP-rank etcd registration only completes for node-rank-0 of multi-node prefill workers on the GB200 NV cluster. Decode then sees "Prefill server not fully registered yet (4 workers registered)" for the entire run and aborts every handshake. Same dynamo hash + same recipe shape registers all 8 DP ranks fine on GB300, isolating the break to the dynamo source build at hash 34d55a5 on aarch64/MNNVL. Switch the 7 GB200 sglang MTP recipes to NVIDIA's prebuilt NGC image (nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.2.0-deepseek-v4-cuda13-dev.3), matching upstream's deepseek-v4-pro/sglang/disagg-gb200/deploy.yaml. Drops dynamo.install (NGC ships dynamo prebuilt), which also removes the per-worker maturin+rustup source build that originally drove the 100-min health-poll extension. Keeps the extended health wall as a safety margin for first-time NGC pull plus DSV4-Pro model load. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml
@@ -6,18 +6,17 @@ frontend:
   num_additional_frontends: 8
 
 dynamo:
-  hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
-  install: true
+  install: false  # NGC sglang-runtime image ships dynamo prebuilt.
 
-# 100-min readiness wall (default 30 min) so the per-worker dynamo
-# source build has room to finish before health-poll gives up.
+# 100-min readiness wall (default 30 min) tolerates first-time NGC
+# image pull on each worker node plus DSV4-Pro multi-node load.
 health_check:
   max_attempts: 600
   interval_seconds: 10
 
 model:
   path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
+  container: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.2.0-deepseek-v4-cuda13-dev.3"
   precision: "mxfp4"
 
 sbatch_directives:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml
@@ -6,18 +6,17 @@ frontend:
   num_additional_frontends: 8
 
 dynamo:
-  hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
-  install: true
+  install: false  # NGC sglang-runtime image ships dynamo prebuilt.
 
-# 100-min readiness wall (default 30 min) so the per-worker dynamo
-# source build has room to finish before health-poll gives up.
+# 100-min readiness wall (default 30 min) tolerates first-time NGC
+# image pull on each worker node plus DSV4-Pro multi-node load.
 health_check:
   max_attempts: 600
   interval_seconds: 10
 
 model:
   path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
+  container: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.2.0-deepseek-v4-cuda13-dev.3"
   precision: "mxfp4"
 
 sbatch_directives:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-1p1d-dep8-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-1p1d-dep8-dep16-mtp.yaml
@@ -6,18 +6,17 @@ frontend:
   num_additional_frontends: 8
 
 dynamo:
-  hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
-  install: true
+  install: false  # NGC sglang-runtime image ships dynamo prebuilt.
 
-# 100-min readiness wall (default 30 min) so the per-worker dynamo
-# source build has room to finish before health-poll gives up.
+# 100-min readiness wall (default 30 min) tolerates first-time NGC
+# image pull on each worker node plus DSV4-Pro multi-node load.
 health_check:
   max_attempts: 600
   interval_seconds: 10
 
 model:
   path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
+  container: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.2.0-deepseek-v4-cuda13-dev.3"
   precision: "mxfp4"
 
 sbatch_directives:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-1p1d-dep8-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-1p1d-dep8-dep8-mtp.yaml
@@ -6,18 +6,17 @@ frontend:
   num_additional_frontends: 8
 
 dynamo:
-  hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
-  install: true
+  install: false  # NGC sglang-runtime image ships dynamo prebuilt.
 
-# 100-min readiness wall (default 30 min) so the per-worker dynamo
-# source build has room to finish before health-poll gives up.
+# 100-min readiness wall (default 30 min) tolerates first-time NGC
+# image pull on each worker node plus DSV4-Pro multi-node load.
 health_check:
   max_attempts: 600
   interval_seconds: 10
 
 model:
   path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
+  container: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.2.0-deepseek-v4-cuda13-dev.3"
   precision: "mxfp4"
 
 sbatch_directives:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-4p1d-dep8-dep8-mtp-c8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-4p1d-dep8-dep8-mtp-c8192.yaml
@@ -6,18 +6,17 @@ frontend:
   num_additional_frontends: 8
 
 dynamo:
-  hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
-  install: true
+  install: false  # NGC sglang-runtime image ships dynamo prebuilt.
 
-# 100-min readiness wall (default 30 min) so the per-worker dynamo
-# source build has room to finish before health-poll gives up.
+# 100-min readiness wall (default 30 min) tolerates first-time NGC
+# image pull on each worker node plus DSV4-Pro multi-node load.
 health_check:
   max_attempts: 600
   interval_seconds: 10
 
 model:
   path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
+  container: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.2.0-deepseek-v4-cuda13-dev.3"
   precision: "mxfp4"
 
 sbatch_directives:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-5p1d-dep8-dep8-mtp-c12288.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-5p1d-dep8-dep8-mtp-c12288.yaml
@@ -6,18 +6,17 @@ frontend:
   num_additional_frontends: 8
 
 dynamo:
-  hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
-  install: true
+  install: false  # NGC sglang-runtime image ships dynamo prebuilt.
 
-# 100-min readiness wall (default 30 min) so the per-worker dynamo
-# source build has room to finish before health-poll gives up.
+# 100-min readiness wall (default 30 min) tolerates first-time NGC
+# image pull on each worker node plus DSV4-Pro multi-node load.
 health_check:
   max_attempts: 600
   interval_seconds: 10
 
 model:
   path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
+  container: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.2.0-deepseek-v4-cuda13-dev.3"
   precision: "mxfp4"
 
 sbatch_directives:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-6p1d-dep8-dep8-mtp-c16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-6p1d-dep8-dep8-mtp-c16384.yaml
@@ -6,18 +6,17 @@ frontend:
   num_additional_frontends: 8
 
 dynamo:
-  hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
-  install: true
+  install: false  # NGC sglang-runtime image ships dynamo prebuilt.
 
-# 100-min readiness wall (default 30 min) so the per-worker dynamo
-# source build has room to finish before health-poll gives up.
+# 100-min readiness wall (default 30 min) tolerates first-time NGC
+# image pull on each worker node plus DSV4-Pro multi-node load.
 health_check:
   max_attempts: 600
   interval_seconds: 10
 
 model:
   path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
+  container: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.2.0-deepseek-v4-cuda13-dev.3"
   precision: "mxfp4"
 
 sbatch_directives: