diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index ad469b28e..675250e2b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11267,3 +11267,47 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
           tp: 4
           ep: 4
           dp-attn: true
+
+
+# llm-d-vllm simple 1P+1D P/D disagg on H200 (Phase 0).
+#
+# Simplest possible multi-node llm-d-vllm shape:
+#   1 prefill node (DP=8 EP=8 dp-attn) + 1 decode node (DP=8 EP=8 dp-attn).
+#   Total 2 H200 nodes. No DeepEP, no NVSHMEM ibgda, no full-mesh RDMA.
+#   KV transfer prefill -> decode via NIXL point-to-point.
+#
+# Apples-to-apples shape vs Dynamo's H200 1P+1D entries (which use
+# sglang or trt; this is the same topology but with vLLM and the llm-d
+# router).
+dsr1-fp8-h200-llm-d-vllm-simple:
+  image: ghcr.io/ezrasilvera/llm-d-nokube-vllm:v0.7.0
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: h200-multinode
+  precision: fp8
+  framework: llm-d-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1, 4, 16, 64, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "RANDOM_RANGE_RATIO=0.05"
+          - "CONFIG_FILE=dsr1-fp8-h200-1p1d-simple.yaml"
+        decode:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
diff --git a/benchmarks/llm-d/Dockerfile b/benchmarks/llm-d/Dockerfile
new file mode 100644
index 000000000..0e5228136
--- /dev/null
+++ b/benchmarks/llm-d/Dockerfile
@@ -0,0 +1,22 @@
+# Combined image for the InferenceX llm-d-vllm framework.
+#
+# Base = ghcr.io/llm-d/llm-d-cuda which already ships vLLM + DeepEP +
+# NVSHMEM + GDRCopy. We add the EPP, the routing-sidecar, and Envoy on top
+# so every node in a SLURM allocation can play any role (prefill, decode,
+# or coordinator) from a single image.
+#
+# Configs (epp-config.yaml, envoy.yaml, per-topology recipes) are NOT
+# baked in. They are mounted at runtime by job.slurm so config-only
+# iteration does not require an image rebuild. See
+# benchmarks/multi_node/llm-d/job.slurm for the expected mount layout.
+
+FROM ghcr.io/llm-d/llm-d-cuda:v0.7.0
+
+COPY --from=ghcr.io/llm-d/llm-d-router-endpoint-picker-dev:main \
+       /app/epp /usr/local/bin/epp
+
+COPY --from=ghcr.io/llm-d/llm-d-router-disagg-sidecar-dev:main \
+       /app/pd-sidecar /usr/local/bin/pd-sidecar
+
+COPY --from=envoyproxy/envoy:distroless-v1.33.2 \
+     /usr/local/bin/envoy /usr/local/bin/
diff --git a/benchmarks/llm-d/README.md b/benchmarks/llm-d/README.md
new file mode 100644
index 000000000..cd6e0bf51
--- /dev/null
+++ b/benchmarks/llm-d/README.md
@@ -0,0 +1,16 @@
+# llm-d-vllm framework artifacts
+
+This directory holds the static, baked-into-the-image pieces of the
+`llm-d-vllm` benchmark framework.
+
+| File | Purpose |
+|---|---|
+| `Dockerfile` | Combined image: vLLM (DeepEP-enabled), EPP, routing-sidecar, Envoy. One image, every node uses what its role requires. |
+| `epp-config.yaml` | Fallback EPP scheduling config. Used when no recipe overrides it via `CONFIG_FILE`. `disagg-profile-handler` + `kv-cache-utilization-scorer` + `random-picker` over the file-discovery endpoint set. |
+| `envoy.yaml` | Static Envoy: listener `:8080`, ext_proc to `127.0.0.1:9002`, ORIGINAL_DST cluster reading `x-gateway-destination-endpoint`. |
+
+The runtime pieces (per-node `server.sh`, the SLURM job script, recipe
+files, and the endpoint discovery mechanism) live under
+`benchmarks/multi_node/llm-d/` and `benchmarks/multi_node/llm-d-recipes/`.
+See the README in `benchmarks/multi_node/llm-d/` for the endpoints-file
+generation flow.
diff --git a/benchmarks/llm-d/envoy.yaml b/benchmarks/llm-d/envoy.yaml
new file mode 100644
index 000000000..20bbe60a6
--- /dev/null
+++ b/benchmarks/llm-d/envoy.yaml
@@ -0,0 +1,85 @@
+# Envoy front door for the llm-d-vllm framework.
+#
+# Listener  : 0.0.0.0:8080  (benchmark client target)
+# ext_proc  : EPP on 127.0.0.1:9002
+# Cluster   : ORIGINAL_DST, picks the address from the
+#             x-gateway-destination-endpoint header that EPP sets.
+
+static_resources:
+  listeners:
+    - name: main
+      address:
+        socket_address: { address: 0.0.0.0, port_value: 8080 }
+      filter_chains:
+        - filters:
+            - name: envoy.filters.network.http_connection_manager
+              typed_config:
+                "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
+                stat_prefix: ingress_http
+                codec_type: AUTO
+                stream_idle_timeout: 0s
+                request_timeout: 0s
+                route_config:
+                  name: route
+                  virtual_hosts:
+                    - name: vh
+                      domains: ["*"]
+                      routes:
+                        - match: { prefix: "/" }
+                          route:
+                            cluster: original_dst
+                            timeout: 0s
+                http_filters:
+                  - name: envoy.filters.http.ext_proc
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor
+                      grpc_service:
+                        envoy_grpc:
+                          cluster_name: epp
+                        timeout: 10s
+                      # message_timeout caps how long Envoy will wait for any
+                      # one ext_proc message ack from EPP. Generation can take
+                      # many seconds; 1000s mirrors the upstream llm-d guide.
+                      message_timeout: 1000s
+                      # FULL_DUPLEX_STREAMED for both directions: the dev EPP
+                      # (ghcr.io/llm-d/llm-d-router-endpoint-picker-dev:main)
+                      # does not ack BUFFERED body mode and Envoy times out
+                      # with 504. Trailer modes also have to be SEND for the
+                      # request lifecycle to terminate cleanly.
+                      processing_mode:
+                        request_header_mode: SEND
+                        response_header_mode: SEND
+                        request_body_mode: FULL_DUPLEX_STREAMED
+                        response_body_mode: FULL_DUPLEX_STREAMED
+                        request_trailer_mode: SEND
+                        response_trailer_mode: SEND
+                  - name: envoy.filters.http.router
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
+  clusters:
+    - name: epp
+      type: STATIC
+      connect_timeout: 1s
+      typed_extension_protocol_options:
+        envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
+          "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
+          explicit_http_config:
+            http2_protocol_options: {}
+      load_assignment:
+        cluster_name: epp
+        endpoints:
+          - lb_endpoints:
+              - endpoint:
+                  address:
+                    socket_address: { address: 127.0.0.1, port_value: 9002 }
+    - name: original_dst
+      type: ORIGINAL_DST
+      lb_policy: CLUSTER_PROVIDED
+      connect_timeout: 5s
+      original_dst_lb_config:
+        use_http_header: true
+        http_header_name: x-gateway-destination-endpoint
+
+admin:
+  address:
+    socket_address: { address: 0.0.0.0, port_value: 9901 }
diff --git a/benchmarks/llm-d/epp-config.yaml b/benchmarks/llm-d/epp-config.yaml
new file mode 100644
index 000000000..3ff0eea87
--- /dev/null
+++ b/benchmarks/llm-d/epp-config.yaml
@@ -0,0 +1,61 @@
+# Default EPP scheduling config (fallback when CONFIG_FILE is unset).
+#
+# Mirrors the upstream llm-d well-lit-path P/D guide:
+#   guides/pd-disaggregation/router/pd-disaggregation.values.yaml
+# in github.com/llm-d/llm-d. Plugins, scheduling profiles, and scorer
+# weights are unchanged from upstream.
+#
+# Single delta vs upstream: file-discovery. The upstream guide assumes
+# a Kubernetes control plane drives endpoint discovery; in our SLURM
+# setup the coordinator node writes /tmp/endpoints.yaml at job start
+# (see benchmarks/multi_node/llm-d/README.md) and EPP loads it via the
+# file-discovery plugin instead.
+
+apiVersion: llm-d.ai/v1alpha1
+kind: EndpointPickerConfig
+
+plugins:
+  # Endpoint discovery (replaces upstream's K8s discovery).
+  - name: file-disc
+    type: file-discovery
+    parameters:
+      path: /tmp/endpoints.yaml
+      watchFile: false
+
+  # P/D routing - identical to upstream pd-disaggregation guide.
+  - type: disagg-headers-handler
+  - type: always-disagg-pd-decider
+  - type: disagg-profile-handler
+    parameters:
+      deciderPluginName: always-disagg-pd-decider
+  - type: prefill-filter
+  - type: decode-filter
+  - type: prefix-cache-scorer
+  - type: queue-scorer
+  - type: kv-cache-utilization-scorer
+  - type: active-request-scorer
+  - type: max-score-picker
+
+schedulingProfiles:
+  - name: prefill
+    plugins:
+      - pluginRef: prefill-filter
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: queue-scorer
+        weight: 2
+      - pluginRef: kv-cache-utilization-scorer
+        weight: 2
+      - pluginRef: max-score-picker
+  - name: decode
+    plugins:
+      - pluginRef: decode-filter
+      - pluginRef: active-request-scorer
+        weight: 2
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: max-score-picker
+
+dataLayer:
+  discovery:
+    pluginRef: file-disc
diff --git a/benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh b/benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh
new file mode 100755
index 000000000..61978c199
--- /dev/null
+++ b/benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+#
+# Wrapper for the DSR1-FP8 H200 wide-EP llm-d-vllm benchmark.
+# Sets topology env (PREFILL_NODES, DECODE_NODES) and calls
+# benchmarks/multi_node/llm-d/submit.sh, which prints JOB_ID on stdout.
+# Same shape as benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh.
+
+set -euo pipefail
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    MODEL_PATH \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/llm-d" || exit 1
+
+export TIME_LIMIT="${TIME_LIMIT:-08:00:00}"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+# Concurrency list passes through to bench server. Use 'x'-delimited form
+# (matches sglang-disagg wrapper convention).
+JOB_ID=$(bash ./submit.sh \
+    "$PREFILL_NODES" \
+    "$DECODE_NODES" \
+    "$ISL" "$OSL" "${CONC_LIST// /x}" inf \
+    "$RANDOM_RANGE_RATIO")
+
+if [[ -z "$JOB_ID" ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml b/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml
new file mode 100644
index 000000000..e3ca415b6
--- /dev/null
+++ b/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml
@@ -0,0 +1,101 @@
+# DeepSeek-R1-0528 fp8 on H200, simple 1P+1D P/D disagg.
+#
+# Phase 0 starting point - the simplest possible llm-d-vllm multi-node
+# config:
+#   1 prefill node (DP=8 EP=8 dp-attn, intra-node EP over NVLink)
+#   1 decode  node (DP=8 EP=8 dp-attn, intra-node EP over NVLink)
+#   total 2 H200 nodes / 16 GPUs.
+#
+# No DeepEP, no NVSHMEM ibgda, no full-mesh-RDMA requirement, no
+# cross-node MoE all-to-all. KV transfer between prefill and decode goes
+# through NIXL point-to-point. This mirrors the shape of the simplest
+# Dynamo H200 multi-node disagg entries (e.g. dsr1-fp8-h200-dynamo-sglang
+# 1P+1D EP=8) but with vLLM as the engine and llm-d as the router.
+#
+# Selected via additional-settings: CONFIG_FILE=dsr1-fp8-h200-1p1d-simple.yaml
+# with PREFILL_NODES=1 DECODE_NODES=1 from the wrapper.
+
+# ---- EPP scheduling config ----
+# Mirrors the upstream llm-d well-lit-path P/D guide:
+#   guides/pd-disaggregation/router/pd-disaggregation.values.yaml
+# in github.com/llm-d/llm-d. Plugins, scheduling profiles, and scorer
+# weights are unchanged from upstream. Single delta: file-discovery
+# replaces upstream's K8s endpoint discovery, since this benchmark runs
+# under SLURM. The coordinator node writes /tmp/endpoints.yaml at job
+# start (see benchmarks/multi_node/llm-d/README.md).
+apiVersion: llm-d.ai/v1alpha1
+kind: EndpointPickerConfig
+
+plugins:
+  - name: file-disc
+    type: file-discovery
+    parameters:
+      path: /tmp/endpoints.yaml
+      watchFile: false
+
+  - type: disagg-headers-handler
+  - type: always-disagg-pd-decider
+  - type: disagg-profile-handler
+    parameters:
+      deciderPluginName: always-disagg-pd-decider
+  - type: prefill-filter
+  - type: decode-filter
+  - type: prefix-cache-scorer
+  - type: queue-scorer
+  - type: kv-cache-utilization-scorer
+  - type: active-request-scorer
+  - type: max-score-picker
+
+schedulingProfiles:
+  - name: prefill
+    plugins:
+      - pluginRef: prefill-filter
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: queue-scorer
+        weight: 2
+      - pluginRef: kv-cache-utilization-scorer
+        weight: 2
+      - pluginRef: max-score-picker
+  - name: decode
+    plugins:
+      - pluginRef: decode-filter
+      - pluginRef: active-request-scorer
+        weight: 2
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: max-score-picker
+
+dataLayer:
+  discovery:
+    pluginRef: file-disc
+
+# ---- Per-role vLLM flags ----
+# Common flags (--enable-expert-parallel, --tensor-parallel-size,
+# --data-parallel-size, --kv_transfer_config, --moe-backend) are set in
+# server.sh. The cross-node DP coordination flags
+# (--data-parallel-hybrid-lb, --data-parallel-size-local, etc.) are NOT
+# emitted because LWS_GROUP_SIZE = PREFILL_NODES = DECODE_NODES = 1.
+prefill:
+  extra-args: >-
+    --gpu-memory-utilization 0.85
+    --kv-cache-dtype fp8
+    --max-num-batched-tokens 32768
+    --max-num-seqs 16
+    --block-size 256
+    --no-enable-prefix-caching
+  env: {}
+
+decode:
+  extra-args: >-
+    --gpu-memory-utilization 0.90
+    --kv-cache-dtype fp8
+    --max-num-batched-tokens 256
+    --max-num-seqs 256
+    --block-size 256
+    --no-enable-prefix-caching
+  env: {}
+
+# ---- SLURM resource directives ----
+slurm:
+  time_limit: "04:00:00"
diff --git a/benchmarks/multi_node/llm-d/README.md b/benchmarks/multi_node/llm-d/README.md
new file mode 100644
index 000000000..b57333dea
--- /dev/null
+++ b/benchmarks/multi_node/llm-d/README.md
@@ -0,0 +1,133 @@
+# llm-d-vllm multi-node SLURM scaffolding
+
+This directory holds the SLURM-side orchestration for the `llm-d-vllm`
+benchmark framework. It mirrors the AMD `sglang-disagg` pattern under
+`benchmarks/multi_node/amd_utils/` (NOT the Dynamo / srt-slurm pattern):
+InferenceX itself owns the SLURM job, no vendor multi-node tool involved.
+
+| File | Role |
+|---|---|
+| `submit.sh` | sbatch wrapper. Validates env, exports tuning vars, returns `JOB_ID`. May read `slurm.time_limit` from the recipe to override `TIME_LIMIT`. |
+| `job.slurm` | sbatch entrypoint. Allocates `PREFILL_NODES + DECODE_NODES` nodes, derives per-node IPs, runs one Docker container per node via `srun`, threads role assignment env into each. |
+| `server.sh` | Per-node entry. Reads `NODE_RANK = SLURM_PROCID`, picks role, starts vLLM (with the wide-EP / DeepEP / NIXL flag set from the llm-d wide-EP-lws guide), starts the pd-sidecar on each leader, and on the decode leader additionally writes `endpoints.yaml`, starts EPP + Envoy, runs `benchmark_serving.py`, and `scancel`s the job. |
+
+## Topology
+
+For an `xP` prefill nodes / `yD` decode nodes run, total nodes = `xP + yD`.
+There is **no dedicated coordinator node**. The decode leader doubles as
+the coordinator (EPP + Envoy + bench), exactly like the AMD path's
+"decode rank 0" coordinator role.
+
+| Rank | Role |
+|---|---|
+| `0` | prefill leader (`LWS_WORKER_INDEX=0`, DP rank 0) + pd-sidecar |
+| `1 .. xP-1` | prefill workers |
+| `xP` | decode leader + pd-sidecar + EPP + Envoy + benchmark client |
+| `xP+1 .. xP+yD-1` | decode workers |
+
+Each instance (prefill or decode) is one vLLM engine spanning multiple
+nodes via `--data-parallel-hybrid-lb`. With `xP=2, yD=2,
+GPUS_PER_NODE=8` you get DP=16 prefill + DP=16 decode (the wide-EP
+reference). Per-rank split: `--data-parallel-size 16
+--data-parallel-size-local 8 --data-parallel-start-rank
+$((LWS_WORKER_INDEX * 8))`.
+
+## How `endpoints.yaml` is generated (file-discovery contract)
+
+The EPP runs in **no-Kubernetes mode**, using the `file-discovery` plugin
+from `llm-d-inference-scheduler` (branch `filediscovery-4`). At startup
+it reads `/tmp/endpoints.yaml`; the file lists every backend the EPP can
+route to, with role labels.
+
+The file is generated at runtime by `server.sh` on the decode leader
+(rank `PREFILL_NODES`). Because all node IPs are only known after
+`sbatch` allocates the job, the file cannot be baked into the image and
+is not part of the repo.
+
+Generation flow:
+
+1. `submit.sh` calls `sbatch -N (xP+yD)`. `sbatch` allocates nodes.
+2. `job.slurm` resolves each node's IP via `srun ip route get 1.1.1.1`,
+   slices them into `PREFILL_LEADER_IP` (= IPS[0]) and `DECODE_LEADER_IP`
+   (= IPS[PREFILL_NODES]), and passes both into the container as env
+   vars.
+3. On the decode leader, `server.sh` writes `/tmp/endpoints.yaml`
+   inside the container with one entry per leader:
+
+   ```yaml
+   endpoints:
+     - name: prefill-0
+       address: <PREFILL_LEADER_IP>
+       port: "8000"            # pd-sidecar port
+       labels:
+         llm-d.ai/role: prefill
+     - name: decode-0
+       address: <DECODE_LEADER_IP>
+       port: "8000"
+       labels:
+         llm-d.ai/role: decode
+   ```
+
+4. The EPP (started immediately after) loads the file via
+   `dataLayer.discovery.pluginRef: file-disc` (see
+   `benchmarks/llm-d/epp-config.yaml`). The plugin enumerates the
+   endpoints into the EPP datastore before the EPP starts serving
+   `ext_proc`, so Envoy never gets a request before discovery is ready.
+5. The `disagg-profile-handler` in the EPP config uses `prefill-filter`
+   and `decode-filter` to pick the right backend per request phase,
+   matching on the `llm-d.ai/role` label.
+
+### Why one entry per *leader* (not per node)
+
+In the wide-EP guide each instance is a single vLLM engine that spans
+multiple nodes via `--data-parallel-hybrid-lb`. With hybrid-lb, the
+leader pod (`LWS_WORKER_INDEX=0`) accepts external traffic and
+distributes it internally across the local DP ranks; in our LWS-free
+SLURM mapping, the prefill-leader and decode-leader are the only nodes
+addressable from outside. Adding an entry per worker would cause EPP to
+route directly to a worker, bypassing the engine's internal load
+balancing.
+
+If we later want to expose all pods of an instance (the alternative
+hybrid-lb interpretation: external LB across nodes too), we can extend
+the loop in `server.sh` to emit one entry per `IPS[i]` in the prefill
+range and one per `IPS[i]` in the decode range, all carrying the same
+role label. EPP then load-balances across them via `random-picker`.
+
+### Live reload
+
+`watchFile: false` in `epp-config.yaml`. Endpoints are static for the
+job lifetime - no reason to pay for `fsnotify` here. Set `watchFile:
+true` (and rewrite `/tmp/endpoints.yaml` from the coordinator) only if
+you want to drain or add an instance mid-run.
+
+### Validation rules (enforced by the plugin)
+
+- `address` must be a literal IPv4 address (no IPv6, no hostnames).
+- `port` is a string in `1..65535`.
+- File capped at 1 MiB.
+- Names must be unique within their namespace (we use the default
+  namespace, so they must be globally unique in the file).
+
+The IPs we collect from `ip route get 1.1.1.1` are always IPv4 on the
+H200 / B200 cluster's primary fabric; if you point at a different
+interface and it returns an IPv6 address, EPP will reject the file at
+startup.
+
+## Recipe files
+
+`benchmarks/multi_node/llm-d-recipes/<name>.yaml` is selected via
+`CONFIG_FILE=<name>.yaml` in the master config's `additional-settings`.
+Each recipe carries:
+
+- top-level `plugins:` / `schedulingProfiles:` / `dataLayer:` - fed into
+  the EPP via `--config-file`. Lets you change routing strategy without
+  rebuilding the image.
+- `prefill:` / `decode:` blocks with `extra-args` (appended to the vLLM
+  launch command on each node of that role) and `env` (exported before
+  vLLM starts).
+- `slurm.time_limit` - overrides `TIME_LIMIT` for that recipe.
+
+When `CONFIG_FILE` is unset or the file is missing, the EPP falls back
+to `/etc/epp/config.yaml` baked into the image, and vLLM runs with no
+extra flags beyond the wide-EP common set in `server.sh`.
diff --git a/benchmarks/multi_node/llm-d/job.slurm b/benchmarks/multi_node/llm-d/job.slurm
new file mode 100644
index 000000000..46a026ced
--- /dev/null
+++ b/benchmarks/multi_node/llm-d/job.slurm
@@ -0,0 +1,147 @@
+#!/bin/bash
+#SBATCH --job-name=llm-d-bench
+#SBATCH --ntasks-per-node=1
+# --output, --error, -N, -n, --time set by submit.sh
+#
+# Allocates PREFILL_NODES + DECODE_NODES nodes, derives per-node IPs, then
+# srun-runs server.sh inside one Docker container per node. NODE_RANK
+# (= SLURM_PROCID) drives role selection inside server.sh.
+
+set -euo pipefail
+
+echo "=== llm-d job start ==="
+echo "UTC: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')"
+
+# Repo root (benchmarks/multi_node/llm-d/job.slurm -> ../../..)
+DI_REPO_DIR=$(cd "$(dirname "$0")/../../.." && pwd)
+export DI_REPO_DIR
+
+ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+TOTAL_NODES=$(echo "$ALL_NODES" | wc -l)
+echo "Allocated nodes ($TOTAL_NODES): $(echo "$ALL_NODES" | tr '\n' ' ')"
+
+if [[ "$TOTAL_NODES" -ne "$NUM_NODES" ]]; then
+    echo "Error: SLURM allocated $TOTAL_NODES nodes, expected $NUM_NODES" >&2
+    exit 1
+fi
+
+# Per-node IPs in rank order.
+IPS=()
+for NODE in $ALL_NODES; do
+    IP=$(srun --nodes=1 --ntasks=1 --nodelist="$NODE" \
+         bash -c 'ip route get 1.1.1.1 | awk "/src/ {print \$7}"')
+    IPS+=("$IP")
+done
+echo "Node IPs: ${IPS[*]}"
+
+# Rank slicing:
+#   prefill leader = rank 0
+#   prefill workers = ranks 1 .. PREFILL_NODES-1
+#   decode  leader = rank PREFILL_NODES (also coordinator: EPP + Envoy + bench)
+#   decode  workers = ranks PREFILL_NODES+1 .. NUM_NODES-1
+PREFILL_LEADER_IP="${IPS[0]}"
+DECODE_LEADER_IP="${IPS[$PREFILL_NODES]}"
+
+# DP leader addresses for vLLM --data-parallel-address (rank 0 of each instance).
+PREFILL_DP_ADDR="$PREFILL_LEADER_IP"
+DECODE_DP_ADDR="$DECODE_LEADER_IP"
+
+ALL_IP_LIST=$(IFS=,; echo "${IPS[*]}")
+
+SANITIZED_USER=$(echo "${USER:-runner}" | tr -c 'a-zA-Z0-9_.-' '_')
+DOCKER_CONT_NAME="llmd_bench_${SANITIZED_USER}_${SLURM_JOB_ID}"
+export DOCKER_CONT_NAME
+export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
+DOCKER_MOUNT_PATH="/workspace"
+
+cleanup() {
+    echo "[${SLURM_JOB_ID}] cleanup on $(hostname)"
+    [[ -n "${WATCHER_PID:-}" ]] && kill "$WATCHER_PID" 2>/dev/null || true
+}
+trap cleanup INT TERM HUP EXIT
+
+# Coordinator-done watcher. server.sh on the decode coordinator writes
+# this marker after the bench finishes; we then scancel the allocation
+# from outside the container (the image has no SLURM client tools).
+# Without this, workers `wait` on local vLLM forever and the job runs
+# to TIME_LIMIT.
+BENCH_DONE_MARKER="$BENCHMARK_LOGS_DIR/.bench_done.$SLURM_JOB_ID"
+rm -f "$BENCH_DONE_MARKER"
+(
+    while [[ ! -f "$BENCH_DONE_MARKER" ]]; do sleep 5; done
+    echo "[${SLURM_JOB_ID}] coordinator finished; scancel'ing job"
+    scancel "$SLURM_JOB_ID" 2>/dev/null || true
+) &
+WATCHER_PID=$!
+
+# One docker run per node, one task per node. server.sh dispatches by NODE_RANK.
+srun \
+    --kill-on-bad-exit=1 \
+    --signal=TERM@30 \
+    --unbuffered \
+    bash -lc "
+set -euo pipefail
+echo \"Rank \$SLURM_PROCID on \$(hostname)\"
+
+sudo docker ps -aq --filter name=\"^${DOCKER_CONT_NAME}_\" | xargs -r sudo docker rm -f || true
+
+exec sudo docker run --rm \
+    --init \
+    --stop-timeout 10 \
+    --network host \
+    --ipc host \
+    --gpus all \
+    --ulimit memlock=-1 --ulimit stack=67108864 \
+    --shm-size 32G \
+    --cap-add SYS_PTRACE --cap-add IPC_LOCK --cap-add SYS_RAWIO \
+    --device /dev/infiniband \
+    --security-opt seccomp=unconfined \
+    --privileged \
+    -v ${MODEL_DIR}:/models:ro \
+    -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \
+    -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \
+    -v ${DI_REPO_DIR}/benchmarks/multi_node/llm-d-recipes:/etc/llmd-recipes:ro \
+    -v ${DI_REPO_DIR}/benchmarks/llm-d/epp-config.yaml:/etc/epp/config.yaml:ro \
+    -v ${DI_REPO_DIR}/benchmarks/llm-d/envoy.yaml:/etc/envoy/envoy.yaml:ro \
+    -e SLURM_JOB_ID=\$SLURM_JOB_ID \
+    -e NODE_RANK=\$SLURM_PROCID \
+    -e NUM_NODES=$NUM_NODES \
+    -e PREFILL_NODES=$PREFILL_NODES \
+    -e DECODE_NODES=$DECODE_NODES \
+    -e ALL_IPS=$ALL_IP_LIST \
+    -e PREFILL_LEADER_IP=$PREFILL_LEADER_IP \
+    -e DECODE_LEADER_IP=$DECODE_LEADER_IP \
+    -e PREFILL_DP_ADDR=$PREFILL_DP_ADDR \
+    -e DECODE_DP_ADDR=$DECODE_DP_ADDR \
+    -e MODEL_DIR=/models \
+    -e MODEL_NAME=$MODEL_NAME \
+    -e GPUS_PER_NODE=$GPUS_PER_NODE \
+    -e PREFILL_DP_SIZE=$PREFILL_DP_SIZE \
+    -e DECODE_DP_SIZE=$DECODE_DP_SIZE \
+    -e BENCH_INPUT_LEN=$BENCH_INPUT_LEN \
+    -e BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN \
+    -e BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY \
+    -e BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE \
+    -e BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO \
+    -e BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER \
+    -e BENCHMARK_LOGS_DIR=/benchmark_logs \
+    -e RUN_EVAL=$RUN_EVAL \
+    -e EVAL_ONLY=$EVAL_ONLY \
+    -e EVAL_CONC=$EVAL_CONC \
+    -e FRAMEWORK=$FRAMEWORK \
+    -e PRECISION=$PRECISION \
+    -e MODEL_PREFIX=$MODEL_PREFIX \
+    -e RUNNER_TYPE=$RUNNER_TYPE \
+    -e RESULT_FILENAME=$RESULT_FILENAME \
+    -e SPEC_DECODING=$SPEC_DECODING \
+    -e IS_MULTINODE=$IS_MULTINODE \
+    -e CONFIG_FILE=$CONFIG_FILE \
+    --name \"${DOCKER_CONT_NAME}_\$SLURM_PROCID\" \
+    \"\$DOCKER_IMAGE_NAME\" bash -lc '
+        set -o pipefail
+        ${DOCKER_MOUNT_PATH}/benchmarks/multi_node/llm-d/server.sh \
+            2>&1 | tee /benchmark_logs/slurm_job-'\"\$SLURM_JOB_ID\"'_rank_'\"\$SLURM_PROCID\"'.log
+    '
+"
+
+srun bash -c "sudo docker ps -aq --filter name=\"^${DOCKER_CONT_NAME}_\" | xargs -r sudo docker rm -f" || true
diff --git a/benchmarks/multi_node/llm-d/server.sh b/benchmarks/multi_node/llm-d/server.sh
new file mode 100755
index 000000000..b5c264be6
--- /dev/null
+++ b/benchmarks/multi_node/llm-d/server.sh
@@ -0,0 +1,322 @@
+#!/usr/bin/env bash
+#
+# Per-node entrypoint for the llm-d-vllm wide-EP P/D disagg benchmark.
+# NODE_RANK is set by srun (= $SLURM_PROCID) in job.slurm.
+#
+# Roles:
+#   Rank 0                         -> prefill leader (DP rank 0)
+#   Ranks 1 .. PREFILL_NODES-1     -> prefill workers
+#   Rank PREFILL_NODES             -> decode leader (DP rank 0) + pd-sidecar
+#                                     + EPP + Envoy + benchmark client
+#                                     (the coordinator, like AMD's decode-0)
+#   Ranks PREFILL_NODES+1 ..       -> decode workers
+#
+# Each "instance" (prefill or decode) is a single vLLM engine spanning
+# PREFILL_NODES (or DECODE_NODES) nodes via --data-parallel-hybrid-lb. The
+# leader pod accepts external traffic; workers handle their local DP ranks.
+
+set -euo pipefail
+
+source /workspace/benchmarks/benchmark_lib.sh
+
+NODE_RANK="${NODE_RANK:-${SLURM_PROCID:-0}}"
+PREFILL_NODES="${PREFILL_NODES:-1}"
+DECODE_NODES="${DECODE_NODES:-1}"
+GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+VLLM_PORT=8200
+SIDECAR_PORT=8000
+ENVOY_PORT=8080
+EPP_GRPC_PORT=9002
+EPP_HEALTH_PORT=9003
+EPP_METRICS_PORT=9090
+
+# Filesystem path to the weights inside the container. job.slurm mounts
+# the host model directory at /models and sets MODEL_DIR=/models, so the
+# weights live directly under MODEL_DIR. MODEL_NAME is the OpenAI-API
+# served name passed via --served-model-name; it is not part of the
+# filesystem path.
+MODEL="${MODEL_DIR}"
+HOST_IP=$(ip route get 1.1.1.1 | awk '/src/ {print $7}')
+# Default NIC for NCCL / Gloo / NVSHMEM bootstrap. Pulled from the same
+# default route HOST_IP came from so the iface and the IP stay
+# consistent across clusters where the routed NIC is not eth0.
+DEFAULT_IFACE=$(ip -o -4 route show to default | awk '{print $5; exit}')
+DEFAULT_IFACE="${DEFAULT_IFACE:-eth0}"
+
+VLLM_LOG="/benchmark_logs/vllm_rank${NODE_RANK}.log"
+SIDECAR_LOG="/benchmark_logs/sidecar_rank${NODE_RANK}.log"
+EPP_LOG="/benchmark_logs/epp.log"
+ENVOY_LOG="/benchmark_logs/envoy.log"
+
+echo "=== rank=$NODE_RANK host=$HOST_IP model=$MODEL ==="
+
+# ----------------------------------------------------------------
+# Role assignment
+# ----------------------------------------------------------------
+if [[ "$NODE_RANK" -lt "$PREFILL_NODES" ]]; then
+    ROLE="prefill"
+    DP_SIZE="$PREFILL_DP_SIZE"
+    DP_ADDR="$PREFILL_DP_ADDR"
+    LWS_WORKER_INDEX="$NODE_RANK"
+    LWS_GROUP_SIZE="$PREFILL_NODES"
+elif [[ "$NODE_RANK" -lt $((PREFILL_NODES + DECODE_NODES)) ]]; then
+    ROLE="decode"
+    DP_SIZE="$DECODE_DP_SIZE"
+    DP_ADDR="$DECODE_DP_ADDR"
+    LWS_WORKER_INDEX=$((NODE_RANK - PREFILL_NODES))
+    LWS_GROUP_SIZE="$DECODE_NODES"
+else
+    echo "ERROR: NODE_RANK=$NODE_RANK out of range" >&2
+    exit 1
+fi
+
+DP_SIZE_LOCAL="$GPUS_PER_NODE"
+START_RANK=$((LWS_WORKER_INDEX * DP_SIZE_LOCAL))
+TP_SIZE=1
+
+echo "ROLE=$ROLE DP_SIZE=$DP_SIZE DP_ADDR=$DP_ADDR LWS_WORKER_INDEX=$LWS_WORKER_INDEX START_RANK=$START_RANK"
+
+# ----------------------------------------------------------------
+# Read role-specific extra-args and env from the recipe file.
+# ----------------------------------------------------------------
+ROLE_EXTRA_ARGS=""
+if [[ -n "${CONFIG_FILE:-}" ]]; then
+    RECIPE_PATH="/etc/llmd-recipes/${CONFIG_FILE}"
+    if [[ -f "$RECIPE_PATH" ]]; then
+        echo "Loading $ROLE recipe from $RECIPE_PATH"
+        eval "$(python3 - <<PY
+import yaml
+recipe = yaml.safe_load(open('${RECIPE_PATH}'))
+section = recipe.get('${ROLE}', {}) or {}
+extra = (section.get('extra-args') or '').strip()
+print(f'ROLE_EXTRA_ARGS={extra!r}')
+for k, v in (section.get('env') or {}).items():
+    print(f'export {k}={v!r}')
+PY
+)"
+    else
+        echo "WARNING: CONFIG_FILE=$CONFIG_FILE but $RECIPE_PATH not found; using defaults" >&2
+    fi
+fi
+
+# ----------------------------------------------------------------
+# Multi-node DP / NIXL P/D env: needed in any topology.
+# ----------------------------------------------------------------
+export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-$DEFAULT_IFACE}
+export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-$DEFAULT_IFACE}
+export VLLM_SKIP_P2P_CHECK=1
+export VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1
+export VLLM_USE_DEEP_GEMM=1
+export VLLM_NIXL_SIDE_CHANNEL_HOST="$HOST_IP"
+export VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
+
+# ----------------------------------------------------------------
+# Wide-EP NVSHMEM / ibgda env (from the llm-d wide-EP-lws guide
+# manifests). Gated on LWS_GROUP_SIZE > 1 - the simple 1P+1D recipe
+# explicitly avoids DeepEP, NVSHMEM ibgda, and full-mesh RDMA, so
+# leaving these set on a single-node-per-role topology is misleading
+# and could trigger ibgda code paths it does not need.
+# ----------------------------------------------------------------
+if [[ "$LWS_GROUP_SIZE" -gt 1 ]]; then
+    export NVIDIA_GDRCOPY=enabled
+    export NVSHMEM_REMOTE_TRANSPORT=ibgda
+    export NVSHMEM_IB_ENABLE_IBGDA=true
+    export NVSHMEM_SYMMETRIC_SIZE=16G
+    export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME=${NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME:-$DEFAULT_IFACE}
+fi
+
+# ----------------------------------------------------------------
+# Start vLLM (every node, prefill or decode)
+#
+# Flags split into:
+#   * COMMON_ARGS - always passed.
+#   * MULTINODE_DP_ARGS - only when an instance spans more than one node
+#     (LWS_GROUP_SIZE > 1, i.e. wide-EP topology). vLLM's
+#     --data-parallel-hybrid-lb and the cross-process DP coordination
+#     flags are wrong for the single-node-per-instance case where DP is
+#     contained inside one engine process.
+# ----------------------------------------------------------------
+KV_TRANSFER_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_load_failure_policy":"fail"}'
+
+COMMON_ARGS=(
+    --port "$VLLM_PORT"
+    --served-model-name "$MODEL_NAME"
+    --trust-remote-code
+    --api-server-count 1
+    --disable-access-log-for-endpoints=/health,/metrics
+    --enable-expert-parallel
+    --tensor-parallel-size "$TP_SIZE"
+    --data-parallel-size "$DP_SIZE"
+    --kv_transfer_config "$KV_TRANSFER_CONFIG"
+    --moe-backend deep_gemm
+)
+
+if [[ "$LWS_GROUP_SIZE" -gt 1 ]]; then
+    COMMON_ARGS+=(
+        --data-parallel-hybrid-lb
+        --data-parallel-size-local "$DP_SIZE_LOCAL"
+        --data-parallel-address "$DP_ADDR"
+        --data-parallel-rpc-port 5555
+        --data-parallel-start-rank "$START_RANK"
+    )
+fi
+
+echo "Starting vLLM ($ROLE) DP=$DP_SIZE local=$DP_SIZE_LOCAL start_rank=$START_RANK group_size=$LWS_GROUP_SIZE"
+# shellcheck disable=SC2086
+vllm serve "$MODEL" "${COMMON_ARGS[@]}" $ROLE_EXTRA_ARGS \
+    > "$VLLM_LOG" 2>&1 &
+VLLM_PID=$!
+
+# Every rank waits for its own engine to bind /health before falling
+# through. For wide-EP (LWS_GROUP_SIZE > 1) this prevents the bench
+# from starting before the worker-side DP shards have come up; for the
+# single-node case it is a no-op extra check.
+wait_for_server_ready --port "$VLLM_PORT" --server-log "$VLLM_LOG" --server-pid "$VLLM_PID"
+echo "vLLM ready on rank $NODE_RANK ($ROLE worker_index=$LWS_WORKER_INDEX)"
+
+# Only the leader of each instance accepts external requests on $VLLM_PORT.
+if [[ "$LWS_WORKER_INDEX" -eq 0 ]]; then
+    # ------------------------------------------------------------
+    # Start pd-sidecar on each leader (prefill leader and decode leader).
+    # The decode-side sidecar is what EPP routes to; the prefill-side
+    # sidecar is the target the decode sidecar pulls KVs from.
+    # ------------------------------------------------------------
+    SIDECAR_CONNECTOR="nixlv2"
+    SIDECAR_FLAGS=(--port="$SIDECAR_PORT" --vllm-port="$VLLM_PORT"
+                   --kv-connector="$SIDECAR_CONNECTOR" --secure-proxy=false)
+    if [[ "$ROLE" == "decode" ]]; then
+        SIDECAR_FLAGS+=(--enable-prefiller-sampling)
+    fi
+    echo "Starting pd-sidecar ($ROLE leader): ${SIDECAR_FLAGS[*]}"
+    pd-sidecar "${SIDECAR_FLAGS[@]}" > "$SIDECAR_LOG" 2>&1 &
+    SIDECAR_PID=$!
+    wait_for_server_ready --port "$SIDECAR_PORT" --server-log "$SIDECAR_LOG" --server-pid "$SIDECAR_PID"
+    echo "pd-sidecar ready on $HOST_IP:$SIDECAR_PORT"
+fi
+
+# ----------------------------------------------------------------
+# Coordinator: decode leader runs EPP + Envoy + benchmark client.
+# ----------------------------------------------------------------
+if [[ "$ROLE" == "decode" && "$LWS_WORKER_INDEX" -eq 0 ]]; then
+
+    # Write endpoints.yaml. See benchmarks/multi_node/llm-d/README.md for
+    # the discovery contract.
+    # NOTE: endpoint 'namespace' must match EPP's --pool-namespace below
+    # (file-discovery filters endpoints by namespace; the schema default
+    # 'default' would otherwise drop every entry).
+    python3 - <<PY
+import os, yaml
+NS = 'inferencex'
+endpoints = [
+    {'name': 'prefill-0',
+     'namespace': NS,
+     'address': os.environ['PREFILL_LEADER_IP'],
+     'port': '$SIDECAR_PORT',
+     'labels': {'llm-d.ai/role': 'prefill'}},
+    {'name': 'decode-0',
+     'namespace': NS,
+     'address': os.environ['DECODE_LEADER_IP'],
+     'port': '$SIDECAR_PORT',
+     'labels': {'llm-d.ai/role': 'decode'}},
+]
+yaml.safe_dump({'endpoints': endpoints}, open('/tmp/endpoints.yaml', 'w'))
+print('endpoints.yaml:')
+print(open('/tmp/endpoints.yaml').read())
+PY
+
+    # EPP config: recipe override, else the default mounted by job.slurm
+    # at /etc/epp/config.yaml (sourced from benchmarks/llm-d/epp-config.yaml).
+    if [[ -n "$CONFIG_FILE" && -f "/etc/llmd-recipes/$CONFIG_FILE" ]]; then
+        EPP_CONFIG="/etc/llmd-recipes/$CONFIG_FILE"
+    else
+        EPP_CONFIG="/etc/epp/config.yaml"
+    fi
+    echo "EPP config: $EPP_CONFIG"
+
+    epp \
+        --pool-name=epp \
+        --pool-namespace=inferencex \
+        --config-file="$EPP_CONFIG" \
+        --grpc-port="$EPP_GRPC_PORT" \
+        --grpc-health-port="$EPP_HEALTH_PORT" \
+        --metrics-port="$EPP_METRICS_PORT" \
+        > "$EPP_LOG" 2>&1 &
+    EPP_PID=$!
+
+    # Wait for EPP to bind its gRPC port before starting Envoy. Envoy's
+    # ext_proc filter dials 127.0.0.1:$EPP_GRPC_PORT - if Envoy comes up
+    # first the early bench requests hit ext_proc connection errors.
+    # gRPC has no plain HTTP /health, so probe the TCP listener directly.
+    echo "Waiting for EPP on 127.0.0.1:$EPP_GRPC_PORT"
+    EPP_WAIT_DEADLINE=$(( $(date +%s) + 60 ))
+    until (echo > "/dev/tcp/127.0.0.1/$EPP_GRPC_PORT") 2>/dev/null; do
+        if ! kill -0 "$EPP_PID" 2>/dev/null; then
+            echo "ERROR: EPP died before binding $EPP_GRPC_PORT" >&2
+            exit 1
+        fi
+        if [[ "$(date +%s)" -ge "$EPP_WAIT_DEADLINE" ]]; then
+            echo "ERROR: EPP did not bind $EPP_GRPC_PORT within 60s" >&2
+            exit 1
+        fi
+        sleep 1
+    done
+    echo "EPP listening on $EPP_GRPC_PORT"
+
+    envoy -c /etc/envoy/envoy.yaml > "$ENVOY_LOG" 2>&1 &
+    ENVOY_PID=$!
+
+    wait_for_server_ready --port "$ENVOY_PORT" --server-log "$ENVOY_LOG" --server-pid "$ENVOY_PID"
+
+    # Wait for the prefill leader's sidecar before starting the bench.
+    # wait_for_server_ready can only probe localhost; the prefill leader
+    # is on a different node, so poll directly with a deadline.
+    echo "Waiting for prefill sidecar at $PREFILL_LEADER_IP:$SIDECAR_PORT/health"
+    PREFILL_WAIT_DEADLINE=$(( $(date +%s) + 300 ))
+    until curl --output /dev/null --silent --fail \
+            "http://$PREFILL_LEADER_IP:$SIDECAR_PORT/health"; do
+        if [[ "$(date +%s)" -ge "$PREFILL_WAIT_DEADLINE" ]]; then
+            echo "ERROR: prefill sidecar did not become ready within 5 min" >&2
+            exit 1
+        fi
+        sleep 5
+    done
+    echo "Prefill sidecar at $PREFILL_LEADER_IP:$SIDECAR_PORT is ready"
+
+    # Sweep concurrency. BENCH_MAX_CONCURRENCY arrives from submit.sh as
+    # an 'x'-delimited list (e.g. "2048x1024x512"); the runner / sweep
+    # configs expect one bench run per level. Same shape as
+    # benchmarks/multi_node/amd_utils/bench.sh.
+    IFS='x' read -r -a CONCURRENCIES <<< "$BENCH_MAX_CONCURRENCY"
+    for max_concurrency in "${CONCURRENCIES[@]}"; do
+        num_prompts=$(( max_concurrency * BENCH_NUM_PROMPTS_MULTIPLIER ))
+        [[ "$num_prompts" -lt 16 ]] && num_prompts=16
+        # Bench against Envoy. EPP routes to decode (and decode sidecar
+        # pulls from prefill via NIXL).
+        run_benchmark_serving \
+            --model "$MODEL_NAME" \
+            --port "$ENVOY_PORT" \
+            --backend openai \
+            --input-len "$BENCH_INPUT_LEN" \
+            --output-len "$BENCH_OUTPUT_LEN" \
+            --random-range-ratio "$BENCH_RANDOM_RANGE_RATIO" \
+            --num-prompts "$num_prompts" \
+            --max-concurrency "$max_concurrency" \
+            --result-filename "${RESULT_FILENAME}_c${max_concurrency}" \
+            --result-dir "$BENCHMARK_LOGS_DIR/"
+    done
+
+    if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+        run_eval --framework lm-eval --port "$ENVOY_PORT"
+        append_lm_eval_summary
+    fi
+
+    # Signal job.slurm (running outside the container, where SLURM
+    # client tools are available) to scancel the allocation. The image
+    # does not bundle scancel, so calling it here would just trip
+    # set -e. Workers end server.sh in `wait`; without this signal
+    # they would hold the job until TIME_LIMIT.
+    touch "$BENCHMARK_LOGS_DIR/.bench_done.$SLURM_JOB_ID"
+else
+    # Workers (prefill workers, decode workers, prefill leader): just keep vLLM alive.
+    wait
+fi
diff --git a/benchmarks/multi_node/llm-d/submit.sh b/benchmarks/multi_node/llm-d/submit.sh
new file mode 100755
index 000000000..663885426
--- /dev/null
+++ b/benchmarks/multi_node/llm-d/submit.sh
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+#
+# Submit a multi-node llm-d-vllm wide-EP P/D disagg benchmark job to SLURM.
+# Modeled after benchmarks/multi_node/amd_utils/submit.sh; prints JOB_ID on
+# stdout so the runner can poll for completion.
+#
+# Topology (matches the llm-d wide-EP guide reference):
+#   1 prefill instance with DP=PREFILL_NODES * GPUS_PER_NODE
+#   1 decode  instance with DP=DECODE_NODES  * GPUS_PER_NODE
+#   each instance spans PREFILL_NODES / DECODE_NODES nodes via vLLM
+#   --data-parallel-hybrid-lb. Total nodes = PREFILL_NODES + DECODE_NODES.
+
+set -euo pipefail
+
+# Repo root resolved from this script's location, so paths below are
+# independent of the caller's $PWD (the wrapper cd's into llm-d/ before
+# invoking this script).
+REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
+
+check_env() {
+    local name="$1"
+    if [[ -z "${!name:-}" ]]; then
+        echo "Error: ${name} not set" >&2
+        exit 1
+    fi
+}
+
+check_env SLURM_ACCOUNT
+check_env SLURM_PARTITION
+check_env TIME_LIMIT
+check_env MODEL_PATH
+check_env MODEL_NAME
+check_env CONTAINER_IMAGE
+check_env RUNNER_NAME
+
+PREFILL_NODES=$1
+DECODE_NODES=$2
+ISL=$3
+OSL=$4
+CONCURRENCIES=$5
+REQUEST_RATE=${6:-inf}
+RANDOM_RANGE_RATIO=${7:-0.8}
+
+NUM_NODES=$((PREFILL_NODES + DECODE_NODES))
+GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+
+export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE
+export MODEL_DIR=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export NUM_NODES=$NUM_NODES
+export PREFILL_NODES=$PREFILL_NODES
+export DECODE_NODES=$DECODE_NODES
+export GPUS_PER_NODE=$GPUS_PER_NODE
+export PREFILL_DP_SIZE=$((PREFILL_NODES * GPUS_PER_NODE))
+export DECODE_DP_SIZE=$((DECODE_NODES  * GPUS_PER_NODE))
+export BENCH_INPUT_LEN=$ISL
+export BENCH_OUTPUT_LEN=$OSL
+export BENCH_MAX_CONCURRENCY=$CONCURRENCIES
+export BENCH_REQUEST_RATE=$REQUEST_RATE
+export BENCH_RANDOM_RANGE_RATIO=$RANDOM_RANGE_RATIO
+export BENCH_NUM_PROMPTS_MULTIPLIER=10
+
+export RUN_EVAL="${RUN_EVAL:-false}"
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+export EVAL_CONC="${EVAL_CONC:-}"
+export FRAMEWORK="${FRAMEWORK:-llm-d-vllm}"
+export PRECISION="${PRECISION:-}"
+export MODEL_PREFIX="${MODEL_PREFIX:-}"
+export RUNNER_TYPE="${RUNNER_TYPE:-}"
+export RESULT_FILENAME="${RESULT_FILENAME:-}"
+export SPEC_DECODING="${SPEC_DECODING:-none}"
+export IS_MULTINODE="${IS_MULTINODE:-true}"
+export CONFIG_FILE="${CONFIG_FILE:-}"
+
+# Recipe may override SLURM time limit (longer topologies need more wall time).
+if [[ -n "$CONFIG_FILE" ]]; then
+    RECIPE_PATH="${REPO_ROOT}/benchmarks/multi_node/llm-d-recipes/${CONFIG_FILE}"
+    if [[ -f "$RECIPE_PATH" ]]; then
+        RECIPE_TIME=$(python3 -c "
+import yaml, sys
+r = yaml.safe_load(open('$RECIPE_PATH'))
+t = r.get('slurm', {}).get('time_limit', '')
+print(t)
+" 2>/dev/null || true)
+        [[ -n "$RECIPE_TIME" ]] && TIME_LIMIT="$RECIPE_TIME"
+    fi
+fi
+
+export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
+mkdir -p "$BENCHMARK_LOGS_DIR"
+
+JOB_ID=$(sbatch \
+    --parsable \
+    --exclusive \
+    -N "$NUM_NODES" \
+    -n "$NUM_NODES" \
+    --ntasks-per-node=1 \
+    --gres=gpu:"$GPUS_PER_NODE" \
+    --time "$TIME_LIMIT" \
+    --partition "$SLURM_PARTITION" \
+    --account "$SLURM_ACCOUNT" \
+    --job-name "$RUNNER_NAME" \
+    --output "${BENCHMARK_LOGS_DIR}/slurm_job-%j.out" \
+    --error  "${BENCHMARK_LOGS_DIR}/slurm_job-%j.err" \
+    "$(dirname "$0")/job.slurm")
+
+if [[ -z "$JOB_ID" ]]; then
+    echo "Error: sbatch failed" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 572056956..1a948b41e 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -1,13 +1,96 @@
 #!/usr/bin/bash
 
-# System-specific configuration for H200 DGXC Slurm cluster
-SLURM_PARTITION="main"
-SLURM_ACCOUNT="sa-shared"
+# System-specific configuration for H200 DGXC Slurm cluster.
+# Exported so child processes (e.g. submit.sh invoked via nested bash)
+# inherit them.
+export SLURM_PARTITION="main"
+export SLURM_ACCOUNT="sa-shared"
 
 set -x
 
 if [[ "$IS_MULTINODE" == "true" ]]; then
 
+    # ------------------------------------------------------------------
+    # llm-d-vllm: InferenceX-owned multi-node path (no srt-slurm).
+    # Mirrors the AMD sglang-disagg dispatch shape: wrapper script ->
+    # benchmarks/multi_node/llm-d/submit.sh -> sbatch -> JOB_ID.
+    # ------------------------------------------------------------------
+    if [[ "$FRAMEWORK" == "llm-d-vllm" ]]; then
+        if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
+            export MODEL_PATH="/models/DeepSeek-R1-0528"
+            export MODEL_NAME="DeepSeek-R1-0528"
+        else
+            echo "Unsupported MODEL_PREFIX/PRECISION for llm-d-vllm on H200: $MODEL_PREFIX/$PRECISION" >&2
+            exit 1
+        fi
+
+        # Logs go to BENCHMARK_LOGS_DIR (NFS-accessible); mirrors AMD path.
+        export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}"
+        mkdir -p "$BENCHMARK_LOGS_DIR"
+
+        SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_h200_llm-d-vllm.sh"
+        BENCH_SCRIPT="benchmarks/multi_node/${SCRIPT_NAME}"
+        if [[ ! -f "$BENCH_SCRIPT" ]]; then
+            echo "Error: llm-d wrapper not found: $BENCH_SCRIPT" >&2
+            exit 1
+        fi
+
+        JOB_ID=$(bash "$BENCH_SCRIPT")
+        if [[ -z "$JOB_ID" ]]; then
+            echo "Error: failed to submit llm-d job" >&2
+            exit 1
+        fi
+        echo "Submitted llm-d job: $JOB_ID"
+
+        LOG_FILE="${BENCHMARK_LOGS_DIR}/slurm_job-${JOB_ID}.out"
+
+        # Wait for log file (also catch early failures).
+        while ! ls "$LOG_FILE" &>/dev/null; do
+            if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
+                echo "ERROR: job $JOB_ID failed before creating log file"
+                scontrol show job "$JOB_ID" || true
+                exit 1
+            fi
+            sleep 5
+        done
+
+        # Background poll, foreground tail.
+        (
+            while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do
+                sleep 10
+            done
+        ) &
+        POLL_PID=$!
+
+        tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
+        wait $POLL_PID
+
+        # Result collection: same shape as AMD path.
+        for result_file in $(find "${BENCHMARK_LOGS_DIR}" -name "${RESULT_FILENAME}*.json" 2>/dev/null); do
+            file_name=$(basename "$result_file")
+            cp "$result_file" "$GITHUB_WORKSPACE/${file_name}"
+            echo "Copied result: $file_name"
+        done
+
+        if [[ "${RUN_EVAL:-false}" == "true" ]]; then
+            EVAL_DIR=$(find "$BENCHMARK_LOGS_DIR" -type d -name eval_results 2>/dev/null | head -1)
+            if [[ -n "$EVAL_DIR" && -d "$EVAL_DIR" ]]; then
+                shopt -s nullglob
+                for eval_file in "$EVAL_DIR"/*; do
+                    [ -f "$eval_file" ] || continue
+                    cp "$eval_file" "$GITHUB_WORKSPACE/"
+                    echo "Copied eval artifact: $(basename "$eval_file")"
+                done
+                shopt -u nullglob
+            else
+                echo "WARNING: RUN_EVAL=true but no eval_results found under $BENCHMARK_LOGS_DIR"
+            fi
+        fi
+
+        scancel "$JOB_ID" 2>/dev/null || true
+        exit 0
+    fi
+
     # MODEL_PATH: Override with pre-downloaded paths on H200 runner
     # The yaml files specify HuggingFace model IDs for portability, but we use
     # local paths to avoid repeated downloading on the shared H200 cluster.
@@ -29,7 +112,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
             exit 1
         fi
     else
-        echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang"
+        echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, llm-d-vllm"
         exit 1
     fi