SemiAnalysisAI · ezrasilvera · Jun 3, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -11267,3 +11267,47 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
           tp: 4
           ep: 4
           dp-attn: true
+
+
+# llm-d-vllm simple 1P+1D P/D disagg on H200 (Phase 0).
+#
+# Simplest possible multi-node llm-d-vllm shape:
+#   1 prefill node (DP=8 EP=8 dp-attn) + 1 decode node (DP=8 EP=8 dp-attn).
+#   Total 2 H200 nodes. No DeepEP, no NVSHMEM ibgda, no full-mesh RDMA.
+#   KV transfer prefill -> decode via NIXL point-to-point.
+#
+# Apples-to-apples shape vs Dynamo's H200 1P+1D entries (which use
+# sglang or trt; this is the same topology but with vLLM and the llm-d
+# router).
+dsr1-fp8-h200-llm-d-vllm-simple:
+  image: ghcr.io/ezrasilvera/llm-d-nokube-vllm:v0.7.0
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: h200-multinode
+  precision: fp8
+  framework: llm-d-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1, 4, 16, 64, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+          - "RANDOM_RANGE_RATIO=0.05"
+          - "CONFIG_FILE=dsr1-fp8-h200-1p1d-simple.yaml"
+        decode:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
diff --git a/benchmarks/llm-d/Dockerfile b/benchmarks/llm-d/Dockerfile
@@ -0,0 +1,22 @@
+# Combined image for the InferenceX llm-d-vllm framework.
+#
+# Base = ghcr.io/llm-d/llm-d-cuda which already ships vLLM + DeepEP +
+# NVSHMEM + GDRCopy. We add the EPP, the routing-sidecar, and Envoy on top
+# so every node in a SLURM allocation can play any role (prefill, decode,
+# or coordinator) from a single image.
+#
+# Configs (epp-config.yaml, envoy.yaml, per-topology recipes) are NOT
+# baked in. They are mounted at runtime by job.slurm so config-only
+# iteration does not require an image rebuild. See
+# benchmarks/multi_node/llm-d/job.slurm for the expected mount layout.
+
+FROM ghcr.io/llm-d/llm-d-cuda:v0.7.0
+
+COPY --from=ghcr.io/llm-d/llm-d-router-endpoint-picker-dev:main \
+       /app/epp /usr/local/bin/epp
+
+COPY --from=ghcr.io/llm-d/llm-d-router-disagg-sidecar-dev:main \
+       /app/pd-sidecar /usr/local/bin/pd-sidecar
+
+COPY --from=envoyproxy/envoy:distroless-v1.33.2 \
+     /usr/local/bin/envoy /usr/local/bin/
diff --git a/benchmarks/llm-d/README.md b/benchmarks/llm-d/README.md
@@ -0,0 +1,16 @@
+# llm-d-vllm framework artifacts
+
+This directory holds the static, baked-into-the-image pieces of the
+`llm-d-vllm` benchmark framework.
+
+| File | Purpose |
+|---|---|
+| `Dockerfile` | Combined image: vLLM (DeepEP-enabled), EPP, routing-sidecar, Envoy. One image, every node uses what its role requires. |
+| `epp-config.yaml` | Fallback EPP scheduling config. Used when no recipe overrides it via `CONFIG_FILE`. `disagg-profile-handler` + `kv-cache-utilization-scorer` + `random-picker` over the file-discovery endpoint set. |
+| `envoy.yaml` | Static Envoy: listener `:8080`, ext_proc to `127.0.0.1:9002`, ORIGINAL_DST cluster reading `x-gateway-destination-endpoint`. |
+
+The runtime pieces (per-node `server.sh`, the SLURM job script, recipe
+files, and the endpoint discovery mechanism) live under
+`benchmarks/multi_node/llm-d/` and `benchmarks/multi_node/llm-d-recipes/`.
+See the README in `benchmarks/multi_node/llm-d/` for the endpoints-file
+generation flow.
diff --git a/benchmarks/llm-d/envoy.yaml b/benchmarks/llm-d/envoy.yaml
@@ -0,0 +1,85 @@
+# Envoy front door for the llm-d-vllm framework.
+#
+# Listener  : 0.0.0.0:8080  (benchmark client target)
+# ext_proc  : EPP on 127.0.0.1:9002
+# Cluster   : ORIGINAL_DST, picks the address from the
+#             x-gateway-destination-endpoint header that EPP sets.
+
+static_resources:
+  listeners:
+    - name: main
+      address:
+        socket_address: { address: 0.0.0.0, port_value: 8080 }
+      filter_chains:
+        - filters:
+            - name: envoy.filters.network.http_connection_manager
+              typed_config:
+                "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
+                stat_prefix: ingress_http
+                codec_type: AUTO
+                stream_idle_timeout: 0s
+                request_timeout: 0s
+                route_config:
+                  name: route
+                  virtual_hosts:
+                    - name: vh
+                      domains: ["*"]
+                      routes:
+                        - match: { prefix: "/" }
+                          route:
+                            cluster: original_dst
+                            timeout: 0s
+                http_filters:
+                  - name: envoy.filters.http.ext_proc
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor
+                      grpc_service:
+                        envoy_grpc:
+                          cluster_name: epp
+                        timeout: 10s
+                      # message_timeout caps how long Envoy will wait for any
+                      # one ext_proc message ack from EPP. Generation can take
+                      # many seconds; 1000s mirrors the upstream llm-d guide.
+                      message_timeout: 1000s
+                      # FULL_DUPLEX_STREAMED for both directions: the dev EPP
+                      # (ghcr.io/llm-d/llm-d-router-endpoint-picker-dev:main)
+                      # does not ack BUFFERED body mode and Envoy times out
+                      # with 504. Trailer modes also have to be SEND for the
+                      # request lifecycle to terminate cleanly.
+                      processing_mode:
+                        request_header_mode: SEND
+                        response_header_mode: SEND
+                        request_body_mode: FULL_DUPLEX_STREAMED
+                        response_body_mode: FULL_DUPLEX_STREAMED
+                        request_trailer_mode: SEND
+                        response_trailer_mode: SEND
+                  - name: envoy.filters.http.router
+                    typed_config:
+                      "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
+  clusters:
+    - name: epp
+      type: STATIC
+      connect_timeout: 1s
+      typed_extension_protocol_options:
+        envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
+          "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
+          explicit_http_config:
+            http2_protocol_options: {}
+      load_assignment:
+        cluster_name: epp
+        endpoints:
+          - lb_endpoints:
+              - endpoint:
+                  address:
+                    socket_address: { address: 127.0.0.1, port_value: 9002 }
+    - name: original_dst
+      type: ORIGINAL_DST
+      lb_policy: CLUSTER_PROVIDED
+      connect_timeout: 5s
+      original_dst_lb_config:
+        use_http_header: true
+        http_header_name: x-gateway-destination-endpoint
+
+admin:
+  address:
+    socket_address: { address: 0.0.0.0, port_value: 9901 }
diff --git a/benchmarks/llm-d/epp-config.yaml b/benchmarks/llm-d/epp-config.yaml
@@ -0,0 +1,61 @@
+# Default EPP scheduling config (fallback when CONFIG_FILE is unset).
+#
+# Mirrors the upstream llm-d well-lit-path P/D guide:
+#   guides/pd-disaggregation/router/pd-disaggregation.values.yaml
+# in github.com/llm-d/llm-d. Plugins, scheduling profiles, and scorer
+# weights are unchanged from upstream.
+#
+# Single delta vs upstream: file-discovery. The upstream guide assumes
+# a Kubernetes control plane drives endpoint discovery; in our SLURM
+# setup the coordinator node writes /tmp/endpoints.yaml at job start
+# (see benchmarks/multi_node/llm-d/README.md) and EPP loads it via the
+# file-discovery plugin instead.
+
+apiVersion: llm-d.ai/v1alpha1
+kind: EndpointPickerConfig
+
+plugins:
+  # Endpoint discovery (replaces upstream's K8s discovery).
+  - name: file-disc
+    type: file-discovery
+    parameters:
+      path: /tmp/endpoints.yaml
+      watchFile: false
+
+  # P/D routing - identical to upstream pd-disaggregation guide.
+  - type: disagg-headers-handler
+  - type: always-disagg-pd-decider
+  - type: disagg-profile-handler
+    parameters:
+      deciderPluginName: always-disagg-pd-decider
+  - type: prefill-filter
+  - type: decode-filter
+  - type: prefix-cache-scorer
+  - type: queue-scorer
+  - type: kv-cache-utilization-scorer
+  - type: active-request-scorer
+  - type: max-score-picker
+
+schedulingProfiles:
+  - name: prefill
+    plugins:
+      - pluginRef: prefill-filter
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: queue-scorer
+        weight: 2
+      - pluginRef: kv-cache-utilization-scorer
+        weight: 2
+      - pluginRef: max-score-picker
+  - name: decode
+    plugins:
+      - pluginRef: decode-filter
+      - pluginRef: active-request-scorer
+        weight: 2
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: max-score-picker
+
+dataLayer:
+  discovery:
+    pluginRef: file-disc
diff --git a/benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh b/benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+#
+# Wrapper for the DSR1-FP8 H200 wide-EP llm-d-vllm benchmark.
+# Sets topology env (PREFILL_NODES, DECODE_NODES) and calls
+# benchmarks/multi_node/llm-d/submit.sh, which prints JOB_ID on stdout.
+# Same shape as benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh.
+
+set -euo pipefail
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    MODEL_PATH \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/llm-d" || exit 1
+
+export TIME_LIMIT="${TIME_LIMIT:-08:00:00}"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+# Concurrency list passes through to bench server. Use 'x'-delimited form
+# (matches sglang-disagg wrapper convention).
+JOB_ID=$(bash ./submit.sh \
+    "$PREFILL_NODES" \
+    "$DECODE_NODES" \
+    "$ISL" "$OSL" "${CONC_LIST// /x}" inf \
+    "$RANDOM_RANGE_RATIO")
+
+if [[ -z "$JOB_ID" ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml b/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml
@@ -0,0 +1,101 @@
+# DeepSeek-R1-0528 fp8 on H200, simple 1P+1D P/D disagg.
+#
+# Phase 0 starting point - the simplest possible llm-d-vllm multi-node
+# config:
+#   1 prefill node (DP=8 EP=8 dp-attn, intra-node EP over NVLink)
+#   1 decode  node (DP=8 EP=8 dp-attn, intra-node EP over NVLink)
+#   total 2 H200 nodes / 16 GPUs.
+#
+# No DeepEP, no NVSHMEM ibgda, no full-mesh-RDMA requirement, no
+# cross-node MoE all-to-all. KV transfer between prefill and decode goes
+# through NIXL point-to-point. This mirrors the shape of the simplest
+# Dynamo H200 multi-node disagg entries (e.g. dsr1-fp8-h200-dynamo-sglang
+# 1P+1D EP=8) but with vLLM as the engine and llm-d as the router.
+#
+# Selected via additional-settings: CONFIG_FILE=dsr1-fp8-h200-1p1d-simple.yaml
+# with PREFILL_NODES=1 DECODE_NODES=1 from the wrapper.
+
+# ---- EPP scheduling config ----
+# Mirrors the upstream llm-d well-lit-path P/D guide:
+#   guides/pd-disaggregation/router/pd-disaggregation.values.yaml
+# in github.com/llm-d/llm-d. Plugins, scheduling profiles, and scorer
+# weights are unchanged from upstream. Single delta: file-discovery
+# replaces upstream's K8s endpoint discovery, since this benchmark runs
+# under SLURM. The coordinator node writes /tmp/endpoints.yaml at job
+# start (see benchmarks/multi_node/llm-d/README.md).
+apiVersion: llm-d.ai/v1alpha1
+kind: EndpointPickerConfig
+
+plugins:
+  - name: file-disc
+    type: file-discovery
+    parameters:
+      path: /tmp/endpoints.yaml
+      watchFile: false
+
+  - type: disagg-headers-handler
+  - type: always-disagg-pd-decider
+  - type: disagg-profile-handler
+    parameters:
+      deciderPluginName: always-disagg-pd-decider
+  - type: prefill-filter
+  - type: decode-filter
+  - type: prefix-cache-scorer
+  - type: queue-scorer
+  - type: kv-cache-utilization-scorer
+  - type: active-request-scorer
+  - type: max-score-picker
+
+schedulingProfiles:
+  - name: prefill
+    plugins:
+      - pluginRef: prefill-filter
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: queue-scorer
+        weight: 2
+      - pluginRef: kv-cache-utilization-scorer
+        weight: 2
+      - pluginRef: max-score-picker
+  - name: decode
+    plugins:
+      - pluginRef: decode-filter
+      - pluginRef: active-request-scorer
+        weight: 2
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: max-score-picker
+
+dataLayer:
+  discovery:
+    pluginRef: file-disc
+
+# ---- Per-role vLLM flags ----
+# Common flags (--enable-expert-parallel, --tensor-parallel-size,
+# --data-parallel-size, --kv_transfer_config, --moe-backend) are set in
+# server.sh. The cross-node DP coordination flags
+# (--data-parallel-hybrid-lb, --data-parallel-size-local, etc.) are NOT
+# emitted because LWS_GROUP_SIZE = PREFILL_NODES = DECODE_NODES = 1.
+prefill:
+  extra-args: >-
+    --gpu-memory-utilization 0.85
+    --kv-cache-dtype fp8
+    --max-num-batched-tokens 32768
+    --max-num-seqs 16
+    --block-size 256
+    --no-enable-prefix-caching
+  env: {}
+
+decode:
+  extra-args: >-
+    --gpu-memory-utilization 0.90
+    --kv-cache-dtype fp8
+    --max-num-batched-tokens 256
+    --max-num-seqs 256
+    --block-size 256
+    --no-enable-prefix-caching
+  env: {}
+
+# ---- SLURM resource directives ----
+slurm:
+  time_limit: "04:00:00"