Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11267,3 +11267,47 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
tp: 4
ep: 4
dp-attn: true


# llm-d-vllm simple 1P+1D P/D disagg on H200 (Phase 0).
#
# Simplest possible multi-node llm-d-vllm shape:
# 1 prefill node (DP=8 EP=8 dp-attn) + 1 decode node (DP=8 EP=8 dp-attn).
# Total 2 H200 nodes. No DeepEP, no NVSHMEM ibgda, no full-mesh RDMA.
# KV transfer prefill -> decode via NIXL point-to-point.
#
# Apples-to-apples shape vs Dynamo's H200 1P+1D entries (which use
# sglang or trt; this is the same topology but with vLLM and the llm-d
# router).
dsr1-fp8-h200-llm-d-vllm-simple:
image: ghcr.io/ezrasilvera/llm-d-nokube-vllm:v0.7.0
model: deepseek-ai/DeepSeek-R1-0528
model-prefix: dsr1
runner: h200-multinode
precision: fp8
framework: llm-d-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- spec-decoding: "none"
conc-list: [ 1, 4, 16, 64, 256 ]
prefill:
num-worker: 1
tp: 1
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=1"
- "RANDOM_RANGE_RATIO=0.05"
- "CONFIG_FILE=dsr1-fp8-h200-1p1d-simple.yaml"
decode:
num-worker: 1
tp: 1
ep: 8
dp-attn: true
additional-settings:
- "DECODE_NODES=1"
22 changes: 22 additions & 0 deletions benchmarks/llm-d/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Combined image for the InferenceX llm-d-vllm framework.
#
# Base = ghcr.io/llm-d/llm-d-cuda which already ships vLLM + DeepEP +
# NVSHMEM + GDRCopy. We add the EPP, the routing-sidecar, and Envoy on top
# so every node in a SLURM allocation can play any role (prefill, decode,
# or coordinator) from a single image.
#
# Configs (epp-config.yaml, envoy.yaml, per-topology recipes) are NOT
# baked in. They are mounted at runtime by job.slurm so config-only
# iteration does not require an image rebuild. See
# benchmarks/multi_node/llm-d/job.slurm for the expected mount layout.

FROM ghcr.io/llm-d/llm-d-cuda:v0.7.0

COPY --from=ghcr.io/llm-d/llm-d-router-endpoint-picker-dev:main \
/app/epp /usr/local/bin/epp

COPY --from=ghcr.io/llm-d/llm-d-router-disagg-sidecar-dev:main \
/app/pd-sidecar /usr/local/bin/pd-sidecar

COPY --from=envoyproxy/envoy:distroless-v1.33.2 \
/usr/local/bin/envoy /usr/local/bin/
16 changes: 16 additions & 0 deletions benchmarks/llm-d/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# llm-d-vllm framework artifacts

This directory holds the static, baked-into-the-image pieces of the
`llm-d-vllm` benchmark framework.

| File | Purpose |
|---|---|
| `Dockerfile` | Combined image: vLLM (DeepEP-enabled), EPP, routing-sidecar, Envoy. One image, every node uses what its role requires. |
| `epp-config.yaml` | Fallback EPP scheduling config. Used when no recipe overrides it via `CONFIG_FILE`. `disagg-profile-handler` + `kv-cache-utilization-scorer` + `random-picker` over the file-discovery endpoint set. |
| `envoy.yaml` | Static Envoy: listener `:8080`, ext_proc to `127.0.0.1:9002`, ORIGINAL_DST cluster reading `x-gateway-destination-endpoint`. |

The runtime pieces (per-node `server.sh`, the SLURM job script, recipe
files, and the endpoint discovery mechanism) live under
`benchmarks/multi_node/llm-d/` and `benchmarks/multi_node/llm-d-recipes/`.
See the README in `benchmarks/multi_node/llm-d/` for the endpoints-file
generation flow.
85 changes: 85 additions & 0 deletions benchmarks/llm-d/envoy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Envoy front door for the llm-d-vllm framework.
#
# Listener : 0.0.0.0:8080 (benchmark client target)
# ext_proc : EPP on 127.0.0.1:9002
# Cluster : ORIGINAL_DST, picks the address from the
# x-gateway-destination-endpoint header that EPP sets.

static_resources:
listeners:
- name: main
address:
socket_address: { address: 0.0.0.0, port_value: 8080 }
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
stat_prefix: ingress_http
codec_type: AUTO
stream_idle_timeout: 0s
request_timeout: 0s
route_config:
name: route
virtual_hosts:
- name: vh
domains: ["*"]
routes:
- match: { prefix: "/" }
route:
cluster: original_dst
timeout: 0s
http_filters:
- name: envoy.filters.http.ext_proc
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor
grpc_service:
envoy_grpc:
cluster_name: epp
timeout: 10s
# message_timeout caps how long Envoy will wait for any
# one ext_proc message ack from EPP. Generation can take
# many seconds; 1000s mirrors the upstream llm-d guide.
message_timeout: 1000s
# FULL_DUPLEX_STREAMED for both directions: the dev EPP
# (ghcr.io/llm-d/llm-d-router-endpoint-picker-dev:main)
# does not ack BUFFERED body mode and Envoy times out
# with 504. Trailer modes also have to be SEND for the
# request lifecycle to terminate cleanly.
processing_mode:
request_header_mode: SEND
response_header_mode: SEND
request_body_mode: FULL_DUPLEX_STREAMED
response_body_mode: FULL_DUPLEX_STREAMED
request_trailer_mode: SEND
response_trailer_mode: SEND
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
clusters:
- name: epp
type: STATIC
connect_timeout: 1s
typed_extension_protocol_options:
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
explicit_http_config:
http2_protocol_options: {}
load_assignment:
cluster_name: epp
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address: { address: 127.0.0.1, port_value: 9002 }
- name: original_dst
type: ORIGINAL_DST
lb_policy: CLUSTER_PROVIDED
connect_timeout: 5s
original_dst_lb_config:
use_http_header: true
http_header_name: x-gateway-destination-endpoint

admin:
address:
socket_address: { address: 0.0.0.0, port_value: 9901 }
61 changes: 61 additions & 0 deletions benchmarks/llm-d/epp-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Default EPP scheduling config (fallback when CONFIG_FILE is unset).
#
# Mirrors the upstream llm-d well-lit-path P/D guide:
# guides/pd-disaggregation/router/pd-disaggregation.values.yaml
# in github.com/llm-d/llm-d. Plugins, scheduling profiles, and scorer
# weights are unchanged from upstream.
#
# Single delta vs upstream: file-discovery. The upstream guide assumes
# a Kubernetes control plane drives endpoint discovery; in our SLURM
# setup the coordinator node writes /tmp/endpoints.yaml at job start
# (see benchmarks/multi_node/llm-d/README.md) and EPP loads it via the
# file-discovery plugin instead.

apiVersion: llm-d.ai/v1alpha1
kind: EndpointPickerConfig

plugins:
# Endpoint discovery (replaces upstream's K8s discovery).
- name: file-disc
type: file-discovery
parameters:
path: /tmp/endpoints.yaml
watchFile: false

# P/D routing - identical to upstream pd-disaggregation guide.
- type: disagg-headers-handler
- type: always-disagg-pd-decider
- type: disagg-profile-handler
parameters:
deciderPluginName: always-disagg-pd-decider
- type: prefill-filter
- type: decode-filter
- type: prefix-cache-scorer
- type: queue-scorer
- type: kv-cache-utilization-scorer
- type: active-request-scorer
- type: max-score-picker

schedulingProfiles:
- name: prefill
plugins:
- pluginRef: prefill-filter
- pluginRef: prefix-cache-scorer
weight: 3
- pluginRef: queue-scorer
weight: 2
- pluginRef: kv-cache-utilization-scorer
weight: 2
- pluginRef: max-score-picker
- name: decode
plugins:
- pluginRef: decode-filter
- pluginRef: active-request-scorer
weight: 2
- pluginRef: prefix-cache-scorer
weight: 3
- pluginRef: max-score-picker

dataLayer:
discovery:
pluginRef: file-disc
48 changes: 48 additions & 0 deletions benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env bash
#
# Wrapper for the DSR1-FP8 H200 wide-EP llm-d-vllm benchmark.
# Sets topology env (PREFILL_NODES, DECODE_NODES) and calls
# benchmarks/multi_node/llm-d/submit.sh, which prints JOB_ID on stdout.
# Same shape as benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh.

set -euo pipefail

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
CONC_LIST \
ISL \
OSL \
IMAGE \
MODEL_PATH \
PREFILL_NODES \
DECODE_NODES \
RANDOM_RANGE_RATIO

if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

set -x

cd "$GITHUB_WORKSPACE/benchmarks/multi_node/llm-d" || exit 1

export TIME_LIMIT="${TIME_LIMIT:-08:00:00}"
export MODEL_PATH=$MODEL_PATH
export MODEL_NAME=$MODEL_NAME
export CONTAINER_IMAGE=$IMAGE

# Concurrency list passes through to bench server. Use 'x'-delimited form
# (matches sglang-disagg wrapper convention).
JOB_ID=$(bash ./submit.sh \
"$PREFILL_NODES" \
"$DECODE_NODES" \
"$ISL" "$OSL" "${CONC_LIST// /x}" inf \
"$RANDOM_RANGE_RATIO")

if [[ -z "$JOB_ID" ]]; then
echo "Failed to submit job" >&2
exit 1
fi

echo "$JOB_ID"
101 changes: 101 additions & 0 deletions benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# DeepSeek-R1-0528 fp8 on H200, simple 1P+1D P/D disagg.
#
# Phase 0 starting point - the simplest possible llm-d-vllm multi-node
# config:
# 1 prefill node (DP=8 EP=8 dp-attn, intra-node EP over NVLink)
# 1 decode node (DP=8 EP=8 dp-attn, intra-node EP over NVLink)
# total 2 H200 nodes / 16 GPUs.
#
# No DeepEP, no NVSHMEM ibgda, no full-mesh-RDMA requirement, no
# cross-node MoE all-to-all. KV transfer between prefill and decode goes
# through NIXL point-to-point. This mirrors the shape of the simplest
# Dynamo H200 multi-node disagg entries (e.g. dsr1-fp8-h200-dynamo-sglang
# 1P+1D EP=8) but with vLLM as the engine and llm-d as the router.
#
# Selected via additional-settings: CONFIG_FILE=dsr1-fp8-h200-1p1d-simple.yaml
# with PREFILL_NODES=1 DECODE_NODES=1 from the wrapper.

# ---- EPP scheduling config ----
# Mirrors the upstream llm-d well-lit-path P/D guide:
# guides/pd-disaggregation/router/pd-disaggregation.values.yaml
# in github.com/llm-d/llm-d. Plugins, scheduling profiles, and scorer
# weights are unchanged from upstream. Single delta: file-discovery
# replaces upstream's K8s endpoint discovery, since this benchmark runs
# under SLURM. The coordinator node writes /tmp/endpoints.yaml at job
# start (see benchmarks/multi_node/llm-d/README.md).
apiVersion: llm-d.ai/v1alpha1
kind: EndpointPickerConfig

plugins:
- name: file-disc
type: file-discovery
parameters:
path: /tmp/endpoints.yaml
watchFile: false

- type: disagg-headers-handler
- type: always-disagg-pd-decider
- type: disagg-profile-handler
parameters:
deciderPluginName: always-disagg-pd-decider
- type: prefill-filter
- type: decode-filter
- type: prefix-cache-scorer
- type: queue-scorer
- type: kv-cache-utilization-scorer
- type: active-request-scorer
- type: max-score-picker

schedulingProfiles:
- name: prefill
plugins:
- pluginRef: prefill-filter
- pluginRef: prefix-cache-scorer
weight: 3
- pluginRef: queue-scorer
weight: 2
- pluginRef: kv-cache-utilization-scorer
weight: 2
- pluginRef: max-score-picker
- name: decode
plugins:
- pluginRef: decode-filter
- pluginRef: active-request-scorer
weight: 2
- pluginRef: prefix-cache-scorer
weight: 3
- pluginRef: max-score-picker

dataLayer:
discovery:
pluginRef: file-disc

# ---- Per-role vLLM flags ----
# Common flags (--enable-expert-parallel, --tensor-parallel-size,
# --data-parallel-size, --kv_transfer_config, --moe-backend) are set in
# server.sh. The cross-node DP coordination flags
# (--data-parallel-hybrid-lb, --data-parallel-size-local, etc.) are NOT
# emitted because LWS_GROUP_SIZE = PREFILL_NODES = DECODE_NODES = 1.
prefill:
extra-args: >-
--gpu-memory-utilization 0.85
--kv-cache-dtype fp8
--max-num-batched-tokens 32768
--max-num-seqs 16
--block-size 256
--no-enable-prefix-caching
env: {}

decode:
extra-args: >-
--gpu-memory-utilization 0.90
--kv-cache-dtype fp8
--max-num-batched-tokens 256
--max-num-seqs 256
--block-size 256
--no-enable-prefix-caching
env: {}

# ---- SLURM resource directives ----
slurm:
time_limit: "04:00:00"
Loading