diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ad469b28e..675250e2b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11267,3 +11267,47 @@ minimaxm2.5-fp8-gb300-dynamo-vllm: tp: 4 ep: 4 dp-attn: true + + +# llm-d-vllm simple 1P+1D P/D disagg on H200 (Phase 0). +# +# Simplest possible multi-node llm-d-vllm shape: +# 1 prefill node (DP=8 EP=8 dp-attn) + 1 decode node (DP=8 EP=8 dp-attn). +# Total 2 H200 nodes. No DeepEP, no NVSHMEM ibgda, no full-mesh RDMA. +# KV transfer prefill -> decode via NIXL point-to-point. +# +# Apples-to-apples shape vs Dynamo's H200 1P+1D entries (which use +# sglang or trt; this is the same topology but with vLLM and the llm-d +# router). +dsr1-fp8-h200-llm-d-vllm-simple: + image: ghcr.io/ezrasilvera/llm-d-nokube-vllm:v0.7.0 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: h200-multinode + precision: fp8 + framework: llm-d-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 1, 4, 16, 64, 256 ] + prefill: + num-worker: 1 + tp: 1 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "RANDOM_RANGE_RATIO=0.05" + - "CONFIG_FILE=dsr1-fp8-h200-1p1d-simple.yaml" + decode: + num-worker: 1 + tp: 1 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_NODES=1" diff --git a/benchmarks/llm-d/Dockerfile b/benchmarks/llm-d/Dockerfile new file mode 100644 index 000000000..0e5228136 --- /dev/null +++ b/benchmarks/llm-d/Dockerfile @@ -0,0 +1,22 @@ +# Combined image for the InferenceX llm-d-vllm framework. +# +# Base = ghcr.io/llm-d/llm-d-cuda which already ships vLLM + DeepEP + +# NVSHMEM + GDRCopy. We add the EPP, the routing-sidecar, and Envoy on top +# so every node in a SLURM allocation can play any role (prefill, decode, +# or coordinator) from a single image. +# +# Configs (epp-config.yaml, envoy.yaml, per-topology recipes) are NOT +# baked in. They are mounted at runtime by job.slurm so config-only +# iteration does not require an image rebuild. See +# benchmarks/multi_node/llm-d/job.slurm for the expected mount layout. + +FROM ghcr.io/llm-d/llm-d-cuda:v0.7.0 + +COPY --from=ghcr.io/llm-d/llm-d-router-endpoint-picker-dev:main \ + /app/epp /usr/local/bin/epp + +COPY --from=ghcr.io/llm-d/llm-d-router-disagg-sidecar-dev:main \ + /app/pd-sidecar /usr/local/bin/pd-sidecar + +COPY --from=envoyproxy/envoy:distroless-v1.33.2 \ + /usr/local/bin/envoy /usr/local/bin/ diff --git a/benchmarks/llm-d/README.md b/benchmarks/llm-d/README.md new file mode 100644 index 000000000..cd6e0bf51 --- /dev/null +++ b/benchmarks/llm-d/README.md @@ -0,0 +1,16 @@ +# llm-d-vllm framework artifacts + +This directory holds the static, baked-into-the-image pieces of the +`llm-d-vllm` benchmark framework. + +| File | Purpose | +|---|---| +| `Dockerfile` | Combined image: vLLM (DeepEP-enabled), EPP, routing-sidecar, Envoy. One image, every node uses what its role requires. | +| `epp-config.yaml` | Fallback EPP scheduling config. Used when no recipe overrides it via `CONFIG_FILE`. `disagg-profile-handler` + `kv-cache-utilization-scorer` + `random-picker` over the file-discovery endpoint set. | +| `envoy.yaml` | Static Envoy: listener `:8080`, ext_proc to `127.0.0.1:9002`, ORIGINAL_DST cluster reading `x-gateway-destination-endpoint`. | + +The runtime pieces (per-node `server.sh`, the SLURM job script, recipe +files, and the endpoint discovery mechanism) live under +`benchmarks/multi_node/llm-d/` and `benchmarks/multi_node/llm-d-recipes/`. +See the README in `benchmarks/multi_node/llm-d/` for the endpoints-file +generation flow. diff --git a/benchmarks/llm-d/envoy.yaml b/benchmarks/llm-d/envoy.yaml new file mode 100644 index 000000000..20bbe60a6 --- /dev/null +++ b/benchmarks/llm-d/envoy.yaml @@ -0,0 +1,85 @@ +# Envoy front door for the llm-d-vllm framework. +# +# Listener : 0.0.0.0:8080 (benchmark client target) +# ext_proc : EPP on 127.0.0.1:9002 +# Cluster : ORIGINAL_DST, picks the address from the +# x-gateway-destination-endpoint header that EPP sets. + +static_resources: + listeners: + - name: main + address: + socket_address: { address: 0.0.0.0, port_value: 8080 } + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: ingress_http + codec_type: AUTO + stream_idle_timeout: 0s + request_timeout: 0s + route_config: + name: route + virtual_hosts: + - name: vh + domains: ["*"] + routes: + - match: { prefix: "/" } + route: + cluster: original_dst + timeout: 0s + http_filters: + - name: envoy.filters.http.ext_proc + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor + grpc_service: + envoy_grpc: + cluster_name: epp + timeout: 10s + # message_timeout caps how long Envoy will wait for any + # one ext_proc message ack from EPP. Generation can take + # many seconds; 1000s mirrors the upstream llm-d guide. + message_timeout: 1000s + # FULL_DUPLEX_STREAMED for both directions: the dev EPP + # (ghcr.io/llm-d/llm-d-router-endpoint-picker-dev:main) + # does not ack BUFFERED body mode and Envoy times out + # with 504. Trailer modes also have to be SEND for the + # request lifecycle to terminate cleanly. + processing_mode: + request_header_mode: SEND + response_header_mode: SEND + request_body_mode: FULL_DUPLEX_STREAMED + response_body_mode: FULL_DUPLEX_STREAMED + request_trailer_mode: SEND + response_trailer_mode: SEND + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + clusters: + - name: epp + type: STATIC + connect_timeout: 1s + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: {} + load_assignment: + cluster_name: epp + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: { address: 127.0.0.1, port_value: 9002 } + - name: original_dst + type: ORIGINAL_DST + lb_policy: CLUSTER_PROVIDED + connect_timeout: 5s + original_dst_lb_config: + use_http_header: true + http_header_name: x-gateway-destination-endpoint + +admin: + address: + socket_address: { address: 0.0.0.0, port_value: 9901 } diff --git a/benchmarks/llm-d/epp-config.yaml b/benchmarks/llm-d/epp-config.yaml new file mode 100644 index 000000000..3ff0eea87 --- /dev/null +++ b/benchmarks/llm-d/epp-config.yaml @@ -0,0 +1,61 @@ +# Default EPP scheduling config (fallback when CONFIG_FILE is unset). +# +# Mirrors the upstream llm-d well-lit-path P/D guide: +# guides/pd-disaggregation/router/pd-disaggregation.values.yaml +# in github.com/llm-d/llm-d. Plugins, scheduling profiles, and scorer +# weights are unchanged from upstream. +# +# Single delta vs upstream: file-discovery. The upstream guide assumes +# a Kubernetes control plane drives endpoint discovery; in our SLURM +# setup the coordinator node writes /tmp/endpoints.yaml at job start +# (see benchmarks/multi_node/llm-d/README.md) and EPP loads it via the +# file-discovery plugin instead. + +apiVersion: llm-d.ai/v1alpha1 +kind: EndpointPickerConfig + +plugins: + # Endpoint discovery (replaces upstream's K8s discovery). + - name: file-disc + type: file-discovery + parameters: + path: /tmp/endpoints.yaml + watchFile: false + + # P/D routing - identical to upstream pd-disaggregation guide. + - type: disagg-headers-handler + - type: always-disagg-pd-decider + - type: disagg-profile-handler + parameters: + deciderPluginName: always-disagg-pd-decider + - type: prefill-filter + - type: decode-filter + - type: prefix-cache-scorer + - type: queue-scorer + - type: kv-cache-utilization-scorer + - type: active-request-scorer + - type: max-score-picker + +schedulingProfiles: + - name: prefill + plugins: + - pluginRef: prefill-filter + - pluginRef: prefix-cache-scorer + weight: 3 + - pluginRef: queue-scorer + weight: 2 + - pluginRef: kv-cache-utilization-scorer + weight: 2 + - pluginRef: max-score-picker + - name: decode + plugins: + - pluginRef: decode-filter + - pluginRef: active-request-scorer + weight: 2 + - pluginRef: prefix-cache-scorer + weight: 3 + - pluginRef: max-score-picker + +dataLayer: + discovery: + pluginRef: file-disc diff --git a/benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh b/benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh new file mode 100755 index 000000000..61978c199 --- /dev/null +++ b/benchmarks/multi_node/dsr1_fp8_h200_llm-d-vllm.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# +# Wrapper for the DSR1-FP8 H200 wide-EP llm-d-vllm benchmark. +# Sets topology env (PREFILL_NODES, DECODE_NODES) and calls +# benchmarks/multi_node/llm-d/submit.sh, which prints JOB_ID on stdout. +# Same shape as benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh. + +set -euo pipefail + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + MODEL_PATH \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/llm-d" || exit 1 + +export TIME_LIMIT="${TIME_LIMIT:-08:00:00}" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +# Concurrency list passes through to bench server. Use 'x'-delimited form +# (matches sglang-disagg wrapper convention). +JOB_ID=$(bash ./submit.sh \ + "$PREFILL_NODES" \ + "$DECODE_NODES" \ + "$ISL" "$OSL" "${CONC_LIST// /x}" inf \ + "$RANDOM_RANGE_RATIO") + +if [[ -z "$JOB_ID" ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml b/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml new file mode 100644 index 000000000..e3ca415b6 --- /dev/null +++ b/benchmarks/multi_node/llm-d-recipes/dsr1-fp8-h200-1p1d-simple.yaml @@ -0,0 +1,101 @@ +# DeepSeek-R1-0528 fp8 on H200, simple 1P+1D P/D disagg. +# +# Phase 0 starting point - the simplest possible llm-d-vllm multi-node +# config: +# 1 prefill node (DP=8 EP=8 dp-attn, intra-node EP over NVLink) +# 1 decode node (DP=8 EP=8 dp-attn, intra-node EP over NVLink) +# total 2 H200 nodes / 16 GPUs. +# +# No DeepEP, no NVSHMEM ibgda, no full-mesh-RDMA requirement, no +# cross-node MoE all-to-all. KV transfer between prefill and decode goes +# through NIXL point-to-point. This mirrors the shape of the simplest +# Dynamo H200 multi-node disagg entries (e.g. dsr1-fp8-h200-dynamo-sglang +# 1P+1D EP=8) but with vLLM as the engine and llm-d as the router. +# +# Selected via additional-settings: CONFIG_FILE=dsr1-fp8-h200-1p1d-simple.yaml +# with PREFILL_NODES=1 DECODE_NODES=1 from the wrapper. + +# ---- EPP scheduling config ---- +# Mirrors the upstream llm-d well-lit-path P/D guide: +# guides/pd-disaggregation/router/pd-disaggregation.values.yaml +# in github.com/llm-d/llm-d. Plugins, scheduling profiles, and scorer +# weights are unchanged from upstream. Single delta: file-discovery +# replaces upstream's K8s endpoint discovery, since this benchmark runs +# under SLURM. The coordinator node writes /tmp/endpoints.yaml at job +# start (see benchmarks/multi_node/llm-d/README.md). +apiVersion: llm-d.ai/v1alpha1 +kind: EndpointPickerConfig + +plugins: + - name: file-disc + type: file-discovery + parameters: + path: /tmp/endpoints.yaml + watchFile: false + + - type: disagg-headers-handler + - type: always-disagg-pd-decider + - type: disagg-profile-handler + parameters: + deciderPluginName: always-disagg-pd-decider + - type: prefill-filter + - type: decode-filter + - type: prefix-cache-scorer + - type: queue-scorer + - type: kv-cache-utilization-scorer + - type: active-request-scorer + - type: max-score-picker + +schedulingProfiles: + - name: prefill + plugins: + - pluginRef: prefill-filter + - pluginRef: prefix-cache-scorer + weight: 3 + - pluginRef: queue-scorer + weight: 2 + - pluginRef: kv-cache-utilization-scorer + weight: 2 + - pluginRef: max-score-picker + - name: decode + plugins: + - pluginRef: decode-filter + - pluginRef: active-request-scorer + weight: 2 + - pluginRef: prefix-cache-scorer + weight: 3 + - pluginRef: max-score-picker + +dataLayer: + discovery: + pluginRef: file-disc + +# ---- Per-role vLLM flags ---- +# Common flags (--enable-expert-parallel, --tensor-parallel-size, +# --data-parallel-size, --kv_transfer_config, --moe-backend) are set in +# server.sh. The cross-node DP coordination flags +# (--data-parallel-hybrid-lb, --data-parallel-size-local, etc.) are NOT +# emitted because LWS_GROUP_SIZE = PREFILL_NODES = DECODE_NODES = 1. +prefill: + extra-args: >- + --gpu-memory-utilization 0.85 + --kv-cache-dtype fp8 + --max-num-batched-tokens 32768 + --max-num-seqs 16 + --block-size 256 + --no-enable-prefix-caching + env: {} + +decode: + extra-args: >- + --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 + --max-num-batched-tokens 256 + --max-num-seqs 256 + --block-size 256 + --no-enable-prefix-caching + env: {} + +# ---- SLURM resource directives ---- +slurm: + time_limit: "04:00:00" diff --git a/benchmarks/multi_node/llm-d/README.md b/benchmarks/multi_node/llm-d/README.md new file mode 100644 index 000000000..b57333dea --- /dev/null +++ b/benchmarks/multi_node/llm-d/README.md @@ -0,0 +1,133 @@ +# llm-d-vllm multi-node SLURM scaffolding + +This directory holds the SLURM-side orchestration for the `llm-d-vllm` +benchmark framework. It mirrors the AMD `sglang-disagg` pattern under +`benchmarks/multi_node/amd_utils/` (NOT the Dynamo / srt-slurm pattern): +InferenceX itself owns the SLURM job, no vendor multi-node tool involved. + +| File | Role | +|---|---| +| `submit.sh` | sbatch wrapper. Validates env, exports tuning vars, returns `JOB_ID`. May read `slurm.time_limit` from the recipe to override `TIME_LIMIT`. | +| `job.slurm` | sbatch entrypoint. Allocates `PREFILL_NODES + DECODE_NODES` nodes, derives per-node IPs, runs one Docker container per node via `srun`, threads role assignment env into each. | +| `server.sh` | Per-node entry. Reads `NODE_RANK = SLURM_PROCID`, picks role, starts vLLM (with the wide-EP / DeepEP / NIXL flag set from the llm-d wide-EP-lws guide), starts the pd-sidecar on each leader, and on the decode leader additionally writes `endpoints.yaml`, starts EPP + Envoy, runs `benchmark_serving.py`, and `scancel`s the job. | + +## Topology + +For an `xP` prefill nodes / `yD` decode nodes run, total nodes = `xP + yD`. +There is **no dedicated coordinator node**. The decode leader doubles as +the coordinator (EPP + Envoy + bench), exactly like the AMD path's +"decode rank 0" coordinator role. + +| Rank | Role | +|---|---| +| `0` | prefill leader (`LWS_WORKER_INDEX=0`, DP rank 0) + pd-sidecar | +| `1 .. xP-1` | prefill workers | +| `xP` | decode leader + pd-sidecar + EPP + Envoy + benchmark client | +| `xP+1 .. xP+yD-1` | decode workers | + +Each instance (prefill or decode) is one vLLM engine spanning multiple +nodes via `--data-parallel-hybrid-lb`. With `xP=2, yD=2, +GPUS_PER_NODE=8` you get DP=16 prefill + DP=16 decode (the wide-EP +reference). Per-rank split: `--data-parallel-size 16 +--data-parallel-size-local 8 --data-parallel-start-rank +$((LWS_WORKER_INDEX * 8))`. + +## How `endpoints.yaml` is generated (file-discovery contract) + +The EPP runs in **no-Kubernetes mode**, using the `file-discovery` plugin +from `llm-d-inference-scheduler` (branch `filediscovery-4`). At startup +it reads `/tmp/endpoints.yaml`; the file lists every backend the EPP can +route to, with role labels. + +The file is generated at runtime by `server.sh` on the decode leader +(rank `PREFILL_NODES`). Because all node IPs are only known after +`sbatch` allocates the job, the file cannot be baked into the image and +is not part of the repo. + +Generation flow: + +1. `submit.sh` calls `sbatch -N (xP+yD)`. `sbatch` allocates nodes. +2. `job.slurm` resolves each node's IP via `srun ip route get 1.1.1.1`, + slices them into `PREFILL_LEADER_IP` (= IPS[0]) and `DECODE_LEADER_IP` + (= IPS[PREFILL_NODES]), and passes both into the container as env + vars. +3. On the decode leader, `server.sh` writes `/tmp/endpoints.yaml` + inside the container with one entry per leader: + + ```yaml + endpoints: + - name: prefill-0 + address: + port: "8000" # pd-sidecar port + labels: + llm-d.ai/role: prefill + - name: decode-0 + address: + port: "8000" + labels: + llm-d.ai/role: decode + ``` + +4. The EPP (started immediately after) loads the file via + `dataLayer.discovery.pluginRef: file-disc` (see + `benchmarks/llm-d/epp-config.yaml`). The plugin enumerates the + endpoints into the EPP datastore before the EPP starts serving + `ext_proc`, so Envoy never gets a request before discovery is ready. +5. The `disagg-profile-handler` in the EPP config uses `prefill-filter` + and `decode-filter` to pick the right backend per request phase, + matching on the `llm-d.ai/role` label. + +### Why one entry per *leader* (not per node) + +In the wide-EP guide each instance is a single vLLM engine that spans +multiple nodes via `--data-parallel-hybrid-lb`. With hybrid-lb, the +leader pod (`LWS_WORKER_INDEX=0`) accepts external traffic and +distributes it internally across the local DP ranks; in our LWS-free +SLURM mapping, the prefill-leader and decode-leader are the only nodes +addressable from outside. Adding an entry per worker would cause EPP to +route directly to a worker, bypassing the engine's internal load +balancing. + +If we later want to expose all pods of an instance (the alternative +hybrid-lb interpretation: external LB across nodes too), we can extend +the loop in `server.sh` to emit one entry per `IPS[i]` in the prefill +range and one per `IPS[i]` in the decode range, all carrying the same +role label. EPP then load-balances across them via `random-picker`. + +### Live reload + +`watchFile: false` in `epp-config.yaml`. Endpoints are static for the +job lifetime - no reason to pay for `fsnotify` here. Set `watchFile: +true` (and rewrite `/tmp/endpoints.yaml` from the coordinator) only if +you want to drain or add an instance mid-run. + +### Validation rules (enforced by the plugin) + +- `address` must be a literal IPv4 address (no IPv6, no hostnames). +- `port` is a string in `1..65535`. +- File capped at 1 MiB. +- Names must be unique within their namespace (we use the default + namespace, so they must be globally unique in the file). + +The IPs we collect from `ip route get 1.1.1.1` are always IPv4 on the +H200 / B200 cluster's primary fabric; if you point at a different +interface and it returns an IPv6 address, EPP will reject the file at +startup. + +## Recipe files + +`benchmarks/multi_node/llm-d-recipes/.yaml` is selected via +`CONFIG_FILE=.yaml` in the master config's `additional-settings`. +Each recipe carries: + +- top-level `plugins:` / `schedulingProfiles:` / `dataLayer:` - fed into + the EPP via `--config-file`. Lets you change routing strategy without + rebuilding the image. +- `prefill:` / `decode:` blocks with `extra-args` (appended to the vLLM + launch command on each node of that role) and `env` (exported before + vLLM starts). +- `slurm.time_limit` - overrides `TIME_LIMIT` for that recipe. + +When `CONFIG_FILE` is unset or the file is missing, the EPP falls back +to `/etc/epp/config.yaml` baked into the image, and vLLM runs with no +extra flags beyond the wide-EP common set in `server.sh`. diff --git a/benchmarks/multi_node/llm-d/job.slurm b/benchmarks/multi_node/llm-d/job.slurm new file mode 100644 index 000000000..46a026ced --- /dev/null +++ b/benchmarks/multi_node/llm-d/job.slurm @@ -0,0 +1,147 @@ +#!/bin/bash +#SBATCH --job-name=llm-d-bench +#SBATCH --ntasks-per-node=1 +# --output, --error, -N, -n, --time set by submit.sh +# +# Allocates PREFILL_NODES + DECODE_NODES nodes, derives per-node IPs, then +# srun-runs server.sh inside one Docker container per node. NODE_RANK +# (= SLURM_PROCID) drives role selection inside server.sh. + +set -euo pipefail + +echo "=== llm-d job start ===" +echo "UTC: $(TZ=UTC date '+%Y-%m-%d %H:%M:%S %Z')" + +# Repo root (benchmarks/multi_node/llm-d/job.slurm -> ../../..) +DI_REPO_DIR=$(cd "$(dirname "$0")/../../.." && pwd) +export DI_REPO_DIR + +ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +TOTAL_NODES=$(echo "$ALL_NODES" | wc -l) +echo "Allocated nodes ($TOTAL_NODES): $(echo "$ALL_NODES" | tr '\n' ' ')" + +if [[ "$TOTAL_NODES" -ne "$NUM_NODES" ]]; then + echo "Error: SLURM allocated $TOTAL_NODES nodes, expected $NUM_NODES" >&2 + exit 1 +fi + +# Per-node IPs in rank order. +IPS=() +for NODE in $ALL_NODES; do + IP=$(srun --nodes=1 --ntasks=1 --nodelist="$NODE" \ + bash -c 'ip route get 1.1.1.1 | awk "/src/ {print \$7}"') + IPS+=("$IP") +done +echo "Node IPs: ${IPS[*]}" + +# Rank slicing: +# prefill leader = rank 0 +# prefill workers = ranks 1 .. PREFILL_NODES-1 +# decode leader = rank PREFILL_NODES (also coordinator: EPP + Envoy + bench) +# decode workers = ranks PREFILL_NODES+1 .. NUM_NODES-1 +PREFILL_LEADER_IP="${IPS[0]}" +DECODE_LEADER_IP="${IPS[$PREFILL_NODES]}" + +# DP leader addresses for vLLM --data-parallel-address (rank 0 of each instance). +PREFILL_DP_ADDR="$PREFILL_LEADER_IP" +DECODE_DP_ADDR="$DECODE_LEADER_IP" + +ALL_IP_LIST=$(IFS=,; echo "${IPS[*]}") + +SANITIZED_USER=$(echo "${USER:-runner}" | tr -c 'a-zA-Z0-9_.-' '_') +DOCKER_CONT_NAME="llmd_bench_${SANITIZED_USER}_${SLURM_JOB_ID}" +export DOCKER_CONT_NAME +export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +DOCKER_MOUNT_PATH="/workspace" + +cleanup() { + echo "[${SLURM_JOB_ID}] cleanup on $(hostname)" + [[ -n "${WATCHER_PID:-}" ]] && kill "$WATCHER_PID" 2>/dev/null || true +} +trap cleanup INT TERM HUP EXIT + +# Coordinator-done watcher. server.sh on the decode coordinator writes +# this marker after the bench finishes; we then scancel the allocation +# from outside the container (the image has no SLURM client tools). +# Without this, workers `wait` on local vLLM forever and the job runs +# to TIME_LIMIT. +BENCH_DONE_MARKER="$BENCHMARK_LOGS_DIR/.bench_done.$SLURM_JOB_ID" +rm -f "$BENCH_DONE_MARKER" +( + while [[ ! -f "$BENCH_DONE_MARKER" ]]; do sleep 5; done + echo "[${SLURM_JOB_ID}] coordinator finished; scancel'ing job" + scancel "$SLURM_JOB_ID" 2>/dev/null || true +) & +WATCHER_PID=$! + +# One docker run per node, one task per node. server.sh dispatches by NODE_RANK. +srun \ + --kill-on-bad-exit=1 \ + --signal=TERM@30 \ + --unbuffered \ + bash -lc " +set -euo pipefail +echo \"Rank \$SLURM_PROCID on \$(hostname)\" + +sudo docker ps -aq --filter name=\"^${DOCKER_CONT_NAME}_\" | xargs -r sudo docker rm -f || true + +exec sudo docker run --rm \ + --init \ + --stop-timeout 10 \ + --network host \ + --ipc host \ + --gpus all \ + --ulimit memlock=-1 --ulimit stack=67108864 \ + --shm-size 32G \ + --cap-add SYS_PTRACE --cap-add IPC_LOCK --cap-add SYS_RAWIO \ + --device /dev/infiniband \ + --security-opt seccomp=unconfined \ + --privileged \ + -v ${MODEL_DIR}:/models:ro \ + -v ${BENCHMARK_LOGS_DIR}:/benchmark_logs \ + -v ${DI_REPO_DIR}:${DOCKER_MOUNT_PATH} \ + -v ${DI_REPO_DIR}/benchmarks/multi_node/llm-d-recipes:/etc/llmd-recipes:ro \ + -v ${DI_REPO_DIR}/benchmarks/llm-d/epp-config.yaml:/etc/epp/config.yaml:ro \ + -v ${DI_REPO_DIR}/benchmarks/llm-d/envoy.yaml:/etc/envoy/envoy.yaml:ro \ + -e SLURM_JOB_ID=\$SLURM_JOB_ID \ + -e NODE_RANK=\$SLURM_PROCID \ + -e NUM_NODES=$NUM_NODES \ + -e PREFILL_NODES=$PREFILL_NODES \ + -e DECODE_NODES=$DECODE_NODES \ + -e ALL_IPS=$ALL_IP_LIST \ + -e PREFILL_LEADER_IP=$PREFILL_LEADER_IP \ + -e DECODE_LEADER_IP=$DECODE_LEADER_IP \ + -e PREFILL_DP_ADDR=$PREFILL_DP_ADDR \ + -e DECODE_DP_ADDR=$DECODE_DP_ADDR \ + -e MODEL_DIR=/models \ + -e MODEL_NAME=$MODEL_NAME \ + -e GPUS_PER_NODE=$GPUS_PER_NODE \ + -e PREFILL_DP_SIZE=$PREFILL_DP_SIZE \ + -e DECODE_DP_SIZE=$DECODE_DP_SIZE \ + -e BENCH_INPUT_LEN=$BENCH_INPUT_LEN \ + -e BENCH_OUTPUT_LEN=$BENCH_OUTPUT_LEN \ + -e BENCH_MAX_CONCURRENCY=$BENCH_MAX_CONCURRENCY \ + -e BENCH_REQUEST_RATE=$BENCH_REQUEST_RATE \ + -e BENCH_RANDOM_RANGE_RATIO=$BENCH_RANDOM_RANGE_RATIO \ + -e BENCH_NUM_PROMPTS_MULTIPLIER=$BENCH_NUM_PROMPTS_MULTIPLIER \ + -e BENCHMARK_LOGS_DIR=/benchmark_logs \ + -e RUN_EVAL=$RUN_EVAL \ + -e EVAL_ONLY=$EVAL_ONLY \ + -e EVAL_CONC=$EVAL_CONC \ + -e FRAMEWORK=$FRAMEWORK \ + -e PRECISION=$PRECISION \ + -e MODEL_PREFIX=$MODEL_PREFIX \ + -e RUNNER_TYPE=$RUNNER_TYPE \ + -e RESULT_FILENAME=$RESULT_FILENAME \ + -e SPEC_DECODING=$SPEC_DECODING \ + -e IS_MULTINODE=$IS_MULTINODE \ + -e CONFIG_FILE=$CONFIG_FILE \ + --name \"${DOCKER_CONT_NAME}_\$SLURM_PROCID\" \ + \"\$DOCKER_IMAGE_NAME\" bash -lc ' + set -o pipefail + ${DOCKER_MOUNT_PATH}/benchmarks/multi_node/llm-d/server.sh \ + 2>&1 | tee /benchmark_logs/slurm_job-'\"\$SLURM_JOB_ID\"'_rank_'\"\$SLURM_PROCID\"'.log + ' +" + +srun bash -c "sudo docker ps -aq --filter name=\"^${DOCKER_CONT_NAME}_\" | xargs -r sudo docker rm -f" || true diff --git a/benchmarks/multi_node/llm-d/server.sh b/benchmarks/multi_node/llm-d/server.sh new file mode 100755 index 000000000..b5c264be6 --- /dev/null +++ b/benchmarks/multi_node/llm-d/server.sh @@ -0,0 +1,322 @@ +#!/usr/bin/env bash +# +# Per-node entrypoint for the llm-d-vllm wide-EP P/D disagg benchmark. +# NODE_RANK is set by srun (= $SLURM_PROCID) in job.slurm. +# +# Roles: +# Rank 0 -> prefill leader (DP rank 0) +# Ranks 1 .. PREFILL_NODES-1 -> prefill workers +# Rank PREFILL_NODES -> decode leader (DP rank 0) + pd-sidecar +# + EPP + Envoy + benchmark client +# (the coordinator, like AMD's decode-0) +# Ranks PREFILL_NODES+1 .. -> decode workers +# +# Each "instance" (prefill or decode) is a single vLLM engine spanning +# PREFILL_NODES (or DECODE_NODES) nodes via --data-parallel-hybrid-lb. The +# leader pod accepts external traffic; workers handle their local DP ranks. + +set -euo pipefail + +source /workspace/benchmarks/benchmark_lib.sh + +NODE_RANK="${NODE_RANK:-${SLURM_PROCID:-0}}" +PREFILL_NODES="${PREFILL_NODES:-1}" +DECODE_NODES="${DECODE_NODES:-1}" +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" +VLLM_PORT=8200 +SIDECAR_PORT=8000 +ENVOY_PORT=8080 +EPP_GRPC_PORT=9002 +EPP_HEALTH_PORT=9003 +EPP_METRICS_PORT=9090 + +# Filesystem path to the weights inside the container. job.slurm mounts +# the host model directory at /models and sets MODEL_DIR=/models, so the +# weights live directly under MODEL_DIR. MODEL_NAME is the OpenAI-API +# served name passed via --served-model-name; it is not part of the +# filesystem path. +MODEL="${MODEL_DIR}" +HOST_IP=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') +# Default NIC for NCCL / Gloo / NVSHMEM bootstrap. Pulled from the same +# default route HOST_IP came from so the iface and the IP stay +# consistent across clusters where the routed NIC is not eth0. +DEFAULT_IFACE=$(ip -o -4 route show to default | awk '{print $5; exit}') +DEFAULT_IFACE="${DEFAULT_IFACE:-eth0}" + +VLLM_LOG="/benchmark_logs/vllm_rank${NODE_RANK}.log" +SIDECAR_LOG="/benchmark_logs/sidecar_rank${NODE_RANK}.log" +EPP_LOG="/benchmark_logs/epp.log" +ENVOY_LOG="/benchmark_logs/envoy.log" + +echo "=== rank=$NODE_RANK host=$HOST_IP model=$MODEL ===" + +# ---------------------------------------------------------------- +# Role assignment +# ---------------------------------------------------------------- +if [[ "$NODE_RANK" -lt "$PREFILL_NODES" ]]; then + ROLE="prefill" + DP_SIZE="$PREFILL_DP_SIZE" + DP_ADDR="$PREFILL_DP_ADDR" + LWS_WORKER_INDEX="$NODE_RANK" + LWS_GROUP_SIZE="$PREFILL_NODES" +elif [[ "$NODE_RANK" -lt $((PREFILL_NODES + DECODE_NODES)) ]]; then + ROLE="decode" + DP_SIZE="$DECODE_DP_SIZE" + DP_ADDR="$DECODE_DP_ADDR" + LWS_WORKER_INDEX=$((NODE_RANK - PREFILL_NODES)) + LWS_GROUP_SIZE="$DECODE_NODES" +else + echo "ERROR: NODE_RANK=$NODE_RANK out of range" >&2 + exit 1 +fi + +DP_SIZE_LOCAL="$GPUS_PER_NODE" +START_RANK=$((LWS_WORKER_INDEX * DP_SIZE_LOCAL)) +TP_SIZE=1 + +echo "ROLE=$ROLE DP_SIZE=$DP_SIZE DP_ADDR=$DP_ADDR LWS_WORKER_INDEX=$LWS_WORKER_INDEX START_RANK=$START_RANK" + +# ---------------------------------------------------------------- +# Read role-specific extra-args and env from the recipe file. +# ---------------------------------------------------------------- +ROLE_EXTRA_ARGS="" +if [[ -n "${CONFIG_FILE:-}" ]]; then + RECIPE_PATH="/etc/llmd-recipes/${CONFIG_FILE}" + if [[ -f "$RECIPE_PATH" ]]; then + echo "Loading $ROLE recipe from $RECIPE_PATH" + eval "$(python3 - <&2 + fi +fi + +# ---------------------------------------------------------------- +# Multi-node DP / NIXL P/D env: needed in any topology. +# ---------------------------------------------------------------- +export GLOO_SOCKET_IFNAME=${GLOO_SOCKET_IFNAME:-$DEFAULT_IFACE} +export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-$DEFAULT_IFACE} +export VLLM_SKIP_P2P_CHECK=1 +export VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1 +export VLLM_USE_DEEP_GEMM=1 +export VLLM_NIXL_SIDE_CHANNEL_HOST="$HOST_IP" +export VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO} + +# ---------------------------------------------------------------- +# Wide-EP NVSHMEM / ibgda env (from the llm-d wide-EP-lws guide +# manifests). Gated on LWS_GROUP_SIZE > 1 - the simple 1P+1D recipe +# explicitly avoids DeepEP, NVSHMEM ibgda, and full-mesh RDMA, so +# leaving these set on a single-node-per-role topology is misleading +# and could trigger ibgda code paths it does not need. +# ---------------------------------------------------------------- +if [[ "$LWS_GROUP_SIZE" -gt 1 ]]; then + export NVIDIA_GDRCOPY=enabled + export NVSHMEM_REMOTE_TRANSPORT=ibgda + export NVSHMEM_IB_ENABLE_IBGDA=true + export NVSHMEM_SYMMETRIC_SIZE=16G + export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME=${NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME:-$DEFAULT_IFACE} +fi + +# ---------------------------------------------------------------- +# Start vLLM (every node, prefill or decode) +# +# Flags split into: +# * COMMON_ARGS - always passed. +# * MULTINODE_DP_ARGS - only when an instance spans more than one node +# (LWS_GROUP_SIZE > 1, i.e. wide-EP topology). vLLM's +# --data-parallel-hybrid-lb and the cross-process DP coordination +# flags are wrong for the single-node-per-instance case where DP is +# contained inside one engine process. +# ---------------------------------------------------------------- +KV_TRANSFER_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both","kv_load_failure_policy":"fail"}' + +COMMON_ARGS=( + --port "$VLLM_PORT" + --served-model-name "$MODEL_NAME" + --trust-remote-code + --api-server-count 1 + --disable-access-log-for-endpoints=/health,/metrics + --enable-expert-parallel + --tensor-parallel-size "$TP_SIZE" + --data-parallel-size "$DP_SIZE" + --kv_transfer_config "$KV_TRANSFER_CONFIG" + --moe-backend deep_gemm +) + +if [[ "$LWS_GROUP_SIZE" -gt 1 ]]; then + COMMON_ARGS+=( + --data-parallel-hybrid-lb + --data-parallel-size-local "$DP_SIZE_LOCAL" + --data-parallel-address "$DP_ADDR" + --data-parallel-rpc-port 5555 + --data-parallel-start-rank "$START_RANK" + ) +fi + +echo "Starting vLLM ($ROLE) DP=$DP_SIZE local=$DP_SIZE_LOCAL start_rank=$START_RANK group_size=$LWS_GROUP_SIZE" +# shellcheck disable=SC2086 +vllm serve "$MODEL" "${COMMON_ARGS[@]}" $ROLE_EXTRA_ARGS \ + > "$VLLM_LOG" 2>&1 & +VLLM_PID=$! + +# Every rank waits for its own engine to bind /health before falling +# through. For wide-EP (LWS_GROUP_SIZE > 1) this prevents the bench +# from starting before the worker-side DP shards have come up; for the +# single-node case it is a no-op extra check. +wait_for_server_ready --port "$VLLM_PORT" --server-log "$VLLM_LOG" --server-pid "$VLLM_PID" +echo "vLLM ready on rank $NODE_RANK ($ROLE worker_index=$LWS_WORKER_INDEX)" + +# Only the leader of each instance accepts external requests on $VLLM_PORT. +if [[ "$LWS_WORKER_INDEX" -eq 0 ]]; then + # ------------------------------------------------------------ + # Start pd-sidecar on each leader (prefill leader and decode leader). + # The decode-side sidecar is what EPP routes to; the prefill-side + # sidecar is the target the decode sidecar pulls KVs from. + # ------------------------------------------------------------ + SIDECAR_CONNECTOR="nixlv2" + SIDECAR_FLAGS=(--port="$SIDECAR_PORT" --vllm-port="$VLLM_PORT" + --kv-connector="$SIDECAR_CONNECTOR" --secure-proxy=false) + if [[ "$ROLE" == "decode" ]]; then + SIDECAR_FLAGS+=(--enable-prefiller-sampling) + fi + echo "Starting pd-sidecar ($ROLE leader): ${SIDECAR_FLAGS[*]}" + pd-sidecar "${SIDECAR_FLAGS[@]}" > "$SIDECAR_LOG" 2>&1 & + SIDECAR_PID=$! + wait_for_server_ready --port "$SIDECAR_PORT" --server-log "$SIDECAR_LOG" --server-pid "$SIDECAR_PID" + echo "pd-sidecar ready on $HOST_IP:$SIDECAR_PORT" +fi + +# ---------------------------------------------------------------- +# Coordinator: decode leader runs EPP + Envoy + benchmark client. +# ---------------------------------------------------------------- +if [[ "$ROLE" == "decode" && "$LWS_WORKER_INDEX" -eq 0 ]]; then + + # Write endpoints.yaml. See benchmarks/multi_node/llm-d/README.md for + # the discovery contract. + # NOTE: endpoint 'namespace' must match EPP's --pool-namespace below + # (file-discovery filters endpoints by namespace; the schema default + # 'default' would otherwise drop every entry). + python3 - < "$EPP_LOG" 2>&1 & + EPP_PID=$! + + # Wait for EPP to bind its gRPC port before starting Envoy. Envoy's + # ext_proc filter dials 127.0.0.1:$EPP_GRPC_PORT - if Envoy comes up + # first the early bench requests hit ext_proc connection errors. + # gRPC has no plain HTTP /health, so probe the TCP listener directly. + echo "Waiting for EPP on 127.0.0.1:$EPP_GRPC_PORT" + EPP_WAIT_DEADLINE=$(( $(date +%s) + 60 )) + until (echo > "/dev/tcp/127.0.0.1/$EPP_GRPC_PORT") 2>/dev/null; do + if ! kill -0 "$EPP_PID" 2>/dev/null; then + echo "ERROR: EPP died before binding $EPP_GRPC_PORT" >&2 + exit 1 + fi + if [[ "$(date +%s)" -ge "$EPP_WAIT_DEADLINE" ]]; then + echo "ERROR: EPP did not bind $EPP_GRPC_PORT within 60s" >&2 + exit 1 + fi + sleep 1 + done + echo "EPP listening on $EPP_GRPC_PORT" + + envoy -c /etc/envoy/envoy.yaml > "$ENVOY_LOG" 2>&1 & + ENVOY_PID=$! + + wait_for_server_ready --port "$ENVOY_PORT" --server-log "$ENVOY_LOG" --server-pid "$ENVOY_PID" + + # Wait for the prefill leader's sidecar before starting the bench. + # wait_for_server_ready can only probe localhost; the prefill leader + # is on a different node, so poll directly with a deadline. + echo "Waiting for prefill sidecar at $PREFILL_LEADER_IP:$SIDECAR_PORT/health" + PREFILL_WAIT_DEADLINE=$(( $(date +%s) + 300 )) + until curl --output /dev/null --silent --fail \ + "http://$PREFILL_LEADER_IP:$SIDECAR_PORT/health"; do + if [[ "$(date +%s)" -ge "$PREFILL_WAIT_DEADLINE" ]]; then + echo "ERROR: prefill sidecar did not become ready within 5 min" >&2 + exit 1 + fi + sleep 5 + done + echo "Prefill sidecar at $PREFILL_LEADER_IP:$SIDECAR_PORT is ready" + + # Sweep concurrency. BENCH_MAX_CONCURRENCY arrives from submit.sh as + # an 'x'-delimited list (e.g. "2048x1024x512"); the runner / sweep + # configs expect one bench run per level. Same shape as + # benchmarks/multi_node/amd_utils/bench.sh. + IFS='x' read -r -a CONCURRENCIES <<< "$BENCH_MAX_CONCURRENCY" + for max_concurrency in "${CONCURRENCIES[@]}"; do + num_prompts=$(( max_concurrency * BENCH_NUM_PROMPTS_MULTIPLIER )) + [[ "$num_prompts" -lt 16 ]] && num_prompts=16 + # Bench against Envoy. EPP routes to decode (and decode sidecar + # pulls from prefill via NIXL). + run_benchmark_serving \ + --model "$MODEL_NAME" \ + --port "$ENVOY_PORT" \ + --backend openai \ + --input-len "$BENCH_INPUT_LEN" \ + --output-len "$BENCH_OUTPUT_LEN" \ + --random-range-ratio "$BENCH_RANDOM_RANGE_RATIO" \ + --num-prompts "$num_prompts" \ + --max-concurrency "$max_concurrency" \ + --result-filename "${RESULT_FILENAME}_c${max_concurrency}" \ + --result-dir "$BENCHMARK_LOGS_DIR/" + done + + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + run_eval --framework lm-eval --port "$ENVOY_PORT" + append_lm_eval_summary + fi + + # Signal job.slurm (running outside the container, where SLURM + # client tools are available) to scancel the allocation. The image + # does not bundle scancel, so calling it here would just trip + # set -e. Workers end server.sh in `wait`; without this signal + # they would hold the job until TIME_LIMIT. + touch "$BENCHMARK_LOGS_DIR/.bench_done.$SLURM_JOB_ID" +else + # Workers (prefill workers, decode workers, prefill leader): just keep vLLM alive. + wait +fi diff --git a/benchmarks/multi_node/llm-d/submit.sh b/benchmarks/multi_node/llm-d/submit.sh new file mode 100755 index 000000000..663885426 --- /dev/null +++ b/benchmarks/multi_node/llm-d/submit.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +# +# Submit a multi-node llm-d-vllm wide-EP P/D disagg benchmark job to SLURM. +# Modeled after benchmarks/multi_node/amd_utils/submit.sh; prints JOB_ID on +# stdout so the runner can poll for completion. +# +# Topology (matches the llm-d wide-EP guide reference): +# 1 prefill instance with DP=PREFILL_NODES * GPUS_PER_NODE +# 1 decode instance with DP=DECODE_NODES * GPUS_PER_NODE +# each instance spans PREFILL_NODES / DECODE_NODES nodes via vLLM +# --data-parallel-hybrid-lb. Total nodes = PREFILL_NODES + DECODE_NODES. + +set -euo pipefail + +# Repo root resolved from this script's location, so paths below are +# independent of the caller's $PWD (the wrapper cd's into llm-d/ before +# invoking this script). +REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" + +check_env() { + local name="$1" + if [[ -z "${!name:-}" ]]; then + echo "Error: ${name} not set" >&2 + exit 1 + fi +} + +check_env SLURM_ACCOUNT +check_env SLURM_PARTITION +check_env TIME_LIMIT +check_env MODEL_PATH +check_env MODEL_NAME +check_env CONTAINER_IMAGE +check_env RUNNER_NAME + +PREFILL_NODES=$1 +DECODE_NODES=$2 +ISL=$3 +OSL=$4 +CONCURRENCIES=$5 +REQUEST_RATE=${6:-inf} +RANDOM_RANGE_RATIO=${7:-0.8} + +NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) +GPUS_PER_NODE="${GPUS_PER_NODE:-8}" + +export DOCKER_IMAGE_NAME=$CONTAINER_IMAGE +export MODEL_DIR=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export NUM_NODES=$NUM_NODES +export PREFILL_NODES=$PREFILL_NODES +export DECODE_NODES=$DECODE_NODES +export GPUS_PER_NODE=$GPUS_PER_NODE +export PREFILL_DP_SIZE=$((PREFILL_NODES * GPUS_PER_NODE)) +export DECODE_DP_SIZE=$((DECODE_NODES * GPUS_PER_NODE)) +export BENCH_INPUT_LEN=$ISL +export BENCH_OUTPUT_LEN=$OSL +export BENCH_MAX_CONCURRENCY=$CONCURRENCIES +export BENCH_REQUEST_RATE=$REQUEST_RATE +export BENCH_RANDOM_RANGE_RATIO=$RANDOM_RANGE_RATIO +export BENCH_NUM_PROMPTS_MULTIPLIER=10 + +export RUN_EVAL="${RUN_EVAL:-false}" +export EVAL_ONLY="${EVAL_ONLY:-false}" +export EVAL_CONC="${EVAL_CONC:-}" +export FRAMEWORK="${FRAMEWORK:-llm-d-vllm}" +export PRECISION="${PRECISION:-}" +export MODEL_PREFIX="${MODEL_PREFIX:-}" +export RUNNER_TYPE="${RUNNER_TYPE:-}" +export RESULT_FILENAME="${RESULT_FILENAME:-}" +export SPEC_DECODING="${SPEC_DECODING:-none}" +export IS_MULTINODE="${IS_MULTINODE:-true}" +export CONFIG_FILE="${CONFIG_FILE:-}" + +# Recipe may override SLURM time limit (longer topologies need more wall time). +if [[ -n "$CONFIG_FILE" ]]; then + RECIPE_PATH="${REPO_ROOT}/benchmarks/multi_node/llm-d-recipes/${CONFIG_FILE}" + if [[ -f "$RECIPE_PATH" ]]; then + RECIPE_TIME=$(python3 -c " +import yaml, sys +r = yaml.safe_load(open('$RECIPE_PATH')) +t = r.get('slurm', {}).get('time_limit', '') +print(t) +" 2>/dev/null || true) + [[ -n "$RECIPE_TIME" ]] && TIME_LIMIT="$RECIPE_TIME" + fi +fi + +export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" +mkdir -p "$BENCHMARK_LOGS_DIR" + +JOB_ID=$(sbatch \ + --parsable \ + --exclusive \ + -N "$NUM_NODES" \ + -n "$NUM_NODES" \ + --ntasks-per-node=1 \ + --gres=gpu:"$GPUS_PER_NODE" \ + --time "$TIME_LIMIT" \ + --partition "$SLURM_PARTITION" \ + --account "$SLURM_ACCOUNT" \ + --job-name "$RUNNER_NAME" \ + --output "${BENCHMARK_LOGS_DIR}/slurm_job-%j.out" \ + --error "${BENCHMARK_LOGS_DIR}/slurm_job-%j.err" \ + "$(dirname "$0")/job.slurm") + +if [[ -z "$JOB_ID" ]]; then + echo "Error: sbatch failed" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 572056956..1a948b41e 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -1,13 +1,96 @@ #!/usr/bin/bash -# System-specific configuration for H200 DGXC Slurm cluster -SLURM_PARTITION="main" -SLURM_ACCOUNT="sa-shared" +# System-specific configuration for H200 DGXC Slurm cluster. +# Exported so child processes (e.g. submit.sh invoked via nested bash) +# inherit them. +export SLURM_PARTITION="main" +export SLURM_ACCOUNT="sa-shared" set -x if [[ "$IS_MULTINODE" == "true" ]]; then + # ------------------------------------------------------------------ + # llm-d-vllm: InferenceX-owned multi-node path (no srt-slurm). + # Mirrors the AMD sglang-disagg dispatch shape: wrapper script -> + # benchmarks/multi_node/llm-d/submit.sh -> sbatch -> JOB_ID. + # ------------------------------------------------------------------ + if [[ "$FRAMEWORK" == "llm-d-vllm" ]]; then + if [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then + export MODEL_PATH="/models/DeepSeek-R1-0528" + export MODEL_NAME="DeepSeek-R1-0528" + else + echo "Unsupported MODEL_PREFIX/PRECISION for llm-d-vllm on H200: $MODEL_PREFIX/$PRECISION" >&2 + exit 1 + fi + + # Logs go to BENCHMARK_LOGS_DIR (NFS-accessible); mirrors AMD path. + export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$GITHUB_WORKSPACE/benchmark_logs}" + mkdir -p "$BENCHMARK_LOGS_DIR" + + SCRIPT_NAME="${EXP_NAME%%_*}_${PRECISION}_h200_llm-d-vllm.sh" + BENCH_SCRIPT="benchmarks/multi_node/${SCRIPT_NAME}" + if [[ ! -f "$BENCH_SCRIPT" ]]; then + echo "Error: llm-d wrapper not found: $BENCH_SCRIPT" >&2 + exit 1 + fi + + JOB_ID=$(bash "$BENCH_SCRIPT") + if [[ -z "$JOB_ID" ]]; then + echo "Error: failed to submit llm-d job" >&2 + exit 1 + fi + echo "Submitted llm-d job: $JOB_ID" + + LOG_FILE="${BENCHMARK_LOGS_DIR}/slurm_job-${JOB_ID}.out" + + # Wait for log file (also catch early failures). + while ! ls "$LOG_FILE" &>/dev/null; do + if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then + echo "ERROR: job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" || true + exit 1 + fi + sleep 5 + done + + # Background poll, foreground tail. + ( + while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do + sleep 10 + done + ) & + POLL_PID=$! + + tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null + wait $POLL_PID + + # Result collection: same shape as AMD path. + for result_file in $(find "${BENCHMARK_LOGS_DIR}" -name "${RESULT_FILENAME}*.json" 2>/dev/null); do + file_name=$(basename "$result_file") + cp "$result_file" "$GITHUB_WORKSPACE/${file_name}" + echo "Copied result: $file_name" + done + + if [[ "${RUN_EVAL:-false}" == "true" ]]; then + EVAL_DIR=$(find "$BENCHMARK_LOGS_DIR" -type d -name eval_results 2>/dev/null | head -1) + if [[ -n "$EVAL_DIR" && -d "$EVAL_DIR" ]]; then + shopt -s nullglob + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] || continue + cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + shopt -u nullglob + else + echo "WARNING: RUN_EVAL=true but no eval_results found under $BENCHMARK_LOGS_DIR" + fi + fi + + scancel "$JOB_ID" 2>/dev/null || true + exit 0 + fi + # MODEL_PATH: Override with pre-downloaded paths on H200 runner # The yaml files specify HuggingFace model IDs for portability, but we use # local paths to avoid repeated downloading on the shared H200 cluster. @@ -29,7 +112,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then exit 1 fi else - echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang" + echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, llm-d-vllm" exit 1 fi