From f3b38b20370cffb4495b7ecc34f0d2be0082865b Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Wed, 3 Jun 2026 14:46:05 +0530 Subject: [PATCH 1/2] refactor: drop client-side DP-rank pinning for external-LB With external-LB data parallelism each DP rank is its own API server on its own port (the URL is the rank selector), so the client no longer needs the hybrid-LB `X-data-parallel-rank` header to pin a rollout to an internal DP shard. Remove the `dp_rank_count` client field + its auto-setup and the per-rank client expansion: one client per base URL, no rank header. The router (vllm-router or llm-d EPP) balances across the per-rank endpoints. This also fixes llm-d routing: the EPP forwards the header to the dp=1 backend, which rejected it ("data_parallel_rank N is out of range"). Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 1 + .../src/prime_rl/configs/rl.py | 5 +- .../src/prime_rl/configs/shared.py | 3 -- src/prime_rl/utils/client.py | 47 +++++++++---------- src/prime_rl/utils/elastic.py | 1 - tests/unit/utils/test_client.py | 14 ++++-- 6 files changed, 33 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9da33aa490..ad3dd08aa3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ Documenting **breaking** configuration changes — renamed, removed, or moved fields that require users to update existing configs. +- **`client.dp_rank_count` removed**: With external-LB data parallelism each DP rank is its own endpoint (one `client.base_url` per rank), so the client no longer pins a rollout to an internal DP shard via the `X-data-parallel-rank` header. The `[orchestrator.student.client] dp_rank_count` field (and its per-rank client expansion) is gone — the router load-balances across the per-rank endpoints. Existing configs setting `dp_rank_count` must drop it (`extra="forbid"` rejects it); there is nothing to migrate. (2026-06-03) - **`inference.kv_cache_offload.cpu_bytes` removed → discriminated `type` config**: The flat `[inference.kv_cache_offload]` block with a single `cpu_bytes` field is replaced by a backend-discriminated union with composable `cpu`/`disk` tiers. Migrate native CPU offload from `[inference.kv_cache_offload]\ncpu_bytes = N` to `[inference.kv_cache_offload]\ntype = "native"` plus `[inference.kv_cache_offload.cpu]\nnum_bytes = N`. A `type = "mooncake"` backend (per-node distributed store; multi-node/SLURM only) and an optional `[inference.kv_cache_offload.disk]\npath = "..."` tier (layered behind cpu) are also available. `extra="forbid"` rejects the old `cpu_bytes` key, so existing configs must migrate. (2026-06-02) - **Orchestrator async-pipeline rewrite** (collection of removals/renames). The orchestrator was rewritten to overlap train/eval rollouts on a shared concurrency limiter; several config fields were removed or renamed. - **`orchestrator.seed` removed**: was only consumed by the deleted buffer; no replacement. diff --git a/packages/prime-rl-configs/src/prime_rl/configs/rl.py b/packages/prime-rl-configs/src/prime_rl/configs/rl.py index 1af3436d1b..e95fd5207b 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/rl.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/rl.py @@ -600,16 +600,13 @@ def auto_setup_disaggregated_inference(self): def auto_setup_inference_client(self): """Auto-configure orchestrator student client from the inference server config. - For all modes, sets dp_rank_count from inference DP size. For SFT mode, - also sets base_url - rl/opd rely on the ClientConfig default + For SFT mode, sets base_url - rl/opd rely on the ClientConfig default (``["http://localhost:8000/v1"]``) which already matches the auto-launched student vLLM at inference.server.port = 8000. """ if self.inference is None: return self client = self.orchestrator.student.client - if "dp_rank_count" not in client.model_fields_set: - client.dp_rank_count = self.inference.data_parallel_size_local or self.inference.parallel.dp if self.orchestrator.training_mode == "sft" and "base_url" not in client.model_fields_set: host = self.inference.server.host or "localhost" port = self.inference.server.port diff --git a/packages/prime-rl-configs/src/prime_rl/configs/shared.py b/packages/prime-rl-configs/src/prime_rl/configs/shared.py index ff311f145d..b510a26f75 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/shared.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/shared.py @@ -127,9 +127,6 @@ class ClientConfig(BaseConfig): skip_model_check: bool = False """Skip checking that the model is available in the inference pool. Useful for external APIs or keys that do not expose ``/models``.""" - dp_rank_count: int = Field(1, ge=1) - """Number of data-parallel ranks behind each base URL. When > 1, each URL is expanded into ``dp_rank_count`` logical clients pinned via the ``X-data-parallel-rank`` header, so every request within a rollout hits the same DP engine and reuses KV cache. Auto-set from the inference config when using the RL entrypoint.""" - admin_base_url: list[str] | None = None """Separate base URLs for admin operations (weight updates, health checks). When set, admin clients bypass routers and hit each server directly — used in disaggregated P/D deployments where the router must not handle admin traffic.""" diff --git a/src/prime_rl/utils/client.py b/src/prime_rl/utils/client.py index b9ee8f4b9d..d5eb9b4a9f 100644 --- a/src/prime_rl/utils/client.py +++ b/src/prime_rl/utils/client.py @@ -17,15 +17,15 @@ from prime_rl.configs.shared import ClientConfig from prime_rl.utils.logger import get_logger -# Identity tuple used by ``select_train_client`` to key load counts. ``api_base_url`` -# distinguishes servers; ``X-data-parallel-rank`` distinguishes DP shards within a -# server, since the router uses that header to route to specific GPU ranks. -ClientIdentity = tuple[str, str | None] +# Identity used by ``select_train_client`` to key load counts. With external-LB +# data parallelism each DP rank is its own endpoint, so ``api_base_url`` alone +# uniquely identifies an inference target. +ClientIdentity = str def client_identity(client: vf.ClientConfig) -> ClientIdentity: """Stable identity for load balancing across inference clients.""" - return (client.api_base_url, client.extra_headers.get("X-data-parallel-rank")) + return client.api_base_url @runtime_checkable @@ -200,27 +200,24 @@ def setup_clients( k: v for k, v in ((k, os.getenv(v)) for k, v in client_config.headers_from_env.items()) if v is not None } for base_url in client_config.base_url: - for dp_rank in range(client_config.dp_rank_count): - headers = {**client_config.headers, **env_headers} - if client_config.dp_rank_count > 1: - headers["X-data-parallel-rank"] = str(dp_rank) - clients.append( - vf.ClientConfig( - client_idx=client_idx, - client_type=client_type, - api_base_url=base_url, - api_key_var=client_config.api_key_var, - timeout=client_config.timeout, - connect_timeout=client_config.connect_timeout, - max_connections=8192, - max_keepalive_connections=8192, - max_retries=10, - extra_headers=headers, - extra_headers_from_state=client_config.extra_headers_from_state, - **renderer_extra, - ) + headers = {**client_config.headers, **env_headers} + clients.append( + vf.ClientConfig( + client_idx=client_idx, + client_type=client_type, + api_base_url=base_url, + api_key_var=client_config.api_key_var, + timeout=client_config.timeout, + connect_timeout=client_config.connect_timeout, + max_connections=8192, + max_keepalive_connections=8192, + max_retries=10, + extra_headers=headers, + extra_headers_from_state=client_config.extra_headers_from_state, + **renderer_extra, ) - client_idx += 1 + ) + client_idx += 1 return clients diff --git a/src/prime_rl/utils/elastic.py b/src/prime_rl/utils/elastic.py index 951b3673c1..b221046078 100644 --- a/src/prime_rl/utils/elastic.py +++ b/src/prime_rl/utils/elastic.py @@ -197,7 +197,6 @@ def _rebuild_clients(self) -> None: api_key_var=self.client_config.api_key_var, headers=self.client_config.headers, headers_from_env=self.client_config.headers_from_env, - dp_rank_count=self.client_config.dp_rank_count, extra_headers_from_state=self.client_config.extra_headers_from_state, ) self._train_clients = ( diff --git a/tests/unit/utils/test_client.py b/tests/unit/utils/test_client.py index 69325e6c4a..8e7f79c3a1 100644 --- a/tests/unit/utils/test_client.py +++ b/tests/unit/utils/test_client.py @@ -49,14 +49,15 @@ def test_load_lora_adapter_succeeds_on_first_attempt(): ) -def test_setup_clients_assigns_renderer_and_dp_rank_headers(): +def test_setup_clients_creates_one_renderer_client_per_url(): from renderers import Qwen3VLRendererConfig + # External-LB: base_url is the list of per-rank endpoints (the URL is the rank + # selector), so each URL maps to exactly one client with no rank header. client_config = ClientConfig( - base_url=["http://worker-a:8000/v1"], + base_url=["http://worker-a:8000/v1", "http://worker-a:8001/v1"], api_key_var="PRIME_API_KEY", headers={"X-Test": "test"}, - dp_rank_count=2, extra_headers_from_state={"X-Session-ID": "session_id"}, ) @@ -70,8 +71,11 @@ def test_setup_clients_assigns_renderer_and_dp_rank_headers(): assert [client.client_type for client in clients] == ["renderer", "renderer"] assert [client.renderer_config for client in clients] == [renderer_settings, renderer_settings] assert [client.renderer_model_name for client in clients] == [None, None] - assert [client.api_base_url for client in clients] == ["http://worker-a:8000/v1"] * 2 - assert [client.extra_headers["X-data-parallel-rank"] for client in clients] == ["0", "1"] + assert [client.api_base_url for client in clients] == [ + "http://worker-a:8000/v1", + "http://worker-a:8001/v1", + ] + assert all("X-data-parallel-rank" not in client.extra_headers for client in clients) assert clients[0].extra_headers["X-Test"] == "test" assert clients[0].extra_headers_from_state == {"X-Session-ID": "session_id"} From 5d0f4eaf9b39347688e53e24e61354f21c77b275 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Wed, 3 Jun 2026 14:46:31 +0530 Subject: [PATCH 2/2] feat: llm-d (EPP+Envoy) router backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add llm-d as a second router backend alongside vllm-router, selected via `[inference.deployment.router] type = "llm-d"`. Built on the external-LB launch substrate — the per-rank vLLM engines launch identically; only the router control plane differs. - `LlmdRouterConfig`: discriminated-union member with `scorers` (base) + per-profile `prefill_scorer_overrides`/`decode_scorer_overrides`, `non_cached_tokens`, `decode_sidecar_port`, and a known-scorer validator. - `launch_router` helper gains an llm-d branch: renders per-replica EPP + Envoy + file-discovery endpoints and launches `epp` + `envoy` instead of vllm-router. Call sites stay router-agnostic. - pd-sidecar on each decode node (P/D) for remote-prefill orchestration + NIXL. - Entrypoints pass the `router` config object to the templates. - Reject `enable_return_routed_experts` with llm-d (breaks P/D, unverified for multi-node). - SLURM cleanup also clears stale `epp`/`envoy`/`pd-sidecar` processes. - Presets under `templates/llmd/` + `scripts/install_llmd.sh`; docs + install skill. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/inference.md | 21 ++++- .../src/prime_rl/configs/inference.py | 86 ++++++++++++++++++- .../src/prime_rl/configs/rl.py | 19 ++++ scripts/install_llmd.sh | 75 ++++++++++++++++ skills/install/SKILL.md | 10 +++ src/prime_rl/entrypoints/inference.py | 9 +- src/prime_rl/entrypoints/rl.py | 6 +- src/prime_rl/templates/_launch_router.sh.j2 | 57 +++++++++++- src/prime_rl/templates/inference.sbatch.j2 | 41 +++++++-- src/prime_rl/templates/llmd/endpoints.yaml.j2 | 41 +++++++++ src/prime_rl/templates/llmd/envoy.yaml.j2 | 69 +++++++++++++++ .../templates/llmd/epp_estimate.yaml.j2 | 34 ++++++++ src/prime_rl/templates/llmd/epp_pd.yaml.j2 | 56 ++++++++++++ .../templates/multi_node_rl.sbatch.j2 | 33 ++++++- 14 files changed, 526 insertions(+), 31 deletions(-) create mode 100755 scripts/install_llmd.sh create mode 100644 src/prime_rl/templates/llmd/endpoints.yaml.j2 create mode 100644 src/prime_rl/templates/llmd/envoy.yaml.j2 create mode 100644 src/prime_rl/templates/llmd/epp_estimate.yaml.j2 create mode 100644 src/prime_rl/templates/llmd/epp_pd.yaml.j2 diff --git a/docs/inference.md b/docs/inference.md index 0552c8f245..12edd50a29 100644 --- a/docs/inference.md +++ b/docs/inference.md @@ -30,7 +30,7 @@ We support 3 distinct deployment shapes: Most of the features are supported for all deployment shapes, with few exceptions. These exceptions are rejected on validation. -You can select the deployment shape with `InferenceDeploymentConfig` in your config file. This is a config-field that allows you to set the deployment shape, deployment-specific knobs such as `num_nodes`, `num_replicas`, `router_port`, `backend_port`, etc. +You can select the deployment shape with `InferenceDeploymentConfig` in your config file. This is a config-field that allows you to set the deployment shape, deployment-specific knobs such as `num_nodes`, `num_replicas`, `backend_port`, and a `[...deployment.router]` block, etc. ```toml [inference.deployment] @@ -102,7 +102,7 @@ tp = 2 dp = 4 ``` -This configuration will run 2 independent vLLM replicas, each with `tp=2` and `dp=4`. Routing will be handled by the `vllm-router` instance running on the same node as the 1st replica. We aim to support more advanced routing options, such as `llm-d` or `dynamo` in the future. You can read more about the supported routing options in the [router](#router) section. +This configuration will run 2 independent vLLM replicas, each with `tp=2` and `dp=4`. Routing is handled by a router instance running on the same node as the 1st replica — either `vllm-router` (default) or the upstream `llm-d` EPP+Envoy, selected via the `[...deployment.router]` block. You can read more about the supported routing options in the [router](#router) section. ### Wide-EP @@ -167,9 +167,22 @@ This will run 3 inference replicas, each running on 6 nodes. Each replica will r ## Router -We use our own fork of [vllm-router](https://github.com/PrimeIntellect-ai/router) as the request handler. We plan to support more advanced proxy options in the future. +Multi-node and disaggregated deployments front their vLLM backends with a router, configured via a discriminated `[...deployment.router]` block (`type = "vllm-router" | "llm-d"`): -Right now, router handles 2 most important things: +```toml +[inference.deployment.router] # or [deployment.router] for the standalone inference entrypoint +type = "llm-d" # "vllm-router" (default) or "llm-d" +# llm-d-only knobs (all optional): +scorers = { "prefix-cache-scorer" = 3.0, "active-request-scorer" = 2.0 } # base, applied to every profile +prefill_scorer_overrides = { "queue-scorer" = 2.0, "kv-cache-utilization-scorer" = 2.0 } # merged onto the P/D prefill profile +decode_scorer_overrides = {} # merged onto the P/D decode profile +non_cached_tokens = 16 # below this many non-cached prompt tokens, skip remote prefill (P/D) +``` + +- **`vllm-router`** (default) — our fork of [vllm-router](https://github.com/PrimeIntellect-ai/router). Knob: `policy`. +- **`llm-d`** — the upstream [llm-d](https://llm-d.ai) Endpoint Picker (EPP) + Envoy proxy. Routing combines **prefix-cache affinity** (grouped rollouts reuse a cached prefix and skip prefill) with the **`active-request-scorer`** — an in-flight load balancer that spreads requests across ranks immediately, unlike the metrics-scraped `queue-scorer` / `kv-cache-utilization-scorer` / `load-aware-scorer` (which lag and concentrate bursts of same-prefix requests). The scorer weights follow the upstream llm-d P/D guide; tune via `scorers` (base) + `prefill_scorer_overrides` / `decode_scorer_overrides` (per-profile, P/D). Does not support `enable_return_routed_experts` (router replay). + +Both backends support the 2 most important things: - Request routing - KV cache re-use and balanced routing - P/D disaggregation - handling the prefill and decode stages separately diff --git a/packages/prime-rl-configs/src/prime_rl/configs/inference.py b/packages/prime-rl-configs/src/prime_rl/configs/inference.py index 22e07c412e..e5d4b5eb00 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/inference.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/inference.py @@ -167,8 +167,26 @@ def to_connector_dict(self) -> dict[str, Any]: ] +# Known llm-d EPP scorer plugins (used to guard the ``scorers`` map against typos). +KNOWN_SCORERS = frozenset( + { + "prefix-cache-scorer", + "precise-prefix-cache-scorer", + "queue-scorer", + "kv-cache-utilization-scorer", + "active-request-scorer", + "load-aware-scorer", + "running-requests-size-scorer", + "token-load-scorer", + "latency-scorer", + "session-affinity-scorer", + "lora-affinity-scorer", + } +) + + class VllmRouterConfig(BaseConfig): - """PrimeIntellect vllm-router fronting the per-rank (external-LB) endpoints.""" + """PrimeIntellect vllm-router.""" type: Literal["vllm-router"] = "vllm-router" @@ -179,9 +197,57 @@ class VllmRouterConfig(BaseConfig): """Routing policy, e.g. ``consistent_hash`` or ``round_robin``.""" -# Discriminated on ``type`` so additional router backends can be added to the -# union (a single member needs no discriminator yet). -RouterConfig: TypeAlias = VllmRouterConfig +class LlmdRouterConfig(BaseConfig): + """llm-d router backend (EPP + Envoy).""" + + type: Literal["llm-d"] = "llm-d" + + port: int = 8000 + """Port the Envoy gateway listens on — becomes the client-facing router URL.""" + + scorers: dict[str, float] = { + "prefix-cache-scorer": 3.0, + "active-request-scorer": 2.0, + } + """EPP scorer name → weight, applied to every routing profile (before the per-profile P/D overrides). Defaults to prefix-cache affinity plus in-flight (active-request) load balancing. Unknown scorer names are rejected.""" + + prefill_scorer_overrides: dict[str, float] = { + "queue-scorer": 2.0, + "kv-cache-utilization-scorer": 2.0, + } + """P/D only: scorer → weight merged onto ``scorers`` for the prefill profile (a per-profile weight overrides the base).""" + + decode_scorer_overrides: dict[str, float] = {} + """P/D only: scorer → weight merged onto ``scorers`` for the decode profile (a per-profile weight overrides the base); empty by default.""" + + non_cached_tokens: int = 16 + """P/D only: requests with fewer than this many non-cached prompt tokens skip remote prefill and run decode-only.""" + + decode_sidecar_port: int = 8300 + """P/D only: port the decode-side llm-d sidecar listens on.""" + + @property + def prefill_scorers(self) -> dict[str, float]: + """Effective prefill-profile scorers: ``scorers`` merged with ``prefill_scorer_overrides``.""" + return {**self.scorers, **self.prefill_scorer_overrides} + + @property + def decode_scorers(self) -> dict[str, float]: + """Effective decode-profile scorers: ``scorers`` merged with ``decode_scorer_overrides``.""" + return {**self.scorers, **self.decode_scorer_overrides} + + @model_validator(mode="after") + def validate_scorers(self): + unknown = ( + set(self.scorers) | set(self.prefill_scorer_overrides) | set(self.decode_scorer_overrides) + ) - KNOWN_SCORERS + if unknown: + raise ValueError(f"Unknown llm-d scorer(s): {sorted(unknown)}. Known scorers: {sorted(KNOWN_SCORERS)}.") + return self + + +# Discriminated on ``type`` so the launch path can pick the router backend. +RouterConfig: TypeAlias = Annotated[VllmRouterConfig | LlmdRouterConfig, Field(discriminator="type")] class BaseInferenceDeploymentConfig(BaseConfig): @@ -366,6 +432,18 @@ def validate_multi_node_requires_slurm(self): raise ValueError("Must use SLURM for multi-node / disaggregated deployment.") return self + @model_validator(mode="after") + def validate_llmd_no_routed_experts(self): + """Reject routed-expert return with the llm-d router (breaks P/D, unverified for multi-node).""" + router = getattr(self.deployment, "router", None) + if router is not None and router.type == "llm-d" and self.enable_return_routed_experts: + raise ValueError( + "The llm-d router backend does not support routed-expert return " + "(enable_return_routed_experts): it breaks P/D and is unverified for multi-node. " + "Use router type 'vllm-router' for routed-expert runs." + ) + return self + @model_validator(mode="after") def auto_setup_kv_cache_offload(self): if self.kv_cache_offload is not None: diff --git a/packages/prime-rl-configs/src/prime_rl/configs/rl.py b/packages/prime-rl-configs/src/prime_rl/configs/rl.py index e95fd5207b..4349a00897 100644 --- a/packages/prime-rl-configs/src/prime_rl/configs/rl.py +++ b/packages/prime-rl-configs/src/prime_rl/configs/rl.py @@ -435,6 +435,25 @@ def auto_setup_router_replay(self): ) return self + @model_validator(mode="after") + def validate_llmd_no_routed_experts(self): + """Reject routed-expert return with the llm-d router (breaks P/D, unverified for multi-node). + + Runs after ``auto_setup_router_replay`` so it also catches the + ``trainer.enable_router_replay`` path, which sets the inference flag here + (after InferenceConfig's own validators, which therefore miss it). + """ + if self.inference is not None and self.inference.enable_return_routed_experts: + router = getattr(self.inference.deployment, "router", None) + if router is not None and router.type == "llm-d": + raise ValueError( + "The llm-d router backend does not support routed-expert return " + "(inference.enable_return_routed_experts / trainer.enable_router_replay): it " + "breaks P/D and is unverified for multi-node. Use router type 'vllm-router' " + "for router-replay runs." + ) + return self + @model_validator(mode="after") def validate_router_replay_without_kv_offload(self): if ( diff --git a/scripts/install_llmd.sh b/scripts/install_llmd.sh new file mode 100755 index 0000000000..3f13d018ed --- /dev/null +++ b/scripts/install_llmd.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# Install the llm-d standalone (no-Kubernetes) routing binaries into +# third_party/llmd/bin: +# - epp : llm-d-router Endpoint Picker (the routing brain) +# - pd-sidecar : decode-side proxy for P/D disaggregation +# - envoy : the data-plane proxy that calls the EPP via ext_proc +# +# epp/pd-sidecar are built from a pinned llm-d-router commit. We currently build +# from a small fork that adds P/D disaggregation for vLLM's token-in +# /inference/v1/generate endpoint (prime-rl's renderer / TITO rollout path) — +# upstream's pd-sidecar only disaggregates the OpenAI endpoints, so token-in P/D +# silently runs decode-only. The fork is pending upstream PR +# llm-d/llm-d-router#1458; switch LLMD_ROUTER_REPO back to the upstream repo and +# bump LLMD_ROUTER_REF once it merges. The fork is branched off the upstream +# commit that added the EPP vllmhttp parser (PR #1248), which the renderer path +# also needs. +# +# System Go is not required: a Go toolchain is vendored under third_party/llmd/go +# and used to bootstrap; GOTOOLCHAIN=auto fetches the exact version the module +# pins. Envoy is pulled as a static binary from its release container image +# (docker is the only widely-available extraction path on bare-metal nodes). +set -euo pipefail + +LLMD_ROUTER_REPO="${LLMD_ROUTER_REPO:-https://github.com/S1ro1/llm-d-router.git}" +LLMD_ROUTER_REF="${LLMD_ROUTER_REF:-1ca4243ec84c657b4a5f507a1776d6c15a618d5b}" +GO_BOOTSTRAP_VERSION="${GO_BOOTSTRAP_VERSION:-1.23.4}" +ENVOY_VERSION="${ENVOY_VERSION:-1.36.0}" + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +PROJECT_DIR=$(cd "$SCRIPT_DIR/.." && pwd) +LLMD_DIR="$PROJECT_DIR/third_party/llmd" +BIN_DIR="$LLMD_DIR/bin" +GO_ROOT="$LLMD_DIR/go" +SRC_DIR="$LLMD_DIR/src" +mkdir -p "$BIN_DIR" + +# --- Go toolchain (vendored bootstrap; auto-upgrades to the module's version) --- +if [ ! -x "$GO_ROOT/bin/go" ]; then + echo "[install_llmd] downloading bootstrap Go $GO_BOOTSTRAP_VERSION" + rm -rf "$GO_ROOT" + curl -fsSL "https://go.dev/dl/go${GO_BOOTSTRAP_VERSION}.linux-amd64.tar.gz" | tar -xz -C "$LLMD_DIR" +fi +export GOROOT="$GO_ROOT" +export PATH="$GO_ROOT/bin:$PATH" +export GOTOOLCHAIN=auto +echo "[install_llmd] bootstrap $(go version)" + +# --- Fetch llm-d-router source at the pinned ref --- +echo "[install_llmd] fetching ${LLMD_ROUTER_REPO}@${LLMD_ROUTER_REF}" +if [ ! -d "$SRC_DIR/.git" ]; then + rm -rf "$SRC_DIR" + git clone --quiet "$LLMD_ROUTER_REPO" "$SRC_DIR" +fi +git -C "$SRC_DIR" remote set-url origin "$LLMD_ROUTER_REPO" +git -C "$SRC_DIR" fetch --quiet origin +git -C "$SRC_DIR" checkout --quiet "$LLMD_ROUTER_REF" + +# --- Build epp + pd-sidecar --- +echo "[install_llmd] building epp + pd-sidecar" +( cd "$SRC_DIR" && go build -o "$BIN_DIR/epp" ./cmd/epp && go build -o "$BIN_DIR/pd-sidecar" ./cmd/pd-sidecar ) + +# --- Envoy static binary (extract from the release image; keep if present) --- +if [ ! -x "$BIN_DIR/envoy" ]; then + echo "[install_llmd] extracting Envoy ${ENVOY_VERSION} from envoyproxy/envoy image" + cid=$(docker create "envoyproxy/envoy:v${ENVOY_VERSION}") + docker cp "${cid}:/usr/local/bin/envoy" "$BIN_DIR/envoy" + docker rm "$cid" >/dev/null + chmod +x "$BIN_DIR/envoy" +fi + +echo "[install_llmd] installed binaries in $BIN_DIR:" +"$BIN_DIR/epp" --version 2>&1 | head -1 || true +"$BIN_DIR/envoy" --version 2>&1 | head -1 || true +"$BIN_DIR/pd-sidecar" --help >/dev/null 2>&1 && echo " pd-sidecar OK" || true +echo "[install_llmd] done" diff --git a/skills/install/SKILL.md b/skills/install/SKILL.md index 3ad7e164b8..746683f55e 100644 --- a/skills/install/SKILL.md +++ b/skills/install/SKILL.md @@ -61,6 +61,16 @@ Flags: `--workspace DIR`, `--deepep-ref REF` (default `73b6ea4`), `--nvshmem-ver Verify: `uv run python -c 'import deep_ep; print(deep_ep.__file__)'`. +### llm-d router backend + +Multi-node / disaggregated deployments can route through the upstream llm-d Endpoint Picker instead of `vllm-router` (set `[...deployment.router] type = "llm-d"`). It needs three native binaries — install once: + +```bash +bash scripts/install_llmd.sh # builds epp + pd-sidecar from a pinned llm-d-router commit (vendored Go), fetches envoy +``` + +Binaries land in `third_party/llmd/bin/{epp,envoy,pd-sidecar}` (a shared path, so SLURM nodes see them). `epp` is pinned to the commit that includes the `vllmhttp-parser` (PR #1248) so prime-rl's renderer/TITO `/inference/v1/generate` path routes correctly. Override the pin with `LLMD_ROUTER_REF=`. The EPP + Envoy + endpoints configs are rendered from `templates/llmd/*.yaml.j2` (included into the SLURM script); only the per-node IPv4 addresses are filled in inline at launch time. + ## Key files - `pyproject.toml` — dependencies, extras, dependency groups diff --git a/src/prime_rl/entrypoints/inference.py b/src/prime_rl/entrypoints/inference.py index 5f36c4b1f1..965556923d 100644 --- a/src/prime_rl/entrypoints/inference.py +++ b/src/prime_rl/entrypoints/inference.py @@ -47,7 +47,7 @@ def write_slurm_script(config: InferenceConfig, config_path: Path, script_path: dp_per_node=dp_per_node, num_nodes=getattr(config.deployment, "num_nodes", 1), port=config.server.port, - disaggregated=is_disaggregated, + is_disaggregated=is_disaggregated, kv_offload=offload is not None, kv_offload_mooncake=is_mooncake, kv_offload_cpu_bytes=int(offload.cpu.num_bytes) if is_mooncake else 0, @@ -65,8 +65,7 @@ def write_slurm_script(config: InferenceConfig, config_path: Path, script_path: num_decode_replicas=config.deployment.num_decode_replicas, prefill_port=config.deployment.prefill_port, decode_port=config.deployment.decode_port, - router_port=config.deployment.router.port, - router_policy=config.deployment.router.policy, + router=config.deployment.router, data_parallel_rpc_port=config.data_parallel_rpc_port, use_deep_gemm=config.use_deep_gemm, prefill_env_overrides=config.deployment.prefill_env_overrides, @@ -74,11 +73,11 @@ def write_slurm_script(config: InferenceConfig, config_path: Path, script_path: ) elif is_multi_node: template_vars.update( - router_port=config.deployment.router.port, + router=config.deployment.router, backend_port=config.deployment.backend_port, - router_policy=config.deployment.router.policy, data_parallel_rpc_port=config.data_parallel_rpc_port, enable_expert_parallel=config.enable_expert_parallel, + infer_nodes_per_replica=config.deployment.num_nodes, ) script = template.render(**template_vars) diff --git a/src/prime_rl/entrypoints/rl.py b/src/prime_rl/entrypoints/rl.py index 0ccdd7c53b..83c3e42134 100644 --- a/src/prime_rl/entrypoints/rl.py +++ b/src/prime_rl/entrypoints/rl.py @@ -12,6 +12,7 @@ import pynvml import tomli_w +from prime_rl.configs.inference import VllmRouterConfig from prime_rl.configs.rl import RLConfig from prime_rl.utils.config import cli from prime_rl.utils.logger import get_logger, setup_logger @@ -371,7 +372,7 @@ def write_slurm_script(config: RLConfig, config_dir: Path, script_path: Path) -> num_prefill_replicas=infer_deploy.num_prefill_replicas, num_decode_replicas=infer_deploy.num_decode_replicas, gpus_per_node=config.deployment.gpus_per_node, - router_port=infer_deploy.router.port, + router=infer_deploy.router, prefill_port=infer_deploy.prefill_port, decode_port=infer_deploy.decode_port, inference_tp=config.inference.parallel.tp, @@ -396,7 +397,8 @@ def write_slurm_script(config: RLConfig, config_dir: Path, script_path: Path) -> nodes_per_infer_replica=config.deployment.num_infer_nodes, num_infer_replicas=config.deployment.num_infer_replicas, gpus_per_node=config.deployment.gpus_per_node, - router_port=config.inference.deployment.router.port if config.inference else 8000, + router=config.inference.deployment.router if config.inference else VllmRouterConfig(), + infer_nodes_per_replica=config.deployment.num_infer_nodes, backend_port=config.inference.deployment.backend_port if config.inference else 8100, inference_tp=config.inference.parallel.tp if config.inference else 1, inference_enable_expert_parallel=config.inference.enable_expert_parallel if config.inference else False, diff --git a/src/prime_rl/templates/_launch_router.sh.j2 b/src/prime_rl/templates/_launch_router.sh.j2 index f9f2ef926f..716a56bddd 100644 --- a/src/prime_rl/templates/_launch_router.sh.j2 +++ b/src/prime_rl/templates/_launch_router.sh.j2 @@ -1,9 +1,57 @@ {# Shared router launch helper, included once per template and called by the - replica-head rank. vllm-router today; an llm-d (EPP+Envoy) branch can slot in - here so the call sites stay router-agnostic. #} -# launch_router [replica_idx] + replica-head rank. The router backend (vllm-router vs llm-d EPP+Envoy) is + chosen here via router.type, so the call sites stay router-agnostic. + node_base/node_count locate this replica's nodes in HOSTNAMES for llm-d + endpoint discovery (disagg uses NUM_PREFILL_NODES/NUM_DECODE_NODES). #} +# launch_router [node_count] launch_router() { - local mode="$1" worker_args="$2" port="$3" policy="$4" log="$5" replica="${6:-}" + local mode="$1" worker_args="$2" port="$3" policy="$4" log="$5" replica="${6:-0}" node_base="${7:-0}" node_count="${8:-0}" +{%- if router.type == "llm-d" %} + echo "Starting llm-d router (EPP+Envoy) on $LOCAL_IP:$port for replica $replica" | tee "$log" + export PATH="$PROJECT_DIR/third_party/llmd/bin:$PATH" + local LLMD_DIR="$OUTPUT_DIR/logs/inference/llmd_${replica}" + mkdir -p "$LLMD_DIR" + read -ra HOSTNAMES <<< "$HOSTNAMES_STR" +{%- if is_disaggregated %} +{%- set ep_disaggregated = true %} +{%- set ep_num_prefill = num_prefill_nodes %} +{%- set ep_num_decode = num_decode_nodes %} +{%- set ep_dp = dp_per_node %} +{%- set ep_prefill_port = prefill_port %} +{%- set ep_sidecar_port = router.decode_sidecar_port %} + cat > "$LLMD_DIR/epp.yaml" < "$LLMD_DIR/endpoints.yaml" < "$LLMD_DIR/epp.yaml" < "$LLMD_DIR/endpoints.yaml" < "$LLMD_DIR/envoy.yaml" <> "$log" 2>&1 & + envoy -c "$LLMD_DIR/envoy.yaml" >> "$log" 2>&1 & +{%- else %} local tag="vllm-router"; [ "$mode" = pd ] && tag="vllm-router (PD)" echo "Starting $tag on $LOCAL_IP:$port${replica:+ for replica $replica}" | tee "$log" local -a cmd=(vllm-router --policy "$policy" --host 0.0.0.0 --port "$port" @@ -14,4 +62,5 @@ launch_router() { cmd+=(--worker-urls $worker_args) fi "${cmd[@]}" >> "$log" 2>&1 & +{%- endif %} } diff --git a/src/prime_rl/templates/inference.sbatch.j2 b/src/prime_rl/templates/inference.sbatch.j2 index efabcfa8fe..abcadf5b28 100755 --- a/src/prime_rl/templates/inference.sbatch.j2 +++ b/src/prime_rl/templates/inference.sbatch.j2 @@ -28,7 +28,7 @@ set -e # Configs export NUM_NODES={{ num_nodes }} export GPUS_PER_NODE={{ gpus_per_node }} -{% if disaggregated -%} +{% if is_disaggregated -%} export NUM_PREFILL_NODES={{ num_prefill_nodes }} export NUM_DECODE_NODES={{ num_decode_nodes }} export NUM_PREFILL_REPLICAS={{ num_prefill_replicas }} @@ -38,10 +38,13 @@ export NODES_PER_PREFILL_REPLICA=$((NUM_PREFILL_NODES / NUM_PREFILL_REPLICAS)) export NODES_PER_DECODE_REPLICA=$((NUM_DECODE_NODES / NUM_DECODE_REPLICAS)) export PREFILL_PORT={{ prefill_port }} export DECODE_PORT={{ decode_port }} -export ROUTER_PORT={{ router_port }} +export ROUTER_PORT={{ router.port }} export RPC_PORT={{ data_parallel_rpc_port }} +{%- if router.type == "llm-d" %} +export DECODE_SIDECAR_PORT={{ router.decode_sidecar_port }} +{%- endif %} {%- elif num_nodes > 1 %} -export ROUTER_PORT={{ router_port }} +export ROUTER_PORT={{ router.port }} export BACKEND_PORT={{ backend_port }} export RPC_PORT={{ data_parallel_rpc_port }} {%- endif %} @@ -69,7 +72,7 @@ uv sync --all-extras # Print inference URLs export HOSTNAMES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) export HOSTNAMES_STR="${HOSTNAMES[*]}" -{% if disaggregated -%} +{% if is_disaggregated -%} # Build per-replica router URLs INFER_URLS="" ALL_ROUTER_ARGS="" @@ -131,6 +134,10 @@ srun bash -s <<'CLEANUP_SH' pkill -9 -f "[p]ython.*prime_rl" 2>/dev/null pkill -9 -f "[t]orchrun" 2>/dev/null pkill -9 -f "[v]llm-router" 2>/dev/null + # llm-d router procs (EPP + Envoy + decode pd-sidecar) left over from a prior run. + pkill -9 -f "[e]pp.*pool-name" 2>/dev/null + pkill -9 -f "[e]nvoy.*envoy.yaml" 2>/dev/null + pkill -9 -f "[p]d-sidecar" 2>/dev/null pkill -9 -f "[v]llm" 2>/dev/null pkill -9 -f "[p]rime_rl" 2>/dev/null # Also match prctl-set process names (kernel comm) like "vllm::router" @@ -143,7 +150,7 @@ srun bash -s <<'CLEANUP_SH' pkill -9 -x mooncake_client 2>/dev/null sleep 2 rm -rf /dev/shm/vllm-* /dev/shm/vllm_* /tmp/vllm-* /tmp/vllm_* /tmp/torch-* /tmp/torchelastic_* 2>/dev/null - procs=$(ps -eo comm,args | grep -E "python|torchrun|vllm|vllm::|mooncake_" | grep -v grep | wc -l) + procs=$(ps -eo comm,args | grep -E "python|torchrun|vllm|vllm::|mooncake_|[e]pp |envoy|pd-sidecar" | grep -v grep | wc -l) gpu=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk "{s+=\$1} END {print s}") echo "[node-cleanup] $(hostname) procs=$procs gpu_mem=${gpu}MiB" CLEANUP_SH @@ -178,7 +185,7 @@ srun --kill-on-bad-exit=1 bash -c ' {% include "_mooncake_store.sh.j2" %} {%- endif %} -{% if disaggregated -%} +{% if is_disaggregated -%} # NVSHMEM libs for DeepEP — ensure the rpath target exists on all nodes if [ ! -d /tmp/deepep_build/nvshmem/lib ] && [ -d "$OUTPUT_DIR/nvshmem/lib" ]; then mkdir -p /tmp/deepep_build/nvshmem @@ -252,8 +259,26 @@ srun --kill-on-bad-exit=1 bash -c ' # Start the router on the first prefill node of each replica (external LB across per-rank endpoints) if [ "$RANK_IN_REPLICA" -eq 0 ]; then REPLICA_ROUTER_ARGS=$(echo "$ALL_ROUTER_ARGS" | cut -d"|" -f$((REPLICA_IDX + 1))) - launch_router pd "$REPLICA_ROUTER_ARGS" "$ROUTER_PORT" "{{ router_policy }}" "$OUTPUT_DIR/logs/inference/router_${REPLICA_IDX}.log" "$REPLICA_IDX" + launch_router pd "$REPLICA_ROUTER_ARGS" "$ROUTER_PORT" "{{ router.policy }}" "$OUTPUT_DIR/logs/inference/router_${REPLICA_IDX}.log" "$REPLICA_IDX" "$REPLICA_BASE" + fi +{%- if router.type == "llm-d" %} + # pd-sidecar on each decode node: EPP routes decode -> sidecar (DECODE_SIDECAR_PORT + # + rank) -> vLLM decode (DECODE_PORT + rank) after orchestrating remote prefill. + if [ "$ROLE" = "decode" ] && [ "$ROLE_RANK" -eq 0 ]; then + SIDECAR_LOG="$OUTPUT_DIR/logs/inference/sidecar_${INFER_NODE_RANK}.log" + echo "Starting pd-sidecar on $LOCAL_IP:$DECODE_SIDECAR_PORT(+rank) -> vLLM $DECODE_PORT(+rank)" | tee $SIDECAR_LOG + export PATH="$PROJECT_DIR/third_party/llmd/bin:$PATH" + pd-sidecar \ + --port $DECODE_SIDECAR_PORT \ + --vllm-port $DECODE_PORT \ + --kv-connector nixlv2 \ + --secure-proxy=false \ + --enable-ssrf-protection=false \ + --inference-pool prime-rl \ + --data-parallel-size {{ dp_per_node }} \ + >> $SIDECAR_LOG 2>&1 & fi +{%- endif %} # External-LB: one vLLM API server per local DP rank (PORT + k), each pinned to its TP # slice of GPUs with a unique NIXL side-channel port. Each role is its own external-LB DP @@ -277,7 +302,7 @@ srun --kill-on-bad-exit=1 bash -c ' # Start the router on node 0 (balances across per-rank endpoints — no intra-node DP header) if [ "$INFER_NODE_RANK" -eq 0 ]; then - launch_router regular "$ROUTER_ARGS" "$ROUTER_PORT" "{{ router_policy }}" "$OUTPUT_DIR/logs/inference/router.log" + launch_router regular "$ROUTER_ARGS" "$ROUTER_PORT" "{{ router.policy }}" "$OUTPUT_DIR/logs/inference/router.log" 0 0 "$NUM_NODES" fi # External-LB: one vLLM API server per DP rank on this node (BACKEND_PORT + dp_local), diff --git a/src/prime_rl/templates/llmd/endpoints.yaml.j2 b/src/prime_rl/templates/llmd/endpoints.yaml.j2 new file mode 100644 index 0000000000..1f3bec6214 --- /dev/null +++ b/src/prime_rl/templates/llmd/endpoints.yaml.j2 @@ -0,0 +1,41 @@ +{# llm-d file-discovery endpoints: one entry per DP rank. Rendered (via include) + into the sbatch; the per-node address is the __LLMD_ADDR___ placeholder, + filled with each SLURM host's IPv4 inline in the sbatch (file-discovery rejects + hostnames). Node ordering matches the sbatch's host loop: prefill then decode. + The including sbatch sets ep_* via {% set %}. #} +endpoints: +{%- if ep_disaggregated %} +{%- for node in range(ep_num_prefill) %} +{%- for rank in range(ep_dp) %} + - name: prefill-{{ node }}-rank-{{ rank }} + address: __LLMD_ADDR_{{ node }}__ + port: "{{ ep_prefill_port + rank }}" + namespace: default + labels: + llm-d.ai/pool: prime-rl + llm-d.ai/role: prefill +{%- endfor %} +{%- endfor %} +{%- for node in range(ep_num_decode) %} +{%- for rank in range(ep_dp) %} + - name: decode-{{ node }}-rank-{{ rank }} + address: __LLMD_ADDR_{{ ep_num_prefill + node }}__ + port: "{{ ep_sidecar_port + rank }}" + namespace: default + labels: + llm-d.ai/pool: prime-rl + llm-d.ai/role: decode +{%- endfor %} +{%- endfor %} +{%- else %} +{%- for node in range(ep_num_nodes) %} +{%- for rank in range(ep_dp) %} + - name: backend-{{ node }}-rank-{{ rank }} + address: __LLMD_ADDR_{{ node }}__ + port: "{{ ep_backend_port + rank }}" + namespace: default + labels: + llm-d.ai/pool: prime-rl +{%- endfor %} +{%- endfor %} +{%- endif %} diff --git a/src/prime_rl/templates/llmd/envoy.yaml.j2 b/src/prime_rl/templates/llmd/envoy.yaml.j2 new file mode 100644 index 0000000000..9421977798 --- /dev/null +++ b/src/prime_rl/templates/llmd/envoy.yaml.j2 @@ -0,0 +1,69 @@ +{# Envoy data plane: EPP picks the backend (ext_proc) and sets + x-gateway-destination-endpoint; the ORIGINAL_DST cluster routes there. Both + /v1/ (MITO) and /inference/v1/ (TITO) go through the picker. EPP gRPC is on + loopback 9002; the gateway listens on the configured port. #} +admin: + address: { socket_address: { address: 127.0.0.1, port_value: 9901 } } +static_resources: + listeners: + - name: prime_rl + address: { socket_address: { address: 0.0.0.0, port_value: {{ router.port }} } } + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: prime_rl + codec_type: AUTO + stream_idle_timeout: 0s + request_timeout: 0s + route_config: + virtual_hosts: + - name: prime_rl + domains: ["*"] + routes: + - match: { prefix: "/v1/" } + route: { cluster: backend_cluster, timeout: 0s, idle_timeout: 86400s } + - match: { prefix: "/inference/v1/" } + route: { cluster: backend_cluster, timeout: 0s, idle_timeout: 86400s } + http_filters: + - name: envoy.filters.http.ext_proc + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor + grpc_service: + envoy_grpc: { cluster_name: epp_cluster } + timeout: 10s + processing_mode: + request_header_mode: SEND + response_header_mode: SEND + request_body_mode: FULL_DUPLEX_STREAMED + response_body_mode: FULL_DUPLEX_STREAMED + request_trailer_mode: SEND + response_trailer_mode: SEND + message_timeout: 1000s + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + suppress_envoy_headers: true + clusters: + - name: backend_cluster + type: ORIGINAL_DST + lb_policy: CLUSTER_PROVIDED + original_dst_lb_config: + use_http_header: true + http_header_name: x-gateway-destination-endpoint + connect_timeout: 5s + - name: epp_cluster + type: STRICT_DNS + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: {} + load_assignment: + cluster_name: epp_cluster + endpoints: + - lb_endpoints: + - endpoint: + address: { socket_address: { address: 127.0.0.1, port_value: 9002 } } + connect_timeout: 5s diff --git a/src/prime_rl/templates/llmd/epp_estimate.yaml.j2 b/src/prime_rl/templates/llmd/epp_estimate.yaml.j2 new file mode 100644 index 0000000000..27f041ff1b --- /dev/null +++ b/src/prime_rl/templates/llmd/epp_estimate.yaml.j2 @@ -0,0 +1,34 @@ +{# llm-d EPP config: single-pool (non-P/D) routing with approximate prefix-cache + scoring. The vllmhttp parser embeds the OpenAI parser and also handles + /inference/v1/generate (token_ids), so MITO + TITO both get prefix-cache + scoring. Included into the sbatch as a heredoc; $LLMD_DIR is the per-replica + config dir, expanded at launch time. #} +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: + - name: file-disc + type: file-discovery + parameters: + path: $LLMD_DIR/endpoints.yaml + watchFile: true + - name: vllmhttp-parser + type: vllmhttp-parser +{%- for scorer in router.scorers %} + - type: {{ scorer }} +{%- endfor %} + - type: max-score-picker + - type: single-profile-handler +schedulingProfiles: + - name: default + plugins: +{%- for scorer, weight in router.scorers.items() %} + - pluginRef: {{ scorer }} + weight: {{ weight }} +{%- endfor %} + - pluginRef: max-score-picker +requestHandler: + parser: + pluginRef: vllmhttp-parser +dataLayer: + discovery: + pluginRef: file-disc diff --git a/src/prime_rl/templates/llmd/epp_pd.yaml.j2 b/src/prime_rl/templates/llmd/epp_pd.yaml.j2 new file mode 100644 index 0000000000..909b531fcd --- /dev/null +++ b/src/prime_rl/templates/llmd/epp_pd.yaml.j2 @@ -0,0 +1,56 @@ +{# llm-d EPP config: prefill/decode disaggregation. disagg-profile-handler runs + the prefill profile when prefix-based-pd-decider sees >= nonCachedTokens + non-cached prompt tokens, else decode-only. Mirrors the upstream llm-d PD + guide: the prefill profile uses prefill_scorers (prefix + queue/kv) and the + decode profile uses scorers (prefix + active-request); prefill-filter / + decode-filter select endpoints by llm-d.ai/role label. Included into the + sbatch as a heredoc; $LLMD_DIR is the per-replica config dir, expanded at + launch time. #} +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: + - name: file-disc + type: file-discovery + parameters: + path: $LLMD_DIR/endpoints.yaml + watchFile: true + - name: vllmhttp-parser + type: vllmhttp-parser + - name: pd-decider + type: prefix-based-pd-decider + parameters: + nonCachedTokens: {{ router.non_cached_tokens }} + - name: profile-handler + type: disagg-profile-handler + parameters: + deciders: + prefill: pd-decider + - type: prefill-filter + - type: decode-filter +{%- for scorer in ((router.prefill_scorers.keys() | list) + (router.decode_scorers.keys() | list)) | unique %} + - type: {{ scorer }} +{%- endfor %} + - type: max-score-picker +schedulingProfiles: + - name: prefill + plugins: + - pluginRef: prefill-filter +{%- for scorer, weight in router.prefill_scorers.items() %} + - pluginRef: {{ scorer }} + weight: {{ weight }} +{%- endfor %} + - pluginRef: max-score-picker + - name: decode + plugins: + - pluginRef: decode-filter +{%- for scorer, weight in router.decode_scorers.items() %} + - pluginRef: {{ scorer }} + weight: {{ weight }} +{%- endfor %} + - pluginRef: max-score-picker +requestHandler: + parser: + pluginRef: vllmhttp-parser +dataLayer: + discovery: + pluginRef: file-disc diff --git a/src/prime_rl/templates/multi_node_rl.sbatch.j2 b/src/prime_rl/templates/multi_node_rl.sbatch.j2 index 37f67be726..3c45079267 100755 --- a/src/prime_rl/templates/multi_node_rl.sbatch.j2 +++ b/src/prime_rl/templates/multi_node_rl.sbatch.j2 @@ -32,7 +32,7 @@ export NODES_PER_INFER_REPLICA={{ nodes_per_infer_replica }} export NUM_INFER_REPLICAS={{ num_infer_replicas }} export GPUS_PER_NODE={{ gpus_per_node }} {% if num_infer_nodes > 0 -%} -export ROUTER_PORT={{ router_port }} +export ROUTER_PORT={{ router.port }} export INFERENCE_TP={{ inference_tp }} export INFERENCE_DP_LOCAL=$((GPUS_PER_NODE / INFERENCE_TP)) export INFERENCE_DATA_PARALLEL_RPC_PORT={{ inference_data_parallel_rpc_port }} @@ -45,6 +45,9 @@ export NODES_PER_PREFILL_REPLICA=$((NUM_PREFILL_NODES / NUM_PREFILL_REPLICAS)) export NODES_PER_DECODE_REPLICA=$((NUM_DECODE_NODES / NUM_DECODE_REPLICAS)) export PREFILL_PORT={{ prefill_port }} export DECODE_PORT={{ decode_port }} +{%- if router.type == "llm-d" %} +export DECODE_SIDECAR_PORT={{ router.decode_sidecar_port }} +{%- endif %} {%- else -%} export BACKEND_PORT={{ backend_port }} export INFERENCE_ENABLE_EXPERT_PARALLEL={{ "1" if inference_enable_expert_parallel else "0" }} @@ -165,6 +168,10 @@ srun bash -s <<'CLEANUP_SH' pkill -9 -f "[p]ython.*prime_rl" 2>/dev/null pkill -9 -f "[t]orchrun" 2>/dev/null pkill -9 -f "[v]llm-router" 2>/dev/null + # llm-d router procs (EPP + Envoy + decode pd-sidecar) left over from a prior run. + pkill -9 -f "[e]pp.*pool-name" 2>/dev/null + pkill -9 -f "[e]nvoy.*envoy.yaml" 2>/dev/null + pkill -9 -f "[p]d-sidecar" 2>/dev/null pkill -9 -f "[v]llm" 2>/dev/null pkill -9 -f "[p]rime_rl" 2>/dev/null # Also match prctl-set process names (kernel comm) like "vllm::router" @@ -177,7 +184,7 @@ srun bash -s <<'CLEANUP_SH' pkill -9 -x mooncake_client 2>/dev/null sleep 2 rm -rf /dev/shm/vllm-* /dev/shm/vllm_* /tmp/vllm-* /tmp/vllm_* /tmp/torch-* /tmp/torchelastic_* 2>/dev/null - procs=$(ps -eo comm,args | grep -E "python|torchrun|vllm|vllm::|mooncake_" | grep -v grep | wc -l) + procs=$(ps -eo comm,args | grep -E "python|torchrun|vllm|vllm::|mooncake_|[e]pp |envoy|pd-sidecar" | grep -v grep | wc -l) gpu=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | awk "{s+=\$1} END {print s}") echo "[node-cleanup] $(hostname) procs=$procs gpu_mem=${gpu}MiB" CLEANUP_SH @@ -288,8 +295,26 @@ if [ "$SLURM_PROCID" -lt "$NUM_INFER_NODES" ]; then # Start the router on the first node of each replica (PD mode, external LB across per-rank endpoints) if [ "$RANK_IN_REPLICA" -eq 0 ]; then REPLICA_ROUTER_ARGS=$(echo "$ALL_ROUTER_ARGS" | cut -d"|" -f$((REPLICA_IDX + 1))) - launch_router pd "$REPLICA_ROUTER_ARGS" "$ROUTER_PORT" consistent_hash "$OUTPUT_DIR/logs/inference/router_${REPLICA_IDX}.log" "$REPLICA_IDX" + launch_router pd "$REPLICA_ROUTER_ARGS" "$ROUTER_PORT" "{{ router.policy }}" "$OUTPUT_DIR/logs/inference/router_${REPLICA_IDX}.log" "$REPLICA_IDX" "$REPLICA_BASE" + fi +{%- if router.type == "llm-d" %} + # pd-sidecar on each decode node: EPP routes decode -> sidecar (DECODE_SIDECAR_PORT + # + rank) -> vLLM decode (DECODE_PORT + rank) after orchestrating remote prefill. + if [ "$ROLE" = "decode" ] && [ "$ROLE_RANK" -eq 0 ]; then + SIDECAR_LOG="$OUTPUT_DIR/logs/inference/sidecar_${INFER_NODE_RANK}.log" + echo "Starting pd-sidecar on $LOCAL_IP:$DECODE_SIDECAR_PORT(+rank) -> vLLM $DECODE_PORT(+rank)" | tee $SIDECAR_LOG + export PATH="$PROJECT_DIR/third_party/llmd/bin:$PATH" + pd-sidecar \ + --port $DECODE_SIDECAR_PORT \ + --vllm-port $DECODE_PORT \ + --kv-connector nixlv2 \ + --secure-proxy=false \ + --enable-ssrf-protection=false \ + --inference-pool prime-rl \ + --data-parallel-size $INFERENCE_DP_LOCAL \ + >> $SIDECAR_LOG 2>&1 & fi +{%- endif %} # External-LB: one vLLM API server per local DP rank (PORT + k), each pinned to its TP # slice of GPUs with a unique NIXL side-channel port. Each role is its own external-LB DP @@ -306,7 +331,7 @@ if [ "$SLURM_PROCID" -lt "$NUM_INFER_NODES" ]; then # Start the router on the first node of each replica if [ "$RANK_IN_REPLICA" -eq 0 ]; then REPLICA_ROUTER_ARGS=$(echo "$ALL_ROUTER_ARGS" | cut -d"|" -f$((REPLICA_IDX + 1))) - launch_router regular "$REPLICA_ROUTER_ARGS" "$ROUTER_PORT" consistent_hash "$OUTPUT_DIR/logs/inference/router_${REPLICA_IDX}.log" "$REPLICA_IDX" + launch_router regular "$REPLICA_ROUTER_ARGS" "$ROUTER_PORT" "{{ router.policy }}" "$OUTPUT_DIR/logs/inference/router_${REPLICA_IDX}.log" "$REPLICA_IDX" "$((REPLICA_IDX * NODES_PER_INFER_REPLICA))" "$NODES_PER_INFER_REPLICA" fi # External-LB: one vLLM API server per DP rank on this node (BACKEND_PORT + d),