Implement routed experts delta replay (with branched deltas)

samsja · S1ro1 · samsja · commit aff43a472f5e · 2026-05-26T18:53:02.000-07:00
Squashed from origin/r3-delta (tip 5c94833, which extends the earlier 3799bda with 'Support branched routed expert deltas' for cases where the routed-experts payload diverges across siblings in a group). Adapts delta replay to main's deferred routed-experts chunk concat: first step starts at 0; extended steps use prefix_len - 1; row 0 fills the boundary, remaining rows append as the new suffix. Bumps router wheel pin to local-path. Bumps deps/verifiers gitlink to d39cc5876. Co-Authored-By: S1ro1 <matej.sirovatka@gmail.com>
diff --git a/Dockerfile.cuda b/Dockerfile.cuda
@@ -18,12 +18,42 @@ ENV DEBIAN_FRONTEND=noninteractive
 ENV TZ=Etc/UTC
 RUN apt-get update && apt-get install -y --no-install-recommends --force-yes \
     build-essential \
+    autoconf \
+    automake \
+    libtool \
+    pkg-config \
+    ca-certificates \
     curl \
     sudo \
     git \
     ninja-build \
+    libnuma-dev \
+    libnl-3-dev \
+    libnl-route-3-dev \
+    libibverbs-dev \
+    librdmacm-dev \
     && apt-get clean autoclean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
+ARG UCX_VERSION=1.19.1
+RUN git clone --depth 1 --branch v${UCX_VERSION} https://github.com/openucx/ucx.git /tmp/ucx \
+    && cd /tmp/ucx \
+    && ./autogen.sh \
+    && ./configure \
+        --prefix=/opt/ucx \
+        --enable-shared \
+        --disable-static \
+        --disable-doxygen-doc \
+        --enable-optimizations \
+        --enable-cma \
+        --enable-devel-headers \
+        --enable-mt \
+        --with-verbs \
+        --with-cuda=/usr/local/cuda \
+        --with-ze=no \
+    && make -j"$(nproc)" \
+    && make install \
+    && rm -rf /tmp/ucx
+
 # Download the latest installer
 ADD https://astral.sh/uv/install.sh /uv-installer.sh
 
@@ -49,7 +79,7 @@ COPY examples /app/examples
 COPY benchmarks/scripts /app/benchmarks/scripts
 
 RUN --mount=type=cache,target=/app/.cache/uv \
-    uv sync --extra flash-attn --extra flash-attn-3 --extra flash-attn-cute --extra envs --extra gpt-oss --group mamba-ssm --locked --no-dev
+    uv sync --extra flash-attn --extra flash-attn-3 --extra flash-attn-cute --extra envs --extra gpt-oss --extra modelexpress --group mamba-ssm --locked --no-dev
 
 # arm64: build flash-attn from source, fix namespace conflicts, apply workarounds
 ARG TARGETARCH
@@ -74,8 +104,12 @@ RUN apt-get update && apt-get install -y \
     net-tools \
     curl \
     vim \
+    libnuma1 \
+    libnl-3-200 \
+    libnl-route-3-200 \
     libibverbs1 \
     ibverbs-providers \
+    librdmacm1 \
     && apt-get clean autoclean \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
@@ -96,6 +130,7 @@ ENV PATH="/usr/local/bin:$PATH"
 WORKDIR /app
 # Copy the application from the builder
 COPY --from=builder --chown=appuser:appuser /app /app
+COPY --from=builder /opt/ucx /opt/ucx
 
 # Copy and set up entrypoint script
 COPY --chown=appuser:appuser scripts/docker-entrypoint.sh /app/docker-entrypoint.sh
@@ -107,6 +142,8 @@ RUN rm /app/.venv/bin/python3.12 && ln -s /usr/local/bin/python /app/.venv/bin/p
 
 # Place executables in the environment at the front of the path
 ENV PATH="/app/.venv/bin:$PATH"
+ENV UCX_HOME=/opt/ucx
+ENV LD_LIBRARY_PATH="/opt/ucx/lib:/opt/ucx/lib/ucx${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
 
 # HuggingFace Hub timeouts (defaults are 10s which causes issues on slow networks)
 ENV HF_HUB_ETAG_TIMEOUT=500
diff --git a/deps/verifiers b/deps/verifiers
@@ -1 +1 @@
-Subproject commit f9c68eb28ccf0448d4573b7ca50f1163f81c5cfd
+Subproject commit d39cc5876a8595cb021746d67c7a088652e872e0
diff --git a/pyproject.toml b/pyproject.toml
@@ -84,10 +84,12 @@ envs = [
     "opencode-science",
     "opencode-swe",
     "reverse-text",
+    "rlm-swe",
     "science-env",
     "simpleqa-verified",
     "tau2-bench",
     "wiki-search",
+    "wordle",
 ]
 disagg = [
     "deep-ep ; platform_machine == 'x86_64'",
@@ -99,6 +101,9 @@ disagg = [
 gpt-oss = [
     "kernels",
 ]
+modelexpress = [
+    "modelexpress==0.3.0",
+]
 quack = [
     "quack-kernels>=0.4.1",
 ]
@@ -134,6 +139,7 @@ members = [
     "deps/verifiers/environments/math_python",
     "deps/verifiers/environments/reverse_text",
     "deps/verifiers/environments/wiki_search",
+    "deps/verifiers/environments/wordle",
     "deps/research-environments/environments/aime2024",
     "deps/research-environments/environments/aime2025",
     "deps/research-environments/environments/code_env",
@@ -155,6 +161,7 @@ members = [
     "deps/research-environments/environments/opencode_math",
     "deps/research-environments/environments/opencode_science",
     "deps/research-environments/environments/opencode_swe",
+    "deps/research-environments/environments/rlm_swe",
     "deps/research-environments/environments/science_env",
     "deps/research-environments/environments/simpleqa_verified",
     "deps/research-environments/environments/tau2_bench",
@@ -178,6 +185,22 @@ override-dependencies = [
     "openenv-core",
 ]
 
+# ModelExpress 0.3.0 publishes protobuf<6 metadata, but its generated proto is
+# compatible with protobuf 6. prime-sandboxes requires protobuf>=6.31.1; keep
+# this capped to the validated protobuf major.
+[[tool.uv.dependency-metadata]]
+name = "modelexpress"
+version = "0.3.0"
+requires-dist = [
+    "grpcio>=1.66.2",
+    "huggingface_hub>=0.20.0",
+    "nixl[cu12]",
+    "numpy>=1.24.0",
+    "protobuf>=5.27.0,<7.0.0",
+    "pydantic>=2.0.0",
+    "torch>=2.6.0",
+]
+
 [tool.uv.exclude-newer-package]
 # we want latest vllm, remove next patch
 vllm = false
@@ -224,10 +247,12 @@ opencode-math = { workspace = true }
 opencode-science = { workspace = true }
 opencode-swe = { workspace = true }
 reverse-text = { workspace = true }
+rlm-swe = { workspace = true }
 science-env = { workspace = true }
 simpleqa-verified = { workspace = true }
 tau2-bench = { workspace = true }
 wiki-search = { workspace = true }
+wordle = { workspace = true }
 torch = { index = "pytorch-cu128" }
 torchvision = { index = "pytorch-cu128" }
 torchaudio = { index = "pytorch-cu128" }
@@ -236,7 +261,7 @@ dion = { git = "https://github.com/samsja/dion.git", rev = "d891eeb" }
 transformers = { git = "https://github.com/huggingface/transformers.git", rev = "c1c3424" }
 flash-attn-4 = { git = "https://github.com/Dao-AILab/flash-attention.git", subdirectory = "flash_attn/cute", rev = "96bd151" }
 prime-pydantic-config = { workspace = true }
-vllm-router = { url = "https://github.com/PrimeIntellect-ai/router/releases/download/v0.1.25/vllm_router-0.1.25-cp38-abi3-manylinux_2_28_x86_64.whl" }
+vllm-router = { path = "third_party/router/dist/vllm_router-0.1.25-cp38-abi3-linux_x86_64.whl" }
 vllm = [
     { url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/vllm-0.21.0+cu129.r42434.pr39568.a106aa6-cp38-abi3-manylinux_2_34_x86_64.whl", marker = "platform_machine == 'x86_64'" },
     { url = "https://github.com/vllm-project/vllm/releases/download/v0.21.0/vllm-0.21.0+cu129-cp38-abi3-manylinux_2_34_aarch64.whl", marker = "platform_machine == 'aarch64'" },
diff --git a/skills/configs/SKILL.md b/skills/configs/SKILL.md
@@ -70,6 +70,10 @@ For rollout debugging, enable trainer-side token export under `trainer.experimen
 
 Leave it unset for normal training. When enabled, it exports every sequence from each exporting rank.
 
+## RLM SWE harness args
+
+For `rlm_swe` / `rlm-swe` configs using the composable RLM harness, use current harness kwargs such as `rlm_max_turns`, `rlm_exec_timeout`, `rlm_max_depth`, `summarize_at_tokens`, `rlm_ref`, `local_checkout`, `append_to_system_prompt`, and `rlm_tools`. Do not use the stale `rlm_max_turns_in_context` key with the composable harness; it is not accepted by `rlm_harness`.
+
 ## Key files
 
 - `packages/prime-rl-configs/src/prime_rl/` — config classes under `configs/`; `utils/config.py` re-exports `BaseConfig` and `cli`
diff --git a/src/prime_rl/inference/vllm/routed_experts.py b/src/prime_rl/inference/vllm/routed_experts.py
@@ -8,7 +8,7 @@
 from vllm.outputs import RequestOutput
 
 
-def serialize_routed_experts(routed_experts: Any) -> dict[str, Any] | None:
+def serialize_routed_experts(routed_experts: Any, start: int = 0) -> dict[str, Any] | None:
     if routed_experts is None:
         return None
 
@@ -23,18 +23,20 @@ def serialize_routed_experts(routed_experts: Any) -> dict[str, Any] | None:
     return {
         "data": pybase64.b64encode(memoryview(compact)).decode("ascii"),
         "shape": list(compact.shape),
+        "start": start,
     }
 
 
 class RoutedExpertsCapture:
-    def __init__(self, generator: AsyncIterator[RequestOutput]):
+    def __init__(self, generator: AsyncIterator[RequestOutput], start: int = 0):
         self._generator = generator
+        self._start = start
         self.routed_experts: dict[int, dict[str, Any]] = {}
 
     async def __aiter__(self):
         async for request_output in self._generator:
             for output in request_output.outputs:
-                encoded = serialize_routed_experts(getattr(output, "routed_experts", None))
+                encoded = serialize_routed_experts(getattr(output, "routed_experts", None), start=self._start)
                 if encoded is not None:
                     self.routed_experts[output.index] = encoded
             yield request_output
diff --git a/src/prime_rl/inference/vllm/serving_tokens.py b/src/prime_rl/inference/vllm/serving_tokens.py
@@ -266,7 +266,11 @@ async def serve_tokens_full_generator(  # type: ignore[override]
         # experts surface in the JSON.
         capture: _GenerateRoutedExpertsCapture | None = None
         if self.model_config.enable_return_routed_experts:
-            capture = _GenerateRoutedExpertsCapture(result_generator)
+            start = request.sampling_params.routed_experts_prompt_start
+            capture = _GenerateRoutedExpertsCapture(
+                result_generator,
+                start=start,
+            )
             result_generator = capture
 
         response = await super().serve_tokens_full_generator(
diff --git a/src/prime_rl/orchestrator/trajectories.py b/src/prime_rl/orchestrator/trajectories.py
@@ -244,11 +244,13 @@ def prepare_step_tokens(step: vf.TrajectoryStep, step_idx: int) -> dict[str, Any
         if tokens is not None:
             routed_experts_payload = tokens.get("routed_experts")
             routed_experts = None
+            routed_experts_start = None
             if routed_experts_payload is not None:
                 decoded_routed_experts = pybase64.b64decode_as_bytearray(routed_experts_payload["data"])
                 routed_experts = np.frombuffer(decoded_routed_experts, dtype=np.uint8).reshape(
                     routed_experts_payload["shape"]
                 )
+                routed_experts_start = routed_experts_payload["start"]
 
             return {
                 "prompt_ids": list(tokens["prompt_ids"]),
@@ -257,6 +259,7 @@ def prepare_step_tokens(step: vf.TrajectoryStep, step_idx: int) -> dict[str, Any
                 "completion_mask": list(map(bool, tokens["completion_mask"])),
                 "completion_logprobs": list(tokens["completion_logprobs"]),
                 "routed_experts": routed_experts,
+                "routed_experts_start": routed_experts_start,
                 # Renderer-emitted multimodal sidecar (placeholders + per-item
                 # processed tensors). Populated when the rollout went through
                 # a multimodal-aware renderer (e.g. Qwen3VLRenderer); absent
@@ -277,6 +280,12 @@ def prepare_step_tokens(step: vf.TrajectoryStep, step_idx: int) -> dict[str, Any
     # Deferred routed_experts state per sample: O(N) chunk list concatenated
     # once at finalize, replacing the prior O(N²) per-extension unpack/repack.
     sample_routed_state: dict[int, dict[str, Any]] = {}
+    routed_prefix_states: dict[int, list[tuple[list[int], list[int], dict[str, Any]]]] = {}
+
+    # Track (prefix_tokens, sample, step_indices) per active sample. step_indices
+    # is the explicit list of prepared_steps positions merged into this sample —
+    # non-contiguous when other agents' steps interleave.
+    active_samples: list[tuple[list[int], TrainingSample, list[int]]] = []
 
     def make_sample(tokens: dict[str, Any]) -> TrainingSample:
         """Create a new TrainingSample from a trajectory step."""
@@ -306,9 +315,37 @@ def make_sample(tokens: dict[str, Any]) -> TrainingSample:
         # each extension is a no-op append rather than a destructive write.
         step_routed = tokens.get("routed_experts")
         if step_routed is not None:
+            routed_start = tokens["routed_experts_start"]
+            assert routed_start is not None
+            chunks: list[np.ndarray] = []
+            running_len = 0
+            if routed_start > 0:
+                source_len = routed_start + 1
+                source_state = None
+                for prompt_ids, completion_ids, candidate_state in routed_prefix_states[source_len]:
+                    prompt_len = len(prompt_ids)
+                    if (
+                        tokens["prompt_ids"][:prompt_len] == prompt_ids
+                        and tokens["prompt_ids"][prompt_len:source_len] == completion_ids
+                    ):
+                        source_state = candidate_state
+                        break
+                assert source_state is not None
+                assert source_state["running_len"] >= routed_start
+                remaining = routed_start
+                for chunk in source_state["chunks"]:
+                    if remaining == 0:
+                        break
+                    take = min(remaining, int(chunk.shape[0]))
+                    chunks.append(chunk[:take])
+                    remaining -= take
+                assert remaining == 0
+                running_len = routed_start
+            chunks.append(step_routed)
+            running_len += int(step_routed.shape[0])
             sample_routed_state[id(sample)] = {
-                "chunks": [step_routed],
-                "running_len": int(step_routed.shape[0]),
+                "chunks": chunks,
+                "running_len": running_len,
             }
         return sample
 
@@ -339,30 +376,31 @@ def extend_sample(
 
         step_routed = tokens.get("routed_experts")
         state = sample_routed_state.get(id(sample))
-        if step_routed is not None and state is not None:
-            # vLLM doesn't capture a routing decision for the *last* token of any
-            # request, so the previous step left no entry for token at index
-            # (prefix_len - 1). The next step's forward pass *did* process that
-            # token (as part of its prompt) and produced step_routed[prefix_len-1].
-            # Append that single boundary entry as its own chunk, then append the
-            # genuinely new entries from this step. No prior bytes touched.
-            if prefix_len > 0 and prefix_len <= step_routed.shape[0]:
-                boundary_chunk = step_routed[prefix_len - 1 : prefix_len]
+        if state is not None:
+            assert step_routed is not None
+        if step_routed is not None:
+            assert state is not None
+            assert tokens["routed_experts_start"] == prefix_len - 1
+            # Delta payloads start at prefix_len - 1. Row 0 fills the boundary
+            # token missing from the previous request; the rest is the new suffix.
+            if prefix_len > 0:
+                boundary_chunk = step_routed[:1]
                 state["chunks"].append(boundary_chunk)
                 state["running_len"] += 1
-            new_chunk = step_routed[prefix_len:]
+                step_routed = step_routed[1:]
+            new_chunk = step_routed
             state["chunks"].append(new_chunk)
             state["running_len"] += int(new_chunk.shape[0])
 
-    # Track (prefix_tokens, sample, step_indices) per active sample. step_indices
-    # is the explicit list of prepared_steps positions merged into this sample —
-    # non-contiguous when other agents' steps interleave.
-    active_samples: list[tuple[list[int], TrainingSample, list[int]]] = []
-
     first_tokens = prepared_steps[0]
     first_prefix = first_tokens["prompt_ids"] + first_tokens["completion_ids"]
     first_sample = make_sample(first_tokens)
     active_samples.append((first_prefix, first_sample, [0]))
+    first_routed_state = sample_routed_state.get(id(first_sample))
+    if first_routed_state is not None:
+        routed_prefix_states.setdefault(len(first_prefix), []).append(
+            (first_tokens["prompt_ids"], first_tokens["completion_ids"], first_routed_state)
+        )
 
     for step_idx, _step in enumerate(trajectory[1:], start=1):
         tokens = prepared_steps[step_idx]
@@ -379,11 +417,17 @@ def extend_sample(
             # Extension holds - merge into matched sample
             prefix_tokens, sample, step_indices = active_samples[matched_idx]
             extend_sample(sample, len(prefix_tokens), step_idx=step_idx)
+            new_prefix = tokens["prompt_ids"] + tokens["completion_ids"]
             active_samples[matched_idx] = (
-                tokens["prompt_ids"] + tokens["completion_ids"],
+                new_prefix,
                 sample,
                 step_indices + [step_idx],
             )
+            routed_state = sample_routed_state.get(id(sample))
+            if routed_state is not None:
+                routed_prefix_states.setdefault(len(new_prefix), []).append(
+                    (tokens["prompt_ids"], tokens["completion_ids"], routed_state)
+                )
         else:
             # No prefix matches - start a new sample
             logger.debug(
@@ -393,6 +437,11 @@ def extend_sample(
             new_prefix = tokens["prompt_ids"] + tokens["completion_ids"]
             sample = make_sample(tokens)
             active_samples.append((new_prefix, sample, [step_idx]))
+            routed_state = sample_routed_state.get(id(sample))
+            if routed_state is not None:
+                routed_prefix_states.setdefault(len(new_prefix), []).append(
+                    (tokens["prompt_ids"], tokens["completion_ids"], routed_state)
+                )
 
     # Finalize routed_experts for each sample. One concat per sample (O(N) byte
     # work) replaces the previous per-step unpack/concat/repack (O(N²)). The
diff --git a/tests/unit/inference/test_serving_tokens.py b/tests/unit/inference/test_serving_tokens.py
@@ -73,7 +73,7 @@ def test_serialize_routed_experts_uses_compact_raw_payload():
 
 
 def test_generate_response_post_process_replaces_upstream_routed_experts():
-    compact_routed_experts = {"data": "AQID", "shape": [1, 1, 3]}
+    compact_routed_experts = {"data": "AQID", "shape": [1, 1, 3], "start": 0}
     capture = _GenerateRoutedExpertsCapture(_empty_request_outputs())
     capture.routed_experts[0] = compact_routed_experts
     response = GenerateResponse(
diff --git a/tests/unit/orchestrator/test_trajectories.py b/tests/unit/orchestrator/test_trajectories.py
diff --git a/uv.lock b/uv.lock