Support interleaved rollouts with include_sub_llm_in_trajectory=True (#900)

snimu · claude · web-flow · commit 00c07a88ef03 · 2026-02-11T17:15:35.000-08:00
When sub-LLM trajectory steps are stored alongside main-model steps,
several base-class methods that read state["trajectory"][-1] would see
a sub-LLM step instead of the last main-model step.  This adds RLMEnv
overrides that filter by trajectory_id so that get_prompt_messages,
get_model_response (get_prompt_ids), max_turns_reached, and
no_tools_called always reference the correct step.

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/tests/test_rlm_env.py b/tests/test_rlm_env.py
@@ -1338,14 +1338,15 @@ class TestSubLLMTrajectorySteps:
     async def test_include_sub_llm_in_trajectory_default(self, rlm_env):
         assert rlm_env.include_sub_llm_in_trajectory is False
 
-    def test_interleaved_disallowed_when_sub_llm_in_trajectory(self):
+    def test_interleaved_allowed_when_sub_llm_in_trajectory(self):
         dataset = make_dataset({})
-        with pytest.raises(ValueError, match="include_sub_llm_in_trajectory=True"):
-            build_env(
-                dataset,
-                include_sub_llm_in_trajectory=True,
-                interleaved_rollouts=True,
-            )
+        env = build_env(
+            dataset,
+            include_sub_llm_in_trajectory=True,
+            interleaved_rollouts=True,
+        )
+        assert env.include_sub_llm_in_trajectory is True
+        assert env.interleaved_rollouts is True
 
     @pytest.mark.asyncio
     async def test_sub_llm_steps_added_to_trajectory(self, rlm_env):
diff --git a/verifiers/envs/experimental/rlm_env.py b/verifiers/envs/experimental/rlm_env.py
@@ -2610,8 +2610,10 @@ class RLMEnv(vf.StatefulToolEnv):
         include_sub_llm_in_trajectory: Whether to include sub-LLM calls as trajectory steps.
                    When True, sub-LLM turns are added to the trajectory as TrajectoryStep
                    objects with tokens, enabling training on sub-LLM calls. Interleaved
-                   rollouts are not supported in this mode. When False (default), sub-LLM
-                   calls happen but are not stored.
+                   rollouts are supported in this mode; the environment ensures that
+                   get_prompt_messages, get_model_response, and stop conditions always
+                   reference the last main-model step rather than a sub-LLM step.
+                   When False (default), sub-LLM calls happen but are not stored.
         context_warning_threshold: Fraction of max_seq_len at which to warn the model
                    to finish (default: 0.80). Only active if max_seq_len is set.
         max_startup_wait_seconds: Maximum seconds to wait for worker startup (default: 120)
@@ -3768,11 +3770,6 @@ async def teardown_executor(self):
     # =========================================================================
 
     def set_interleaved_rollouts(self, interleaved_rollouts: bool) -> None:
-        if interleaved_rollouts and self.include_sub_llm_in_trajectory:
-            raise ValueError(
-                "RLMEnv does not support interleaved rollouts when "
-                "include_sub_llm_in_trajectory=True. Use branched rollouts instead."
-            )
         super().set_interleaved_rollouts(interleaved_rollouts)
 
     def update_tool_args(
@@ -3838,12 +3835,6 @@ async def setup_state(self, state: State, **kwargs) -> State:
                 "rlm_control_dir_remote", f"/tmp/rlm_{rollout_id}/rlm_control"
             )
 
-        if self.include_sub_llm_in_trajectory and self.interleaved_rollouts:
-            raise ValueError(
-                "RLMEnv does not support interleaved rollouts when "
-                "include_sub_llm_in_trajectory=True. Use branched rollouts instead."
-            )
-
         try:
             # 1. Setup interception and register rollout
             state = await self._setup_interception_and_register(state, rollout_id)
@@ -4202,6 +4193,14 @@ async def call_python_repl(self, code: str, state: Any) -> str:
             append_execution_time=True,
         )
 
+    def _last_main_trajectory_step(self, state: State) -> TrajectoryStep | None:
+        """Find the last trajectory step belonging to the main (root) model."""
+        main_id = state.get("trajectory_id")
+        for step in reversed(state.get("trajectory", [])):
+            if step.get("trajectory_id") == main_id:
+                return step
+        return None
+
     async def add_trajectory_step(self, state: State, trajectory_step: TrajectoryStep):
         update_rlm_metrics_from_step(state, trajectory_step)
         await super().add_trajectory_step(state, trajectory_step)
@@ -4282,8 +4281,15 @@ async def get_prompt_messages(self, state: State) -> Messages:
 
             return cast(Messages, messages)
         else:
-            # Subsequent turns: use parent implementation
-            return await super().get_prompt_messages(state)
+            # Subsequent turns: use last main trajectory step (skip sub-LLM steps)
+            last_main = self._last_main_trajectory_step(state)
+            if last_main is None:
+                return state["prompt"]
+            prev_turn_prompt = last_main["prompt"]
+            prev_turn_completion = last_main["completion"]
+            messages = concat_messages([prev_turn_prompt, prev_turn_completion])
+            env_response = await self.env_response(messages, state)
+            return concat_messages([messages, env_response])
 
     async def env_response(
         self, messages: Messages, state: State, **kwargs
@@ -4294,6 +4300,46 @@ async def env_response(
             state["final_env_response"] = tool_messages
         return tool_messages
 
+    async def get_model_response(  # type: ignore[override]
+        self, state: State, prompt: Messages, **kwargs: Any
+    ) -> ModelResponse:
+        """Ensure get_prompt_ids sees the last main trajectory step, not a sub-LLM step.
+
+        In interleaved mode, get_prompt_ids (called from super) reads
+        state["trajectory"][-1] to build token-level prompts.  After
+        env_response adds sub-LLM steps, trajectory[-1] may be a sub-LLM
+        step with incompatible tokens.  We temporarily move trailing sub-LLM
+        steps out of the trajectory for the duration of the super call.
+        """
+        if not (self.include_sub_llm_in_trajectory and self.interleaved_rollouts):
+            return await super().get_model_response(state, prompt, **kwargs)
+
+        trajectory = state.get("trajectory", [])
+        if not trajectory:
+            return await super().get_model_response(state, prompt, **kwargs)
+
+        main_id = state["trajectory_id"]
+        if trajectory[-1].get("trajectory_id") == main_id:
+            return await super().get_model_response(state, prompt, **kwargs)
+
+        # Find last main step and temporarily move trailing sub-LLM steps aside
+        last_main_idx = None
+        for i in range(len(trajectory) - 1, -1, -1):
+            if trajectory[i].get("trajectory_id") == main_id:
+                last_main_idx = i
+                break
+
+        if last_main_idx is None:
+            return await super().get_model_response(state, prompt, **kwargs)
+
+        trailing = trajectory[last_main_idx + 1 :]
+        del trajectory[last_main_idx + 1 :]
+        try:
+            result = await super().get_model_response(state, prompt, **kwargs)
+        finally:
+            trajectory.extend(trailing)
+        return result
+
     # =========================================================================
     # Stop Conditions
     # =========================================================================
@@ -4309,6 +4355,30 @@ async def answer_ready(self, state: State) -> bool:
         """Stop when model sets answer['ready'] = True."""
         return "final_answer" in state
 
+    @vf.stop
+    async def max_turns_reached(self, state: State) -> bool:
+        """Count only main-model trajectory steps, not sub-LLM steps."""
+        if self.max_turns <= 0:
+            return False
+        main_id = state.get("trajectory_id")
+        count = sum(
+            1 for s in state.get("trajectory", []) if s.get("trajectory_id") == main_id
+        )
+        return count >= self.max_turns
+
+    @vf.stop
+    async def no_tools_called(self, state: State) -> bool:
+        """Check last main-model completion for tool calls, ignoring sub-LLM steps."""
+        last_main = self._last_main_trajectory_step(state)
+        if last_main is None:
+            return False
+        last_message = cast(dict[str, Any], last_main["completion"][-1])
+        is_assistant = last_message.get("role") == "assistant"
+        no_tool_calls = (
+            "tool_calls" not in last_message or last_message["tool_calls"] is None
+        )
+        return is_assistant and no_tool_calls
+
     @vf.stop
     async def prompt_too_long(self, state: State) -> bool:
         """Stop when API returns overlong prompt error."""