fix(qwen35): emit images inline in tool-response content (#25)

eligotts · web-flow · commit 562db77effec · 2026-05-13T18:51:12.000+02:00
diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py
@@ -851,6 +851,7 @@ def emit_image(part: dict[str, Any], msg_idx: int) -> None:
                     emit_special=emit_special,
                     emit_text=emit_text,
                     emit_ids=emit_ids,
+                    emit_image=emit_image,
                 )
             elif msg.get("content") is not None:
                 # User / other content branches — images allowed.
@@ -1034,6 +1035,7 @@ def emit_image(part: dict[str, Any], _msg_idx: int = -1) -> None:
                     emit_special=emit_special,
                     emit_text=emit_text,
                     emit_ids=emit_ids,
+                    emit_image=emit_image,
                 )
             elif msg.get("content") is not None:
                 self._emit_content(
@@ -1246,19 +1248,32 @@ def _render_tool_body(
         emit_special,
         emit_text,
         emit_ids,
+        emit_image=None,
     ) -> None:
         """Emit tool-result body (after the role tag): ``## Return of {id}\\n``
         + content. No ``<|im_end|>``; caller emits that.
 
         The K2.5 template emits the ``## Return of …`` header unconditionally
         — when ``tool_call_id`` is missing the interpolation yields empty
         string and you get ``## Return of \\n``. We mirror that.
+
+        ``emit_image`` is threaded through to ``_emit_content`` so image parts
+        inside ``content`` lists (browser-agent screenshots, etc.) render as
+        ``<|media_begin|>image<|media_content|><|media_pad|><|media_end|>``
+        inline. Without it those parts are silently dropped.
         """
         tool_call_id = msg.get("tool_call_id") or ""
         emit_text(f"## Return of {tool_call_id}\n", msg_idx)
         content = msg.get("content")
         if content is not None:
-            self._emit_content(content, msg_idx, emit_special, emit_text, emit_ids)
+            self._emit_content(
+                content,
+                msg_idx,
+                emit_special,
+                emit_text,
+                emit_ids,
+                emit_image=emit_image,
+            )
 
     def _normalize_response_tokens(self, response: list[int]) -> list[int]:
         """Restore the synthetic ``<think>`` prefill if the sampler stripped it.
diff --git a/renderers/qwen35.py b/renderers/qwen35.py
@@ -439,9 +439,9 @@ def flush_buf() -> None:
                 self._render_tool(
                     messages,
                     i,
-                    content,
                     emit_special=emit_special,
                     emit_text=emit_text,
+                    emit_image=emit_image,
                 )
 
             else:
@@ -545,7 +545,7 @@ def emit_special(token_id: int, _msg_idx: int = -1) -> None:
         def emit_text(text: str, _msg_idx: int = -1) -> None:
             tokens.extend(self._encode(text))
 
-        def emit_image(part: dict[str, Any]) -> None:
+        def emit_image(part: dict[str, Any], _msg_idx: int = -1) -> None:
             _, out, n, h = self._process_image(part)
             emit_special(self._vision_start)
             offset = len(tokens)
@@ -619,9 +619,9 @@ def flush_buf() -> None:
                 self._render_tool(
                     new_messages,
                     i,
-                    content,
                     emit_special=emit_special,
                     emit_text=emit_text,
+                    emit_image=emit_image,
                 )
             else:
                 return None
@@ -799,24 +799,66 @@ def _render_tool(
         self,
         messages: list[Message],
         msg_idx: int,
-        content: str,
         *,
         emit_special,
         emit_text,
+        emit_image,
     ) -> None:
-        # Consecutive tool messages are grouped under a single <|im_start|>user block
+        # Consecutive tool messages share a single <|im_start|>user ... <|im_end|>
+        # envelope. Whether to open and close the envelope depends only on the
+        # neighbouring roles, never on the content type of this or any other
+        # tool message — keep this predicate text/media-agnostic.
         prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"
         next_is_tool = (
             msg_idx + 1 < len(messages) and messages[msg_idx + 1]["role"] == "tool"
         )
+        raw_content = messages[msg_idx].get("content")
 
         if not prev_is_tool:
             emit_special(self._im_start, msg_idx)
             emit_text("user", msg_idx)
 
         emit_text("\n", msg_idx)
         emit_special(self._tool_response, msg_idx)
-        emit_text("\n" + content + "\n", msg_idx)
+
+        if self._content_has_media(raw_content):
+            # Mirror the chat template's ``render_content`` macro for list
+            # content: text segments BPE-encode together up to a media
+            # boundary, then each image emits ``<|vision_start|>`` + N×
+            # ``<|image_pad|>`` + ``<|vision_end|>`` inline.
+            # ``_content_has_media`` returns False unless content is a list,
+            # but the type checker can't follow that through the call.
+            assert isinstance(raw_content, list)
+            buf: list[str] = ["\n"]
+
+            def flush_buf() -> None:
+                if buf:
+                    emit_text("".join(buf), msg_idx)
+                    buf.clear()
+
+            for item in raw_content:
+                if isinstance(item, str):
+                    buf.append(item)
+                elif isinstance(item, dict):
+                    if _is_image_part(item):
+                        flush_buf()
+                        emit_image(item, msg_idx)
+                    elif _is_video_part(item):
+                        raise NotImplementedError(
+                            "Video parts are not yet supported by Qwen35Renderer."
+                        )
+                    elif "text" in item:
+                        buf.append(item["text"])
+                    else:
+                        raise ValueError(f"Unexpected content item: {item}")
+                else:
+                    raise ValueError(f"Unexpected content item: {item}")
+            flush_buf()
+            emit_text("\n", msg_idx)
+        else:
+            content = self._render_content(raw_content).strip()
+            emit_text("\n" + content + "\n", msg_idx)
+
         emit_special(self._tool_response_end, msg_idx)
 
         if not next_is_tool:
diff --git a/tests/test_multimodal.py b/tests/test_multimodal.py
@@ -299,11 +299,142 @@ def _build_cases(make_part, image):
     ]
 
 
+def _build_tool_image_cases(make_part, image):
+    """Tool-message image scenarios. Targets renderers that emit image
+    placeholders inside ``<tool_response>`` blocks. Browser-agent style
+    trajectories produce post-action screenshots as tool responses, so
+    handling images here is load-bearing for that workload."""
+    return [
+        pytest.param(
+            [
+                {"role": "user", "content": "Take a screenshot."},
+                {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        {
+                            "id": "c1",
+                            "type": "function",
+                            "function": {"name": "screenshot", "arguments": {}},
+                        }
+                    ],
+                },
+                {
+                    "role": "tool",
+                    "tool_call_id": "c1",
+                    "content": [
+                        {"type": "text", "text": "Screenshot captured."},
+                        make_part(image),
+                    ],
+                },
+            ],
+            False,
+            id="tool_response_with_image",
+        ),
+        pytest.param(
+            [
+                {"role": "user", "content": "Screenshot then describe."},
+                {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        {
+                            "id": "c1",
+                            "type": "function",
+                            "function": {"name": "screenshot", "arguments": {}},
+                        }
+                    ],
+                },
+                {
+                    "role": "tool",
+                    "tool_call_id": "c1",
+                    "content": [
+                        {"type": "text", "text": "Done:"},
+                        make_part(image),
+                    ],
+                },
+                {"role": "assistant", "content": "A square."},
+                {"role": "user", "content": "Now show me the next page."},
+                {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        {
+                            "id": "c2",
+                            "type": "function",
+                            "function": {"name": "screenshot", "arguments": {}},
+                        }
+                    ],
+                },
+                {
+                    "role": "tool",
+                    "tool_call_id": "c2",
+                    "content": [
+                        {"type": "text", "text": "Next page:"},
+                        make_part(image),
+                    ],
+                },
+            ],
+            False,
+            id="multi_turn_tool_response_images",
+        ),
+        pytest.param(
+            [
+                {"role": "user", "content": "Run a few tools."},
+                {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        {
+                            "id": "c1",
+                            "type": "function",
+                            "function": {"name": "ping", "arguments": {}},
+                        },
+                        {
+                            "id": "c2",
+                            "type": "function",
+                            "function": {"name": "screenshot", "arguments": {}},
+                        },
+                        {
+                            "id": "c3",
+                            "type": "function",
+                            "function": {"name": "ping", "arguments": {}},
+                        },
+                    ],
+                },
+                {"role": "tool", "tool_call_id": "c1", "content": "pong"},
+                {
+                    "role": "tool",
+                    "tool_call_id": "c2",
+                    "content": [
+                        {"type": "text", "text": "Screenshot:"},
+                        make_part(image),
+                    ],
+                },
+                {"role": "tool", "tool_call_id": "c3", "content": "pong"},
+            ],
+            False,
+            id="consecutive_tools_mixed_media",
+        ),
+    ]
+
+
 # ---------------------------------------------------------------------------
 # Tests.
 # ---------------------------------------------------------------------------
 
 
+def _supports_tool_message_images(renderer) -> bool:
+    """True iff this renderer emits image placeholders inside tool-response
+    content. Renderers without the feature silently drop image parts in tool
+    content; as they grow the feature they get added here and the test starts
+    asserting against them."""
+    from renderers.kimi_k25 import KimiK25Renderer
+    from renderers.qwen35 import Qwen35Renderer
+
+    return isinstance(renderer, (Qwen35Renderer, KimiK25Renderer))
+
+
 @pytest.mark.parametrize(
     "mm_model_name,modality", _CASES, ids=[f"{m}|{mo}" for m, mo in _CASES]
 )
@@ -513,6 +644,44 @@ def test_modality_registry_models_route_to_renderer():
         )
 
 
+@pytest.mark.parametrize(
+    "mm_model_name,modality", _CASES, ids=[f"{m}|{mo}" for m, mo in _CASES]
+)
+def test_tool_response_image_byte_parity(mm_model_name, modality, tiny_image):
+    """Tool-message image parity vs ``processor.apply_chat_template`` + ``processor(...)``.
+
+    Browser-agent SFT traces carry post-action screenshots as ``tool``
+    responses. Renderers that drop those image parts silently — historically
+    every Qwen-VL family renderer did — produce token streams that diverge
+    from the HF processor and lose most of the visual learning signal.
+    Skipped for renderers that haven't grown the feature yet; flips to a
+    real assertion as they do.
+    """
+    if modality != "image":
+        pytest.skip("Tool-response media path is image-only for now.")
+    if not _hf_snapshot_cached(mm_model_name):
+        pytest.skip(f"{mm_model_name}: HF snapshot not cached locally")
+
+    kit = _modality_kit(modality, mm_model_name)
+    tokenizer, processor, renderer = _load_processor_and_renderer(mm_model_name)
+
+    if not _supports_tool_message_images(renderer):
+        pytest.skip(
+            f"{type(renderer).__name__} does not yet emit images inside tool responses"
+        )
+
+    for case in _build_tool_image_cases(kit["make_part"], tiny_image):
+        messages, add_gp = case.values
+        ours = renderer.render_ids(messages, add_generation_prompt=add_gp)
+        theirs = kit["processor_input_ids"](processor, messages, add_gp)
+        assert ours == theirs, (
+            f"{mm_model_name} / tool / case={case.id}: "
+            f"renderer diverges from processor.\n"
+            f"  len(ours)={len(ours)} len(theirs)={len(theirs)}\n"
+            f"  ours[:60]={ours[:60]}\n  theirs[:60]={theirs[:60]}"
+        )
+
+
 def test_qwen3_vl_renderer_exposes_image_modality():
     """The flagship multimodal renderer is concretely Qwen3VLRenderer.