Skip to content

Commit 562db77

Browse files
authored
fix(qwen35): emit images inline in tool-response content (#25)
1 parent 4062040 commit 562db77

3 files changed

Lines changed: 233 additions & 7 deletions

File tree

renderers/kimi_k25.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -851,6 +851,7 @@ def emit_image(part: dict[str, Any], msg_idx: int) -> None:
851851
emit_special=emit_special,
852852
emit_text=emit_text,
853853
emit_ids=emit_ids,
854+
emit_image=emit_image,
854855
)
855856
elif msg.get("content") is not None:
856857
# User / other content branches — images allowed.
@@ -1034,6 +1035,7 @@ def emit_image(part: dict[str, Any], _msg_idx: int = -1) -> None:
10341035
emit_special=emit_special,
10351036
emit_text=emit_text,
10361037
emit_ids=emit_ids,
1038+
emit_image=emit_image,
10371039
)
10381040
elif msg.get("content") is not None:
10391041
self._emit_content(
@@ -1246,19 +1248,32 @@ def _render_tool_body(
12461248
emit_special,
12471249
emit_text,
12481250
emit_ids,
1251+
emit_image=None,
12491252
) -> None:
12501253
"""Emit tool-result body (after the role tag): ``## Return of {id}\\n``
12511254
+ content. No ``<|im_end|>``; caller emits that.
12521255
12531256
The K2.5 template emits the ``## Return of …`` header unconditionally
12541257
— when ``tool_call_id`` is missing the interpolation yields empty
12551258
string and you get ``## Return of \\n``. We mirror that.
1259+
1260+
``emit_image`` is threaded through to ``_emit_content`` so image parts
1261+
inside ``content`` lists (browser-agent screenshots, etc.) render as
1262+
``<|media_begin|>image<|media_content|><|media_pad|><|media_end|>``
1263+
inline. Without it those parts are silently dropped.
12561264
"""
12571265
tool_call_id = msg.get("tool_call_id") or ""
12581266
emit_text(f"## Return of {tool_call_id}\n", msg_idx)
12591267
content = msg.get("content")
12601268
if content is not None:
1261-
self._emit_content(content, msg_idx, emit_special, emit_text, emit_ids)
1269+
self._emit_content(
1270+
content,
1271+
msg_idx,
1272+
emit_special,
1273+
emit_text,
1274+
emit_ids,
1275+
emit_image=emit_image,
1276+
)
12621277

12631278
def _normalize_response_tokens(self, response: list[int]) -> list[int]:
12641279
"""Restore the synthetic ``<think>`` prefill if the sampler stripped it.

renderers/qwen35.py

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -439,9 +439,9 @@ def flush_buf() -> None:
439439
self._render_tool(
440440
messages,
441441
i,
442-
content,
443442
emit_special=emit_special,
444443
emit_text=emit_text,
444+
emit_image=emit_image,
445445
)
446446

447447
else:
@@ -545,7 +545,7 @@ def emit_special(token_id: int, _msg_idx: int = -1) -> None:
545545
def emit_text(text: str, _msg_idx: int = -1) -> None:
546546
tokens.extend(self._encode(text))
547547

548-
def emit_image(part: dict[str, Any]) -> None:
548+
def emit_image(part: dict[str, Any], _msg_idx: int = -1) -> None:
549549
_, out, n, h = self._process_image(part)
550550
emit_special(self._vision_start)
551551
offset = len(tokens)
@@ -619,9 +619,9 @@ def flush_buf() -> None:
619619
self._render_tool(
620620
new_messages,
621621
i,
622-
content,
623622
emit_special=emit_special,
624623
emit_text=emit_text,
624+
emit_image=emit_image,
625625
)
626626
else:
627627
return None
@@ -799,24 +799,66 @@ def _render_tool(
799799
self,
800800
messages: list[Message],
801801
msg_idx: int,
802-
content: str,
803802
*,
804803
emit_special,
805804
emit_text,
805+
emit_image,
806806
) -> None:
807-
# Consecutive tool messages are grouped under a single <|im_start|>user block
807+
# Consecutive tool messages share a single <|im_start|>user ... <|im_end|>
808+
# envelope. Whether to open and close the envelope depends only on the
809+
# neighbouring roles, never on the content type of this or any other
810+
# tool message — keep this predicate text/media-agnostic.
808811
prev_is_tool = msg_idx > 0 and messages[msg_idx - 1]["role"] == "tool"
809812
next_is_tool = (
810813
msg_idx + 1 < len(messages) and messages[msg_idx + 1]["role"] == "tool"
811814
)
815+
raw_content = messages[msg_idx].get("content")
812816

813817
if not prev_is_tool:
814818
emit_special(self._im_start, msg_idx)
815819
emit_text("user", msg_idx)
816820

817821
emit_text("\n", msg_idx)
818822
emit_special(self._tool_response, msg_idx)
819-
emit_text("\n" + content + "\n", msg_idx)
823+
824+
if self._content_has_media(raw_content):
825+
# Mirror the chat template's ``render_content`` macro for list
826+
# content: text segments BPE-encode together up to a media
827+
# boundary, then each image emits ``<|vision_start|>`` + N×
828+
# ``<|image_pad|>`` + ``<|vision_end|>`` inline.
829+
# ``_content_has_media`` returns False unless content is a list,
830+
# but the type checker can't follow that through the call.
831+
assert isinstance(raw_content, list)
832+
buf: list[str] = ["\n"]
833+
834+
def flush_buf() -> None:
835+
if buf:
836+
emit_text("".join(buf), msg_idx)
837+
buf.clear()
838+
839+
for item in raw_content:
840+
if isinstance(item, str):
841+
buf.append(item)
842+
elif isinstance(item, dict):
843+
if _is_image_part(item):
844+
flush_buf()
845+
emit_image(item, msg_idx)
846+
elif _is_video_part(item):
847+
raise NotImplementedError(
848+
"Video parts are not yet supported by Qwen35Renderer."
849+
)
850+
elif "text" in item:
851+
buf.append(item["text"])
852+
else:
853+
raise ValueError(f"Unexpected content item: {item}")
854+
else:
855+
raise ValueError(f"Unexpected content item: {item}")
856+
flush_buf()
857+
emit_text("\n", msg_idx)
858+
else:
859+
content = self._render_content(raw_content).strip()
860+
emit_text("\n" + content + "\n", msg_idx)
861+
820862
emit_special(self._tool_response_end, msg_idx)
821863

822864
if not next_is_tool:

tests/test_multimodal.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,11 +299,142 @@ def _build_cases(make_part, image):
299299
]
300300

301301

302+
def _build_tool_image_cases(make_part, image):
303+
"""Tool-message image scenarios. Targets renderers that emit image
304+
placeholders inside ``<tool_response>`` blocks. Browser-agent style
305+
trajectories produce post-action screenshots as tool responses, so
306+
handling images here is load-bearing for that workload."""
307+
return [
308+
pytest.param(
309+
[
310+
{"role": "user", "content": "Take a screenshot."},
311+
{
312+
"role": "assistant",
313+
"content": "",
314+
"tool_calls": [
315+
{
316+
"id": "c1",
317+
"type": "function",
318+
"function": {"name": "screenshot", "arguments": {}},
319+
}
320+
],
321+
},
322+
{
323+
"role": "tool",
324+
"tool_call_id": "c1",
325+
"content": [
326+
{"type": "text", "text": "Screenshot captured."},
327+
make_part(image),
328+
],
329+
},
330+
],
331+
False,
332+
id="tool_response_with_image",
333+
),
334+
pytest.param(
335+
[
336+
{"role": "user", "content": "Screenshot then describe."},
337+
{
338+
"role": "assistant",
339+
"content": "",
340+
"tool_calls": [
341+
{
342+
"id": "c1",
343+
"type": "function",
344+
"function": {"name": "screenshot", "arguments": {}},
345+
}
346+
],
347+
},
348+
{
349+
"role": "tool",
350+
"tool_call_id": "c1",
351+
"content": [
352+
{"type": "text", "text": "Done:"},
353+
make_part(image),
354+
],
355+
},
356+
{"role": "assistant", "content": "A square."},
357+
{"role": "user", "content": "Now show me the next page."},
358+
{
359+
"role": "assistant",
360+
"content": "",
361+
"tool_calls": [
362+
{
363+
"id": "c2",
364+
"type": "function",
365+
"function": {"name": "screenshot", "arguments": {}},
366+
}
367+
],
368+
},
369+
{
370+
"role": "tool",
371+
"tool_call_id": "c2",
372+
"content": [
373+
{"type": "text", "text": "Next page:"},
374+
make_part(image),
375+
],
376+
},
377+
],
378+
False,
379+
id="multi_turn_tool_response_images",
380+
),
381+
pytest.param(
382+
[
383+
{"role": "user", "content": "Run a few tools."},
384+
{
385+
"role": "assistant",
386+
"content": "",
387+
"tool_calls": [
388+
{
389+
"id": "c1",
390+
"type": "function",
391+
"function": {"name": "ping", "arguments": {}},
392+
},
393+
{
394+
"id": "c2",
395+
"type": "function",
396+
"function": {"name": "screenshot", "arguments": {}},
397+
},
398+
{
399+
"id": "c3",
400+
"type": "function",
401+
"function": {"name": "ping", "arguments": {}},
402+
},
403+
],
404+
},
405+
{"role": "tool", "tool_call_id": "c1", "content": "pong"},
406+
{
407+
"role": "tool",
408+
"tool_call_id": "c2",
409+
"content": [
410+
{"type": "text", "text": "Screenshot:"},
411+
make_part(image),
412+
],
413+
},
414+
{"role": "tool", "tool_call_id": "c3", "content": "pong"},
415+
],
416+
False,
417+
id="consecutive_tools_mixed_media",
418+
),
419+
]
420+
421+
302422
# ---------------------------------------------------------------------------
303423
# Tests.
304424
# ---------------------------------------------------------------------------
305425

306426

427+
def _supports_tool_message_images(renderer) -> bool:
428+
"""True iff this renderer emits image placeholders inside tool-response
429+
content. Renderers without the feature silently drop image parts in tool
430+
content; as they grow the feature they get added here and the test starts
431+
asserting against them."""
432+
from renderers.kimi_k25 import KimiK25Renderer
433+
from renderers.qwen35 import Qwen35Renderer
434+
435+
return isinstance(renderer, (Qwen35Renderer, KimiK25Renderer))
436+
437+
307438
@pytest.mark.parametrize(
308439
"mm_model_name,modality", _CASES, ids=[f"{m}|{mo}" for m, mo in _CASES]
309440
)
@@ -513,6 +644,44 @@ def test_modality_registry_models_route_to_renderer():
513644
)
514645

515646

647+
@pytest.mark.parametrize(
648+
"mm_model_name,modality", _CASES, ids=[f"{m}|{mo}" for m, mo in _CASES]
649+
)
650+
def test_tool_response_image_byte_parity(mm_model_name, modality, tiny_image):
651+
"""Tool-message image parity vs ``processor.apply_chat_template`` + ``processor(...)``.
652+
653+
Browser-agent SFT traces carry post-action screenshots as ``tool``
654+
responses. Renderers that drop those image parts silently — historically
655+
every Qwen-VL family renderer did — produce token streams that diverge
656+
from the HF processor and lose most of the visual learning signal.
657+
Skipped for renderers that haven't grown the feature yet; flips to a
658+
real assertion as they do.
659+
"""
660+
if modality != "image":
661+
pytest.skip("Tool-response media path is image-only for now.")
662+
if not _hf_snapshot_cached(mm_model_name):
663+
pytest.skip(f"{mm_model_name}: HF snapshot not cached locally")
664+
665+
kit = _modality_kit(modality, mm_model_name)
666+
tokenizer, processor, renderer = _load_processor_and_renderer(mm_model_name)
667+
668+
if not _supports_tool_message_images(renderer):
669+
pytest.skip(
670+
f"{type(renderer).__name__} does not yet emit images inside tool responses"
671+
)
672+
673+
for case in _build_tool_image_cases(kit["make_part"], tiny_image):
674+
messages, add_gp = case.values
675+
ours = renderer.render_ids(messages, add_generation_prompt=add_gp)
676+
theirs = kit["processor_input_ids"](processor, messages, add_gp)
677+
assert ours == theirs, (
678+
f"{mm_model_name} / tool / case={case.id}: "
679+
f"renderer diverges from processor.\n"
680+
f" len(ours)={len(ours)} len(theirs)={len(theirs)}\n"
681+
f" ours[:60]={ours[:60]}\n theirs[:60]={theirs[:60]}"
682+
)
683+
684+
516685
def test_qwen3_vl_renderer_exposes_image_modality():
517686
"""The flagship multimodal renderer is concretely Qwen3VLRenderer.
518687

0 commit comments

Comments
 (0)