@@ -299,11 +299,142 @@ def _build_cases(make_part, image):
299299 ]
300300
301301
302+ def _build_tool_image_cases (make_part , image ):
303+ """Tool-message image scenarios. Targets renderers that emit image
304+ placeholders inside ``<tool_response>`` blocks. Browser-agent style
305+ trajectories produce post-action screenshots as tool responses, so
306+ handling images here is load-bearing for that workload."""
307+ return [
308+ pytest .param (
309+ [
310+ {"role" : "user" , "content" : "Take a screenshot." },
311+ {
312+ "role" : "assistant" ,
313+ "content" : "" ,
314+ "tool_calls" : [
315+ {
316+ "id" : "c1" ,
317+ "type" : "function" ,
318+ "function" : {"name" : "screenshot" , "arguments" : {}},
319+ }
320+ ],
321+ },
322+ {
323+ "role" : "tool" ,
324+ "tool_call_id" : "c1" ,
325+ "content" : [
326+ {"type" : "text" , "text" : "Screenshot captured." },
327+ make_part (image ),
328+ ],
329+ },
330+ ],
331+ False ,
332+ id = "tool_response_with_image" ,
333+ ),
334+ pytest .param (
335+ [
336+ {"role" : "user" , "content" : "Screenshot then describe." },
337+ {
338+ "role" : "assistant" ,
339+ "content" : "" ,
340+ "tool_calls" : [
341+ {
342+ "id" : "c1" ,
343+ "type" : "function" ,
344+ "function" : {"name" : "screenshot" , "arguments" : {}},
345+ }
346+ ],
347+ },
348+ {
349+ "role" : "tool" ,
350+ "tool_call_id" : "c1" ,
351+ "content" : [
352+ {"type" : "text" , "text" : "Done:" },
353+ make_part (image ),
354+ ],
355+ },
356+ {"role" : "assistant" , "content" : "A square." },
357+ {"role" : "user" , "content" : "Now show me the next page." },
358+ {
359+ "role" : "assistant" ,
360+ "content" : "" ,
361+ "tool_calls" : [
362+ {
363+ "id" : "c2" ,
364+ "type" : "function" ,
365+ "function" : {"name" : "screenshot" , "arguments" : {}},
366+ }
367+ ],
368+ },
369+ {
370+ "role" : "tool" ,
371+ "tool_call_id" : "c2" ,
372+ "content" : [
373+ {"type" : "text" , "text" : "Next page:" },
374+ make_part (image ),
375+ ],
376+ },
377+ ],
378+ False ,
379+ id = "multi_turn_tool_response_images" ,
380+ ),
381+ pytest .param (
382+ [
383+ {"role" : "user" , "content" : "Run a few tools." },
384+ {
385+ "role" : "assistant" ,
386+ "content" : "" ,
387+ "tool_calls" : [
388+ {
389+ "id" : "c1" ,
390+ "type" : "function" ,
391+ "function" : {"name" : "ping" , "arguments" : {}},
392+ },
393+ {
394+ "id" : "c2" ,
395+ "type" : "function" ,
396+ "function" : {"name" : "screenshot" , "arguments" : {}},
397+ },
398+ {
399+ "id" : "c3" ,
400+ "type" : "function" ,
401+ "function" : {"name" : "ping" , "arguments" : {}},
402+ },
403+ ],
404+ },
405+ {"role" : "tool" , "tool_call_id" : "c1" , "content" : "pong" },
406+ {
407+ "role" : "tool" ,
408+ "tool_call_id" : "c2" ,
409+ "content" : [
410+ {"type" : "text" , "text" : "Screenshot:" },
411+ make_part (image ),
412+ ],
413+ },
414+ {"role" : "tool" , "tool_call_id" : "c3" , "content" : "pong" },
415+ ],
416+ False ,
417+ id = "consecutive_tools_mixed_media" ,
418+ ),
419+ ]
420+
421+
302422# ---------------------------------------------------------------------------
303423# Tests.
304424# ---------------------------------------------------------------------------
305425
306426
427+ def _supports_tool_message_images (renderer ) -> bool :
428+ """True iff this renderer emits image placeholders inside tool-response
429+ content. Renderers without the feature silently drop image parts in tool
430+ content; as they grow the feature they get added here and the test starts
431+ asserting against them."""
432+ from renderers .kimi_k25 import KimiK25Renderer
433+ from renderers .qwen35 import Qwen35Renderer
434+
435+ return isinstance (renderer , (Qwen35Renderer , KimiK25Renderer ))
436+
437+
307438@pytest .mark .parametrize (
308439 "mm_model_name,modality" , _CASES , ids = [f"{ m } |{ mo } " for m , mo in _CASES ]
309440)
@@ -513,6 +644,44 @@ def test_modality_registry_models_route_to_renderer():
513644 )
514645
515646
647+ @pytest .mark .parametrize (
648+ "mm_model_name,modality" , _CASES , ids = [f"{ m } |{ mo } " for m , mo in _CASES ]
649+ )
650+ def test_tool_response_image_byte_parity (mm_model_name , modality , tiny_image ):
651+ """Tool-message image parity vs ``processor.apply_chat_template`` + ``processor(...)``.
652+
653+ Browser-agent SFT traces carry post-action screenshots as ``tool``
654+ responses. Renderers that drop those image parts silently — historically
655+ every Qwen-VL family renderer did — produce token streams that diverge
656+ from the HF processor and lose most of the visual learning signal.
657+ Skipped for renderers that haven't grown the feature yet; flips to a
658+ real assertion as they do.
659+ """
660+ if modality != "image" :
661+ pytest .skip ("Tool-response media path is image-only for now." )
662+ if not _hf_snapshot_cached (mm_model_name ):
663+ pytest .skip (f"{ mm_model_name } : HF snapshot not cached locally" )
664+
665+ kit = _modality_kit (modality , mm_model_name )
666+ tokenizer , processor , renderer = _load_processor_and_renderer (mm_model_name )
667+
668+ if not _supports_tool_message_images (renderer ):
669+ pytest .skip (
670+ f"{ type (renderer ).__name__ } does not yet emit images inside tool responses"
671+ )
672+
673+ for case in _build_tool_image_cases (kit ["make_part" ], tiny_image ):
674+ messages , add_gp = case .values
675+ ours = renderer .render_ids (messages , add_generation_prompt = add_gp )
676+ theirs = kit ["processor_input_ids" ](processor , messages , add_gp )
677+ assert ours == theirs , (
678+ f"{ mm_model_name } / tool / case={ case .id } : "
679+ f"renderer diverges from processor.\n "
680+ f" len(ours)={ len (ours )} len(theirs)={ len (theirs )} \n "
681+ f" ours[:60]={ ours [:60 ]} \n theirs[:60]={ theirs [:60 ]} "
682+ )
683+
684+
516685def test_qwen3_vl_renderer_exposes_image_modality ():
517686 """The flagship multimodal renderer is concretely Qwen3VLRenderer.
518687
0 commit comments