Skip to content

Commit 9c91a54

Browse files
abrichrclaude
andauthored
fix: use list input format for Outlines multimodal generation (#205)
TransformersMultiModal.format_input is a singledispatch that only accepts `list` and `Chat` types. A `dict` raises TypeError. Correct format: [prompt_text, outlines.Image(pil_image)] Wrong format: {"text": prompt, "images": [image]} Also fixes PIL .format being dropped by .convert("RGB") — outlines.Image requires .format to be set. Restored after conversion. New test: test_outlines_multimodal_input_format verifies: - list is a registered dispatch type (dict is NOT) - outlines.Image wraps PIL images correctly - This test would have caught both the dict and format bugs 36/36 tests pass in 0.10s. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 1a0bebc commit 9c91a54

2 files changed

Lines changed: 55 additions & 5 deletions

File tree

openadapt_evals/training/standalone/trainer.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,11 @@ def _collect_rollout(self, task_id: str, instruction: str) -> Rollout:
200200
logger.info("Stuck at step %d", step_idx)
201201
break
202202

203-
image = Image.open(io.BytesIO(screenshot)).convert("RGB")
203+
image = Image.open(io.BytesIO(screenshot))
204+
if image.mode != "RGB":
205+
image = image.convert("RGB")
206+
# .convert() drops .format; restore it for outlines.Image
207+
image.format = "PNG"
204208
messages = build_agent_messages(instruction, include_image=True)
205209
if hasattr(self._processor, "apply_chat_template"):
206210
text_input = self._processor.apply_chat_template(
@@ -215,10 +219,13 @@ def _collect_rollout(self, task_id: str, instruction: str) -> Rollout:
215219
else None
216220
)
217221
if outlines_gen is not None:
218-
# Outlines v1.2 Generator API: handles tokenization,
219-
# generation, and decoding internally. For multimodal
220-
# models, pass a dict with "text" + image keys.
221-
model_input = {"text": text_input, "images": [image]}
222+
# Outlines v1.2 Generator API for multimodal models.
223+
# TransformersMultiModal.format_input dispatches on type:
224+
# list → [prompt_text, Image(pil), ...]
225+
# Chat → Chat([Message(...)])
226+
# A dict is NOT accepted (raises TypeError).
227+
import outlines
228+
model_input = [text_input, outlines.Image(image)]
222229
decoded = outlines_gen(
223230
model_input,
224231
max_new_tokens=self._config.max_new_tokens,

tests/test_standalone_trainer.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,49 @@ def test_outlines_generator_api_contract(self) -> None:
203203
for p in params_call
204204
), f"SteerableGenerator.__call__ doesn't accept **kwargs: {sig_call}"
205205

206+
def test_outlines_multimodal_input_format(self) -> None:
207+
"""Verify outlines TransformersMultiModal accepts list input, not dict.
208+
209+
This is THE test that catches the input format bug. The trainer
210+
must pass [prompt, outlines.Image(pil)] not {"text": ..., "images": ...}.
211+
212+
TransformersMultiModalTypeAdapter.format_input is a singledispatch
213+
that only accepts `list` and `Chat` types. A `dict` raises TypeError.
214+
"""
215+
try:
216+
import outlines
217+
from outlines.models.transformers import TransformersMultiModalTypeAdapter
218+
except ImportError:
219+
pytest.skip("outlines not installed")
220+
221+
# Verify list is a registered dispatch type by checking the
222+
# class-level dispatcher registry (singledispatchmethod stores
223+
# it on the descriptor, not the bound method).
224+
fmt = TransformersMultiModalTypeAdapter.__dict__["format_input"]
225+
registry = fmt.dispatcher.registry
226+
registered_types = set(registry.keys())
227+
assert list in registered_types, (
228+
f"list not registered in format_input dispatch: {registered_types}. "
229+
f"The trainer passes [prompt, Image(pil)] — this type must be accepted."
230+
)
231+
assert dict not in registered_types, (
232+
"dict is registered in format_input — if this changes, the trainer's "
233+
"input format can be simplified back to a dict."
234+
)
235+
236+
# Verify outlines.Image exists and wraps PIL images
237+
assert hasattr(outlines, "Image"), "outlines.Image not found"
238+
from PIL import Image as PILImage
239+
import io
240+
test_img = PILImage.new("RGB", (10, 10))
241+
# outlines.Image requires .format to be set (loaded from file)
242+
buf = io.BytesIO()
243+
test_img.save(buf, format="PNG")
244+
buf.seek(0)
245+
test_img_with_format = PILImage.open(buf)
246+
wrapped = outlines.Image(test_img_with_format)
247+
assert wrapped is not None
248+
206249
def test_false_sentinel_not_confused_with_none(self) -> None:
207250
"""Regression: False sentinel must return None, not be treated as uninitialized."""
208251
config = TrainingConfig(constrained_decoding=True)

0 commit comments

Comments
 (0)