Skip to content

Commit 88e4c09

Browse files
abrichrclaude
andcommitted
fix(modal): fix inference container image and multi-modal message handling
- Pin transformers==4.57.3 (matches local, has Qwen3-VL support) - Add torchvision dependency (required by AutoVideoProcessor) - Add fallback: AutoModelForVision2Seq -> Qwen2_5_VLForConditionalGeneration - Add fallback: AutoProcessor -> Qwen2_5_VLProcessor - Reconstruct multi-modal messages with {"type": "image"} placeholders for proper vision token generation in apply_chat_template - Rename container_idle_timeout -> scaledown_window (Modal API update) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e42ed75 commit 88e4c09

1 file changed

Lines changed: 45 additions & 9 deletions

File tree

openadapt_ml/cloud/modal_cloud.py

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -289,11 +289,13 @@ def _build_inference_app(
289289

290290
inference_image = modal.Image.debian_slim(python_version="3.12").pip_install(
291291
"torch",
292-
"transformers",
292+
"torchvision",
293+
"transformers==4.57.3",
293294
"peft",
294295
"accelerate",
295296
"pillow",
296297
"qwen-vl-utils",
298+
"av",
297299
)
298300

299301
vol = volume
@@ -306,7 +308,7 @@ def _build_inference_app(
306308
volumes={VOLUME_MOUNT: vol},
307309
timeout=300,
308310
serialized=True,
309-
container_idle_timeout=600,
311+
scaledown_window=600,
310312
)
311313
def infer(
312314
messages_json: str,
@@ -329,16 +331,27 @@ def infer(
329331

330332
import torch
331333
from PIL import Image as _Image
332-
from transformers import AutoModelForVision2Seq, AutoProcessor
334+
from transformers import AutoProcessor
333335

334336
# Load model (cached in container memory across calls)
335337
if not hasattr(infer, "_model"):
336338
print(f"Loading base model: {_base}")
337-
infer._model = AutoModelForVision2Seq.from_pretrained(
338-
_base,
339-
torch_dtype=torch.bfloat16,
340-
device_map="auto",
341-
)
339+
try:
340+
from transformers import AutoModelForVision2Seq
341+
342+
infer._model = AutoModelForVision2Seq.from_pretrained(
343+
_base,
344+
torch_dtype=torch.bfloat16,
345+
device_map="auto",
346+
)
347+
except (ImportError, ValueError):
348+
from transformers import Qwen2_5_VLForConditionalGeneration
349+
350+
infer._model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
351+
_base,
352+
torch_dtype=torch.bfloat16,
353+
device_map="auto",
354+
)
342355

343356
if _adapter:
344357
from peft import PeftModel
@@ -347,7 +360,13 @@ def infer(
347360
vol.reload()
348361
infer._model = PeftModel.from_pretrained(infer._model, _adapter)
349362

350-
infer._processor = AutoProcessor.from_pretrained(_base)
363+
try:
364+
infer._processor = AutoProcessor.from_pretrained(_base)
365+
except TypeError:
366+
# Fallback for transformers versions with video processor bug
367+
from transformers import Qwen2_5_VLProcessor
368+
369+
infer._processor = Qwen2_5_VLProcessor.from_pretrained(_base)
351370
print("Model ready for inference")
352371

353372
messages = _json.loads(messages_json)
@@ -358,6 +377,23 @@ def infer(
358377
img_bytes = _base64.b64decode(image_base64)
359378
image = _Image.open(_BytesIO(img_bytes)).convert("RGB")
360379

380+
# Reconstruct multi-modal messages for the processor.
381+
# The agent sends flattened text messages (image dicts stripped),
382+
# but apply_chat_template needs {"type": "image"} placeholders
383+
# to generate <|image_pad|> tokens for the vision encoder.
384+
if image is not None:
385+
for msg in messages:
386+
if msg["role"] == "user":
387+
text_content = msg["content"]
388+
# Replace <image> tag in text with proper multi-modal format
389+
if "<image>" in text_content:
390+
text_content = text_content.replace("<image>\n", "").replace("<image>", "")
391+
msg["content"] = [
392+
{"type": "image"},
393+
{"type": "text", "text": text_content},
394+
]
395+
break
396+
361397
# Build inputs using the processor's chat template
362398
text = infer._processor.apply_chat_template(
363399
messages, tokenize=False, add_generation_prompt=True

0 commit comments

Comments
 (0)