@@ -289,11 +289,13 @@ def _build_inference_app(
289289
290290 inference_image = modal .Image .debian_slim (python_version = "3.12" ).pip_install (
291291 "torch" ,
292- "transformers" ,
292+ "torchvision" ,
293+ "transformers==4.57.3" ,
293294 "peft" ,
294295 "accelerate" ,
295296 "pillow" ,
296297 "qwen-vl-utils" ,
298+ "av" ,
297299 )
298300
299301 vol = volume
@@ -306,7 +308,7 @@ def _build_inference_app(
306308 volumes = {VOLUME_MOUNT : vol },
307309 timeout = 300 ,
308310 serialized = True ,
309- container_idle_timeout = 600 ,
311+ scaledown_window = 600 ,
310312 )
311313 def infer (
312314 messages_json : str ,
@@ -329,16 +331,27 @@ def infer(
329331
330332 import torch
331333 from PIL import Image as _Image
332- from transformers import AutoModelForVision2Seq , AutoProcessor
334+ from transformers import AutoProcessor
333335
334336 # Load model (cached in container memory across calls)
335337 if not hasattr (infer , "_model" ):
336338 print (f"Loading base model: { _base } " )
337- infer ._model = AutoModelForVision2Seq .from_pretrained (
338- _base ,
339- torch_dtype = torch .bfloat16 ,
340- device_map = "auto" ,
341- )
339+ try :
340+ from transformers import AutoModelForVision2Seq
341+
342+ infer ._model = AutoModelForVision2Seq .from_pretrained (
343+ _base ,
344+ torch_dtype = torch .bfloat16 ,
345+ device_map = "auto" ,
346+ )
347+ except (ImportError , ValueError ):
348+ from transformers import Qwen2_5_VLForConditionalGeneration
349+
350+ infer ._model = Qwen2_5_VLForConditionalGeneration .from_pretrained (
351+ _base ,
352+ torch_dtype = torch .bfloat16 ,
353+ device_map = "auto" ,
354+ )
342355
343356 if _adapter :
344357 from peft import PeftModel
@@ -347,7 +360,13 @@ def infer(
347360 vol .reload ()
348361 infer ._model = PeftModel .from_pretrained (infer ._model , _adapter )
349362
350- infer ._processor = AutoProcessor .from_pretrained (_base )
363+ try :
364+ infer ._processor = AutoProcessor .from_pretrained (_base )
365+ except TypeError :
366+ # Fallback for transformers versions with video processor bug
367+ from transformers import Qwen2_5_VLProcessor
368+
369+ infer ._processor = Qwen2_5_VLProcessor .from_pretrained (_base )
351370 print ("Model ready for inference" )
352371
353372 messages = _json .loads (messages_json )
@@ -358,6 +377,23 @@ def infer(
358377 img_bytes = _base64 .b64decode (image_base64 )
359378 image = _Image .open (_BytesIO (img_bytes )).convert ("RGB" )
360379
380+ # Reconstruct multi-modal messages for the processor.
381+ # The agent sends flattened text messages (image dicts stripped),
382+ # but apply_chat_template needs {"type": "image"} placeholders
383+ # to generate <|image_pad|> tokens for the vision encoder.
384+ if image is not None :
385+ for msg in messages :
386+ if msg ["role" ] == "user" :
387+ text_content = msg ["content" ]
388+ # Replace <image> tag in text with proper multi-modal format
389+ if "<image>" in text_content :
390+ text_content = text_content .replace ("<image>\n " , "" ).replace ("<image>" , "" )
391+ msg ["content" ] = [
392+ {"type" : "image" },
393+ {"type" : "text" , "text" : text_content },
394+ ]
395+ break
396+
361397 # Build inputs using the processor's chat template
362398 text = infer ._processor .apply_chat_template (
363399 messages , tokenize = False , add_generation_prompt = True
0 commit comments