nv-auto-deploy
diff --git a/‎tensorrt_llm/_torch/auto_deploy/llm.py‎
Lines changed: 10 additions & 2 deletions b/‎tensorrt_llm/_torch/auto_deploy/llm.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py‎
Lines changed: 3 additions & 0 deletions
@@ -46,17 +46,25 @@ def __call__(
             # multi_modal_data should not be present in the messages field
             assert "multi_modal_data" not in inputs, f"unexpected multi_modal_data key in {inputs=}"
 
+            # Normalize message content to list-of-dicts format for multimodal
+            # processors (e.g., Llama4) that expect {"type": "text", "text": "..."}
+            # instead of plain strings when tokenize=True.
+            messages = inputs["messages"]
+            for msg in messages:
+                if isinstance(msg.get("content"), str):
+                    msg["content"] = [{"type": "text", "text": msg["content"]}]
+
             # TODO: we don't really need this but it makes for a good sanity check. Consider
             # removing this in the future if we need to speed things up.
             prompt = self.processor.apply_chat_template(
-                inputs["messages"],
+                messages,
                 add_generation_prompt=True,
                 tokenize=False,
             )
             inputs["prompt"] = prompt
 
             all_args = self.processor.apply_chat_template(
-                inputs["messages"],
+                messages,
                 add_generation_prompt=True,
                 tokenize=True,
                 return_dict=True,
 
@@ -15,6 +15,7 @@
 from .modeling_internlm3 import InternLM3ForCausalLM
 from .modeling_kimi_k2 import KimiK2ForCausalLM, KimiK25ForConditionalGeneration
 from .modeling_llama3 import Llama3ForCausalLM
+from .modeling_llama4 import Llama4ForCausalLM, Llama4ForConditionalGeneration
 from .modeling_minimax_m2 import MiniMaxM2ForCausalLM
 from .modeling_mistral import MistralForCausalLM
 from .modeling_mistral3 import Mistral3ForConditionalGeneration, Mistral3TextForCausalLM
@@ -49,6 +50,8 @@
     "KimiK2ForCausalLM",
     "KimiK25ForConditionalGeneration",
     "Llama3ForCausalLM",
+    "Llama4ForCausalLM",
+    "Llama4ForConditionalGeneration",
     "MiniMaxM2ForCausalLM",
     "MistralForCausalLM",
     "Mistral3ForConditionalGeneration",