[None][fix] Use simple shard + BMM and fix chat template for GPT-OSS

lucaslie · lucaslie · commit 132675111cba · 2026-03-12T20:30:30.000-07:00
- Use simple_shard_only + bmm sharding per reviewer feedback (uses
  all_gather for functional multi-GPU support)
- Guard multimodal content-to-list conversion in llm.py with
  hasattr(processor, "image_processor") to fix TypeError in
  text-only model chat templates (e.g., GPT-OSS)

Signed-off-by: Lucas Liebenwein &lt;lliebenwein@nvidia.com&gt;
Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/llm.py b/tensorrt_llm/_torch/auto_deploy/llm.py
@@ -49,10 +49,14 @@ def __call__(
             # Normalize message content to list-of-dicts format for multimodal
             # processors (e.g., Llama4) that expect {"type": "text", "text": "..."}
             # instead of plain strings when tokenize=True.
+            # Only apply for multimodal processors that need it; text-only models
+            # (e.g., GPT-OSS) have chat templates that expect plain string content.
             messages = inputs["messages"]
-            for msg in messages:
-                if isinstance(msg.get("content"), str):
-                    msg["content"] = [{"type": "text", "text": msg["content"]}]
+            is_multimodal = hasattr(self.processor, "image_processor")
+            if is_multimodal:
+                for msg in messages:
+                    if isinstance(msg.get("content"), str):
+                        msg["content"] = [{"type": "text", "text": msg["content"]}]
 
             # TODO: we don't really need this but it makes for a good sanity check. Consider
             # removing this in the future if we need to speed things up.