[#15613][fix] Gemma4 multimodal: fix vision TP and xgrammar startup crashes (#15566)

Thachnh · web-flow · commit 0425801b33ab · 2026-06-26T00:38:42.000-04:00
Signed-off-by: Thach Nguyen &lt;thach@deepinfra.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_gemma4_vision.py b/tensorrt_llm/_torch/models/modeling_gemma4_vision.py
@@ -949,8 +949,12 @@ def _pad_attention_head_dim(self, weights: Dict[str, torch.Tensor]) -> Dict[str,
 
         hf_hd = first_attn.hf_head_dim
         padded_hd = first_attn.head_dim
-        nh = first_attn.num_heads
-        nkv = first_attn.num_key_value_heads
+        # first_attn.num_heads / num_key_value_heads are already divided by
+        # tp_size, but these weights are still unsharded here, so read the full
+        # head counts from the vision config (no-op at tp1).
+        vc = first_attn.vision_config
+        nh = vc.num_attention_heads
+        nkv = getattr(vc, "num_key_value_heads", nh)
         pad_w = padded_hd - hf_hd
 
         # HF keys at this point have already had ``.linear.`` stripped if the
diff --git a/tensorrt_llm/_torch/models/modeling_gemma4mm.py b/tensorrt_llm/_torch/models/modeling_gemma4mm.py
@@ -768,6 +768,10 @@ def post_config(self):
     def infer_max_seq_len(self) -> int:
         return self.llm.infer_max_seq_len()
 
+    @property
+    def vocab_size_padded(self) -> int:
+        return self.llm.vocab_size_padded
+
     @property
     def multimodal_data_device_paths(self) -> List[str]:
         """Dotted paths in ``multimodal_data`` that the engine should ship to