NVIDIA-NeMo
diff --git a/‎examples/vlm_finetune/gemma4/gemma4_26b_a4b_moe_packing.yaml‎
Lines changed: 107 additions & 0 deletions b/‎examples/vlm_finetune/gemma4/gemma4_26b_a4b_moe_packing.yaml‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎nemo_automodel/components/datasets/vlm/collate_fns.py‎
Lines changed: 28 additions & 10 deletions b/‎nemo_automodel/components/datasets/vlm/collate_fns.py‎
Lines changed: 28 additions & 10 deletions
diff --git a/‎nemo_automodel/components/datasets/vlm/datasets.py‎
Lines changed: 12 additions & 1 deletion b/‎nemo_automodel/components/datasets/vlm/datasets.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎nemo_automodel/components/datasets/vlm/neat_packing_vlm.py‎
Lines changed: 24 additions & 1 deletion b/‎nemo_automodel/components/datasets/vlm/neat_packing_vlm.py‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎nemo_automodel/components/models/gemma4_moe/model.py‎
Lines changed: 25 additions & 4 deletions b/‎nemo_automodel/components/models/gemma4_moe/model.py‎
Lines changed: 25 additions & 4 deletions
@@ -0,0 +1,107 @@
+# Configuration for fine-tuning Gemma 4 26B-A4B MoE (128 experts) with MedPix dataset and sequence packing
+# Requires 8 GPUs (FSDP2 + EP=8, 16 experts per GPU)
+# torchrun --nproc-per-node=8 examples/vlm_finetune/finetune.py -c examples/vlm_finetune/gemma4/gemma4_26b_a4b_moe_packing.yaml
+
+recipe: FinetuneRecipeForVLM
+
+step_scheduler:
+  global_batch_size: 8
+  local_batch_size: 1
+  ckpt_every_steps: 500
+  val_every_steps: 500
+  num_epochs: 2
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 60
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 42
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
+  pretrained_model_name_or_path: google/gemma-4-26B-A4B-it
+  torch_dtype: torch.bfloat16
+  trust_remote_code: true
+  attn_implementation: eager
+  backend:
+    _target_: nemo_automodel.components.models.common.BackendConfig
+    attn: te
+    linear: te
+    rms_norm: te
+    rope_fusion: true
+    dispatcher: deepep
+    fake_balanced_gate: false
+    enable_hf_state_dict_adapter: true
+    enable_fsdp_optimizations: true
+  text_config:
+  # 26B-A4B does not use kv_shared layers (only used in 2B, 4B), hence use_cache: false.
+    use_cache: false
+
+processor:
+  padding_side: right
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: vlm_checkpoints/gemma4_26b_a4b_moe_packing/
+  model_save_format: torch_save
+  save_consolidated: false
+
+distributed:
+  strategy: fsdp2
+  dp_size: none
+  tp_size: 1
+  cp_size: 1
+  ep_size: 8
+  sequence_parallel: false
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: train[:1000]
+
+packed_sequence:
+  pretokenize: true
+  max_length: 3072
+  pack_size: 3072
+  packing_ratio: 0.9
+  drop_long_samples: true
+  post_tokenize_hook_fn: nemo_automodel.components.datasets.vlm.collate_fns.gemma4_inject_thinking_prefix
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 4
+  persistent_workers: true
+  pin_memory: true
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: validation[:500]
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.gemma4_prefix_collate_fn
+
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 2e-5
+  weight_decay: 0.01
+  betas: [0.9, 0.95]
+
+freeze_config:
+  freeze_embeddings: true
+  freeze_vision_tower: true
+  freeze_audio_tower: true
+  freeze_language_model: false
+
+# wandb:
+#   project: <your-project>
+#   entity: <your-entity>
+#   name: <your-run-name>
@@ -1496,6 +1496,12 @@ def _pad_1d(tensor, pad_value, target_len):
     labels = torch.stack([_pad_1d(x["labels"], LABEL_PAD, max_len) for x in batch])
     attention_mask = torch.stack([_pad_1d(x["attention_mask"], 0, max_len) for x in batch])
 
+    def _get_mm_token_type_ids(item):
+        v = item.get("mm_token_type_ids")
+        return v if v is not None else torch.zeros(0, dtype=torch.long)
+
+    mm_token_type_ids = torch.stack([_pad_1d(_get_mm_token_type_ids(x), 0, max_len) for x in batch])
+
     if use_flash:
         # Keep indexed [B, S] mask for flash_attn_varlen_func.
         # The patched _get_unpad_data will extract per-document cu_seqlens.
@@ -1526,6 +1532,7 @@ def _pad_mrope(pos, target_len):
         "labels": labels,
         "position_ids": position_ids,
         "attention_mask": attention_mask_out,
+        "mm_token_type_ids": mm_token_type_ids,
     }
 
     # Store indexed attention mask for loss functions that need per-sample
@@ -1541,7 +1548,7 @@ def _pad_mrope(pos, target_len):
         if tensors:
             result[key] = torch.cat(tensors, dim=0).to(torch.bfloat16)
 
-    for key in ("image_grid_thw", "video_grid_thw", "second_per_grid_ts"):
+    for key in ("image_grid_thw", "image_position_ids", "video_grid_thw", "second_per_grid_ts"):
         tensors = [x[key] for x in batch if key in x and x[key] is not None]
         if tensors:
             result[key] = torch.cat(tensors, dim=0)
@@ -1804,13 +1811,6 @@ def _inject_thinking_prefix_tokens(
 ) -> Dict[str, torch.Tensor]:
     """Insert ``<|channel>thought\\n<channel|>`` tokens after every ``<|turn>model\\n`` marker.
 
-    Gemma4 31B / 26B-A4B MoE instruction-tuned models always emit a thinking-
-    channel prefix before the actual response.  When this prefix is absent from
-    training sequences the model predicts ``<|channel>`` but the label says
-    answer text, inflating initial loss to ~9.  Injecting the prefix (masked
-    as -100 in labels) lets the model see its expected pattern and brings
-    initial loss down to ~3.
-
     Modifies ``input_ids``, ``attention_mask``, and ``mm_token_type_ids``
     (if present).  Additionally, any other 2-D integer tensor whose second
     dimension matches ``input_ids`` is extended with zeros so that sequence
@@ -1885,6 +1885,25 @@ def _inject_thinking_prefix_tokens(
     return batch
 
 
+def gemma4_inject_thinking_prefix(
+    batch: Dict[str, torch.Tensor],
+    processor,
+) -> Dict[str, torch.Tensor]:
+    """Inject Gemma4's thinking-channel prefix after every assistant turn marker.
+
+    Gemma4 31B / 26B-A4B MoE instruction-tuned models always emit a thinking-
+    channel prefix before the actual response.  When this prefix is absent from
+    training sequences the model predicts ``<|channel>`` but the label says
+    answer text, inflating initial loss to ~9.  Injecting the prefix (masked
+    as -100 in labels) lets the model see its expected pattern and brings
+    initial loss down to ~3.
+
+    Safe no-op for non-Gemma4 tokenizers.
+    """
+    tokenizer = getattr(processor, "tokenizer", processor)
+    return _inject_thinking_prefix_tokens(batch, tokenizer)
+
+
 def gemma4_prefix_collate_fn(
     examples: Sequence[Dict[str, Any]],
     processor,
@@ -1900,8 +1919,7 @@ def gemma4_prefix_collate_fn(
     """
 
     def _inject(batch, proc):
-        tokenizer = getattr(proc, "tokenizer", proc)
-        batch = _inject_thinking_prefix_tokens(batch, tokenizer)
+        batch = gemma4_inject_thinking_prefix(batch, proc)
         if max_length is not None and batch["input_ids"].size(1) > max_length:
             for key in list(batch.keys()):
                 v = batch[key]
 
@@ -922,12 +922,21 @@ class PreTokenizedDatasetWrapper(torch.utils.data.Dataset):
     ``pixel_values_videos``, ``video_grid_thw``).
     """
 
-    def __init__(self, dataset, processor, max_length=None, max_retries=10, truncate=False):
+    def __init__(
+        self,
+        dataset,
+        processor,
+        max_length=None,
+        max_retries=10,
+        truncate=False,
+        post_tokenize_hook=None,
+    ):
         self.dataset = dataset
         self.processor = processor
         self.max_length = max_length
         self.truncate = truncate
         self.max_retries = max_retries
+        self.post_tokenize_hook = post_tokenize_hook
         # Compatibility attributes expected by build_dataloader
         self.preload_media = False
 
@@ -998,6 +1007,8 @@ def __getitem__(self, idx):
                     processor_kwargs["video_metadata"] = [video_metadata]
 
                 result = self.processor(**processor_kwargs)
+                if self.post_tokenize_hook is not None:
+                    result = self.post_tokenize_hook(result, self.processor)
 
                 input_ids = result["input_ids"][0]  # (seq_len,)
                 seq_len = input_ids.shape[0]
 
@@ -47,7 +47,14 @@
 
 logger = logging.getLogger(__name__)
 
-MEDIA_KEYS = ("pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts")
+MEDIA_KEYS = (
+    "pixel_values",
+    "image_grid_thw",
+    "image_position_ids",
+    "pixel_values_videos",
+    "video_grid_thw",
+    "second_per_grid_ts",
+)
 
 
 # ---------------------------------------------------------------------------
@@ -302,6 +309,10 @@ def _shift_sample(sample: dict, has_mrope: bool = False) -> dict:
     out["labels"] = sample["labels"][1:]
     out["attention_mask"] = sample["attention_mask"][:-1]
 
+    if (mm_ttids := sample.get("mm_token_type_ids")) is not None:
+        mm_ttids = torch.as_tensor(mm_ttids)
+        out["mm_token_type_ids"] = mm_ttids[0, :-1] if mm_ttids.ndim == 2 else mm_ttids[:-1]
+
     if has_mrope and "position_ids" in sample and sample["position_ids"] is not None:
         out["position_ids"] = sample["position_ids"][:, :-1]
 
@@ -321,11 +332,13 @@ def _build_packed_vlm_sample(
     all_input_ids: list[int] = []
     all_labels: list[int] = []
     all_attention_mask: list[int] = []
+    all_mm_token_type_ids: list[int] = []
     all_position_ids_1d: list[int] = []
     mrope_position_ids_list: list[torch.Tensor] = []
 
     pixel_values_list: list[torch.Tensor] = []
     image_grid_thw_list: list[torch.Tensor] = []
+    image_position_ids_list: list[torch.Tensor] = []
     pixel_values_videos_list: list[torch.Tensor] = []
     video_grid_thw_list: list[torch.Tensor] = []
     second_per_grid_ts_list: list[torch.Tensor] = []
@@ -345,6 +358,12 @@ def _build_packed_vlm_sample(
         all_labels.extend(labs)
         all_attention_mask.extend([seq_idx] * seq_len)
 
+        mm_ttids = sample.get("mm_token_type_ids")
+        if mm_ttids is not None:
+            all_mm_token_type_ids.extend(mm_ttids.tolist() if isinstance(mm_ttids, torch.Tensor) else mm_ttids)
+        else:
+            all_mm_token_type_ids.extend([0] * seq_len)
+
         if has_mrope and "position_ids" in sample:
             mrope_position_ids_list.append(sample["position_ids"])
         else:
@@ -355,6 +374,8 @@ def _build_packed_vlm_sample(
         if "image_grid_thw" in sample and sample["image_grid_thw"] is not None:
             n_images += sample["image_grid_thw"].shape[0]
             image_grid_thw_list.append(sample["image_grid_thw"])
+        if "image_position_ids" in sample and sample["image_position_ids"] is not None:
+            image_position_ids_list.append(sample["image_position_ids"])
         if "pixel_values_videos" in sample and sample["pixel_values_videos"] is not None:
             pixel_values_videos_list.append(sample["pixel_values_videos"])
         if "video_grid_thw" in sample and sample["video_grid_thw"] is not None:
@@ -368,6 +389,7 @@ def _build_packed_vlm_sample(
         "input_ids": torch.tensor(all_input_ids, dtype=torch.long),
         "labels": torch.tensor(all_labels, dtype=torch.long),
         "attention_mask": torch.tensor(all_attention_mask, dtype=torch.long),
+        "mm_token_type_ids": torch.tensor(all_mm_token_type_ids, dtype=torch.long),
         "n_images": n_images,
         "n_videos": n_videos,
     }
@@ -379,6 +401,7 @@ def _build_packed_vlm_sample(
 
     packed["pixel_values"] = torch.cat(pixel_values_list, dim=0) if pixel_values_list else None
     packed["image_grid_thw"] = torch.cat(image_grid_thw_list, dim=0) if image_grid_thw_list else None
+    packed["image_position_ids"] = torch.cat(image_position_ids_list, dim=0) if image_position_ids_list else None
     packed["pixel_values_videos"] = torch.cat(pixel_values_videos_list, dim=0) if pixel_values_videos_list else None
     packed["video_grid_thw"] = torch.cat(video_grid_thw_list, dim=0) if video_grid_thw_list else None
     packed["second_per_grid_ts"] = torch.cat(second_per_grid_ts_list, dim=0) if second_per_grid_ts_list else None
 
@@ -114,9 +114,7 @@ def forward(self, x, token_mask=None, cp_mesh=None):
         expert_scores = self.proj(x_norm)
         router_probs = F.softmax(expert_scores, dim=-1)
 
-        # Top-k on raw scores (matching HF Gemma4Router behaviour)
-        _, indices = torch.topk(expert_scores, k=self.topk, dim=-1)
-        weights = router_probs.gather(-1, indices)
+        weights, indices = torch.topk(router_probs, k=self.topk, dim=-1)
         weights = weights / weights.sum(dim=-1, keepdim=True).clamp(min=1e-20)
         return weights, indices, None
 
@@ -264,6 +262,26 @@ def forward(
         return x
 
 
+def _convert_bool_4d_mask_to_additive(attention_mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Convert a 4D bool allowed-mask to HF additive format (0.0 allowed, -inf masked)."""
+    if attention_mask.ndim != 4 or attention_mask.dtype != torch.bool:
+        return attention_mask
+    additive = torch.zeros(attention_mask.shape, dtype=dtype, device=attention_mask.device)
+    return additive.masked_fill(~attention_mask, torch.finfo(dtype).min)
+
+
+def _derive_padding_mask(attention_mask: torch.Tensor) -> torch.Tensor:
+    """Derive 2D padding mask (True = pad) from 1D, 2D, or 4D attention mask."""
+    if attention_mask.ndim == 2:
+        return attention_mask == 0
+    if attention_mask.ndim == 4:
+        diagonal = torch.diagonal(attention_mask[:, 0], dim1=-2, dim2=-1)
+        if attention_mask.dtype == torch.bool:
+            return diagonal.logical_not()
+        return diagonal != 0
+    return attention_mask.bool().logical_not()
+
+
 # ---------------------------------------------------------------------------
 # Text model backend
 # ---------------------------------------------------------------------------
@@ -356,7 +374,10 @@ def forward(
             position_ids = cache_position.unsqueeze(0)
 
         if padding_mask is None and attention_mask is not None:
-            padding_mask = attention_mask.bool().logical_not()
+            padding_mask = _derive_padding_mask(attention_mask)
+
+        if attention_mask is not None:
+            attention_mask = _convert_bool_4d_mask_to_additive(attention_mask, inputs_embeds.dtype)
 
         hidden_states = inputs_embeds