NVIDIA-NeMo
diff --git a/‎examples/vlm_finetune/nemotron_omni/nemotron_omni_v3_cord_v2_ep8cp2.yaml‎
Lines changed: 99 additions & 0 deletions b/‎examples/vlm_finetune/nemotron_omni/nemotron_omni_v3_cord_v2_ep8cp2.yaml‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎nemo_automodel/_transformers/capabilities.py‎
Lines changed: 14 additions & 7 deletions b/‎nemo_automodel/_transformers/capabilities.py‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎nemo_automodel/components/distributed/cp_utils.py‎
Lines changed: 65 additions & 6 deletions b/‎nemo_automodel/components/distributed/cp_utils.py‎
Lines changed: 65 additions & 6 deletions
@@ -0,0 +1,99 @@
+# NemotronOmni v3 (Reasoning) fine-tuning on CORD-V2 -- ep8cp2 (CP=2 test)
+#
+# Run:
+#   cd automodel-omni
+#   automodel examples/vlm_finetune/nemotron_omni/nemotron_omni_v3_cord_v2_ep8cp2.yaml --nproc-per-node 8
+
+recipe: FinetuneRecipeForVLM
+
+step_scheduler:
+  global_batch_size: 8
+  local_batch_size: 1
+  ckpt_every_steps: 100000
+  val_every_steps: 100000
+  max_steps: 100
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 30
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 1234
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
+  pretrained_model_name_or_path: nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16
+  trust_remote_code: true
+  torch_dtype: torch.bfloat16
+  backend:
+    _target_: nemo_automodel.components.models.common.BackendConfig
+    attn: te
+    linear: te
+    rms_norm: torch_fp32
+    rope_fusion: false
+    dispatcher: deepep
+    fake_balanced_gate: false
+    enable_hf_state_dict_adapter: true
+
+processor:
+  _target_: transformers.AutoProcessor.from_pretrained
+  pretrained_model_name_or_path: nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16
+  trust_remote_code: true
+
+checkpoint:
+  enabled: false
+
+distributed:
+  strategy: fsdp2
+  tp_size: 1
+  cp_size: 2
+  pp_size: 1
+  ep_size: 8
+  sequence_parallel: false
+
+freeze_config:
+  freeze_embeddings: true
+  freeze_vision_tower: true
+  freeze_audio_tower: true
+  freeze_language_model: false
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: train
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 0
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.nemotron_omni_collate_fn
+    max_length: 4096
+  drop_last: true
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: validation
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 1
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.nemotron_omni_collate_fn
+    max_length: 4096
+
+wandb:
+  entity: Nemo-automodel
+  project: huiyingl_workspace
+  name: nomni_v3_medpix_ep8cp2_te
+
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 1e-4
+  weight_decay: 0.01
+  betas: [0.9, 0.95]
@@ -96,15 +96,22 @@ def _is_hybrid(model: "nn.Module") -> bool:
 
     Detected via config attributes used by NemotronH (``layers_block_type``)
     and HF hybrid models (``hybrid_override_pattern``, ``is_hybrid_model``).
+    For VLM wrappers, also inspect the inner ``language_model``'s config.
     """
-    config = getattr(model, "config", None)
-    if config is None:
-        return False
-    for attr in ("layers_block_type", "hybrid_override_pattern"):
-        pattern = getattr(config, attr, None)
-        if pattern and any(str(c).upper() == "M" for c in pattern):
+    candidates = [getattr(model, "config", None)]
+    inner = getattr(model, "language_model", None)
+    if inner is not None:
+        candidates.append(getattr(inner, "config", None))
+    for config in candidates:
+        if config is None:
+            continue
+        for attr in ("layers_block_type", "hybrid_override_pattern"):
+            pattern = getattr(config, attr, None)
+            if pattern and any(str(c).upper() == "M" for c in pattern):
+                return True
+        if getattr(config, "is_hybrid_model", False) is True:
             return True
-    return getattr(config, "is_hybrid_model", False) is True
+    return False
 
 
 class ModelSupports:
 
@@ -271,11 +271,23 @@ def _get_mesh_size(mesh):
     # so that SDPA handles causal masking internally.
     batch.pop("attention_mask", None)
 
+    # Determine the primary sequence tensor: inputs_embeds (VLM with CP, where
+    # multimodal token replacement happened pre-shard) or input_ids (standard LLM).
+    has_inputs_embeds = "inputs_embeds" in batch
+    has_input_ids = "input_ids" in batch
+    assert has_inputs_embeds ^ has_input_ids, (
+        "make_cp_batch_and_ctx requires exactly one of 'inputs_embeds' or 'input_ids' in batch"
+    )
+    if has_inputs_embeds:
+        primary_seq_tensor = batch["inputs_embeds"]
+    else:
+        primary_seq_tensor = batch["input_ids"]
+    seq_len = primary_seq_tensor.shape[1]
+
     # Skip 1D injection if position_ids already in batch (e.g. mRoPE pre-computed)
     if "position_ids" not in batch and (_get_mesh_size(cp_mesh) > 1 or _get_mesh_size(tp_mesh) > 1):
-        batch["position_ids"] = torch.arange(0, batch["input_ids"].shape[1]).unsqueeze(0).to(batch["input_ids"].device)
+        batch["position_ids"] = torch.arange(0, seq_len).unsqueeze(0).to(primary_seq_tensor.device)
 
-    input_ids = batch["input_ids"]
     position_ids = batch["position_ids"]
 
     # Determine correct seq dim for CP sharding
@@ -284,12 +296,19 @@ def _get_mesh_size(mesh):
 
     labels = batch["labels"]
 
-    # Collect all available tensors for context parallel
-    cp_buffers = [input_ids, labels, position_ids]
+    # Collect all available tensors for context parallel.  We track each
+    # cp_buffer's batch key (when sourced from ``batch``) so the padding pass
+    # below can pick the semantically-correct fill sentinel and mirror the
+    # padded tensor back into ``batch``.  ``loss_mask`` is passed as an arg
+    # (not in batch) so it has no key.
+    primary_key = "inputs_embeds" if has_inputs_embeds else "input_ids"
+    cp_buffers = [primary_seq_tensor, labels, position_ids]
+    # inputs_embeds is [B, S, H] → seq_dim=1; input_ids is [B, S] → seq_dim=1
     cp_seq_dims = [1, 1, pos_seq_dim]
-    cp_no_restore_buffers = {input_ids, labels}
+    cp_no_restore_buffers = {primary_seq_tensor, labels}
+    batch_buffer_keys: dict[int, str] = {0: primary_key, 1: "labels", 2: "position_ids"}
 
-    # Add loss_mask if available
+    # Add loss_mask if available (passed as arg, not in batch -> no key)
     if loss_mask is not None:
         cp_buffers.append(loss_mask)
         cp_seq_dims.append(1)
@@ -298,10 +317,50 @@ def _get_mesh_size(mesh):
     # Add padding_mask if available in batch
     if "padding_mask" in batch:
         padding_mask = batch["padding_mask"]
+        batch_buffer_keys[len(cp_buffers)] = "padding_mask"
         cp_buffers.append(padding_mask)
         cp_seq_dims.append(1)
         cp_no_restore_buffers.add(padding_mask)
 
+    # Pad sequence length to be divisible by 2 * cp_size (required by
+    # context_parallel load balancing). The inputs_embeds path can hit
+    # arbitrary seq lengths from the VLM collator, so we pad here rather
+    # than relying on dataset-side padding.
+    #
+    # Per-buffer pad sentinels: each tensor's "ignore" value is semantic, not
+    # dtype-derived.  ``labels``/``padding_mask``/``attention_mask`` are all
+    # int/bool but have different ignore conventions.  Falling through to 0
+    # for ``padding_mask`` (== False == "real token") would tell the MoE
+    # router to route the cp-pad slots to experts -- silently wasting capacity
+    # and skewing load-balance loss.
+    PAD_FILL = {
+        "labels": -100,  # CE ignore_index
+        "padding_mask": True,  # bool: True == "this position is pad, ignore"
+        "attention_mask": False,  # HF: 0 == "this position is pad, ignore"
+        # everything else (input_ids, position_ids, ...) -> 0
+    }
+    cp_divisor = cp_mesh.size() * 2
+    if seq_len % cp_divisor != 0:
+        pad_len = cp_divisor - (seq_len % cp_divisor)
+        new_no_restore = set()
+        for i, (buf, dim) in enumerate(zip(cp_buffers, cp_seq_dims)):
+            pad_shape = list(buf.shape)
+            pad_shape[dim] = pad_len
+            if buf.dtype.is_floating_point:
+                pad_val = torch.zeros(pad_shape, dtype=buf.dtype, device=buf.device)
+            else:
+                fill_val = PAD_FILL.get(batch_buffer_keys.get(i), 0)
+                pad_val = torch.full(pad_shape, fill_val, dtype=buf.dtype, device=buf.device)
+            old_buf = buf
+            cp_buffers[i] = torch.cat([buf, pad_val], dim=dim)
+            if old_buf in cp_no_restore_buffers:
+                new_no_restore.add(cp_buffers[i])
+        cp_no_restore_buffers = new_no_restore
+        # Mirror every batch-sourced cp_buffer back into ``batch`` so any
+        # downstream consumer reading from the dict sees the padded shape.
+        for idx, key in batch_buffer_keys.items():
+            batch[key] = cp_buffers[idx]
+
     cp_ctx = create_context_parallel_ctx(
         cp_mesh=cp_mesh,
         cp_buffers=cp_buffers,