NVIDIA-NeMo
diff --git a/‎dfm/src/automodel/flow_matching/flow_matching_pipeline.py‎
Lines changed: 28 additions & 15 deletions b/‎dfm/src/automodel/flow_matching/flow_matching_pipeline.py‎
Lines changed: 28 additions & 15 deletions
diff --git a/‎dfm/src/automodel/recipes/train.py‎
Lines changed: 1 addition & 1 deletion b/‎dfm/src/automodel/recipes/train.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dfm/src/megatron/model/dit/dit_layer_spec.py‎
Lines changed: 1 addition & 0 deletions b/‎dfm/src/megatron/model/dit/dit_layer_spec.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎dfm/src/megatron/model/wan/flow_matching/flow_matching_pipeline_wan.py‎
Lines changed: 161 additions & 0 deletions b/‎dfm/src/megatron/model/wan/flow_matching/flow_matching_pipeline_wan.py‎
Lines changed: 161 additions & 0 deletions
@@ -104,7 +104,7 @@ class FlowMatchingPipeline:
         )
 
         # Training step
-        loss, metrics = pipeline.step(model, batch, device, dtype, global_step)
+        weighted_loss, average_weighted_loss, loss_mask, metrics = pipeline.step(model, batch, device, dtype, global_step)
     """
 
     def __init__(
@@ -262,6 +262,7 @@ def compute_loss(
         model_pred: torch.Tensor,
         target: torch.Tensor,
         sigma: torch.Tensor,
+        batch: Dict[str, Any],
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Compute flow matching loss with optional weighting.
@@ -279,6 +280,7 @@ def compute_loss(
             loss_weight: Applied weights
         """
         loss = nn.functional.mse_loss(model_pred.float(), target.float(), reduction="none")
+        loss_mask = batch["loss_mask"] if "loss_mask" in batch else None
 
         if self.use_loss_weighting:
             loss_weight = 1.0 + self.flow_shift * sigma
@@ -288,17 +290,19 @@ def compute_loss(
 
         loss_weight = loss_weight.to(model_pred.device)
 
-        unweighted_loss = loss.mean()
-        weighted_loss = (loss * loss_weight).mean()
+        unweighted_loss = loss
+        weighted_loss = loss * loss_weight
+        average_unweighted_loss = unweighted_loss.mean()
+        average_weighted_loss = weighted_loss.mean()
 
-        return weighted_loss, unweighted_loss, loss_weight
+        return weighted_loss, average_weighted_loss, unweighted_loss, average_unweighted_loss, loss_weight, loss_mask
 
     def step(
         self,
         model: nn.Module,
         batch: Dict[str, Any],
-        device: torch.device,
-        dtype: torch.dtype,
+        device: torch.device = torch.device("cuda"),
+        dtype: torch.dtype = torch.bfloat16,
         global_step: int = 0,
     ) -> Tuple[torch.Tensor, Dict[str, Any]]:
         """
@@ -398,26 +402,35 @@ def step(
         # ====================================================================
         # Loss Computation
         # ====================================================================
-        weighted_loss, unweighted_loss, loss_weight = self.compute_loss(model_pred, target, sigma)
+        weighted_loss, average_weighted_loss, unweighted_loss, average_unweighted_loss, loss_weight, loss_mask = (
+            self.compute_loss(model_pred, target, sigma, batch)
+        )
 
         # Safety check
-        if torch.isnan(weighted_loss) or weighted_loss > 100:
-            logger.error(f"[ERROR] Loss explosion! Loss={weighted_loss.item():.3f}")
-            raise ValueError(f"Loss exploded: {weighted_loss.item()}")
+        if torch.isnan(average_weighted_loss) or average_weighted_loss > 100:
+            logger.error(f"[ERROR] Loss explosion! Loss={average_weighted_loss.item():.3f}")
+            raise ValueError(f"Loss exploded: {average_weighted_loss.item()}")
 
         # Logging
         if detailed_log or debug_mode:
-            self._log_loss_detailed(global_step, model_pred, target, loss_weight, unweighted_loss, weighted_loss)
+            self._log_loss_detailed(
+                global_step,
+                model_pred,
+                target,
+                loss_weight,
+                average_unweighted_loss,
+                average_weighted_loss,
+            )
         elif summary_log:
             logger.info(
-                f"[STEP {global_step}] Loss: {weighted_loss.item():.6f} | "
+                f"[STEP {global_step}] Loss: {average_weighted_loss.item():.6f} | "
                 f"w=[{loss_weight.min():.2f},{loss_weight.max():.2f}]"
             )
 
         # Collect metrics
         metrics = {
-            "loss": weighted_loss.item(),
-            "unweighted_loss": unweighted_loss.item(),
+            "loss": average_weighted_loss.item(),
+            "unweighted_loss": average_unweighted_loss.item(),
             "sigma_min": sigma.min().item(),
             "sigma_max": sigma.max().item(),
             "sigma_mean": sigma.mean().item(),
@@ -432,7 +445,7 @@ def step(
             "data_type": data_type,
         }
 
-        return weighted_loss, metrics
+        return weighted_loss, average_weighted_loss, loss_mask, metrics
 
     def _log_detailed(
         self,
 
@@ -382,7 +382,7 @@ def run_train_validation_loop(self):
                 micro_losses = []
                 for micro_batch in batch_group:
                     try:
-                        loss, metrics = self.flow_matching_pipeline.step(
+                        _, loss, _, metrics = self.flow_matching_pipeline.step(
                             model=self.model,
                             batch=micro_batch,
                             device=self.device,
 
@@ -184,6 +184,7 @@ def forward(
         sequence_len_offset=None,
         inference_context=None,
         rotary_pos_cos_sin=None,
+        **kwargs,
     ):
         timestep_emb = attention_mask
 
 
@@ -0,0 +1,161 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, Tuple
+
+import torch
+import torch.nn as nn
+from megatron.core import parallel_state
+
+from dfm.src.automodel.flow_matching.adapters.base import FlowMatchingContext, ModelAdapter
+from dfm.src.automodel.flow_matching.flow_matching_pipeline import FlowMatchingPipeline
+from dfm.src.megatron.model.wan.utils import thd_split_inputs_cp
+
+
+class WanAdapter(ModelAdapter):
+    """
+    Model adapter for Wan model (Megatron version).
+
+    Handles mapping of standard FlowMatchingContext to Wan specific inputs.
+    """
+
+    def prepare_inputs(self, context: FlowMatchingContext) -> Dict[str, Any]:
+        grid_sizes = context.batch["grid_sizes"]
+        noisy_latents = context.noisy_latents
+        video_latents = context.video_latents
+        loss_mask = context.batch["loss_mask"]
+        context_embeddings = context.batch["context_embeddings"]
+        timesteps = context.timesteps
+        packed_seq_params = context.batch["packed_seq_params"]
+
+        # tranpose back to have shape "sbhd"
+        # (before we reshaped to "bshd" to be compatible with flow matching pipeline)
+        noisy_latents = noisy_latents.transpose(0, 1)
+
+        # ========================================================================
+        # Cast model inputs to bf16
+        # ========================================================================
+
+        noisy_latents = noisy_latents.to(torch.bfloat16)
+        context_embeddings = context_embeddings.to(torch.bfloat16)
+
+        # NOTE: investigate the affect of bf16 timesteps on embedding precision
+        # CRITICAL: Keep timesteps in fp32 for embedding precision
+        # timesteps = timesteps.float()  # NOT bf16!
+        timesteps = timesteps.to(torch.bfloat16)
+
+        # ========================================================================
+        # Split accross context parallelism
+        # ========================================================================
+
+        if parallel_state.get_context_parallel_world_size() > 1:
+            noisy_latents = thd_split_inputs_cp(
+                noisy_latents,
+                packed_seq_params["self_attention"].cu_seqlens_q_padded,
+                parallel_state.get_context_parallel_group(),
+            )
+            # TODO (pmannan): Disable CP for CrossAttention as KV context is small.
+            # We don't need to split context embeddings across context parallelism
+            # if we disable context parallelism for cross-attention
+            context_embeddings = thd_split_inputs_cp(
+                context_embeddings,
+                packed_seq_params["cross_attention"].cu_seqlens_kv_padded,
+                parallel_state.get_context_parallel_group(),
+            )
+        else:
+            noisy_latents = noisy_latents
+            context_embeddings = context_embeddings
+
+        return {
+            "noisy_latents": noisy_latents,
+            "grid_sizes": grid_sizes,
+            "timesteps": timesteps,
+            "context_embeddings": context_embeddings,
+            "packed_seq_params": packed_seq_params,
+        }
+
+    def forward(self, model: nn.Module, inputs: Dict[str, Any]) -> torch.Tensor:
+        """
+        Execute forward pass for Wan model.
+
+        Args:
+            model: Wan model
+            inputs: Dictionary from prepare_inputs()
+
+        Returns:
+            Model prediction tensor
+        """
+
+        model_pred = model(
+            x=inputs["noisy_latents"],
+            grid_sizes=inputs["grid_sizes"],
+            t=inputs["timesteps"],
+            context=inputs["context_embeddings"],
+            packed_seq_params=inputs["packed_seq_params"],
+        )
+        return self.post_process_prediction(model_pred)
+
+
+class WanFlowMatchingPipeline(FlowMatchingPipeline):
+    """
+    Wan-specific Flow Matching pipeline handling Context Parallelism and Custom Noise.
+
+    This pipeline extends the standard FlowMatchingPipeline to support:
+    1. Wan-specific noise generation (patching + padding)
+    2. Context Parallelism (CP) splitting of inputs
+    3. Masked loss computation
+    """
+
+    def determine_task_type(self, data_type: str) -> str:
+        """Determine task type based on data type and randomization."""
+        return "t2v"
+
+    def compute_loss(
+        self,
+        model_pred: torch.Tensor,
+        target: torch.Tensor,
+        sigma: torch.Tensor,
+        batch: Dict[str, Any],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        loss_mask = batch["loss_mask"]
+        packed_seq_params = batch["packed_seq_params"]
+
+        # tranpose back to have shape "sbhd"
+        # (before we reshaped to "bshd" to be compatible with flow matching pipeline)
+        target = target.transpose(0, 1)
+
+        # ========================================================================
+        # Split accross context parallelism
+        # ========================================================================
+
+        if parallel_state.get_context_parallel_world_size() > 1:
+            target = thd_split_inputs_cp(
+                target,
+                packed_seq_params["self_attention"].cu_seqlens_q_padded,
+                parallel_state.get_context_parallel_group(),
+            )
+            split_loss_mask = thd_split_inputs_cp(
+                loss_mask,
+                packed_seq_params["self_attention"].cu_seqlens_q_padded,
+                parallel_state.get_context_parallel_group(),
+            )
+        else:
+            target = target
+            split_loss_mask = loss_mask
+
+        batch["loss_mask"] = split_loss_mask
+        weighted_loss, average_weighted_loss, unweighted_loss, average_unweighted_loss, loss_weight, loss_mask = (
+            super().compute_loss(model_pred, target, sigma, batch)
+        )
+        return weighted_loss, average_weighted_loss, unweighted_loss, average_unweighted_loss, loss_weight, loss_mask