add support for fastgen precomupted dataset, update the negative prompt, remove 4 step generation of the student and just call fastgen approach.

sajadn · sajadn · commit bc1699da9b7b · 2026-01-19T11:38:46.000-08:00
Signed-off-by: sajadn &lt;snorouzi@nvidia.com&gt;
diff --git a/dfm/src/megatron/data/wan/wan_energon_datamodule.py b/dfm/src/megatron/data/wan/wan_energon_datamodule.py
@@ -17,31 +17,35 @@
 from dataclasses import dataclass
 
 from megatron.bridge.data.utils import DatasetBuildContext
-from torch import int_repr
 
 from dfm.src.megatron.data.common.diffusion_energon_datamodule import DiffusionDataModule, DiffusionDataModuleConfig
+from dfm.src.megatron.data.wan.wan_latent_taskencoder import WanLatentTaskEncoder
 from dfm.src.megatron.data.wan.wan_taskencoder import WanTaskEncoder
 
 
 @dataclass(kw_only=True)
 class WanDataModuleConfig(DiffusionDataModuleConfig):
-    path: str
-    seq_length: int
-    packing_buffer_size: int
-    micro_batch_size: int
-    global_batch_size: int
-    num_workers: int_repr
-    dataloader_type: str = "external"
+    # Only define new fields here; inherited fields come from DiffusionDataModuleConfig
+    use_fastgen_dataset: bool = False  # Flag to determine which task encoder to use
 
     def __post_init__(self):
+        # Instantiate the appropriate task encoder based on the flag
+        if self.use_fastgen_dataset:
+            task_encoder = WanLatentTaskEncoder(
+                seq_length=self.task_encoder_seq_length,
+                packing_buffer_size=self.packing_buffer_size,
+            )
+        else:
+            task_encoder = WanTaskEncoder(
+                seq_length=self.task_encoder_seq_length,
+                packing_buffer_size=self.packing_buffer_size,
+            )
+
         self.dataset = DiffusionDataModule(
             path=self.path,
             seq_length=self.seq_length,
             packing_buffer_size=self.packing_buffer_size,
-            task_encoder=WanTaskEncoder(
-                seq_length=self.task_encoder_seq_length,  # Use task_encoder_seq_length for packing
-                packing_buffer_size=self.packing_buffer_size,
-            ),
+            task_encoder=task_encoder,
             micro_batch_size=self.micro_batch_size,
             global_batch_size=self.global_batch_size,
             num_workers=self.num_workers,
diff --git a/dfm/src/megatron/data/wan/wan_latent_taskencoder.py b/dfm/src/megatron/data/wan/wan_latent_taskencoder.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=C0115,C0116,C0301
+
+"""
+Task encoder for WAN dataset with precomputed latents.
+
+This module provides WanLatentTaskEncoder which handles precomputed VAE-encoded
+video latents and text embeddings. It differs from WanTaskEncoder in that it
+expects latents that are already VAE-encoded rather than raw video files.
+
+Expected Energon dataset structure per sample:
+  - latent.pth: RGB video latents (precomputed, already VAE-encoded) [C, T, H, W]
+  - txt_emb.pth: Text embeddings (already padded to [512, dim])
+  - depth_latent.pth: Depth latents (optional)
+  - json: Metadata (resolution, fps, etc.)
+
+The cook function maps these to the format expected by the parent WanTaskEncoder:
+  - latent.pth -> pth (video latents)
+  - txt_emb.pth -> pickle (text embeddings)
+"""
+
+from megatron.energon.task_encoder.cooking import Cooker, basic_sample_keys
+
+from dfm.src.megatron.data.wan.wan_taskencoder import WanTaskEncoder
+
+
+def cook_latent(sample: dict) -> dict:
+    """
+    Cook function for precomputed latent samples.
+
+    Maps the precomputed latent file naming convention to the format
+    expected by the parent WanTaskEncoder class.
+
+    Args:
+        sample (dict): Raw sample from Energon dataset containing:
+            - "latent.pth": RGB video latents (precomputed)
+            - "txt_emb.pth": Text embeddings
+            - "depth_latent.pth" (optional): Depth latents
+            - "json": Metadata
+
+    Returns:
+        dict: Processed sample with keys mapped to parent's expected format:
+            - "pth": Video latent tensor (from latent.pth)
+            - "pickle": Text embeddings (from txt_emb.pth)
+            - "json": Metadata
+    """
+    return dict(
+        **basic_sample_keys(sample),
+        pth=sample["latent.pth"],  # Map latent.pth -> pth
+        pickle=sample["txt_emb.pth"],  # Map txt_emb.pth -> pickle
+        json=sample.get("json", {}),
+    )
+
+
+class WanLatentTaskEncoder(WanTaskEncoder):
+    """
+    Task encoder for WAN dataset with precomputed latents.
+
+    This class inherits from WanTaskEncoder and only overrides the cook function
+    to handle the different file naming convention used for precomputed latents:
+      - latent.pth (precomputed VAE-encoded video) instead of raw video in pth
+      - txt_emb.pth (pre-encoded text embeddings) instead of pickle
+
+    All other processing is handled by the parent class:
+      - Patchifying video latents
+      - Grid size calculation
+      - Text embedding padding to 512 tokens
+      - Context parallelism padding
+      - Sequence packing
+
+    Attributes:
+        use_depth_latent (bool): Whether to load and use depth latents.
+            Note: Currently depth latents are loaded but not actively used
+            in the encoding pipeline. They can be accessed via video_metadata.
+
+    Example usage:
+        task_encoder = WanLatentTaskEncoder(
+            seq_length=500,
+            packing_buffer_size=100,
+            patch_spatial=2,
+            patch_temporal=1,
+            use_depth_latent=False,  # Set to True if needed
+        )
+    """
+
+    cookers = [
+        Cooker(cook_latent),
+    ]
+
+    def __init__(
+        self,
+        *args,
+        use_depth_latent: bool = False,
+        **kwargs,
+    ):
+        """
+        Initialize the WanLatentTaskEncoder.
+
+        Args:
+            use_depth_latent (bool): Flag to enable depth latent loading.
+                Defaults to False for memory optimization.
+            *args: Additional positional arguments passed to parent WanTaskEncoder.
+            **kwargs: Additional keyword arguments passed to parent WanTaskEncoder.
+                Common kwargs include:
+                - seq_length (int): Maximum sequence length
+                - packing_buffer_size (int): Buffer size for sequence packing
+                - patch_spatial (int): Spatial patch size (default: 2)
+                - patch_temporal (int): Temporal patch size (default: 1)
+        """
+        super().__init__(*args, **kwargs)
+        self.use_depth_latent = use_depth_latent
+        # All other initialization (patchifying, grid calculation, etc.)
+        # is handled by the parent WanTaskEncoder class
diff --git a/dfm/src/megatron/model/wan/flow_matching/flow_inference_pipeline.py b/dfm/src/megatron/model/wan/flow_matching/flow_inference_pipeline.py
@@ -164,7 +164,8 @@ def __init__(
 
         if dist.is_initialized():
             dist.barrier()
-        self.model.to(self.device)
+        # Move model to device and convert to correct dtype in one call
+        self.model.to(device=self.device, dtype=self.param_dtype)
 
         self.sample_neg_prompt = inference_cfg.english_sample_neg_prompt
 
@@ -317,11 +318,7 @@ def _decode_latents(self, latents, sample=True):
             latents.device, latents.dtype
         )
         latents = latents / latents_std + latents_mean
-        videos = self.vae.decode(latents)
-        if sample:
-            videos = videos.sample()
-        else:
-            videos = videos[0].clip_(-1.0, 1.0)
+        videos = self.vae.decode(latents).sample
         return videos
 
     def generate(
diff --git a/dfm/src/megatron/model/wan/wan_model.py b/dfm/src/megatron/model/wan/wan_model.py
@@ -350,3 +350,27 @@ def _set_embedder_weights_replica_id(
             replica_id=replica_id,
             allow_shape_mismatch=False,
         )
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Load state dict with automatic handling of 'module.' prefix mismatch.
+
+        This method handles the case where checkpoints saved with DistributedDataParallel
+        have a 'module.' prefix that needs to be removed when loading.
+
+        Args:
+            state_dict (dict): The state dictionary to load
+            strict (bool): Whether to strictly enforce that the keys match
+
+        Returns:
+            NamedTuple: with 'missing_keys' and 'unexpected_keys' fields
+        """
+        # Check if state_dict has 'module.' prefix but model doesn't
+        has_module_prefix = any(k.startswith("module.") for k in state_dict.keys())
+        if has_module_prefix:
+            new_state_dict = {}
+            for key, value in state_dict.items():
+                new_key = key.replace("module.", "", 1) if key.startswith("module.") else key
+                new_state_dict[new_key] = value
+            state_dict = new_state_dict
+
+        return super().load_state_dict(state_dict, strict=strict)
diff --git a/dfm/src/megatron/model/wan_dmd/wan_dmd_step.py b/dfm/src/megatron/model/wan_dmd/wan_dmd_step.py
@@ -32,7 +32,6 @@
 from megatron.core.utils import get_model_config, unwrap_model
 
 import wandb
-from dfm.src.fastgen.fastgen.methods.model import FastGenModel
 from dfm.src.megatron.model.wan.flow_matching.flow_inference_pipeline import FlowInferencePipeline
 from dfm.src.megatron.model.wan.inference import SIZE_CONFIGS
 from dfm.src.megatron.model.wan.wan_step import wan_data_step
@@ -128,11 +127,11 @@ def __init__(
     def _get_neg_condition(self, unwrapped_model):
         """
         Get the negative condition embedding, computing and caching it on first call.
-        The negative condition is the embedding of an empty string "".
+        The negative condition uses the prompt from self.inference_cfg.english_sample_neg_prompt.
         """
         if self._neg_condition is None:
             logger.info("Computing and caching negative condition embedding...")
-            neg_prompt = [""]
+            neg_prompt = [self.inference_cfg.english_sample_neg_prompt]
             neg_condition = unwrapped_model.get_text_encoder().encode(neg_prompt, precision=torch.bfloat16)
             self._neg_condition = neg_condition.transpose(0, 1).contiguous()
             logger.info(f"Negative condition cached with shape: {self._neg_condition.shape}")
@@ -176,7 +175,7 @@ def on_train_start(self, student, teacher, fake_score, state: GlobalState):
 
     def on_validation_start(self, single_step_outputs, batch, student, teacher, state: GlobalState):
         """
-        Generate validation videos from teacher (50 steps) and student (1 step).
+        Generate validation videos from teacher (50 steps) and student (N steps based on config).
         Logs videos to Weights & Biases.
         """
         if self._inference_pipeline is None:
@@ -187,8 +186,10 @@ def on_validation_start(self, single_step_outputs, batch, student, teacher, stat
         torch.cuda.empty_cache()
 
         # Create pipeline with teacher model (we'll swap for student later)
-
         gen_latent = single_step_outputs["gen_rand"]
+        if callable(gen_latent):
+            logger.info("gen_rand is callable (multi-step generation), invoking it to get latents...")
+            gen_latent = gen_latent()
         with torch.no_grad():
             gen_videos = self._inference_pipeline._decode_latents(gen_latent, sample=False)
             fps = self.inference_cfg.sample_fps
@@ -205,10 +206,11 @@ def on_validation_start(self, single_step_outputs, batch, student, teacher, stat
             prompt = "The video captures a series of images showing a group of children seated in an outdoor setting, possibly at a sports event. The children are dressed in casual attire, with one wearing a red top and another in a white top with a rainbow design. The background is filled with other spectators, some of whom are wearing baseball caps. The lighting suggests it's either late afternoon or early evening, and the atmosphere appears to be casual and relaxed."
 
         print("prompt", prompt)
+        student_steps = student.config.student_sample_steps
         self._log_videos_to_wandb(
             videos=gen_videos,
-            video_name="student_prediction",
-            caption=f"Student (1 step): {prompt}",
+            video_name=f"student_{student_steps}step_prediction",
+            caption=f"{prompt}",
             fps=fps,
             state=state,
         )
@@ -218,50 +220,13 @@ def on_validation_start(self, single_step_outputs, batch, student, teacher, stat
         gc.collect()
         torch.cuda.empty_cache()
 
-        student_steps = 4
-        input_rand = single_step_outputs.get("input_rand", None)
-        logger.info(f"Generating validation video from student with {student_steps} steps using generator_fn...")
-
-        # Get condition from batch
-        condition = batch.get("context_embeddings", None)
-        # Extract prompt for caption
-
-        with torch.no_grad():
-            # Wrap student to adapt interface for FastGenModel.generator_fn
-            wrapped_student = MegatronFastGenInferenceWrapper(student, batch)
-            # Use FastGenModel.generator_fn directly
-            student_4step_latents = FastGenModel.generator_fn(
-                net=wrapped_student,
-                noise=input_rand,  # [B, C, T, H, W] unit Gaussian
-                condition=condition,
-                student_sample_steps=student_steps,
-                student_sample_type="sde",  # stochastic sampling
-            )
-
-            # Decode latents to video
-            student_4step_videos = self._inference_pipeline._decode_latents(student_4step_latents, sample=False)
-            self._log_videos_to_wandb(
-                videos=student_4step_videos,
-                video_name="student_4step_prediction",
-                caption=f"Student ({student_steps} steps): {prompt}",
-                fps=fps,
-                state=state,
-            )
-
-            del student_4step_videos, student_4step_latents
-            gc.collect()
-            torch.cuda.empty_cache()
-
         # Generation parameters
         size_key = "832*480"
         size = SIZE_CONFIGS[size_key]
         frame_num = 81
         shift = 5.0
         guide_scale = 5.0
-
         seed = parallel_state.get_data_parallel_rank()
-
-        # Get the same initial noise that was used by the student
         # input_rand is the unit Gaussian noise (input_student / max_sigma)
         input_rand = single_step_outputs.get("input_rand", None)
         if input_rand is not None:
diff --git a/dfm/src/megatron/recipes/wan/wan_dmd.py b/dfm/src/megatron/recipes/wan/wan_dmd.py
@@ -179,6 +179,7 @@ def wan_dmd_config(
     test_data_path: Optional[List[str]] = None,
     per_split_data_args_path: Optional[str] = None,
     mock: bool = False,
+    use_fastgen_dataset: bool = False,
     # Model configuration
     tensor_parallelism: int = 1,
     pipeline_parallelism: int = 1,
@@ -212,6 +213,8 @@ def wan_dmd_config(
         test_data_path (Optional[List[str]]): List of test data paths.
         per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
         mock (bool): Whether to use mock data. If True, ignores data_paths.
+        use_fastgen_dataset (bool): Whether to use WanLatentTaskEncoder for precomputed latents (True)
+            or WanTaskEncoder for raw data (False). Defaults to False.
         tensor_parallelism (int): Degree of tensor model parallelism.
         pipeline_parallelism (int): Degree of pipeline model parallelism.
         pipeline_parallelism_dtype (Optional[torch.dtype]): Data type for pipeline parallelism.
@@ -295,6 +298,7 @@ def wan_dmd_config(
             num_workers=10,
             task_encoder_seq_length=None,
             packing_buffer_size=40,  # 131,072 = 2^17 tokens, each 5 secs of 832*480 is about 45k tokens
+            use_fastgen_dataset=use_fastgen_dataset,  # Pass flag instead of instance
         )
 
     # Config Container
diff --git a/examples/megatron/recipes/wan/wan_dmd.py b/examples/megatron/recipes/wan/wan_dmd.py
@@ -93,6 +93,11 @@ def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]:
         default="finetune",
         help="Set training mode, 'pretrain' or 'finetune'.",
     )
+    parser.add_argument(
+        "--use-fastgen-dataset",
+        action="store_true",
+        help="Use WanLatentTaskEncoder for precomputed latents instead of WanTaskEncoder.",
+    )
     parser.add_argument(
         "--config-file",
         type=str,
@@ -138,7 +143,11 @@ def main() -> None:
     logger.info("------------------------------------------------------------------")
 
     # Load base configuration from the recipe as a Python dataclass
-    cfg: ConfigContainer = wan_dmd_config(mock=args.mock, training_mode=args.training_mode)
+    cfg: ConfigContainer = wan_dmd_config(
+        mock=args.mock,
+        training_mode=args.training_mode,
+        use_fastgen_dataset=args.use_fastgen_dataset,
+    )
     logger.info("Loaded base configuration")
 
     # Print configuration on rank 0
diff --git a/run_dit/dmd_distill_energon_latent.sh b/run_dit/dmd_distill_energon_latent.sh