minor fix

mxinO · mxinO · commit 215056b8d272 · 2026-03-20T00:00:30.000-07:00
Signed-off-by: Meng Xin &lt;mxin@nvidia.com&gt;
diff --git a/examples/diffusers/distillation/src/models/ltx2/pipeline.py b/examples/diffusers/distillation/src/models/ltx2/pipeline.py
@@ -184,9 +184,10 @@ def unload_text_encoder(self) -> None:
             del te.feature_extractor_linear
             te.feature_extractor_linear = None
         te.tokenizer = None
-        # Keep connectors on GPU (they're small)
-        te.embeddings_connector.to("cuda")
-        te.audio_embeddings_connector.to("cuda")
+        # Keep connectors on current GPU (they're small)
+        device = torch.device("cuda", torch.cuda.current_device())
+        te.embeddings_connector.to(device)
+        te.audio_embeddings_connector.to(device)
         free_gpu_memory()
         logger.info("Text encoder unloaded (connectors kept for training/inference)")
 
diff --git a/examples/diffusers/distillation/src/models/wan/adapter.py b/examples/diffusers/distillation/src/models/wan/adapter.py
@@ -81,8 +81,8 @@ def prepare_inputs(
         )
 
     def forward_model(self, model: nn.Module, inputs: BackboneInputs) -> Tensor:
-        # WanModel's internal norms promote to float32; autocast keeps
-        # linear ops in bf16 to match the model weights.
+        # WanModel norms promote to float32 internally; autocast keeps
+        # matmuls in bf16 to match the original Wan inference code.
         with torch.amp.autocast("cuda", dtype=torch.bfloat16):
             output_list = model(**inputs.forward_kwargs)
         return torch.stack(output_list)
diff --git a/examples/diffusers/distillation/src/models/wan/pipeline.py b/examples/diffusers/distillation/src/models/wan/pipeline.py
@@ -66,12 +66,18 @@ def load_components(self, model_config, device: str, dtype: torch.dtype) -> None
         self._config = self._var["config"]()
 
         t5_path = os.path.join(path, self._config.t5_checkpoint)
+        # Prefer local tokenizer dir (avoids HuggingFace network calls).
+        # Wan ships tokenizer files under <model_root>/google/umt5-xxl/.
+        tokenizer_path = self._config.t5_tokenizer
+        local_tokenizer = os.path.join(path, tokenizer_path)
+        if os.path.isdir(local_tokenizer):
+            tokenizer_path = local_tokenizer
         self._text_encoder = T5EncoderModel(
             text_len=self._config.text_len,
             dtype=dtype,
             device=torch.device("cpu"),
             checkpoint_path=t5_path,
-            tokenizer_path=self._config.t5_tokenizer,
+            tokenizer_path=tokenizer_path,
         )
 
         vae_mod = importlib.import_module(self._var["vae_module"])
diff --git a/examples/diffusers/distillation/src/trainer.py b/examples/diffusers/distillation/src/trainer.py
@@ -102,6 +102,7 @@ def __init__(
         self._inference_pipeline = inference_pipeline
 
         self._global_step = 0
+        self._data_epoch = 0
         self._wandb_run = None
 
         set_seed(config.seed)
@@ -671,30 +672,6 @@ def _training_step(self, batch: dict[str, Tensor]) -> Tensor:
 
         return total_loss
 
-    def _compute_distillation_loss(
-        self, student_pred: Tensor, teacher_pred: Tensor, loss_mask: Tensor
-    ) -> Tensor:
-        loss_type = self._config.distillation.distillation_loss_type
-
-        if loss_type == "mse":
-            loss = torch.nn.functional.mse_loss(student_pred, teacher_pred, reduction="none")
-        elif loss_type == "cosine":
-            s_flat = student_pred.flatten(start_dim=2)
-            t_flat = teacher_pred.flatten(start_dim=2)
-            cos_sim = torch.nn.functional.cosine_similarity(s_flat, t_flat, dim=-1)
-            loss = 1.0 - cos_sim  # [B, T]
-        else:
-            raise ValueError(f"Unknown distillation loss type: {loss_type}")
-
-        if loss_mask is not None and loss_mask.numel() > 0:
-            # Expand mask to match loss dimensions
-            while loss_mask.dim() < loss.dim():
-                loss_mask = loss_mask.unsqueeze(-1)
-            mask = loss_mask.float()
-            loss = loss.mul(mask).div(mask.mean())
-
-        return loss.mean()
-
     def _compute_layer_distillation_loss(self) -> Tensor:
         """Compute distillation loss across hooked intermediate layers."""
         assert self._student_extractor is not None
@@ -1078,10 +1055,12 @@ def train(self) -> dict:
                 f"batch_size={cfg.optimization.batch_size}"
             )
 
+        start_micro = self._global_step * grad_accum
+        total_micro = total_steps * grad_accum
         pbar = tqdm(
-            range(self._global_step, total_steps * grad_accum),
-            initial=self._global_step * grad_accum,
-            total=total_steps * grad_accum,
+            range(start_micro, total_micro),
+            initial=start_micro,
+            total=total_micro,
             desc="Training",
             disable=not _is_global_rank0(),
         )
@@ -1091,6 +1070,10 @@ def train(self) -> dict:
             try:
                 batch = next(data_iter)
             except StopIteration:
+                self._data_epoch += 1
+                sampler = getattr(self._dataloader, "sampler", None)
+                if sampler is not None and hasattr(sampler, "set_epoch"):
+                    sampler.set_epoch(self._data_epoch)
                 data_iter = iter(self._dataloader)
                 batch = next(data_iter)