[bugfix]: route Wan negative prompt encoding through TextEncoderLoader

alexzms · alexzms · commit c9cf85e35276 · 2026-04-26T06:42:21.000Z
Addresses review feedback on #1178: the previous fix loaded the negative prompt encoder via transformers' T5EncoderModel, but Wan's text_encoder is UMT5 (per-layer relative position bias, not shared). Loading UMT5 weights into a T5 architecture silently produces wrong embeddings for the negative prompt and diverges the training-time CFG from inference. Switch to TextEncoderLoader so the encoder class is resolved from pipeline_config (UMT5EncoderModel for Wan) and the postprocess_text function is reused instead of imported by name. This keeps the fix to the original SP deadlock (every rank encodes independently; no full WanPipeline construction, no NCCL collectives) while staying inside the existing prompt-encoding abstraction. text_encoder_cpu_offload is forced off for this short-lived load to avoid initializing an FSDP device mesh, which would re-introduce collectives.
diff --git a/fastvideo/train/models/wan/wan.py b/fastvideo/train/models/wan/wan.py
@@ -339,37 +339,53 @@ def ensure_negative_conditioning(self) -> None:
         device = self.device
         dtype = self._get_training_dtype()
 
-        # Every rank encodes the negative prompt independently.
-        # This avoids NCCL collectives that would deadlock when
-        # only a subset of ranks creates an inference pipeline.
+        # Every rank encodes the negative prompt independently. This avoids
+        # the NCCL deadlock that occurred when only rank 0 constructed the
+        # full inference pipeline. We go through TextEncoderLoader so the
+        # encoder class is resolved from pipeline_config (i.e. UMT5 for Wan,
+        # not vanilla T5) and the same tokenizer / postprocess_text used at
+        # inference time are reused.
         import os
 
-        from transformers import AutoTokenizer, T5EncoderModel
+        from transformers import AutoTokenizer
 
-        from fastvideo.configs.pipelines.wan import (
-            t5_postprocess_text, )
-        from fastvideo.utils import PRECISION_TO_TYPE, maybe_download_model
+        from fastvideo.models.loader.component_loader import TextEncoderLoader
+        from fastvideo.train.utils.moduleloader import make_inference_args
+        from fastvideo.utils import maybe_download_model
 
         model_path = maybe_download_model(tc.model_path)
 
         sampling_param = SamplingParam.from_pretrained(model_path)
         negative_prompt = sampling_param.negative_prompt
 
         encoder_config = tc.pipeline_config.text_encoder_configs[0]
+        postprocess_text = tc.pipeline_config.postprocess_text_funcs[0]
         tok_kwargs = dict(encoder_config.tokenizer_kwargs)
 
-        text_enc_dtype = PRECISION_TO_TYPE[tc.pipeline_config.text_encoder_precisions[0]]
-        tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_path, "tokenizer"))
-        text_encoder = T5EncoderModel.from_pretrained(
+        inference_args = make_inference_args(tc, model_path=model_path)
+        # The negative-prompt encoder is small and only used once at startup;
+        # keep it on-device and skip CPU offload to avoid initializing FSDP
+        # device meshes (which would re-introduce collective communication).
+        inference_args.text_encoder_cpu_offload = False
+
+        loader = TextEncoderLoader()
+        text_encoder = loader.load(
             os.path.join(model_path, "text_encoder"),
-            torch_dtype=text_enc_dtype,
+            inference_args,
         ).to(device).eval()
+        tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_path, "tokenizer"))
 
-        with torch.no_grad():
+        with torch.no_grad(), set_forward_context(current_timestep=0, attn_metadata=None):
             text_inputs = tokenizer(negative_prompt, **tok_kwargs).to(device)
-            outputs = text_encoder(**text_inputs)
+            outputs = text_encoder(
+                input_ids=text_inputs.input_ids,
+                attention_mask=text_inputs.attention_mask,
+            )
+            # postprocess_text reads outputs.attention_mask; the FastVideo
+            # encoders already set it, but be explicit to match the inference
+            # path (where TextEncodingStage assigns it).
             outputs.attention_mask = text_inputs["attention_mask"]
-            neg_embeds = t5_postprocess_text(outputs).to(device=device, dtype=dtype)
+            neg_embeds = postprocess_text(outputs).to(device=device, dtype=dtype)
             neg_mask = text_inputs["attention_mask"].to(device=device, dtype=dtype)
 
         del text_encoder, tokenizer