[feat]: add LongCat bidirectional finetuning support (hao-ai-lab#1244)

aryan5v · alexzms · mergify[bot] · web-flow · commit f633e30ebb0e · 2026-05-09T12:20:43.000-07:00
Co-authored-by: Aryan Kumar &lt;aryan5v@users.noreply.github.com&gt;
Co-authored-by: alexzms &lt;3036648523@qq.com&gt;
Co-authored-by: mergify[bot] &lt;37929162+mergify[bot]@users.noreply.github.com&gt;
diff --git a/examples/train/configs/fine_tuning/longcat/t2v.yaml b/examples/train/configs/fine_tuning/longcat/t2v.yaml
@@ -0,0 +1,71 @@
+# LongCat-Video T2V 13.6B bidirectional finetune.
+
+models:
+  student:
+    _target_: fastvideo.train.models.longcat.LongCatModel
+    init_from: FastVideo/LongCat-Video-T2V-Diffusers
+    trainable: true
+
+method:
+  _target_: fastvideo.train.methods.fine_tuning.finetune.FineTuneMethod
+
+training:
+  model_path: FastVideo/LongCat-Video-T2V-Diffusers
+
+  distributed:
+    num_gpus: 8
+    sp_size: 1
+    tp_size: 1
+    hsdp_replicate_dim: 1
+    hsdp_shard_dim: 8
+
+  data:
+    data_path: data/LongCat-Syn
+    dataloader_num_workers: 4
+    train_batch_size: 1
+    training_cfg_rate: 0.0
+    seed: 1000
+    num_latent_t: 20
+    num_height: 480
+    num_width: 848
+    num_frames: 77
+
+  optimizer:
+    learning_rate: 1.0e-6
+    betas: [0.9, 0.999]
+    weight_decay: 0.01
+    lr_scheduler: constant
+    lr_warmup_steps: 0
+
+  loop:
+    max_train_steps: 4000
+    gradient_accumulation_steps: 1
+
+  checkpoint:
+    output_dir: outputs/longcat_finetune
+    training_state_checkpointing_steps: 1000
+    checkpoints_total_limit: 3
+
+  tracker:
+    project_name: fastvideo
+    run_name: longcat_finetune
+
+  model:
+    enable_gradient_checkpointing_type: full
+
+callbacks:
+  grad_clip:
+    _target_: fastvideo.train.callbacks.grad_clip.GradNormClipCallback
+    max_grad_norm: 1.0
+  validation:
+    _target_: fastvideo.train.callbacks.validation.ValidationCallback
+    pipeline_target: fastvideo.pipelines.basic.longcat.longcat_pipeline.LongCatPipeline
+    dataset_file: data/validation_prompts.json
+    every_steps: 100
+    sampling_steps: [50]
+    guidance_scale: 5.0
+
+pipeline:
+  # Match the released LongCat scheduler config. flow_shift=0.0 collapses
+  # FlowMatch training timesteps to zero in FastVideo's scheduler.
+  flow_shift: 12.0
diff --git a/fastvideo/models/dits/longcat.py b/fastvideo/models/dits/longcat.py
@@ -206,6 +206,17 @@ def forward(
                 encoder_attention_mask = encoder_attention_mask.squeeze(1).squeeze(1)
             elif len(encoder_attention_mask.shape) == 3:
                 encoder_attention_mask = encoder_attention_mask.squeeze(1)
+
+            seq_len = int(y.shape[1])
+            mask_len = int(encoder_attention_mask.shape[1])
+            if mask_len < seq_len:
+                encoder_attention_mask = F.pad(
+                    encoder_attention_mask,
+                    (0, seq_len - mask_len),
+                    value=0,
+                )
+            elif mask_len > seq_len:
+                encoder_attention_mask = encoder_attention_mask[:, :seq_len]
             
             # Zero out padded tokens if requested
             if self.text_tokens_zero_pad:
diff --git a/fastvideo/pipelines/stages/longcat_denoising.py b/fastvideo/pipelines/stages/longcat_denoising.py
@@ -70,13 +70,14 @@ def forward(
                 pipeline.add_module("transformer", self.transformer)
             fastvideo_args.model_loaded["transformer"] = True
 
-        # Get transformer dtype
-        if hasattr(self.transformer, 'module'):
-            transformer_dtype = next(self.transformer.module.parameters()).dtype
-        else:
-            transformer_dtype = next(self.transformer.parameters()).dtype
-
-        target_dtype = transformer_dtype
+        # Inference dtype. We hardcode bf16 (matching the WanDenoisingStage
+        # pattern) rather than reading transformer.parameters().dtype: when
+        # a model is loaded with default_dtype=fp32 but its FSDP-wrapped
+        # submodules compute in bf16, the parameter-dtype heuristic
+        # mismatches the Conv3d weight/bias dtype and the patch_embed
+        # forward fails with "Input type (float) and bias type
+        # (c10::BFloat16) should be the same".
+        target_dtype = torch.bfloat16
         autocast_enabled = (target_dtype != torch.float32) and not fastvideo_args.disable_autocast
 
         # Extract batch parameters
diff --git a/fastvideo/tests/transformers/test_cosmos.py b/fastvideo/tests/transformers/test_cosmos.py
@@ -22,9 +22,24 @@
 os.environ["MASTER_PORT"] = "29504"
 
 BASE_MODEL_PATH = "nvidia/Cosmos-Predict2-2B-Video2World"
-MODEL_PATH = maybe_download_model(BASE_MODEL_PATH,
-                                  local_dir=os.path.join(
-                                      'data', BASE_MODEL_PATH))
+
+
+def _resolve_model_path() -> str:
+    try:
+        return maybe_download_model(
+            BASE_MODEL_PATH,
+            local_dir=os.path.join("data", BASE_MODEL_PATH),
+        )
+    except ValueError as exc:
+        pytest.skip(
+            "Skipping Cosmos transformer test because the configured "
+            "HuggingFace token cannot access the gated Cosmos weights: "
+            f"{exc}",
+            allow_module_level=True,
+        )
+
+
+MODEL_PATH = _resolve_model_path()
 TRANSFORMER_PATH = os.path.join(MODEL_PATH, "transformer")
 
 
@@ -131,4 +146,4 @@ def test_cosmos2_transformer():
     logger.info("Mean Diff: %s", mean_diff.item())
     assert max_diff < 1e-1, f"Maximum difference between outputs: {max_diff.item()}"
     # mean diff
-    assert mean_diff < 1e-2, f"Mean difference between outputs: {mean_diff.item()}"
+    assert mean_diff < 1e-2, f"Mean difference between outputs: {mean_diff.item()}"
diff --git a/fastvideo/train/models/longcat/__init__.py b/fastvideo/train/models/longcat/__init__.py
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+"""LongCat model plugin package."""
+
+from fastvideo.train.models.longcat.longcat import (
+    LongCatModel as LongCatModel, )
diff --git a/fastvideo/train/models/longcat/longcat.py b/fastvideo/train/models/longcat/longcat.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+"""LongCat model plugin (per-role instance)."""
+
+from __future__ import annotations
+
+from typing import Any, Literal, TYPE_CHECKING
+
+import torch
+
+from fastvideo.pipelines import TrainingBatch
+from fastvideo.train.models.wan.wan import WanModel
+
+if TYPE_CHECKING:
+    from fastvideo.train.utils.training_config import TrainingConfig
+
+
+class LongCatModel(WanModel):
+    """LongCat per-role model for training and distillation."""
+
+    _transformer_cls_name: str = "LongCatTransformer3DModel"
+
+    @staticmethod
+    def _validate_flow_shift(flow_shift: float | None) -> float:
+        if flow_shift is None:
+            return 12.0
+
+        validated = float(flow_shift)
+        if validated == 0.0:
+            raise ValueError("LongCat training does not support flow_shift=0.0 because "
+                             "it collapses FlowMatch training timesteps. Use 12.0 to "
+                             "match the released LongCat scheduler config.")
+        return validated
+
+    def __init__(
+        self,
+        *,
+        init_from: str,
+        training_config: TrainingConfig,
+        trainable: bool = True,
+        disable_custom_init_weights: bool = False,
+        flow_shift: float = 12.0,
+        enable_gradient_checkpointing_type: str | None = None,
+        transformer_override_safetensor: str | None = None,
+    ) -> None:
+        super().__init__(
+            init_from=init_from,
+            training_config=training_config,
+            trainable=trainable,
+            disable_custom_init_weights=disable_custom_init_weights,
+            flow_shift=self._validate_flow_shift(flow_shift),
+            enable_gradient_checkpointing_type=enable_gradient_checkpointing_type,
+            transformer_override_safetensor=transformer_override_safetensor,
+        )
+
+    def _init_timestep_mechanics(self) -> None:
+        assert self.training_config is not None
+        tc = self.training_config
+        flow_shift = getattr(tc.pipeline_config, "flow_shift", None)  # type: ignore[union-attr]
+        self.timestep_shift = self._validate_flow_shift(flow_shift)
+        self.num_train_timestep = int(self.noise_scheduler.num_train_timesteps)
+        self.min_timestep = 0
+        self.max_timestep = self.num_train_timestep
+
+    def _build_attention_metadata(self, training_batch: TrainingBatch) -> TrainingBatch:
+        training_batch.attn_metadata = None
+        return training_batch
+
+    def _build_distill_input_kwargs(
+        self,
+        noise_input: torch.Tensor,
+        timestep: torch.Tensor,
+        text_dict: dict[str, torch.Tensor] | None,
+    ) -> dict[str, Any]:
+        if text_dict is None:
+            raise ValueError("text_dict cannot be None for LongCat distillation")
+
+        batch_size = int(noise_input.shape[0])
+        if timestep.ndim == 0:
+            timestep = timestep.view(1).expand(batch_size)
+        elif timestep.ndim == 1 and int(timestep.shape[0]) == 1 and batch_size > 1:
+            timestep = timestep.expand(batch_size)
+
+        return {
+            "hidden_states": noise_input.permute(0, 2, 1, 3, 4),
+            "encoder_hidden_states": text_dict["encoder_hidden_states"],
+            "encoder_attention_mask": text_dict["encoder_attention_mask"],
+            "timestep": timestep,
+        }
+
+    def predict_noise(
+        self,
+        noisy_latents: torch.Tensor,
+        timestep: torch.Tensor,
+        batch: TrainingBatch,
+        *,
+        conditional: bool,
+        cfg_uncond: dict[str, Any] | None = None,
+        attn_kind: Literal["dense", "vsa"] = "dense",
+    ) -> torch.Tensor:
+        """Adapt LongCat's sign convention to FineTuneMethod's target.
+
+        ``LongCatTransformer3DModel`` is pretrained to output the
+        ``clean - noise`` direction; ``LongCatDenoisingStage`` (the
+        bidirectional inference pipeline) explicitly negates the
+        transformer output before handing it to
+        ``FlowMatchEulerDiscreteScheduler.step``. Training methods on
+        the other hand (``FineTuneMethod``,
+        ``DiffusionForcingSFTMethod``) target ``noise - clean``
+        directly (the standard rectified-flow velocity Wan uses).
+
+        Without the negation here, the loss MSE pushes the transformer
+        toward ``noise - clean``, flipping its native output sign over
+        training. Inference then applies its own negation on top, so
+        the scheduler receives the wrong direction and produces noise
+        even while the training loss is dropping. Verified empirically
+        on a 100-step LongCat overfit run: step 0 generated meaningful
+        video, step 100 was pure noise despite low loss.
+
+        Negating in ``predict_noise`` keeps the transformer's
+        pretrained sign convention intact while presenting the
+        training methods with a Wan-compatible
+        ``pred ≈ noise - clean`` for MSE.
+        """
+        pred = super().predict_noise(
+            noisy_latents,
+            timestep,
+            batch,
+            conditional=conditional,
+            cfg_uncond=cfg_uncond,
+            attn_kind=attn_kind,
+        )
+        return -pred
diff --git a/fastvideo/training/trackers.py b/fastvideo/training/trackers.py
@@ -17,11 +17,45 @@
 from typing import Any
 from collections.abc import Iterable, Iterator
 
+import torch
+
 from fastvideo.logger import init_logger
 
 logger = init_logger(__name__)
 
 
+def _sanitize_wandb_config(value: Any) -> Any:
+    """Best-effort conversion of nested config objects to W&B-safe values."""
+    if value is None or isinstance(value, str | int | float | bool):
+        return value
+    if isinstance(value, Enum):
+        return value.value
+    if isinstance(value, pathlib.Path):
+        return str(value)
+    if isinstance(value, dict):
+        return {str(k): _sanitize_wandb_config(v) for k, v in value.items()}
+    if isinstance(value, list | tuple | set):
+        return [_sanitize_wandb_config(v) for v in value]
+    if isinstance(value, torch.dtype):
+        return str(value)
+    if isinstance(value, torch.Tensor):
+        tensor = value.detach().cpu()
+        if tensor.dtype == torch.bfloat16:
+            tensor = tensor.to(dtype=torch.float32)
+        if tensor.ndim == 0 or tensor.numel() == 1:
+            return tensor.item()
+        if tensor.numel() <= 256:
+            return tensor.tolist()
+        return {
+            "_type": "tensor_summary",
+            "shape": list(tensor.shape),
+            "dtype": str(tensor.dtype),
+        }
+    if callable(value):
+        return getattr(value, "__name__", repr(value))
+    return repr(value)
+
+
 @dataclass
 class Timer:
     """Simple timer utility used by the trackers."""
@@ -143,7 +177,7 @@ def __init__(
         self._run = wandb.init(
             project=experiment_name,
             dir=log_dir,
-            config=config,
+            config=(_sanitize_wandb_config(config) if config is not None else None),
             name=run_name,
         )
         logger.info("Initialized Weights & Biases tracker")