hao-ai-lab
diff --git a/‎fastvideo/models/dits/ltx2.py‎
Lines changed: 234 additions & 49 deletions b/‎fastvideo/models/dits/ltx2.py‎
Lines changed: 234 additions & 49 deletions
diff --git a/‎fastvideo/models/encoders/gemma.py‎
Lines changed: 13 additions & 6 deletions b/‎fastvideo/models/encoders/gemma.py‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎fastvideo/models/loader/component_loader.py‎
Lines changed: 50 additions & 15 deletions b/‎fastvideo/models/loader/component_loader.py‎
Lines changed: 50 additions & 15 deletions
diff --git a/‎fastvideo/models/loader/fsdp_load.py‎
Lines changed: 40 additions & 0 deletions b/‎fastvideo/models/loader/fsdp_load.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎fastvideo/models/upsamplers/__init__.py‎
Lines changed: 23 additions & 0 deletions b/‎fastvideo/models/upsamplers/__init__.py‎
Lines changed: 23 additions & 0 deletions
@@ -361,6 +361,10 @@ def named_parameters(self, prefix: str = "", recurse: bool = True):
                 continue
             yield name, param
 
+    def prepare_for_compile(self) -> None:
+        # Load Gemma outside Dynamo so torch.compile does not trace HF file-system checks.
+        _ = self.gemma_model
+
     @property
     def gemma_model(self) -> Gemma3ForConditionalGeneration:
         if self._gemma_model is None:
@@ -517,18 +521,21 @@ def forward(
             attention_mask = torch.ones_like(input_ids)
 
         model = self.gemma_model
-        orig_device = model.device
-        model.to(device=get_local_torch_device())
-        # input_ids = input_ids.to(device=model.device)
-        # attention_mask = attention_mask.to(device=model.device)
+        target_device = get_local_torch_device()
+        # Do not invoke model.to() inside the compiled forward path.
+        # _parse_to returns a non-Tensor torch.device, which Dynamo cannot
+        # trace under fullgraph=True. The model is already moved to device
+        # when first loaded (see gemma_model property + prepare_for_compile),
+        # so this guard is a runtime no-op and Dynamo can DCE it.
+        if model.device != target_device:
+            model.to(device=target_device)
         outputs = model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             output_hidden_states=True,
             return_dict=True,
         )
-        model.to(device=orig_device)
-        
+
         encoded_inputs = self._run_feature_extractor(
             outputs.hidden_states,
             attention_mask,
 
@@ -99,6 +99,12 @@ def for_module_type(
             # NumberConditioners; not a pure text encoder, so it gets
             # its own loader.
             "conditioner": (ConditionerLoader, "fastvideo"),
+            # LTX-2 spatial / temporal upsamplers — share the
+            # UpsamplerLoader path with the upsampler/upsampler_2 keys
+            # so the SR pipeline picks up real weights instead of the
+            # generic config-only loader.
+            "spatial_upsampler": (UpsamplerLoader, "diffusers"),
+            "temporal_upsampler": (UpsamplerLoader, "diffusers"),
         }
 
         if module_type in module_loaders:
@@ -1058,7 +1064,7 @@ def load(self, model_path: str, fastvideo_args: FastVideoArgs):
 
 
 class UpsamplerLoader(ComponentLoader):
-    """Loader for upsamplers."""
+    """Loader for upsamplers (incl. LTX-2 spatial/temporal upsamplers)."""
 
     def load(self, model_path: str, fastvideo_args: FastVideoArgs):
         """Load the upsampler based on the model path, and inference args."""
@@ -1068,36 +1074,65 @@ def load(self, model_path: str, fastvideo_args: FastVideoArgs):
         if class_name is None:
             raise ValueError(
                 "Model config does not contain a _class_name attribute. "
-                "Only diffusers format is supported."
-            )
-
-        try:
-            upsampler_cfg = deepcopy(fastvideo_args.pipeline_config.upsampler_config[0])
-            upsampler_cfg.update_model_config(config_dict)
-        except Exception as e:
-            upsampler_cfg = deepcopy(fastvideo_args.pipeline_config.upsampler_config[1])
-            upsampler_cfg.update_model_config(config_dict)
+                "Only diffusers format is supported.")
+
+        # The base PipelineConfig declares ``upsampler_config`` as a
+        # single ``UpsamplerConfig`` instance, but Hunyuan15 narrows it
+        # to a tuple of two configs (one per SR target). We only treat
+        # the attribute as a multi-config when it actually is one;
+        # otherwise the LTX-2 branch below handles the single-class
+        # path that takes the diffusers config dict directly.
+        upsampler_config_attr = getattr(fastvideo_args.pipeline_config,
+                                        "upsampler_config", None)
+        if isinstance(upsampler_config_attr, list | tuple):
+            try:
+                upsampler_cfg = deepcopy(upsampler_config_attr[0])
+                upsampler_cfg.update_model_config(config_dict)
+            except Exception:
+                upsampler_cfg = deepcopy(upsampler_config_attr[1])
+                upsampler_cfg.update_model_config(config_dict)
+        elif class_name == "LTX2LatentUpsampler":
+            # LTX-2 pipeline_config does not declare upsampler_config; the
+            # `LTX2LatentUpsampler` wrapper takes the raw diffusers config
+            # dict directly via LatentUpsamplerConfigurator.
+            upsampler_cfg = deepcopy(config_dict)
+        else:
+            raise AttributeError(
+                "pipeline_config.upsampler_config is missing; cannot build "
+                f"upsampler config for class {class_name}")
 
         model_cls, _ = ModelRegistry.resolve_model_cls(class_name)
         model = model_cls(upsampler_cfg)
 
         target_device = get_local_torch_device()
-        model = model.to(target_device, dtype=PRECISION_TO_TYPE[fastvideo_args.pipeline_config.upsampler_precision])
+        upsampler_precision = getattr(fastvideo_args.pipeline_config,
+                                      "upsampler_precision", "bf16")
+        model = model.to(target_device,
+                         dtype=PRECISION_TO_TYPE[upsampler_precision])
 
-        # Find all safetensors files
         safetensors_list = glob.glob(
             os.path.join(str(model_path), "*.safetensors"))
         if not safetensors_list:
             raise ValueError(f"No safetensors files found in {model_path}")
-        
+
         if len(safetensors_list) == 1:
             loaded = safetensors_load_file(safetensors_list[0])
         else:
             loaded = {}
             for sf_file in safetensors_list:
                 loaded.update(safetensors_load_file(sf_file))
-        
-        model.load_state_dict(loaded, strict=True)
+
+        # The LTX-2 latent upsampler wrapper exposes the actual conv
+        # stack at ``self.model``; checkpoint state_dicts may be saved
+        # without the ``model.`` prefix when the inner module was
+        # serialised directly. Strip / forward as needed so both layouts
+        # load cleanly.
+        target_module = getattr(model, "model", model)
+        if loaded and all(k.startswith("model.") for k in loaded):
+            stripped = {k[len("model."):]: v for k, v in loaded.items()}
+            target_module.load_state_dict(stripped, strict=True)
+        else:
+            target_module.load_state_dict(loaded, strict=True)
 
         return model.eval()
 
 
@@ -28,6 +28,37 @@
 logger = init_logger(__name__)
 
 
+def _maybe_convert_model_to_nvfp4(model: nn.Module) -> None:
+    """Quantize NVFP4-tagged linear layers in-place after weights are loaded.
+
+    Walks the module tree once, looking for layers whose ``quant_method``
+    is an :class:`NVFP4QuantizeMethod` (attached at construction time by
+    :meth:`NVFP4Config.get_quant_method`). When at least one such layer
+    exists, calls :func:`convert_model_to_nvfp4` to register the
+    ``_nvfp4_weight*`` / ``_nvfp4_alpha`` / ``_weight_global_sf`` buffers
+    on each targeted layer.
+
+    The walk returns on the first NVFP4 layer found so non-NVFP4 callers
+    pay only an ``isinstance`` check per module. flashinfer is imported
+    lazily inside :func:`convert_model_to_nvfp4` so this helper is a
+    no-op on hosts without the NVFP4 backend.
+    """
+    # Defer the import: nvfp4_config imports heavy diffusers /
+    # torch.distributed symbols at module-load time, and unconditional
+    # import would penalize every loader call regardless of whether
+    # NVFP4 is wired.
+    from fastvideo.layers.quantization.nvfp4_config import (
+        NVFP4QuantizeMethod, convert_model_to_nvfp4,
+    )
+
+    for mod in model.modules():
+        if isinstance(getattr(mod, "quant_method", None),
+                      NVFP4QuantizeMethod):
+            logger.info("Converting loaded model weights for NVFP4 linear layers")
+            convert_model_to_nvfp4(model)
+            return
+
+
 # TODO(PY): move this to utils elsewhere
 @contextlib.contextmanager
 def set_default_dtype(dtype: torch.dtype) -> Generator[None, None, None]:
@@ -158,6 +189,15 @@ def maybe_load_fsdp_model(
         if isinstance(p, torch.nn.Parameter):
             p.requires_grad = False
 
+    # NVFP4 weight prequantization. We detect by the registered
+    # ``quant_method`` on linear layers rather than by a separate flag —
+    # construction-time ``NVFP4Config.get_quant_method`` already attached
+    # ``NVFP4QuantizeMethod`` to every targeted layer, so the loader's
+    # responsibility is just to materialize the per-layer nvfp4 weight /
+    # scale buffers from the freshly-loaded bf16 weights. No-op when
+    # ``flashinfer`` is not installed (lazy import inside the helper).
+    _maybe_convert_model_to_nvfp4(model)
+
     compile_in_loader = enable_torch_compile and training_mode
     if compile_in_loader:
         compile_kwargs = torch_compile_kwargs or {}
 
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from fastvideo.models.upsamplers.ltx2_upsampler import (
+    BlurDownsample,
+    LTX2LatentUpsampler,
+    LatentUpsampler,
+    LatentUpsamplerConfigurator,
+    PixelShuffleND,
+    ResBlock,
+    SpatialRationalResampler,
+    upsample_video,
+)
+
+__all__ = [
+    "BlurDownsample",
+    "LTX2LatentUpsampler",
+    "LatentUpsampler",
+    "LatentUpsamplerConfigurator",
+    "PixelShuffleND",
+    "ResBlock",
+    "SpatialRationalResampler",
+    "upsample_video",
+]