NVIDIA
diff --git a/‎.github/codecov.yml‎
Lines changed: 0 additions & 12 deletions b/‎.github/codecov.yml‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/diffusers/README.md‎
Lines changed: 14 additions & 0 deletions b/‎examples/diffusers/README.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎examples/diffusers/quantization/calibration.py‎
Lines changed: 9 additions & 4 deletions b/‎examples/diffusers/quantization/calibration.py‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎examples/diffusers/quantization/models_utils.py‎
Lines changed: 23 additions & 22 deletions b/‎examples/diffusers/quantization/models_utils.py‎
Lines changed: 23 additions & 22 deletions
diff --git a/‎examples/diffusers/quantization/pipeline_manager.py‎
Lines changed: 41 additions & 44 deletions b/‎examples/diffusers/quantization/pipeline_manager.py‎
Lines changed: 41 additions & 44 deletions
@@ -11,15 +11,3 @@ coverage:
         target: auto
         threshold: 1% # Allow atmost 1% coverage drop from main branch.
     patch: false
-
-# Exclude GPU-only Triton kernel files from ALL codecov calculations (project
-# and patch checks, all flags). Rationale: these files are dominated by
-# @triton.jit kernel bodies that CPU unit tests cannot exercise. GPU tests
-# cover them end-to-end (see tests/gpu/torch/sparsity/attention_sparsity/) but
-# the `gpu`-flag upload may race with the PR status check, so relying on flag
-# combination alone leaves the project check flaky. Dropping these files here
-# makes the check deterministic — local `pytest --cov` and GPU runs still
-# measure them; only the codecov PR status ignores them.
-ignore:
-  - "modelopt/torch/kernels/triton_fa.py"
-  - "modelopt/torch/kernels/hf_triton_attention.py"
@@ -16,6 +16,7 @@ Changelog
 - Add support for vLLM fakequant reload using ModelOpt state for HF models. See `examples/vllm_serve/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/vllm_serve#load-qatptq-model-and-serve-in-vllm-wip>`_ for more details.
 - [Early Testing] Add Claude Code PTQ skill (``.claude/skills/ptq/``) for agent-assisted post-training quantization. The skill guides the agent through environment detection, model support checking, format selection, and execution via the launcher or manual SLURM/Docker/bare GPU paths. Includes handling for unlisted models with custom module patching. This feature is in early testing — use with caution.
 - Add performant layerwise calibration for large models that don't fit on GPU (e.g. DeepSeek-R1, Kimi-K2). See `modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml <https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml>`_ for usage. Layerwise calibration also supports PTQ with intermediate progress saving — useful when long PTQ runs get hit with Slurm timeouts. See `modelopt_recipes/general/ptq/nvfp4_default-none_kv_gptq.yaml <https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt_recipes/general/ptq/nvfp4_default-none_kv_gptq.yaml>`_ for usage.
+- Add implicit GEMM CUDA kernel for Conv3D with fused NVFP4 fake quantization (``modelopt.torch.quantization.src.conv``). When NVFP4 quantization is applied to an ``nn.Conv3d`` layer via ModelOpt PTQ, the implicit GEMM path is used automatically instead of cuDNN. Uses BF16 WMMA tensor cores (SM80+) with FP32 accumulation and in-kernel FP4 (E2M1) activation quantization. Grouped convolution (``groups > 1``) falls back to the default cuDNN path. Inference only — training mode falls back to cuDNN with a warning.
 
 **Backward Breaking Changes**
 
 
@@ -117,6 +117,20 @@ python quantize.py \
     --hf-ckpt-dir ./hf_ckpt
 ```
 
+#### Wan 2.2 VAE NVFP4 (Conv3D Implicit GEMM)
+
+The Wan 2.2 VAE (`AutoencoderKLWan`, shared between the 5B and 14B pipelines) is built from 3D convolutions. When quantizing the VAE with NVFP4, the `Conv3d` layers are automatically dispatched through a custom BF16 WMMA implicit-GEMM kernel with fused FP4 activation quantization. Requires SM80+ (Ampere or newer). See [`modelopt/torch/quantization/src/conv/README.md`](../../modelopt/torch/quantization/src/conv/README.md) for kernel details.
+
+```sh
+python quantize.py \
+    --model {wan2.2-t2v-14b|wan2.2-t2v-5b} \
+    --backbone vae \
+    --format fp4 --quant-algo max --collect-method default \
+    --model-dtype BFloat16 --trt-high-precision-dtype BFloat16 \
+    --batch-size 1 --calib-size 32 --n-steps 30 \
+    --quantized-torch-ckpt-save-path ./wan22_vae_fp4.pt
+```
+
 #### [LTX-2](https://github.com/Lightricks/LTX-2) FP4
 
 > [!WARNING]
 
@@ -108,11 +108,12 @@ def run_calibration(self, batched_prompts: list[list[str]]) -> None:
     def _run_wan_video_calibration(
         self, prompt_batch: list[str], extra_args: dict[str, Any]
     ) -> None:
+        extra_params = self.pipeline_manager.config.extra_params
         kwargs = {}
         kwargs["negative_prompt"] = extra_args["negative_prompt"]
-        kwargs["height"] = extra_args["height"]
-        kwargs["width"] = extra_args["width"]
-        kwargs["num_frames"] = extra_args["num_frames"]
+        kwargs["height"] = extra_params.get("height", extra_args["height"])
+        kwargs["width"] = extra_params.get("width", extra_args["width"])
+        kwargs["num_frames"] = extra_params.get("num_frames", extra_args["num_frames"])
         kwargs["guidance_scale"] = extra_args["guidance_scale"]
         if "guidance_scale_2" in extra_args:
             kwargs["guidance_scale_2"] = extra_args["guidance_scale_2"]
@@ -154,7 +155,11 @@ def _run_ltx2_calibration(self, prompt_batch: list[str], extra_args: dict[str, A
             "images": extra_params.get("images", []),
             "tiling_config": extra_params.get("tiling_config", TilingConfig.default()),
         }
-        self.pipe(prompt=prompt, **kwargs)
+        decoded_video, decoded_audio = self.pipe(prompt=prompt, **kwargs)
+        # vae_decode_video returns a lazy generator — consume it so the
+        # video decoder's forward() actually runs during calibration.
+        for _ in decoded_video:
+            pass
 
     def _run_ltx_video_calibration(
         self, prompt_batch: list[str], extra_args: dict[str, Any]
 
@@ -33,7 +33,9 @@
 from utils import (
     filter_func_default,
     filter_func_flux_dev,
+    filter_func_ltx2_vae,
     filter_func_ltx_video,
+    filter_func_wan_vae,
     filter_func_wan_video,
 )
 
@@ -54,31 +56,30 @@ class ModelType(str, Enum):
     WAN22_T2V_5b = "wan2.2-t2v-5b"
 
 
-def get_model_filter_func(model_type: ModelType) -> Callable[[str], bool]:
-    """
-    Get the appropriate filter function for a given model type.
+_FILTER_FUNC_MAP: dict[ModelType, Callable[[str], bool]] = {
+    ModelType.FLUX_DEV: filter_func_flux_dev,
+    ModelType.FLUX2_DEV: filter_func_flux_dev,
+    ModelType.LTX_VIDEO_DEV: filter_func_ltx_video,
+    ModelType.LTX2: filter_func_ltx_video,
+    ModelType.WAN22_T2V_14b: filter_func_wan_video,
+    ModelType.WAN22_T2V_5b: filter_func_wan_video,
+}
 
-    Args:
-        model_type: The model type enum
+_VAE_FILTER_FUNC_MAP: dict[tuple[ModelType, str], Callable[[str], bool]] = {
+    (ModelType.LTX2, "video_decoder"): filter_func_ltx2_vae,
+    (ModelType.WAN22_T2V_14b, "vae"): filter_func_wan_vae,
+    (ModelType.WAN22_T2V_5b, "vae"): filter_func_wan_vae,
+}
 
-    Returns:
-        A filter function appropriate for the model type
-    """
-    filter_func_map = {
-        ModelType.FLUX_DEV: filter_func_flux_dev,
-        ModelType.FLUX_SCHNELL: filter_func_default,
-        ModelType.FLUX2_DEV: filter_func_flux_dev,
-        ModelType.SDXL_BASE: filter_func_default,
-        ModelType.SDXL_TURBO: filter_func_default,
-        ModelType.SD3_MEDIUM: filter_func_default,
-        ModelType.SD35_MEDIUM: filter_func_default,
-        ModelType.LTX_VIDEO_DEV: filter_func_ltx_video,
-        ModelType.LTX2: filter_func_ltx_video,
-        ModelType.WAN22_T2V_14b: filter_func_wan_video,
-        ModelType.WAN22_T2V_5b: filter_func_wan_video,
-    }
 
-    return filter_func_map.get(model_type, filter_func_default)
+def get_model_filter_func(
+    model_type: ModelType, backbone_name: str = "transformer"
+) -> Callable[[str], bool]:
+    """Get the appropriate filter function for a given model type and backbone."""
+    vae_func = _VAE_FILTER_FUNC_MAP.get((model_type, backbone_name))
+    if vae_func is not None:
+        return vae_func
+    return _FILTER_FUNC_MAP.get(model_type, filter_func_default)
 
 
 # Model registry with HuggingFace model IDs
 
@@ -42,6 +42,7 @@ def __init__(self, config: ModelConfig, logger: logging.Logger):
         self.pipe: Any | None = None
         self.pipe_upsample: LTXLatentUpsamplePipeline | None = None  # For LTX-Video upsampling
         self._transformer: torch.nn.Module | None = None
+        self._video_decoder: torch.nn.Module | None = None
 
     @staticmethod
     def create_pipeline_from(
@@ -58,23 +59,20 @@ def create_pipeline_from(
         Raises:
             ValueError: If model type is unsupported
         """
-        try:
-            pipeline_cls = MODEL_PIPELINE[model_type]
-            if pipeline_cls is None:
-                raise ValueError(f"Model type {model_type.value} does not use diffusers pipelines.")
-            model_id = (
-                MODEL_REGISTRY[model_type] if override_model_path is None else override_model_path
-            )
-            pipe = pipeline_cls.from_pretrained(
-                model_id,
-                torch_dtype=torch_dtype,
-                use_safetensors=True,
-                **MODEL_DEFAULTS[model_type].get("from_pretrained_extra_args", {}),
-            )
-            pipe.set_progress_bar_config(disable=True)
-            return pipe
-        except Exception as e:
-            raise e
+        pipeline_cls = MODEL_PIPELINE[model_type]
+        if pipeline_cls is None:
+            raise ValueError(f"Model type {model_type.value} does not use diffusers pipelines.")
+        model_id = (
+            MODEL_REGISTRY[model_type] if override_model_path is None else override_model_path
+        )
+        pipe = pipeline_cls.from_pretrained(
+            model_id,
+            torch_dtype=torch_dtype,
+            use_safetensors=True,
+            **MODEL_DEFAULTS[model_type].get("from_pretrained_extra_args", {}),
+        )
+        pipe.set_progress_bar_config(disable=True)
+        return pipe
 
     def create_pipeline(self) -> Any:
         """
@@ -157,42 +155,32 @@ def setup_device(self) -> None:
                 self.logger.info("Enabling VAE tiling for LTX-Video")
                 self.pipe.vae.enable_tiling()
 
-    def get_backbone(self) -> torch.nn.Module:
-        """
-        Get the backbone model (transformer or UNet).
-
-        Returns:
-            Backbone model module
-        """
-        if not self.pipe:
-            raise RuntimeError("Pipeline not created. Call create_pipeline() first.")
-
-        backbone_pairs = list(self.iter_backbones())
-        if len(backbone_pairs) == 1:
-            return backbone_pairs[0][1]
-        return torch.nn.ModuleList([module for _, module in backbone_pairs])
-
     def iter_backbones(self) -> Iterator[tuple[str, torch.nn.Module]]:
         """
-        Yield backbone modules by name, based on a backbone spec.
-
-        Yields:
-            (backbone_name, module) pairs
+        Yield (backbone_name, module) pairs.
         """
         if not self.pipe:
             raise RuntimeError("Pipeline not created. Call create_pipeline() first.")
 
         names = list(self.config.backbone)
+        if not names:
+            raise RuntimeError("No backbone names provided.")
 
         if self.config.model_type == ModelType.LTX2:
-            self._ensure_ltx2_transformer_cached()
-            name = names[0] if names else "transformer"
-            yield name, self._transformer
+            for name in names:
+                if name == "video_decoder":
+                    self._ensure_ltx2_video_decoder_cached()
+                    yield name, self._video_decoder
+                elif name == "transformer":
+                    self._ensure_ltx2_transformer_cached()
+                    yield name, self._transformer
+                else:
+                    raise ValueError(
+                        f"Unsupported LTX-2 backbone name '{name}'. "
+                        "Expected 'transformer' or 'video_decoder'."
+                    )
             return
 
-        if not names:
-            raise RuntimeError("No backbone names provided.")
-
         for name in names:
             module = getattr(self.pipe, name, None)
             if module is None:
@@ -207,6 +195,16 @@ def _ensure_ltx2_transformer_cached(self) -> None:
             self.pipe.stage_1_model_ledger.transformer = lambda: transformer
             self._transformer = transformer
 
+    def _ensure_ltx2_video_decoder_cached(self) -> None:
+        if not self.pipe:
+            raise RuntimeError("Pipeline not created. Call create_pipeline() first.")
+        if self._video_decoder is None:
+            video_decoder = self.pipe.stage_1_model_ledger.video_decoder()
+            # Cache it so subsequent calls return the same (quantized) instance
+            self.pipe.stage_1_model_ledger.video_decoder = lambda: video_decoder
+            self.pipe.stage_2_model_ledger.video_decoder = lambda: video_decoder
+            self._video_decoder = video_decoder
+
     def _create_ltx2_pipeline(self) -> Any:
         params = dict(self.config.extra_params)
         checkpoint_path = params.pop("checkpoint_path", None)
@@ -261,7 +259,6 @@ def _create_ltx2_pipeline(self) -> Any:
         return TI2VidTwoStagesPipeline(**pipeline_kwargs)
 
     def print_quant_summary(self):
-        backbone_pairs = list(self.iter_backbones())
-        for name, backbone in backbone_pairs:
+        for name, backbone in self.iter_backbones():
             self.logger.info(f"{name} quantization info:")
             mtq.print_quant_summary(backbone)