NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/torch_onnx/torch_quant_to_onnx.py‎
Lines changed: 103 additions & 1 deletion b/‎examples/torch_onnx/torch_quant_to_onnx.py‎
Lines changed: 103 additions & 1 deletion
@@ -17,6 +17,7 @@ Changelog
 - [Early Testing] Add Claude Code PTQ skill (``.claude/skills/ptq/``) for agent-assisted post-training quantization. The skill guides the agent through environment detection, model support checking, format selection, and execution via the launcher or manual SLURM/Docker/bare GPU paths. Includes handling for unlisted models with custom module patching. This feature is in early testing — use with caution.
 - Add performant layerwise calibration for large models that don't fit on GPU (e.g. DeepSeek-R1, Kimi-K2). See `modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml <https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml>`_ for usage. Layerwise calibration also supports PTQ with intermediate progress saving — useful when long PTQ runs get hit with Slurm timeouts. See `modelopt_recipes/general/ptq/nvfp4_default-none_kv_gptq.yaml <https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt_recipes/general/ptq/nvfp4_default-none_kv_gptq.yaml>`_ for usage.
 - Add implicit GEMM CUDA kernel for Conv3D with fused NVFP4 fake quantization (``modelopt.torch.quantization.src.conv``). When NVFP4 quantization is applied to an ``nn.Conv3d`` layer via ModelOpt PTQ, the implicit GEMM path is used automatically instead of cuDNN. Uses BF16 WMMA tensor cores (SM80+) with FP32 accumulation and in-kernel FP4 (E2M1) activation quantization. Grouped convolution (``groups > 1``) falls back to the default cuDNN path. Inference only — training mode falls back to cuDNN with a warning.
+- Add FP8 MHA quantization support for vision transformers. Adds an attention-aware ONNX post-processing pass (scale Mul / K-transpose move before Q, Q→DQ insertion on softmax output) in :class:`FP8QuantExporter <modelopt.onnx.export.fp8_exporter.FP8QuantExporter>`, per-instance nested-attention-wrapper skipping in the HF plugin, and ``nn.LayerNorm`` registration in ``QuantModuleRegistry`` so BMM input quantizers and LayerNorm output quantizers defined in FP8_DEFAULT_CFG are honored end-to-end. See `examples/torch_onnx/torch_quant_to_onnx.py <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/torch_onnx/torch_quant_to_onnx.py>`_ for the general timm-model quantize→ONNX workflow.
 
 **Backward Breaking Changes**
 
 
@@ -88,6 +88,36 @@
     },
 ]
 
+# FP8 MHA-aware config entries: quantize LayerNorm output and Softmax output so TRT can
+# fuse Q/DQ into the attention MatMul kernels. LayerNorm output QDQ is shared across all
+# downstream Q/K/V/FC consumers; Softmax output QDQ is required for MHA-v2 fusion on the
+# attn@V MatMul. Relies on ``torch.nn.LayerNorm`` and ``torch.nn.Softmax`` being registered
+# in ``QuantModuleRegistry`` (see ``modelopt/torch/quantization/nn/modules``).
+_FP8_MHA_OVERRIDE: list = [
+    {
+        "parent_class": "nn.LayerNorm",
+        "quantizer_name": "*output_quantizer",
+        "cfg": {"num_bits": (4, 3), "axis": None},
+    },
+    {
+        "parent_class": "nn.LayerNorm",
+        "quantizer_name": "*input_quantizer",
+        "enable": False,
+    },
+    {
+        "parent_class": "nn.Softmax",
+        "quantizer_name": "*output_quantizer",
+        "cfg": {"num_bits": (4, 3), "axis": None},
+    },
+    {
+        # Pre-softmax Q/DQ can't fuse into the Q@K^T MatMul (no TRT kernel for
+        # MatMul→Softmax fusion through Q/DQ) and just adds overhead.
+        "parent_class": "nn.Softmax",
+        "quantizer_name": "*input_quantizer",
+        "enable": False,
+    },
+]
+
 # Auto-quantize format configs that use block quantization and need Conv2d overrides for TRT.
 # TRT DynamicQuantize requires 2D/3D input, but Conv2d operates on 4D tensors.
 _NEEDS_FP8_CONV_OVERRIDE: set[str] = {
@@ -102,11 +132,14 @@ def get_quant_config(quantize_mode):
     """Get quantization config, overriding Conv2d for TRT compatibility.
 
     TensorRT only supports FP8 and INT8 for Conv layers.
+    - For FP8: add MHA-aware LayerNorm/Softmax output quantizers for transformer fusion.
     - For MXFP8, NVFP4: override Conv2d to FP8
     - For INT4_AWQ: override Conv2d to INT8
     """
     config: dict = copy.deepcopy(QUANT_CONFIG_DICT[quantize_mode])
-    if quantize_mode in ("mxfp8", "nvfp4"):
+    if quantize_mode == "fp8":
+        config["quant_cfg"].extend(_FP8_MHA_OVERRIDE)
+    elif quantize_mode in ("mxfp8", "nvfp4"):
         warnings.warn(
             f"TensorRT only supports FP8/INT8 for Conv layers. "
             f"Overriding Conv2d quantization to FP8 for '{quantize_mode}' mode."
@@ -121,6 +154,67 @@ def get_quant_config(quantize_mode):
     return config
 
 
+def _inject_softmax_modules(model):
+    """Replace timm vision-transformer ``F.softmax`` with ``nn.Softmax`` submodules.
+
+    Timm's ``Attention.forward`` calls ``attn.softmax(dim=-1)`` (``F.softmax``), which
+    exposes no module for quantization. We disable ``fused_attn`` (so the non-fused
+    path runs) and attach a ``self.softmax = nn.Softmax(dim=-1)`` child, then rebind
+    ``forward`` to call that submodule. Combined with ``nn.Softmax`` registered in
+    ``QuantModuleRegistry``, this means a standard ``mtq.quantize`` pass will add
+    and calibrate the softmax output quantizer.
+
+    Returns the count of patched attention modules.
+    """
+    try:
+        from timm.models.vision_transformer import Attention as _VitAttention
+    except ImportError:
+        return 0
+
+    patched = 0
+    for _, module in model.named_modules():
+        if not isinstance(module, _VitAttention):
+            continue
+        module.fused_attn = False
+        if not isinstance(getattr(module, "softmax", None), torch.nn.Softmax):
+            module.softmax = torch.nn.Softmax(dim=-1)
+        module.forward = _vit_attention_forward.__get__(module, type(module))
+        patched += 1
+    return patched
+
+
+def _vit_attention_forward(self, x, attn_mask=None, is_causal=False):
+    """Replacement for timm ``Attention.forward`` that routes softmax through ``self.softmax``.
+
+    Mirrors the non-fused branch of upstream timm's implementation for the case without
+    masking/causal (the default for image classifiers). ``self.softmax`` is a real
+    ``nn.Softmax`` module, so its output_quantizer is honored during quantization.
+    """
+    B, N, C = x.shape
+    qkv = (
+        self.qkv(x)
+        .reshape(B, N, 3, self.num_heads, self.head_dim)
+        .permute(2, 0, 3, 1, 4)
+    )
+    q, k, v = qkv.unbind(0)
+    q, k = self.q_norm(q), self.k_norm(k)
+
+    q = q * self.scale
+    attn = q @ k.transpose(-2, -1)
+    if attn_mask is not None:
+        attn = attn + attn_mask
+    attn = self.softmax(attn)
+    attn = self.attn_drop(attn)
+    x = attn @ v
+
+    x = x.transpose(1, 2).reshape(B, N, self.attn_dim)
+    if hasattr(self, "norm"):
+        x = self.norm(x)
+    x = self.proj(x)
+    x = self.proj_drop(x)
+    return x
+
+
 def filter_func(name):
     """Filter function to exclude certain layers from quantization.
 
@@ -458,6 +552,14 @@ def main():
         # Conv2d layers are overridden to FP8 (for TRT compatibility), those FP8
         # quantizers require calibration data.
         config = get_quant_config(args.quantize_mode)
+
+        if args.quantize_mode == "fp8":
+            # Swap timm Attention's internal F.softmax for an nn.Softmax submodule so
+            # the output_quantizer declared in _FP8_MHA_OVERRIDE picks it up.
+            n_patched = _inject_softmax_modules(model)
+            if n_patched:
+                print(f"Patched {n_patched} timm Attention modules for softmax output quantization")
+
         data_loader = load_calibration_data(
             model,
             args.calibration_data_size,