NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 7 additions & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎examples/torch_onnx/torch_quant_to_onnx.py‎
Lines changed: 23 additions & 1 deletion b/‎examples/torch_onnx/torch_quant_to_onnx.py‎
Lines changed: 23 additions & 1 deletion
@@ -1,6 +1,13 @@
 Changelog
 =========
 
+0.45 (2026-06-xx)
+^^^^^^^^^^^^^^^^^
+
+**New Features**
+
+- Add FP8 MHA quantization support for vision transformers. Adds an attention-aware ONNX post-processing pass (scale Mul / K-transpose move before Q, Q→DQ insertion on softmax output) in :class:`FP8QuantExporter <modelopt.onnx.export.fp8_exporter.FP8QuantExporter>`, per-instance nested-attention-wrapper skipping in the HF plugin, and ``nn.LayerNorm`` registration in ``QuantModuleRegistry`` so BMM input quantizers and LayerNorm output quantizers defined in FP8_DEFAULT_CFG are honored end-to-end. See `examples/torch_onnx/torch_quant_to_onnx.py <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/torch_onnx/torch_quant_to_onnx.py>`_ for the general timm-model quantize→ONNX workflow.
+
 0.44 (2026-05-xx)
 ^^^^^^^^^^^^^^^^^
 
 
@@ -88,6 +88,22 @@
     },
 ]
 
+# FP8 MHA-aware config entries: quantize LayerNorm output so TRT can fuse the shared
+# Q/DQ across all downstream Q/K/V/FC consumers. Softmax-output Q/DQ is handled by the
+# FP8 ONNX exporter's post-processing pass (fixed 1/448 scale, data-independent).
+_FP8_MHA_OVERRIDE: list = [
+    {
+        "parent_class": "nn.LayerNorm",
+        "quantizer_name": "*output_quantizer",
+        "cfg": {"num_bits": (4, 3), "axis": None},
+    },
+    {
+        "parent_class": "nn.LayerNorm",
+        "quantizer_name": "*input_quantizer",
+        "enable": False,
+    },
+]
+
 # Auto-quantize format configs that use block quantization and need Conv2d overrides for TRT.
 # TRT DynamicQuantize requires 2D/3D input, but Conv2d operates on 4D tensors.
 _NEEDS_FP8_CONV_OVERRIDE: set[str] = {
@@ -102,11 +118,16 @@ def get_quant_config(quantize_mode):
     """Get quantization config, overriding Conv2d for TRT compatibility.
 
     TensorRT only supports FP8 and INT8 for Conv layers.
+    - For FP8: add MHA-aware LayerNorm output quantizer so TRT fuses shared Q/DQ into
+      downstream attention matmuls. Softmax-output Q/DQ is inserted by the FP8 ONNX
+      exporter's post-processing (fixed 1/448 scale, no calibration needed).
     - For MXFP8, NVFP4: override Conv2d to FP8
     - For INT4_AWQ: override Conv2d to INT8
     """
     config: dict = copy.deepcopy(QUANT_CONFIG_DICT[quantize_mode])
-    if quantize_mode in ("mxfp8", "nvfp4"):
+    if quantize_mode == "fp8":
+        config["quant_cfg"].extend(_FP8_MHA_OVERRIDE)
+    elif quantize_mode in ("mxfp8", "nvfp4"):
         warnings.warn(
             f"TensorRT only supports FP8/INT8 for Conv layers. "
             f"Overriding Conv2d quantization to FP8 for '{quantize_mode}' mode."
@@ -458,6 +479,7 @@ def main():
         # Conv2d layers are overridden to FP8 (for TRT compatibility), those FP8
         # quantizers require calibration data.
         config = get_quant_config(args.quantize_mode)
+
         data_loader = load_calibration_data(
             model,
             args.calibration_data_size,