NVIDIA · kevalmorabia97 · Mar 17, 2026 · Jan 26, 2026 · Feb 2, 2026 · Feb 4, 2026
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -25,7 +25,8 @@ NVIDIA Model Optimizer Changelog
 - Improve ``auto_quantize`` checkpoint/resume: calibration state is now saved and restored across runs, avoiding redundant calibration when resuming a search.
 - Add support for Nemotron-3 (NemotronHForCausalLM) model quantization and support for NemotronH MoE expert support in ``auto_quantize`` grouping and scoring rules.
 - Add support for block-granular RHT for non-power-of-2 dimensions.
-
+- Replace modelopt FP8 QDQ nodes with native ONNX QDQ nodes.
+
 **Misc**
 
 - Migrated project metadata from ``setup.py`` to a fully declarative ``pyproject.toml``.

@@ -130,7 +130,7 @@ def remove_disconnected_outputs(self) -> None:
         """Remove disconnected outputs from the model."""
         tensors_to_remove = []
         for tensor in self.model.graph.output:
-            if not utils.get_producer_nodes(self.model, tensor.name):
+            if not onnx_utils.get_producer_nodes(self.model, tensor.name):
                 tensors_to_remove.append(tensor)
                 logger.debug(f"Found disconnected output: {tensor.name}")
 
@@ -279,7 +279,7 @@ def _match_layernorm_pattern(self, mean_node: onnx.NodeProto) -> dict | None:
             # Find variance computation branch
             pow_nodes = [
                 n
-                for n in utils.get_consumer_nodes(self.model, sub_node.output[0])
+                for n in onnx_utils.get_consumer_nodes(self.model, sub_node.output[0])
                 if n.op_type == "Pow"
             ]
             if len(pow_nodes) != 1:
@@ -303,8 +303,8 @@ def _match_layernorm_pattern(self, mean_node: onnx.NodeProto) -> dict | None:
 
             # Find Div node
             # Find the Div node that consumes both sqrt and sub outputs
-            sqrt_consumers = utils.get_consumer_nodes(self.model, sqrt_node.output[0])
-            sub_consumers = utils.get_consumer_nodes(self.model, sub_node.output[0])
+            sqrt_consumers = onnx_utils.get_consumer_nodes(self.model, sqrt_node.output[0])
+            sub_consumers = onnx_utils.get_consumer_nodes(self.model, sub_node.output[0])
 
             div_nodes = [n for n in sqrt_consumers if n in sub_consumers and n.op_type == "Div"]
             if len(div_nodes) != 1:
@@ -342,14 +342,14 @@ def _match_layernorm_pattern(self, mean_node: onnx.NodeProto) -> dict | None:
                 div_node,
             ]
 
-            consumers = utils.get_consumer_nodes(self.model, div_node.output[0])
+            consumers = onnx_utils.get_consumer_nodes(self.model, div_node.output[0])
             if len(consumers) == 1 and consumers[0].op_type == "Mul":
                 mul_node = consumers[0]
                 scale = self._get_initializer_value(mul_node.input[1], return_array=True)
                 final_node = mul_node
                 nodes_to_remove.append(mul_node)
 
-                consumers = utils.get_consumer_nodes(self.model, mul_node.output[0])
+                consumers = onnx_utils.get_consumer_nodes(self.model, mul_node.output[0])
                 if len(consumers) == 1 and consumers[0].op_type == "Add":
                     add_node = consumers[0]
                     bias = self._get_initializer_value(add_node.input[1], return_array=True)
@@ -457,7 +457,7 @@ def _create_layernorm_node(self, pattern: dict) -> onnx.NodeProto:
 
     def _find_insertion_point(self, input_name: str) -> int:
         """Find the correct insertion point for the new LayerNorm node."""
-        producer_nodes = utils.get_producer_nodes(self.model, input_name)
+        producer_nodes = onnx_utils.get_producer_nodes(self.model, input_name)
         if not producer_nodes:
             return 0
 

@@ -27,6 +27,7 @@
 
 import onnx
 
+import modelopt.onnx.utils as onnx_utils
 from modelopt.onnx.utils import get_opset_version
 
 
@@ -60,32 +61,6 @@ def setup_mappings(model: onnx.ModelProto) -> tuple[dict, dict, dict]:
     return value_info_map, initializer_map, node_to_init_map
 
 
-def get_consumer_nodes(model: onnx.ModelProto, tensor_name: str) -> list[onnx.NodeProto]:
-    """Get all consumer nodes for a given tensor name.
-
-    Args:
-        model: The ONNX model to search.
-        tensor_name: Name of the tensor to find consumers for.
-
-    Returns:
-        list[onnx.NodeProto]: List of nodes that consume the tensor.
-    """
-    return [n for n in model.graph.node if tensor_name in n.input]
-
-
-def get_producer_nodes(model: onnx.ModelProto, tensor_name: str) -> list[onnx.NodeProto]:
-    """Get all producer nodes for a given tensor name.
-
-    Args:
-        model: The ONNX model to search.
-        tensor_name: Name of the tensor to find producers for.
-
-    Returns:
-        list[onnx.NodeProto]: List of nodes that produce the tensor.
-    """
-    return [n for n in model.graph.node if tensor_name in n.output]
-
-
 def get_unique_consumer_node(model: onnx.ModelProto, tensor_name: str) -> onnx.NodeProto:
     """Get a single consumer node and raise exception if there are multiple consumers.
 
@@ -99,30 +74,12 @@ def get_unique_consumer_node(model: onnx.ModelProto, tensor_name: str) -> onnx.N
     Raises:
         Exception: If there is not exactly one consumer node.
     """
-    consumers = get_consumer_nodes(model, tensor_name)
+    consumers = onnx_utils.get_consumer_nodes(model, tensor_name)
     if len(consumers) != 1:
         raise Exception(f"Expected single consumer for {tensor_name}, found {len(consumers)}")
     return consumers[0]
 
 
-def get_cast_to_type(cast_node: onnx.NodeProto) -> int:
-    """Get the target type from a Cast node.
-
-    Args:
-        cast_node: The Cast node to extract type from.
-
-    Returns:
-        int: The target type value from the Cast node's 'to' attribute.
-
-    Raises:
-        ValueError: If the Cast node does not have a 'to' attribute.
-    """
-    for attr in cast_node.attribute:
-        if attr.name == "to":
-            return attr.i
-    raise ValueError("Cast node does not have 'to' attribute")
-
-
 def walk_subgraphs_recursive(
     graph: onnx.GraphProto,
     callback: Callable,

@@ -22,6 +22,8 @@
 import torch
 from onnx_graphsurgeon.ir.tensor import LazyValues
 
+from modelopt.onnx.logging_config import logger
+
 from .base_exporter import ONNXQuantExporter
 
 
@@ -45,13 +47,13 @@ def compress_weights(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
         Even though modelopt supports FP8 onnx export, the weights are represented in fp32 + QDQ.
         The storage is therefore very bad. In this function,
         Q nodes will get removed from the weights and have only DQ nodes with those converted FP8
-        weights in the output model.
+        weights in the output model. TRT custom ops are converted to native ONNX DequantizeLinear.
 
         Parameters:
-            onnx_model: ONNX model with FP32/FP16 weights and QDQ nodes.
+            onnx_model: ONNX model with FP32/FP16 weights and TRT_FP8 QDQ nodes.
 
         Returns:
-            ONNX model with FP8 weights and only DQ nodes for weights (QDQ preserved for activations).
+            ONNX model with FP8 weights and native ONNX DQ nodes for weights (QDQ preserved for activations).
         """
         start_time = time.time()
         print("Replacing all (fp32 weights + fp8 QDQ) with (fp8 weights + DQ)...")
@@ -62,7 +64,7 @@ def compress_weights(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
 
         for node in graph.nodes:
             if node.op == "TRT_FP8QuantizeLinear":
-                # Should not remove input QDQ
+                # Should not remove input QDQ (only process weight quantization)
                 if not isinstance(node.inputs[0], gs.Constant):
                     continue
 
@@ -88,7 +90,7 @@ def compress_weights(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
                 onnx_weights_fp8 = gs.Constant(quantizer_name + "/fp8_weights", values)
 
                 node.outputs.clear()
-                # DQ Op is separated out
+                # Convert TRT DQ to native ONNX DequantizeLinear with FP8 weights
                 dq_op.inputs[0] = onnx_weights_fp8
                 dq_op.op = "DequantizeLinear"
                 dq_op.outputs[0].dtype = dq_op.inputs[1].dtype
@@ -101,5 +103,46 @@ def compress_weights(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
 
     @staticmethod
     def post_process(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
-        """Post-processes the ONNX model for FP8 quantization."""
-        return onnx_model
+        """Post-processes the ONNX model for FP8 quantization.
+
+        Converts TRT_FP8 QDQ ops to native ONNX QuantizeLinear/DequantizeLinear:
+        - TRT_FP8QuantizeLinear -> QuantizeLinear with FP8E4M3FN zero_point and saturate=1
+        - TRT_FP8DequantizeLinear -> DequantizeLinear
+
+        Args:
+            onnx_model: The ONNX model containing TRT_FP8 quantization nodes.
+
+        Returns:
+            The post-processed ONNX model with native ONNX quantization ops.
+        """
+        logger.info("Post-processing FP8 quantized model")
+        graph = gs.import_onnx(onnx_model)
+
+        # Convert TRT_FP8QuantizeLinear to native QuantizeLinear
+        for node in graph.nodes:
+            if node.op == "TRT_FP8QuantizeLinear":
+                node.op = "QuantizeLinear"
+                # Add FP8 zero_point if not present
+                if len(node.inputs) == 2:
+                    # Create FP8 zero point constant
+                    zp_tensor = onnx.TensorProto()
+                    zp_tensor.data_type = onnx.TensorProto.FLOAT8E4M3FN
+                    zp_tensor.dims.extend([1])  # 1-element tensor
+                    zp_tensor.raw_data = b"\x00"  # Zero in FP8
+                    zp_values = LazyValues(zp_tensor)
+                    zero_point = gs.Constant(node.name + "_zero_point", zp_values)
+                    node.inputs.append(zero_point)
+                # Add saturate attribute for FP8
+                node.attrs["saturate"] = 1
+                logger.debug(f"Converted {node.name} from TRT_FP8QuantizeLinear to QuantizeLinear")
+
+        # Convert TRT_FP8DequantizeLinear to native DequantizeLinear
+        for node in graph.nodes:
+            if node.op == "TRT_FP8DequantizeLinear":
+                node.op = "DequantizeLinear"
+                logger.debug(
+                    f"Converted {node.name} from TRT_FP8DequantizeLinear to DequantizeLinear"
+                )
+
+        graph.cleanup().toposort()
+        return gs.export_onnx(graph)
@@ -215,7 +215,7 @@ def compute_scales(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
         logger.debug(f"Found {len(fp4_qdq_nodes)} FP4QDQ nodes to process")
 
         for node in fp4_qdq_nodes:
-            idx = initializer_indices.get(node.input[0], None)
+            idx = initializer_indices.get(node.input[0])
             assert idx is not None, f"Initializer for weight '{node.input[0]}' not found."
 
             tensor = initializers[idx]
@@ -259,7 +259,7 @@ def compress_weights(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
         fp4_qdq_nodes = [node for node in graph.node if node.op_type == "TRT_FP4QDQ"]
 
         for node in fp4_qdq_nodes:
-            idx = initializer_indices.get(node.input[0], None)
+            idx = initializer_indices.get(node.input[0])
             assert idx is not None, f"Initializer for weight '{node.input[0]}' not found."
 
             tensor = initializers[idx]
@@ -365,7 +365,7 @@ def _cast_input_dtypes(node: onnx.NodeProto, precision_dtype: str):
         logger.debug(f"Found {len(fp4_qdq_nodes)} FP4QDQ nodes to convert")
 
         for node in fp4_qdq_nodes:
-            idx = initializer_indices.get(node.input[0], None)
+            idx = initializer_indices.get(node.input[0])
             assert idx is not None, f"Initializer for weight '{node.input[0]}' not found."
             initializers_to_delete.append(graph.initializer[idx].name)