update function to replace duplicate casts

ajrasane · ajrasane · commit 01862234ee2a · 2026-02-13T14:09:35.000Z
Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -22,6 +22,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 - Add PTQ support for GLM-4.7, including loading MTP layer weights from a separate ``mtp.safetensors`` file and export as-is.
 - Add support for image-text data calibration in PTQ for Nemotron VL models.
 - Add PTQ support for Nemotron Parse.
+- Replace modelopt FP8 QDQ nodes with native ONNX QDQ nodes
 
 0.41 (2026-01-19)
 ^^^^^^^^^^^^^^^^^
diff --git a/modelopt/onnx/autocast/precisionconverter.py b/modelopt/onnx/autocast/precisionconverter.py
@@ -1147,42 +1147,13 @@ def _is_same_type_cast(self, node: onnx.NodeProto) -> bool:
         output_type = utils.get_cast_to_type(node)
         return all(inp_type == output_type for inp_type in input_types) and input_types is not None
 
-    def _is_sequential_cast(self, node: onnx.NodeProto) -> bool:
-        assert node.op_type == "Cast"
-        output_type = utils.get_cast_to_type(node)
-
-        # Cast to high precision -> cast to low precision, first cast has no impact and can be safely removed
-        # Cast to low precision -> cast to high precision affects precision and should not be removed
-        precision_order = [
-            TensorProto.DOUBLE,
-            TensorProto.FLOAT,
-            TensorProto.FLOAT16,
-            TensorProto.BFLOAT16,
-        ]
-        consumers = [
-            n for n in utils.get_consumer_nodes(self.model, node.output[0]) if n.op_type == "Cast"
-        ]
-
-        # If the first cast has additional consumers, we should not remove it
-        if len(consumers) != 1:
-            return False
-
-        next_node = consumers[0]
-        first_cast_type = output_type
-        second_cast_type = utils.get_cast_to_type(next_node)
-
-        return (
-            first_cast_type in precision_order
-            and second_cast_type in precision_order
-            and precision_order.index(first_cast_type) <= precision_order.index(second_cast_type)
-        )
-
     def _remove_redundant_casts(self):
         """Removes both sequential casts and casts that don't change precision.
 
         This method optimizes the graph by removing unnecessary cast operations that either:
         1. Don't actually change the data type
         2. Could be replaced by a single cast operation
+        3. Can be folded into a preceding Constant node
         """
         if self.custom_ops:
             self.model = self._propagate_types_shapes_custom_ops(self.model)
@@ -1198,35 +1169,7 @@ def _remove_redundant_casts(self):
                     check_type=True,
                 )
 
-        nodes_to_remove = []
-        for node in self.model.graph.node:
-            if node.op_type == "Cast":
-                # Find cast nodes that don't change precision
-                if self._is_same_type_cast(node):
-                    nodes_to_remove.append(node)
-                    self._bypass_cast_node(node)
-                    logger.debug(f"Found redundant same-type cast: {node.name}")
-                    continue
-
-                # Find sequential casts that don't change precision
-                if self._is_sequential_cast(node):
-                    nodes_to_remove.append(node)
-                    self._bypass_cast_node(node)
-                    logger.debug(f"Found removable double-cast: {node.name}")
-
-                # Find foldable Constant -> Cast. Initializers are handled by _convert_initializers.
-                if self._is_foldable_constant_cast_pattern(node):
-                    nodes_to_remove.append(node)
-                    cast_producers = utils.get_producer_nodes(self.model, node.input[0])
-                    assert len(cast_producers) == 1 and cast_producers[0].op_type == "Constant"
-                    constant_producer = cast_producers[0]
-                    self._convert_constant_values(constant_producer, node)
-                    self._bypass_cast_node(node)
-                    logger.debug(f"Found foldable Constant->Cast pattern, removing {node.name}")
-
-        logger.debug(f"Removing redundant casts: {[n.name for n in nodes_to_remove]}")
-        for node in nodes_to_remove:
-            self.model.graph.node.remove(node)
+        self.model = onnx_utils.remove_redundant_casts(self.model)
 
     def _fix_network_output_names(self):
         modified = False
@@ -1360,80 +1303,6 @@ def _get_tensor_type(self, tensor_name):
             return self.initializer_map[tensor_name].data_type
         raise Exception(f"did not find tensor {tensor_name}")
 
-    def _convert_constant_values(self, const_node, cast_node: onnx.NodeProto) -> None:
-        original_tensor = const_node.attribute[0].t
-        if original_tensor.data_type == onnx.TensorProto.BFLOAT16:
-            original_data = onnx_utils.read_f16_tensor_as_fp32(original_tensor)
-        else:
-            original_data = onnx.numpy_helper.to_array(original_tensor)
-
-        # Precompute casted value
-        cast_to_type = utils.get_cast_to_type(cast_node)
-        cast_dtype = onnx.helper.tensor_dtype_to_np_dtype(cast_to_type)
-
-        # Handle bfloat16 conversion manually since numpy doesn't support it natively
-        if cast_to_type == onnx.TensorProto.BFLOAT16:
-            casted_data = original_data.astype(ml_dtypes.bfloat16)
-        else:
-            casted_data = original_data.astype(cast_dtype)
-
-        # Create a new constant node with casted data
-        if cast_to_type == onnx.TensorProto.BFLOAT16:
-            # Create TensorProto manually for bfloat16
-            tensor_proto = onnx.TensorProto()
-            tensor_proto.name = const_node.output[0]
-            tensor_proto.data_type = onnx.TensorProto.BFLOAT16
-            tensor_proto.dims.extend(casted_data.shape)
-            # Convert bfloat16 to raw bytes
-            bf16_bytes = casted_data.astype(ml_dtypes.bfloat16).view(np.uint16)
-            tensor_proto.raw_data = bf16_bytes.tobytes()
-        else:
-            # Create tensor manually to ensure proper handling
-            tensor_proto = onnx.numpy_helper.from_array(casted_data)
-            tensor_proto.name = const_node.output[0]
-
-        new_const_node = onnx.helper.make_node(
-            "Constant",
-            inputs=[],
-            outputs=const_node.output,
-            value=tensor_proto,
-            name=const_node.name,
-        )
-
-        # Replace the original constant node with the new constant node
-        # The scope of this function is to convert the constant node data. Removing the cast is done later.
-        for node in utils.get_consumer_nodes(self.model, const_node.name):
-            for i, input_name in enumerate(node.input):
-                if input_name == const_node.name:
-                    node.input[i] = new_const_node.output[0]
-                    break
-
-        const_idx = -1
-        for i, node in enumerate(self.model.graph.node):
-            if node == const_node:
-                const_idx = i
-                break
-
-        self.model.graph.node.remove(const_node)
-        self.model.graph.node.insert(const_idx, new_const_node)
-        # The Cast node is the sole consumer of the Constant node, guaranteed by _is_foldable_constant_cast_pattern
-        cast_node.input[0] = new_const_node.output[0]
-
-    def _is_foldable_constant_cast_pattern(self, node: onnx.NodeProto) -> bool:
-        """Constant -> Cast and Cast is the only consumer of the Constant node."""
-        assert node.op_type == "Cast"
-
-        producer = utils.get_producer_nodes(self.model, node.input[0])
-
-        const_producer = (
-            producer[0] if len(producer) == 1 and producer[0].op_type == "Constant" else None
-        )
-
-        if const_producer:
-            get_consumer_nodes = utils.get_consumer_nodes(self.model, const_producer.output[0])
-            return len(get_consumer_nodes) == 1 and get_consumer_nodes[0] == node
-        return False
-
     def _sanitize_model(self):
         graph_sanitizer = GraphSanitizer(
             self.model,
diff --git a/modelopt/onnx/export/fp8_exporter.py b/modelopt/onnx/export/fp8_exporter.py
@@ -127,6 +127,7 @@ def post_process(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
                     # Create FP8 zero point constant
                     zp_tensor = onnx.TensorProto()
                     zp_tensor.data_type = onnx.TensorProto.FLOAT8E4M3FN
+                    zp_tensor.dims.extend([1])  # 1-element tensor
                     zp_tensor.raw_data = b"\x00"  # Zero in FP8
                     zp_values = LazyValues(zp_tensor)
                     zero_point = gs.Constant(node.name + "_zero_point", zp_values)
diff --git a/modelopt/onnx/utils.py b/modelopt/onnx/utils.py
@@ -1215,48 +1215,106 @@ def onnx_type_str_to_enum(dtype: str) -> int:
     return getattr(onnx.TensorProto, dtype)
 
 
-def remove_duplicate_casts(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
-    """Removes consecutive Cast nodes that cast to the same type.
+def remove_redundant_casts(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
+    """Removes redundant Cast nodes from an ONNX model.
 
-    Example: Cast(to=FP16) -> Cast(to=FP16) becomes just Cast(to=FP16)
+    Handles three patterns:
+    1. Same-type casts: Cast where input type == output type (no-op)
+    2. Sequential casts: Cast(to=high_prec) -> Cast(to=low_prec), first cast removed
+    3. Constant->Cast folding: Fold cast into preceding Constant node's data
+
+    Args:
+        onnx_model: The ONNX model to optimize.
+
+    Returns:
+        onnx.ModelProto: Model with redundant casts removed.
     """
+    import ml_dtypes
+
     graph = gs.import_onnx(onnx_model)
     removed_count = 0
 
+    # Precision ordering: lower index = higher precision
+    precision_order = {
+        onnx.TensorProto.DOUBLE: 0,
+        onnx.TensorProto.FLOAT: 1,
+        onnx.TensorProto.FLOAT16: 2,
+        onnx.TensorProto.BFLOAT16: 3,
+    }
+
+    def _get_onnx_type(tensor):
+        """Get ONNX type enum from a GS tensor's dtype."""
+        if tensor.dtype is None:
+            return None
+        try:
+            return onnx.helper.np_dtype_to_tensor_dtype(tensor.dtype)
+        except Exception:
+            return None
+
+    def _bypass_cast(node):
+        """Reconnect consumers of cast output to use cast input, removing the cast."""
+        inp = node.inputs[0]
+        out = node.outputs[0]
+        for consumer in list(out.outputs):
+            for i, consumer_inp in enumerate(consumer.inputs):
+                if consumer_inp is out:
+                    consumer.inputs[i] = inp
+        for i, graph_out in enumerate(graph.outputs):
+            if graph_out is out:
+                graph.outputs[i] = inp
+        node.outputs.clear()
+
     for node in list(graph.nodes):
         if node.op != "Cast":
             continue
 
-        # Check if output goes to exactly one Cast node
-        if len(node.outputs) != 1 or len(node.outputs[0].outputs) != 1:
+        cast_to = node.attrs.get("to")
+        if cast_to is None:
             continue
 
-        next_node = node.outputs[0].outputs[0]
-        if next_node.op != "Cast":
-            continue
+        input_tensor = node.inputs[0]
+        output_tensor = node.outputs[0]
 
-        first_to = node.attrs.get("to")
-        second_to = next_node.attrs.get("to")
-
-        # Only handle same-type casts
-        if first_to != second_to:
+        # Pattern 1: Same-type cast (no-op)
+        input_type = _get_onnx_type(input_tensor)
+        if input_type is not None and input_type == cast_to:
+            _bypass_cast(node)
+            removed_count += 1
+            logger.debug(f"Removed same-type cast: {node.name}")
             continue
 
-        # Bypass the second cast - keep first, remove second
-        input_tensor = node.outputs[0]
-        output_tensor = next_node.outputs[0]
-
-        for consumer in list(output_tensor.outputs):
-            for i, inp in enumerate(consumer.inputs):
-                if inp == output_tensor:
-                    consumer.inputs[i] = input_tensor
-        next_node.outputs.clear()
-        removed_count += 1
-        logger.debug(f"Removed duplicate cast: {next_node.name} (same type as {node.name})")
+        # Pattern 2: Sequential casts where first can be removed
+        # Cast(to=high) -> Cast(to=low): first cast has no effect
+        cast_consumers = output_tensor.outputs
+        if len(cast_consumers) == 1 and cast_consumers[0].op == "Cast":
+            next_cast_to = cast_consumers[0].attrs.get("to")
+            if (
+                cast_to in precision_order
+                and next_cast_to in precision_order
+                and precision_order[cast_to] <= precision_order[next_cast_to]
+            ):
+                _bypass_cast(node)
+                removed_count += 1
+                logger.debug(f"Removed sequential cast: {node.name}")
+                continue
+
+        # Pattern 3: Constant -> Cast folding (only if constant has single consumer)
+        if isinstance(input_tensor, Constant) and len(input_tensor.outputs) == 1:
+            try:
+                if cast_to == onnx.TensorProto.BFLOAT16:
+                    input_tensor.values = input_tensor.values.astype(ml_dtypes.bfloat16)
+                else:
+                    cast_dtype = onnx.helper.tensor_dtype_to_np_dtype(cast_to)
+                    input_tensor.values = input_tensor.values.astype(cast_dtype)
+                _bypass_cast(node)
+                removed_count += 1
+                logger.debug(f"Folded Constant->Cast: {node.name}")
+            except Exception as e:
+                logger.debug(f"Failed to fold Constant->Cast {node.name}: {e}")
 
     if removed_count > 0:
         graph.cleanup().toposort()
-        logger.info(f"Removed {removed_count} duplicate Cast nodes")
+        logger.info(f"Removed {removed_count} redundant Cast nodes")
 
     return gs.export_onnx(graph)
 
diff --git a/modelopt/torch/_deploy/utils/torch_onnx.py b/modelopt/torch/_deploy/utils/torch_onnx.py
@@ -50,8 +50,8 @@
     get_output_names,
     get_output_shapes,
     infer_shapes,
-    remove_duplicate_casts,
     remove_node_training_mode,
+    remove_redundant_casts,
 )
 from modelopt.torch.quantization.export_onnx import configure_linear_module_onnx_quantizers
 from modelopt.torch.utils import flatten_tree, standardize_named_model_args
@@ -589,7 +589,7 @@ def get_onnx_bytes_and_metadata(
         # Change FP32 cast nodes feeding into Concat/Add to FP16
         onnx_opt_graph = change_casts_to_fp16(onnx_opt_graph, ["Concat", "Add"])
 
-    onnx_opt_graph = remove_duplicate_casts(onnx_opt_graph)
+    onnx_opt_graph = remove_redundant_casts(onnx_opt_graph)
 
     # TensorRT expects all scales to be postive
     onnx_opt_graph = replace_zero_scale_with_smallest_nonzero(onnx_opt_graph)