Fix TRT strongly-typed parsing for torch_onnx quantized exports

ajrasane · ajrasane · commit df090ec09db1 · 2026-04-17T23:29:15.000Z
Three related fixes so all torch_onnx tests build a TensorRT engine under
--stronglyTyped:

- onnx/utils.py: Add fold_qdq_scale_fp16_to_fp32_casts and extend
  fold_dq_fp32_to_fp16_casts to propagate FP16 through nested DQ-scale
  chains (NVFP4 double-DQ). Without this, the outer DQ output was retyped
  to FP16 while its FP32 scale stayed in the graph, leaving the downstream
  Gemm in FP32 and mismatching the FP16 bias.
- torch/_deploy/utils/torch_onnx.py: Run fold_qdq_scale_fp16_to_fp32_casts
  after fold_dq_fp32_to_fp16_casts so Cast(FP16-&gt;FP32) nodes injected in
  front of Q/DQ scale inputs by onnxconverter_common are removed.
- examples/torch_onnx/torch_quant_to_onnx.py: Skip downsample.reduction
  (Swin/SwinV2 4D Linear incompatible with TRT DynamicQuantize) and pass
  strict=False to load_calib_amax so quantizers that never saw a tensor
  with calibration_data_size=1 do not crash calibration.

Signed-off-by: ajrasane &lt;arasane@nvidia.com&gt;
Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
diff --git a/examples/torch_onnx/torch_quant_to_onnx.py b/examples/torch_onnx/torch_quant_to_onnx.py
@@ -122,11 +122,15 @@ def get_quant_config(quantize_mode):
 
 
 def filter_func(name):
-    """Filter function to exclude certain layers from quantization."""
+    """Filter function to exclude certain layers from quantization.
+
+    ``downsample.reduction`` (Swin/SwinV2) is excluded because it operates on 4D tensors
+    and TRT's DynamicQuantize layer (used for MXFP8/NVFP4) requires 2D/3D input.
+    """
     pattern = re.compile(
         r".*(time_emb_proj|time_embedding|conv_in|conv_out|conv_shortcut|add_embedding|"
         r"pos_embed|time_text_embed|context_embedder|norm_out|x_embedder|patch_embed|cpb_mlp|"
-        r"maxpool|global_pool).*"
+        r"maxpool|global_pool|downsample\.reduction).*"
     )
     return pattern.match(name) is not None
 
@@ -192,7 +196,7 @@ def _calibrate_uncalibrated_quantizers(model, data_loader):
 
     for quantizer in uncalibrated:
         quantizer.disable_calib()
-        quantizer.load_calib_amax()
+        quantizer.load_calib_amax(strict=False)
 
 
 def quantize_model(model, config, data_loader=None):
diff --git a/modelopt/onnx/utils.py b/modelopt/onnx/utils.py
@@ -1513,12 +1513,18 @@ def fold_dq_fp32_to_fp16_casts(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
     2. Updating the DQ output type to FP16 in value_info
     3. Bypassing and removing the Cast node
 
+    NVFP4 uses a nested DQ chain (scale is itself a DQ output). When the outer DQ's scale
+    is produced by another DQ, recursively retype the inner DQ's chain so the whole
+    chain produces FP16 tensors under strongly-typed TRT parsing.
+
     Args:
         onnx_model: The ONNX model with DQ -> Cast(FP32->FP16) patterns.
 
     Returns:
         The ONNX model with Cast nodes removed and DQ outputs set to FP16.
     """
+    import numpy as np
+
     dq_ops = {"DequantizeLinear", "TRT_FP8DequantizeLinear"}
 
     # Build a map of tensor name -> producer node
@@ -1532,51 +1538,66 @@ def fold_dq_fp32_to_fp16_casts(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
         init.name: init for init in onnx_model.graph.initializer
     }
 
+    value_info_map: dict[str, onnx.ValueInfoProto] = {
+        vi.name: vi for vi in onnx_model.graph.value_info
+    }
+
+    retyped_dq_outputs: set[str] = set()
+
+    def _convert_fp32_init_to_fp16(init: onnx.TensorProto) -> None:
+        scale_data = np.frombuffer(init.raw_data, dtype=np.float32)
+        if not scale_data.size:
+            scale_data = np.array(init.float_data, dtype=np.float32)
+        init.data_type = onnx.TensorProto.FLOAT16
+        init.raw_data = scale_data.astype(np.float16).tobytes()
+        del init.float_data[:]
+
+    def _retype_dq_chain(dq_node: onnx.NodeProto, depth: int = 0) -> None:
+        """Propagate FP16 output type down through a DQ's scale chain."""
+        if depth > 4 or len(dq_node.input) < 2:
+            return
+        scale_name = dq_node.input[1]
+        scale_init = initializer_map.get(scale_name)
+        if scale_init is not None:
+            if scale_init.data_type == onnx.TensorProto.FLOAT:
+                _convert_fp32_init_to_fp16(scale_init)
+            return
+        scale_producer = producer_map.get(scale_name)
+        if scale_producer is None or scale_producer.op_type not in dq_ops:
+            return
+        _retype_dq_chain(scale_producer, depth + 1)
+        if scale_name in value_info_map:
+            value_info_map[scale_name].type.tensor_type.elem_type = onnx.TensorProto.FLOAT16
+        retyped_dq_outputs.add(scale_name)
+
     nodes_to_remove = []
     for node in onnx_model.graph.node:
         if node.op_type != "Cast":
             continue
 
-        # Check: Cast target is FP16
         cast_to = None
         for attr in node.attribute:
             if attr.name == "to":
                 cast_to = attr.i
         if cast_to != onnx.TensorProto.FLOAT16:
             continue
 
-        # Check: producer is a DQ node
         producer = producer_map.get(node.input[0])
         if producer is None or producer.op_type not in dq_ops:
             continue
 
-        # Convert the DQ scale initializer from FP32 to FP16
-        # DQ inputs: [input, scale, (zero_point)]
-        if len(producer.input) >= 2:
-            scale_name = producer.input[1]
-            if scale_name in initializer_map:
-                scale_init = initializer_map[scale_name]
-                if scale_init.data_type == onnx.TensorProto.FLOAT:
-                    import numpy as np
-
-                    scale_data = np.frombuffer(scale_init.raw_data, dtype=np.float32)
-                    if not scale_data.size:
-                        scale_data = np.array(scale_init.float_data, dtype=np.float32)
-                    scale_fp16 = scale_data.astype(np.float16)
-                    scale_init.data_type = onnx.TensorProto.FLOAT16
-                    scale_init.raw_data = scale_fp16.tobytes()
-                    del scale_init.float_data[:]
-
-        # Bypass the Cast node
+        _retype_dq_chain(producer)
+
         _bypass_cast_node(onnx_model, node)
         nodes_to_remove.append(node)
 
-        # Update the DQ output type in value_info
         dq_output_name = producer.output[0]
-        for vi in onnx_model.graph.value_info:
-            if vi.name == dq_output_name:
-                vi.type.tensor_type.elem_type = onnx.TensorProto.FLOAT16
-                break
+        retyped_dq_outputs.add(dq_output_name)
+
+    for name in retyped_dq_outputs:
+        vi = value_info_map.get(name)
+        if vi is not None:
+            vi.type.tensor_type.elem_type = onnx.TensorProto.FLOAT16
 
     logger.debug(f"Folded {len(nodes_to_remove)} DQ -> Cast(FP32->FP16) patterns")
     for node in nodes_to_remove:
@@ -1585,6 +1606,92 @@ def fold_dq_fp32_to_fp16_casts(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
     return onnx_model
 
 
+def fold_qdq_scale_fp16_to_fp32_casts(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
+    """Remove Cast(FP16->FP32) nodes feeding into Q/DQ scale inputs.
+
+    When convert_float_to_float16 blocks QuantizeLinear/DequantizeLinear, it inserts
+    Cast(FP16->FP32) nodes before every scale input. In opset >=20 Q/DQ natively accept
+    FP16 scales, and leaving the cast in place forces DQ outputs to FP32, breaking
+    downstream FP16 matmul/add operations under strongly-typed TRT parsing.
+
+    This function bypasses each such Cast and, when the upstream Constant is FP16,
+    wires the DQ output to FP16 in value_info so shape inference stays consistent.
+
+    Args:
+        onnx_model: The ONNX model with Cast(FP16->FP32) -> Q/DQ.scale patterns.
+
+    Returns:
+        The ONNX model with redundant scale-path casts removed.
+    """
+    qdq_ops = {
+        "QuantizeLinear",
+        "DequantizeLinear",
+        "TRT_FP8QuantizeLinear",
+        "TRT_FP8DequantizeLinear",
+    }
+
+    producer_map: dict[str, onnx.NodeProto] = {}
+    consumer_map: dict[str, list[tuple[onnx.NodeProto, int]]] = {}
+    for node in onnx_model.graph.node:
+        for out in node.output:
+            producer_map[out] = node
+        for idx, inp in enumerate(node.input):
+            if inp:
+                consumer_map.setdefault(inp, []).append((node, idx))
+
+    type_map = _build_tensor_type_map(onnx_model)
+
+    nodes_to_remove: list[onnx.NodeProto] = []
+    dq_outputs_retyped: set[str] = set()
+    visited_casts: set[int] = set()
+    for node in onnx_model.graph.node:
+        if node.op_type not in qdq_ops or len(node.input) < 2:
+            continue
+
+        scale_name = node.input[1]
+        cast_node = producer_map.get(scale_name)
+        if cast_node is None or cast_node.op_type != "Cast":
+            continue
+        if id(cast_node) in visited_casts:
+            # Already handled (e.g. shared scale Cast across paired Q/DQ).
+            if node.op_type.endswith("DequantizeLinear"):
+                dq_outputs_retyped.add(node.output[0])
+            continue
+        if get_cast_to_type(cast_node) != onnx.TensorProto.FLOAT:
+            continue
+        if type_map.get(cast_node.input[0]) != onnx.TensorProto.FLOAT16:
+            continue
+
+        # Only bypass when every consumer of this Cast is a Q/DQ scale input; otherwise
+        # other ops would silently receive FP16 instead of the FP32 they requested.
+        cast_output = cast_node.output[0]
+        consumers = consumer_map.get(cast_output, [])
+        if not consumers or not all(
+            c.op_type in qdq_ops and i == 1 for c, i in consumers
+        ):
+            continue
+
+        # Bypass the cast so the scale stays FP16
+        _bypass_cast_node(onnx_model, cast_node)
+        nodes_to_remove.append(cast_node)
+        visited_casts.add(id(cast_node))
+
+        # For DQ nodes, the output type follows the scale type — update value_info.
+        if node.op_type.endswith("DequantizeLinear"):
+            dq_outputs_retyped.add(node.output[0])
+
+    for vi in onnx_model.graph.value_info:
+        if vi.name in dq_outputs_retyped:
+            vi.type.tensor_type.elem_type = onnx.TensorProto.FLOAT16
+
+    logger.debug(f"Folded {len(nodes_to_remove)} Cast(FP16->FP32) -> Q/DQ.scale patterns")
+    for cast_node in nodes_to_remove:
+        if cast_node in onnx_model.graph.node:
+            onnx_model.graph.node.remove(cast_node)
+
+    return onnx_model
+
+
 def remove_node_training_mode(onnx_model: onnx.ModelProto, node_op_type: str) -> onnx.ModelProto:
     """Remove `training_mode` attribute and extra training outputs from nodes of a given op type.
 
diff --git a/modelopt/torch/_deploy/utils/torch_onnx.py b/modelopt/torch/_deploy/utils/torch_onnx.py
@@ -48,6 +48,7 @@
     change_casts_to_fp16,
     check_model_uses_external_data,
     fold_dq_fp32_to_fp16_casts,
+    fold_qdq_scale_fp16_to_fp32_casts,
     get_input_names,
     get_input_shapes,
     get_node_names,
@@ -652,6 +653,9 @@ def get_onnx_bytes_and_metadata(
             onnx_opt_graph = change_casts_to_fp16(onnx_opt_graph, op_list)
             # Remove Cast(FP32->FP16) nodes after DQ by setting DQ output to FP16 directly
             onnx_opt_graph = fold_dq_fp32_to_fp16_casts(onnx_opt_graph)
+            # Remove Cast(FP16->FP32) feeding Q/DQ scales so DQ stays FP16 for downstream
+            # MatMul/Add layers under strongly-typed TRT parsing.
+            onnx_opt_graph = fold_qdq_scale_fp16_to_fp32_casts(onnx_opt_graph)
         else:
             onnx_opt_graph = convert_to_f16(
                 onnx_opt_graph, low_precision_type=weights_dtype, keep_io_types=False