[Auto-23/24][ONNX][Autocast] Clear stale Cast-output type metadata before ORT InferenceSession load (#1565)

gcunhase · claude · web-flow · commit f99d83eeab48 · 2026-05-29T19:38:49.000Z
### What does this PR do? Type of change: Bug fix **Error:** ``` onnxruntime.capi.onnxruntime_pybind11_state.Fail: [ONNXRuntimeError] : 1 : FAIL : Type Error: Type (tensor(float16)) of output arg (node_5bc985fa) of node (node_5bc985fa) does not match expected type (tensor(float)). ``` **Root cause:** - Some ONNX exporters emit `graph.output` / `value_info` entries whose dtype disagrees with the upstream `Cast` node's `to` attribute. - ORT's type checker rejects such models on session load. **Fix:** - New helper `modelopt.onnx.utils.clear_stale_value_info()` reconciles each `graph.output` elem_type to its producing Cast's `to`, then clears `value_info` so ORT recomputes intermediate types. - Called from `autocast/referencerunner.py` and `quantization/quantize.py::_preprocess_onnx`. ### Usage ```python # Internal fix; no new flag introduced. Generic CLI to exercise the affected Autocast path: $ python -m modelopt.onnx.autocast --onnx=model.onnx ``` ### Testing ``` pytest tests/unit/onnx/test_onnx_utils.py::test_clear_stale_value_info ``` ### Before your PR is "*Ready for review*" Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, `torch.load(..., weights_only=False)`, `pickle`, etc.). - Is this change backward compatible?: ✅ - If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: N/A - Did you write any new necessary tests?: ✅ - Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: ❌  ## Summary by CodeRabbit * **New Features** * Automatic cleaning and reconciliation of stale ONNX type metadata before runtime and quantization; reconciled model files are produced and used when inconsistencies are found. * **Tests** * New unit tests covering metadata-cleaning behavior across cast/type scenarios to ensure correctness and prevent regressions.  [![Review Change Stack](https://storage.googleapis.com/coderabbit_public_assets/review-stack-in-coderabbit-ui.svg)](https://app.coderabbit.ai/change-stack/NVIDIA/Model-Optimizer/pull/1565?utm_source=github_walkthrough&utm_medium=github&utm_campaign=change_stack)   --------- Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> Co-authored-by: modelopt-fix-agent-bot (Claude Opus 4.7) <noreply@anthropic.com>
diff --git a/modelopt/onnx/autocast/referencerunner.py b/modelopt/onnx/autocast/referencerunner.py
@@ -295,6 +295,8 @@ def run(self, inputs=None):
         ort.set_default_logger_severity(3)
 
         model_copy = copy.deepcopy(self.model)
+        # Clear stale type metadata to prevent type check failures in ORT
+        onnx_utils.clear_stale_value_info(model_copy)
         modify_outputs = ModifyOnnxOutputs(model_copy, outputs=constants.MARK_ALL)
 
         # Load the modified model and create an inference session
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
@@ -73,6 +73,7 @@
 from modelopt.onnx.utils import (
     BASE_MIN_OPSET,
     QDQ_PRECISION_MIN_OPSET,
+    clear_stale_value_info,
     duplicate_shared_constants,
     get_opset_version,
     name_onnx_nodes,
@@ -118,6 +119,13 @@ def _preprocess_onnx(
         use_external_data_format,
         intermediate_generated_files,
     )
+
+    # Clear stale type metadata to prevent type check failures in ORT
+    if clear_stale_value_info(onnx_model):
+        onnx_path = os.path.join(output_dir, f"{model_name}_reconciled.onnx")
+        save_onnx(onnx_model, onnx_path, use_external_data_format)
+        intermediate_generated_files.append(onnx_path)
+
     if has_custom_op:
         onnx_path = os.path.join(output_dir, f"{model_name}_ort_support.onnx")
         save_onnx(onnx_model, onnx_path, use_external_data_format)
diff --git a/modelopt/onnx/utils.py b/modelopt/onnx/utils.py
@@ -1860,3 +1860,37 @@ def change_casts_to_fp16(model: onnx.ModelProto, target_op_types: list[str]) ->
                 break
 
     return model
+
+
+def clear_stale_value_info(model: onnx.ModelProto) -> int:
+    """Clear stale type metadata that would otherwise trip ORT's type checker.
+
+    Walks every ``Cast`` node and forces the ``elem_type`` of any
+    ``graph.output`` entry produced by that Cast to match the Cast's ``to``
+    attribute (the spec-defined contract for a Cast's output dtype). Then
+    clears ``value_info`` wholesale so ORT/shape-inference re-derives
+    intermediate-tensor types from the operator graph during session setup.
+
+    Args:
+        model: Loaded in-memory onnx ModelProto.
+
+    Returns:
+        Total number of entries reconciled or cleared.
+    """
+    cast_to_by_output = {
+        node.output[0]: get_cast_to_type(node)
+        for node in model.graph.node
+        if node.op_type == "Cast" and node.output
+    }
+
+    fixed_outputs = 0
+    for o in model.graph.output:
+        to_attr = cast_to_by_output.get(o.name)
+        if to_attr is not None and o.type.tensor_type.elem_type != to_attr:
+            o.type.tensor_type.elem_type = to_attr
+            fixed_outputs += 1
+
+    n_vi = len(model.graph.value_info)
+    if n_vi:
+        del model.graph.value_info[:]
+    return fixed_outputs + n_vi
diff --git a/tests/unit/onnx/test_onnx_utils.py b/tests/unit/onnx/test_onnx_utils.py
@@ -30,6 +30,7 @@
 
 from modelopt.onnx.trt_utils import load_onnx_model
 from modelopt.onnx.utils import (
+    clear_stale_value_info,
     get_input_names_from_bytes,
     get_output_names_from_bytes,
     randomize_weights_onnx_bytes,
@@ -329,3 +330,37 @@ def test_ir_version_support(tmp_path):
     assert model_reload.ir_version == 10, (
         f"The maximum supported IR version is 10, but version {model_reload.ir_version} was detected."
     )
+
+
+def _make_cast_model(cast_to, output_elem_type, with_value_info=False):
+    """Build a tiny X -> Cast(to=cast_to) -> Y model."""
+    nodes = [make_node("Cast", ["X"], ["Y"], to=cast_to, name="cast")]
+    inputs = [make_tensor_value_info("X", onnx.TensorProto.FLOAT16, [1, 4])]
+    outputs = [make_tensor_value_info("Y", output_elem_type, [1, 4])]
+    value_info = (
+        [make_tensor_value_info("Y", onnx.TensorProto.FLOAT16, [1, 4])] if with_value_info else []
+    )
+    graph = make_graph(nodes, "cast_graph", inputs, outputs, value_info=value_info)
+    return make_model(graph, producer_name="modelopt test", opset_imports=[make_opsetid("", 17)])
+
+
+@pytest.mark.parametrize(
+    ("output_elem_type", "with_value_info", "expected_count"),
+    [
+        (onnx.TensorProto.FLOAT16, True, 2),  # stale output + value_info: reconcile + clear
+        (onnx.TensorProto.FLOAT, False, 0),  # output already matches Cast.to: no-op
+    ],
+    ids=["stale_output_and_value_info", "no_op_when_matching"],
+)
+def test_clear_stale_value_info(output_elem_type, with_value_info, expected_count):
+    model = _make_cast_model(
+        cast_to=onnx.TensorProto.FLOAT,
+        output_elem_type=output_elem_type,
+        with_value_info=with_value_info,
+    )
+
+    count = clear_stale_value_info(model)
+
+    assert model.graph.output[0].type.tensor_type.elem_type == onnx.TensorProto.FLOAT
+    assert len(model.graph.value_info) == 0
+    assert count == expected_count