[6056809] Fix TRT dependency in ModelOpt ONNX quantization (#1189)

gcunhase · web-flow · commit bdc04f174d5b · 2026-04-08T01:42:51.000+05:30
### What does this PR do? Type of change: Bug fix Regression bug introduced by the Autotune integration into ModelOpt ONNX quantization (#951), making ModelOpt dependent on TensorRT in all scenarios. This PR fixes this issue by requiring TensorRT only when `--autotune` is enabled. ### Usage ```python $ python -m modelopt.onnx.quantization --onnx_path=${MODEL_NAME}.onnx ``` ### Testing See bug 6056809. ### Before your PR is "*Ready for review*" Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, `torch.load(..., weights_only=False)`, `pickle`, etc.). - Is this change backward compatible?: ✅ - If you copied code from any other sources or added a new PIP dependency, did you follow guidance in `CONTRIBUTING.md`: N/A - Did you write any new necessary tests?: ✅ - Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: N/A  ## Summary by CodeRabbit * **Bug Fixes** * Autotune dependency failures now surface as clearer runtime errors instead of only logging warnings. * **Chores** * Centralized autotune presets and numeric defaults into a shared configuration. * Core autotune components are conditionally exposed so initialization succeeds when optional acceleration libraries are absent. * Deferred autotune imports to runtime to improve failure handling. * **Tests** * Added a test ensuring the quantization CLI/parser initializes correctly without optional acceleration libraries.  --------- Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com>
diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from modelopt.onnx.quantization.autotune import (
+from modelopt.onnx.quantization.autotune.utils import (
     MODE_PRESETS,
     StoreWithExplicitFlag,
     get_node_filter_list,
diff --git a/modelopt/onnx/quantization/autotune/__init__.py b/modelopt/onnx/quantization/autotune/__init__.py
@@ -20,35 +20,42 @@
 region analysis to efficiently explore and optimize Q/DQ insertion strategies.
 """
 
-# Expose Autotune modes
-from .__main__ import MODE_PRESETS
+# Expose Autotune modes and CLI utilities
+from .utils import MODE_PRESETS, StoreWithExplicitFlag, get_node_filter_list
 
-# Core data structures
-from .autotuner import QDQAutotuner
-from .benchmark import TensorRTPyBenchmark, TrtExecBenchmark
-from .common import (
-    AutotunerError,
-    AutotunerNotInitializedError,
-    Config,
-    InsertionScheme,
-    InvalidSchemeError,
-    PatternCache,
-    PatternSchemes,
-    Region,
-    RegionType,
-)
-from .insertion_points import (
-    ChildRegionInputInsertionPoint,
-    ChildRegionOutputInsertionPoint,
-    NodeInputInsertionPoint,
-    ResolvedInsertionPoint,
-)
-from .region_pattern import RegionPattern
-from .region_search import CombinedRegionSearch
-from .utils import StoreWithExplicitFlag, get_node_filter_list
+# Core data structures (requires TensorRT)
+try:
+    from .autotuner import QDQAutotuner
+    from .benchmark import TensorRTPyBenchmark, TrtExecBenchmark
+    from .common import (
+        AutotunerError,
+        AutotunerNotInitializedError,
+        Config,
+        InsertionScheme,
+        InvalidSchemeError,
+        PatternCache,
+        PatternSchemes,
+        Region,
+        RegionType,
+    )
+    from .insertion_points import (
+        ChildRegionInputInsertionPoint,
+        ChildRegionOutputInsertionPoint,
+        NodeInputInsertionPoint,
+        ResolvedInsertionPoint,
+    )
+    from .region_pattern import RegionPattern
+    from .region_search import CombinedRegionSearch
+except ImportError as e:
+    from modelopt.onnx.logging_config import logger
 
-__all__ = [
-    "MODE_PRESETS",
+    logger.warning(
+        f"Failed to import Autotune dependencies: '{e}'. Ignore if Autotune is not being used."
+    )
+
+__all__ = ["MODE_PRESETS", "StoreWithExplicitFlag", "get_node_filter_list"]
+
+_OPTIONAL_EXPORTS = [
     "AutotunerError",
     "AutotunerNotInitializedError",
     "ChildRegionInputInsertionPoint",
@@ -65,8 +72,7 @@
     "RegionPattern",
     "RegionType",
     "ResolvedInsertionPoint",
-    "StoreWithExplicitFlag",
     "TensorRTPyBenchmark",
     "TrtExecBenchmark",
-    "get_node_filter_list",
 ]
+__all__.extend(name for name in _OPTIONAL_EXPORTS if name in globals())
diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py
@@ -22,6 +22,10 @@
 
 from modelopt.onnx.logging_config import logger
 from modelopt.onnx.quantization.autotune.utils import (
+    DEFAULT_NUM_SCHEMES,
+    DEFAULT_TIMING_RUNS,
+    DEFAULT_WARMUP_RUNS,
+    MODE_PRESETS,
     StoreWithExplicitFlag,
     get_node_filter_list,
     validate_file_path,
@@ -32,21 +36,9 @@
 )
 
 DEFAULT_OUTPUT_DIR = "./autotuner_output"
-DEFAULT_NUM_SCHEMES = 50
 DEFAULT_QUANT_TYPE = "int8"
 DEFAULT_DQ_DTYPE = "float32"
 DEFAULT_TIMING_CACHE = str(Path(tempfile.gettempdir()) / "trtexec_timing.cache")
-DEFAULT_WARMUP_RUNS = 50
-DEFAULT_TIMING_RUNS = 100
-MODE_PRESETS = {
-    "quick": {"schemes_per_region": 30, "warmup_runs": 10, "timing_runs": 50},
-    "default": {
-        "schemes_per_region": DEFAULT_NUM_SCHEMES,
-        "warmup_runs": DEFAULT_WARMUP_RUNS,
-        "timing_runs": DEFAULT_TIMING_RUNS,
-    },
-    "extensive": {"schemes_per_region": 200, "warmup_runs": 50, "timing_runs": 200},
-}
 
 
 def apply_mode_presets(args) -> None:
diff --git a/modelopt/onnx/quantization/autotune/utils.py b/modelopt/onnx/quantization/autotune/utils.py
@@ -21,6 +21,20 @@
 
 from modelopt.onnx.logging_config import logger
 
+DEFAULT_NUM_SCHEMES = 50
+DEFAULT_WARMUP_RUNS = 50
+DEFAULT_TIMING_RUNS = 100
+
+MODE_PRESETS = {
+    "quick": {"schemes_per_region": 30, "warmup_runs": 10, "timing_runs": 50},
+    "default": {
+        "schemes_per_region": DEFAULT_NUM_SCHEMES,
+        "warmup_runs": DEFAULT_WARMUP_RUNS,
+        "timing_runs": DEFAULT_TIMING_RUNS,
+    },
+    "extensive": {"schemes_per_region": 200, "warmup_runs": 50, "timing_runs": 200},
+}
+
 
 class StoreWithExplicitFlag(argparse.Action):
     """Store the value and set an 'explicit' flag on the namespace so mode presets do not override."""
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
@@ -47,14 +47,6 @@
 
 from modelopt.onnx.logging_config import configure_logging, logger
 from modelopt.onnx.op_types import is_data_dependent_shape_op
-
-try:
-    from modelopt.onnx.quantization.autotune.workflows import (
-        init_benchmark_instance,
-        region_pattern_autotuning_workflow,
-    )
-except ImportError:
-    logger.warning("Failed to import Autotune dependencies")
 from modelopt.onnx.quantization.calib_utils import (
     CalibrationDataProvider,
     CalibrationDataType,
@@ -287,6 +279,17 @@ def _find_nodes_to_quantize_autotune(
     """Extracts quantization information from Autotune to provide ORT quantization."""
     logger.info("Running Auto Q/DQ with TensorRT")
 
+    try:
+        from modelopt.onnx.quantization.autotune.workflows import (
+            init_benchmark_instance,
+            region_pattern_autotuning_workflow,
+        )
+    except ImportError as e:
+        raise RuntimeError(
+            f"Failed to import Autotune dependencies: '{e}'."
+            "Make sure that all Autotune requirements are installed (i.e., TensorRT)."
+        )
+
     benchmark_instance = init_benchmark_instance(
         use_trtexec=use_trtexec,
         plugin_libraries=trt_plugins,
@@ -295,6 +298,7 @@ def _find_nodes_to_quantize_autotune(
         timing_runs=timing_runs,
         trtexec_args=trtexec_args.split() if trtexec_args else None,
     )
+
     if benchmark_instance is None:
         raise RuntimeError("Failed to initialize TensorRT benchmark")
 
diff --git a/tests/unit/onnx/quantization/test_autotune_quantization_integration.py b/tests/unit/onnx/quantization/test_autotune_quantization_integration.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import sys
+
+import pytest
+
+
+def test_quantization_cli_parser_imports_without_tensorrt():
+    """Verify the CLI parser can be constructed without TensorRT installed."""
+    with pytest.MonkeyPatch.context() as mp:
+        # Force tensorrt import to fail, even if it's actually installed
+        mp.setitem(sys.modules, "tensorrt", None)
+
+        # Reload the autotune package so it picks up the blocked import
+        import modelopt.onnx.quantization.autotune
+
+        importlib.reload(modelopt.onnx.quantization.autotune)
+
+        from modelopt.onnx.quantization.__main__ import get_parser
+
+        parser = get_parser()
+        args = parser.parse_args(["--onnx_path", "dummy.onnx"])
+        assert args.onnx_path == "dummy.onnx"
+        assert args.quantize_mode == "int8"