diff --git a/CHANGELOG.rst b/CHANGELOG.rst index aafe955dd0..ae94ef2ab3 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -20,6 +20,7 @@ NVIDIA Model Optimizer Changelog - Add ``nvfp4_omlp_only`` quantization format for NVFP4 quantization. This is similar to ``nvfp4_mlp_only`` but also quantizes the output projection layer in attention. - ``pass_through_bwd`` in the quantization config is now default to True. Please set it to False if you want to use STE with zeroed outlier gradients for potentially better QAT accuracy. - Add :meth:`compute_quantization_mse ` API to measure per-quantizer mean-squared quantization error, with flexible wildcard and callable filtering. +- **AutoQDQ**: New tool for automated Q/DQ (Quantize/Dequantize) placement optimization for ONNX models. Uses TensorRT latency measurements to choose insertion schemes that minimize inference time. Discovers regions automatically, groups them by structural pattern, and tests multiple Q/DQ schemes per pattern. Supports INT8 and FP8 quantization, pattern cache for warm-start on similar models, checkpoint/resume, and importing patterns from an existing QDQ baseline. CLI: ``python -m modelopt.onnx.quantization.autotune``. See the AutoQDQ guide in the documentation. **Misc** diff --git a/modelopt/onnx/quantization/autotune/__main__.py b/modelopt/onnx/quantization/autotune/__main__.py index 877d1a0170..cb7b3c2810 100644 --- a/modelopt/onnx/quantization/autotune/__main__.py +++ b/modelopt/onnx/quantization/autotune/__main__.py @@ -27,12 +27,50 @@ ) DEFAULT_OUTPUT_DIR = "./autotuner_output" -DEFAULT_NUM_SCHEMES = 30 +DEFAULT_NUM_SCHEMES = 50 DEFAULT_QUANT_TYPE = "int8" DEFAULT_DQ_DTYPE = "float32" DEFAULT_TIMING_CACHE = str(Path(tempfile.gettempdir()) / "trtexec_timing.cache") -DEFAULT_WARMUP_RUNS = 5 -DEFAULT_TIMING_RUNS = 20 +DEFAULT_WARMUP_RUNS = 50 +DEFAULT_TIMING_RUNS = 100 +MODE_PRESETS = { + "quick": {"schemes_per_region": 30, "warmup_runs": 10, "timing_runs": 50}, + "default": { + "schemes_per_region": DEFAULT_NUM_SCHEMES, + "warmup_runs": DEFAULT_WARMUP_RUNS, + "timing_runs": DEFAULT_TIMING_RUNS, + }, + "extensive": {"schemes_per_region": 200, "warmup_runs": 50, "timing_runs": 200}, +} + + +class _StoreWithExplicitFlag(argparse.Action): + """Store the value and set an 'explicit' flag on the namespace so mode presets do not override.""" + + def __init__(self, explicit_attr: str, *args, **kwargs): + self._explicit_attr = explicit_attr + super().__init__(*args, **kwargs) + + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, values) + setattr(namespace, self._explicit_attr, True) + + +def apply_mode_presets(args) -> None: + """Apply --mode preset to schemes_per_region, warmup_runs, timing_runs. + + Only applies preset for an option when that option was not explicitly set on the + command line (explicit flags override the preset even when the value equals the default). + """ + if args.mode not in MODE_PRESETS: + return + preset = MODE_PRESETS[args.mode] + if not getattr(args, "_explicit_num_schemes", False): + args.num_schemes = preset["schemes_per_region"] + if not getattr(args, "_explicit_warmup_runs", False): + args.warmup_runs = preset["warmup_runs"] + if not getattr(args, "_explicit_timing_runs", False): + args.timing_runs = preset["timing_runs"] def validate_file_path(path: str | None, description: str) -> Path | None: @@ -94,12 +132,15 @@ def run_autotune() -> int: - 130: Interrupted by user (Ctrl+C) """ args = _get_autotune_parser().parse_args() + apply_mode_presets(args) model_path = validate_file_path(args.onnx_path, "Model file") validate_file_path(args.qdq_baseline, "QDQ baseline model") output_dir = Path(args.output_dir) log_benchmark_config(args) trtexec_args = getattr(args, "trtexec_benchmark_args", None) + if trtexec_args and isinstance(trtexec_args, str): + trtexec_args = trtexec_args.split() benchmark_instance = init_benchmark_instance( use_trtexec=args.use_trtexec, plugin_libraries=args.plugin_libraries, @@ -167,6 +208,12 @@ def _get_autotune_parser() -> argparse.ArgumentParser: # Basic usage python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx + # Quick mode (fewer schemes and benchmark runs for fast iteration) + python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx --mode quick + + # Extensive mode (more schemes and runs for thorough tuning) + python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx --mode extensive + # Import patterns from QDQ baseline model python -m modelopt.onnx.quantization.autotune \\ --onnx_path model.onnx --qdq_baseline baseline.onnx @@ -198,13 +245,26 @@ def _get_autotune_parser() -> argparse.ArgumentParser: # Autotuning Strategy strategy_group = parser.add_argument_group("Autotuning Strategy") + strategy_group.add_argument( + "--mode", + type=str, + default="default", + choices=["quick", "default", "extensive"], + help="Preset for schemes_per_region, warmup_runs, and timing_runs. " + "'quick': fewer schemes/runs for fast iteration; " + "'default': balanced; " + "'extensive': more schemes/runs for thorough tuning. " + "Explicit --schemes_per_region, --warmup_runs, --timing_runs override the preset.", + ) strategy_group.add_argument( "--schemes_per_region", "-s", type=int, default=DEFAULT_NUM_SCHEMES, dest="num_schemes", - help=f"Number of schemes to test per region (default: {DEFAULT_NUM_SCHEMES})", + action=_StoreWithExplicitFlag, + explicit_attr="_explicit_num_schemes", + help=f"Schemes per region (default: {DEFAULT_NUM_SCHEMES}; preset from --mode if not set)", ) strategy_group.add_argument( "--pattern_cache", @@ -268,13 +328,17 @@ def _get_autotune_parser() -> argparse.ArgumentParser: "--warmup_runs", type=int, default=DEFAULT_WARMUP_RUNS, - help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS})", + action=_StoreWithExplicitFlag, + explicit_attr="_explicit_warmup_runs", + help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS}; preset from --mode applies if not set)", ) trt_group.add_argument( "--timing_runs", type=int, default=DEFAULT_TIMING_RUNS, - help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS})", + action=_StoreWithExplicitFlag, + explicit_attr="_explicit_timing_runs", + help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS}; preset from --mode applies if not set)", ) trt_group.add_argument( "--plugin_libraries", diff --git a/tests/_test_utils/onnx/quantization/autotune/models.py b/tests/_test_utils/onnx/quantization/autotune/models.py index db9652e561..fc63f6690b 100644 --- a/tests/_test_utils/onnx/quantization/autotune/models.py +++ b/tests/_test_utils/onnx/quantization/autotune/models.py @@ -25,12 +25,19 @@ def _create_simple_conv_onnx_model(): """Build ONNX model: Input -> Conv -> Relu -> Output (minimal for autotuner tests).""" - input_tensor = helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [32, 3, 224, 224]) + input_tensor = helper.make_tensor_value_info( + "input", onnx.TensorProto.FLOAT, [64, 32, 224, 224] + ) output_tensor = helper.make_tensor_value_info( - "output", onnx.TensorProto.FLOAT, [32, 64, 224, 224] + "output", onnx.TensorProto.FLOAT, [64, 64, 224, 224] ) conv_node = helper.make_node( - "Conv", inputs=["input", "conv_weight"], outputs=["conv_out"], name="conv" + "Conv", + inputs=["input", "conv_weight"], + outputs=["conv_out"], + name="conv", + kernel_shape=[3, 3], + pads=[1, 1, 1, 1], ) relu_node = helper.make_node("Relu", inputs=["conv_out"], outputs=["output"], name="relu") graph = helper.make_graph( @@ -40,7 +47,7 @@ def _create_simple_conv_onnx_model(): [output_tensor], initializer=[ helper.make_tensor( - "conv_weight", onnx.TensorProto.FLOAT, [64, 3, 3, 3], [0.1] * (64 * 3 * 3 * 3) + "conv_weight", onnx.TensorProto.FLOAT, [64, 32, 3, 3], [0.1] * (64 * 32 * 3 * 3) ) ], ) diff --git a/tests/gpu/onnx/quantization/autotune/test_workflow.py b/tests/gpu/onnx/quantization/autotune/test_workflow.py index b448135acf..8066766a9c 100644 --- a/tests/gpu/onnx/quantization/autotune/test_workflow.py +++ b/tests/gpu/onnx/quantization/autotune/test_workflow.py @@ -35,7 +35,6 @@ def simple_conv_model(): return _test_models._create_simple_conv_onnx_model() -@pytest.mark.skip(reason="TODO: Fix test and enable") @pytest.mark.parametrize("use_trtexec", [True, False]) def test_export_quantized_model(use_trtexec, simple_conv_model): """Test exporting quantized model with Q/DQ.""" diff --git a/tests/unit/onnx/quantization/autotune/test_autotune_config.py b/tests/unit/onnx/quantization/autotune/test_autotune_config.py index 9ec99d65d1..98274fbf81 100644 --- a/tests/unit/onnx/quantization/autotune/test_autotune_config.py +++ b/tests/unit/onnx/quantization/autotune/test_autotune_config.py @@ -14,11 +14,17 @@ # limitations under the License. """ -Tests for the Config class in the autotuner. +Tests for the Config class and CLI mode presets in the autotuner. -Tests configuration parameter validation and defaults. +Tests configuration parameter validation, defaults, and CLI --mode preset +selection and explicit-flag precedence. """ +from modelopt.onnx.quantization.autotune.__main__ import ( + MODE_PRESETS, + _get_autotune_parser, + apply_mode_presets, +) from modelopt.onnx.quantization.autotune.common import Config @@ -95,3 +101,96 @@ def test_pattern_cache_params(self): assert config.pattern_cache_minimum_distance == 3 assert config.pattern_cache_max_entries_per_pattern == 10 + + +class TestModePresets: + """Test --mode preset selection and explicit-flag precedence.""" + + @staticmethod + def _parse_cli(argv): + """Parse argv with the autotune CLI parser and apply mode presets.""" + parser = _get_autotune_parser() + args = parser.parse_args(argv) + apply_mode_presets(args) + return args + + def test_mode_quick_applies_preset_when_no_explicit_flags(self): + """With --mode quick and no explicit schemes/warmup/timing, preset values are used.""" + args = self._parse_cli(["--onnx_path", "model.onnx", "--mode", "quick"]) + preset = MODE_PRESETS["quick"] + assert args.num_schemes == preset["schemes_per_region"] + assert args.warmup_runs == preset["warmup_runs"] + assert args.timing_runs == preset["timing_runs"] + + def test_mode_default_applies_preset_when_no_explicit_flags(self): + """With --mode default and no explicit flags, preset values are used.""" + args = self._parse_cli(["--onnx_path", "model.onnx", "--mode", "default"]) + preset = MODE_PRESETS["default"] + assert args.num_schemes == preset["schemes_per_region"] + assert args.warmup_runs == preset["warmup_runs"] + assert args.timing_runs == preset["timing_runs"] + + def test_mode_extensive_applies_preset_when_no_explicit_flags(self): + """With --mode extensive and no explicit flags, preset values are used.""" + args = self._parse_cli(["--onnx_path", "model.onnx", "--mode", "extensive"]) + preset = MODE_PRESETS["extensive"] + assert args.num_schemes == preset["schemes_per_region"] + assert args.warmup_runs == preset["warmup_runs"] + assert args.timing_runs == preset["timing_runs"] + + def test_explicit_schemes_per_region_overrides_mode_preset(self): + """Explicit --schemes_per_region is kept even when it differs from preset.""" + args = self._parse_cli( + ["--onnx_path", "model.onnx", "--mode", "default", "--schemes_per_region", "99"] + ) + assert args.num_schemes == 99 + assert args.warmup_runs == MODE_PRESETS["default"]["warmup_runs"] + assert args.timing_runs == MODE_PRESETS["default"]["timing_runs"] + + def test_explicit_default_value_not_overridden_by_mode(self): + """Explicit --schemes_per_region 30 (parser default) is not overridden by --mode default.""" + args = self._parse_cli( + ["--onnx_path", "model.onnx", "--mode", "default", "--schemes_per_region", "30"] + ) + assert args.num_schemes == 30 + + def test_explicit_warmup_runs_overrides_mode_preset(self): + """Explicit --warmup_runs is kept and not overridden by preset.""" + args = self._parse_cli( + ["--onnx_path", "model.onnx", "--mode", "extensive", "--warmup_runs", "3"] + ) + assert args.warmup_runs == 3 + assert args.num_schemes == MODE_PRESETS["extensive"]["schemes_per_region"] + assert args.timing_runs == MODE_PRESETS["extensive"]["timing_runs"] + + def test_explicit_timing_runs_overrides_mode_preset(self): + """Explicit --timing_runs is kept and not overridden by preset.""" + args = self._parse_cli( + ["--onnx_path", "model.onnx", "--mode", "quick", "--timing_runs", "7"] + ) + assert args.timing_runs == 7 + assert args.num_schemes == MODE_PRESETS["quick"]["schemes_per_region"] + assert args.warmup_runs == MODE_PRESETS["quick"]["warmup_runs"] + + def test_multiple_explicit_overrides_mode_preset(self): + """Multiple explicit flags override only their respective preset values.""" + args = self._parse_cli( + [ + "--onnx_path", + "model.onnx", + "--mode", + "extensive", + "--schemes_per_region", + "10", + "--timing_runs", + "5", + ] + ) + assert args.num_schemes == 10 + assert args.timing_runs == 5 + assert args.warmup_runs == MODE_PRESETS["extensive"]["warmup_runs"] + + def test_short_flag_schemes_per_region_overrides_mode(self): + """Short form -s for schemes_per_region is treated as explicit and overrides preset.""" + args = self._parse_cli(["--onnx_path", "model.onnx", "--mode", "default", "-s", "25"]) + assert args.num_schemes == 25 diff --git a/tests/unit/onnx/quantization/autotune/test_pattern_cache.py b/tests/unit/onnx/quantization/autotune/test_pattern_cache.py new file mode 100644 index 0000000000..294501ff03 --- /dev/null +++ b/tests/unit/onnx/quantization/autotune/test_pattern_cache.py @@ -0,0 +1,183 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tests for PatternCache in the autotuner. + +Covers pattern cache creation, serialization, YAML round-trip, and scheme management. +""" + +import os +import tempfile + +from modelopt.onnx.quantization.autotune.common import ( + InsertionScheme, + NodeInputInsertionPoint, + PatternCache, + PatternSchemes, +) +from modelopt.onnx.quantization.autotune.region_pattern import RegionPattern + + +class TestPatternCache: + """Test PatternCache functionality.""" + + @staticmethod + def _create_test_pattern(signature: str, size: int = 2): + """Create a test RegionPattern.""" + return RegionPattern(signature=signature, size=size) + + def test_empty_cache_creation(self): + """Test creating an empty PatternCache.""" + cache = PatternCache() + assert len(cache.pattern_schemes) == 0 + assert cache.pattern_schemes is not None + + def test_add_pattern_schemes(self): + """Test adding pattern schemes to cache.""" + cache = PatternCache() + pattern = self._create_test_pattern("Conv->Relu") + ps = PatternSchemes(pattern=pattern) + scheme = InsertionScheme() + scheme.latency_ms = 10.0 + ps.schemes.append(scheme) + cache.add_pattern_schemes(ps) + assert len(cache.pattern_schemes) == 1 + assert cache.pattern_schemes[0].pattern_signature == "Conv->Relu" + + def test_multiple_patterns(self): + """Test cache with multiple pattern schemes.""" + cache = PatternCache() + pattern_sigs = ["Conv->Relu", "Gemm->Relu", "Conv->Add->Relu"] + for pattern_sig in pattern_sigs: + pattern = self._create_test_pattern(pattern_sig) + ps = PatternSchemes(pattern=pattern) + scheme = InsertionScheme() + scheme.latency_ms = 10.0 + len(pattern_sig) + ps.schemes.append(scheme) + cache.add_pattern_schemes(ps) + assert len(cache.pattern_schemes) == 3 + found_patterns = [ps.pattern_signature for ps in cache.pattern_schemes] + for pattern_sig in pattern_sigs: + assert pattern_sig in found_patterns + + def test_serialization_empty(self): + """Test serialization of empty cache.""" + cache = PatternCache() + data = cache.to_dict() + assert "pattern_schemes" in data + assert len(data["pattern_schemes"]) == 0 + restored = PatternCache.from_dict(data) + assert len(restored.pattern_schemes) == 0 + + def test_serialization_with_data(self): + """Test serialization with pattern schemes.""" + cache = PatternCache(minimum_distance=0) + pattern = self._create_test_pattern("Conv->Relu") + ps = PatternSchemes(pattern=pattern) + scheme1 = InsertionScheme() + scheme1.node_inputs = [NodeInputInsertionPoint(0, 0)] + scheme1.latency_ms = 10.0 + ps.schemes.append(scheme1) + scheme2 = InsertionScheme() + scheme2.node_inputs = [ + NodeInputInsertionPoint(0, 0), + NodeInputInsertionPoint(1, 0), + NodeInputInsertionPoint(2, 0), + NodeInputInsertionPoint(3, 0), + NodeInputInsertionPoint(4, 0), + ] + scheme2.latency_ms = 12.0 + ps.schemes.append(scheme2) + cache.add_pattern_schemes(ps) + data = cache.to_dict() + restored = PatternCache.from_dict(data) + assert len(restored.pattern_schemes) == 1 + restored_ps = restored.pattern_schemes[0] + assert restored_ps.pattern_signature == "Conv->Relu" + assert len(restored_ps.schemes) == 2 + assert restored_ps.best_scheme is not None + assert restored_ps.best_scheme.latency_ms == 10.0 + assert restored_ps.schemes[0].latency_ms == 10.0 + + def test_yaml_round_trip(self): + """Test saving and loading cache as YAML.""" + cache = PatternCache() + pattern = self._create_test_pattern("Gemm->Relu") + ps = PatternSchemes(pattern=pattern) + scheme = InsertionScheme() + scheme.latency_ms = 15.0 + ps.schemes.append(scheme) + cache.add_pattern_schemes(ps) + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: + yaml_path = f.name + try: + cache.save(yaml_path) + restored = PatternCache.load(yaml_path) + assert len(restored.pattern_schemes) == 1 + assert restored.pattern_schemes[0].pattern_signature == "Gemm->Relu" + assert restored.pattern_schemes[0].schemes[0].latency_ms == 15.0 + finally: + if os.path.exists(yaml_path): + os.unlink(yaml_path) + + def test_update_cache(self): + """Test updating existing pattern in cache (merges schemes).""" + cache = PatternCache(minimum_distance=0) + pattern1 = self._create_test_pattern("Conv->Relu") + ps1 = PatternSchemes(pattern=pattern1) + scheme1 = InsertionScheme() + scheme1.latency_ms = 10.0 + ps1.schemes.append(scheme1) + cache.add_pattern_schemes(ps1) + pattern2 = self._create_test_pattern("Conv->Relu") + ps2 = PatternSchemes(pattern=pattern2) + scheme2 = InsertionScheme() + scheme2.latency_ms = 8.0 + scheme2.node_inputs = [NodeInputInsertionPoint(0, 0)] + ps2.schemes.append(scheme2) + cache.add_pattern_schemes(ps2) + assert len(cache.pattern_schemes) == 1 + conv_relu_ps = cache.pattern_schemes[0] + assert conv_relu_ps.pattern_signature == "Conv->Relu" + assert len(conv_relu_ps.schemes) == 2 + assert conv_relu_ps.best_scheme is not None + assert conv_relu_ps.best_scheme.latency_ms == 8.0 + + def test_get_best_scheme(self): + """Test retrieving best scheme for a pattern.""" + cache = PatternCache(minimum_distance=0) + pattern = self._create_test_pattern("Conv->Relu") + ps = PatternSchemes(pattern=pattern) + scheme1 = InsertionScheme() + scheme1.node_inputs = [NodeInputInsertionPoint(0, 0)] + scheme1.latency_ms = 12.0 + ps.schemes.append(scheme1) + scheme2 = InsertionScheme() + scheme2.node_inputs = [NodeInputInsertionPoint(1, 0)] + scheme2.latency_ms = 8.0 + ps.schemes.append(scheme2) + scheme3 = InsertionScheme() + scheme3.node_inputs = [NodeInputInsertionPoint(2, 0)] + scheme3.latency_ms = 10.0 + ps.schemes.append(scheme3) + cache.add_pattern_schemes(ps) + conv_relu_ps = cache.pattern_schemes[0] + assert conv_relu_ps.pattern_signature == "Conv->Relu" + assert len(conv_relu_ps.schemes) == 3 + best = conv_relu_ps.best_scheme + assert best is not None + assert best.latency_ms == 8.0 + latencies = sorted([s.latency_ms for s in conv_relu_ps.schemes]) + assert latencies == [8.0, 10.0, 12.0] diff --git a/tests/unit/onnx/quantization/autotune/test_region.py b/tests/unit/onnx/quantization/autotune/test_region.py index a27b1c98ca..5a733017d9 100644 --- a/tests/unit/onnx/quantization/autotune/test_region.py +++ b/tests/unit/onnx/quantization/autotune/test_region.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License");