Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ NVIDIA Model Optimizer Changelog
- Add ``nvfp4_omlp_only`` quantization format for NVFP4 quantization. This is similar to ``nvfp4_mlp_only`` but also quantizes the output projection layer in attention.
- ``pass_through_bwd`` in the quantization config is now default to True. Please set it to False if you want to use STE with zeroed outlier gradients for potentially better QAT accuracy.
- Add :meth:`compute_quantization_mse <modelopt.torch.quantization.model_quant.compute_quantization_mse>` API to measure per-quantizer mean-squared quantization error, with flexible wildcard and callable filtering.
- **AutoQDQ**: New tool for automated Q/DQ (Quantize/Dequantize) placement optimization for ONNX models. Uses TensorRT latency measurements to choose insertion schemes that minimize inference time. Discovers regions automatically, groups them by structural pattern, and tests multiple Q/DQ schemes per pattern. Supports INT8 and FP8 quantization, pattern cache for warm-start on similar models, checkpoint/resume, and importing patterns from an existing QDQ baseline. CLI: ``python -m modelopt.onnx.quantization.autotune``. See the AutoQDQ guide in the documentation.
Comment thread
gcunhase marked this conversation as resolved.

**Misc**

Expand Down
76 changes: 70 additions & 6 deletions modelopt/onnx/quantization/autotune/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,50 @@
)

DEFAULT_OUTPUT_DIR = "./autotuner_output"
DEFAULT_NUM_SCHEMES = 30
DEFAULT_NUM_SCHEMES = 50
DEFAULT_QUANT_TYPE = "int8"
DEFAULT_DQ_DTYPE = "float32"
DEFAULT_TIMING_CACHE = str(Path(tempfile.gettempdir()) / "trtexec_timing.cache")
DEFAULT_WARMUP_RUNS = 5
DEFAULT_TIMING_RUNS = 20
DEFAULT_WARMUP_RUNS = 50
DEFAULT_TIMING_RUNS = 100
MODE_PRESETS = {
"quick": {"schemes_per_region": 30, "warmup_runs": 10, "timing_runs": 50},
"default": {
"schemes_per_region": DEFAULT_NUM_SCHEMES,
"warmup_runs": DEFAULT_WARMUP_RUNS,
"timing_runs": DEFAULT_TIMING_RUNS,
},
"extensive": {"schemes_per_region": 200, "warmup_runs": 50, "timing_runs": 200},
}
Comment thread
willg-nv marked this conversation as resolved.


class _StoreWithExplicitFlag(argparse.Action):
"""Store the value and set an 'explicit' flag on the namespace so mode presets do not override."""

def __init__(self, explicit_attr: str, *args, **kwargs):
self._explicit_attr = explicit_attr
super().__init__(*args, **kwargs)

def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, values)
setattr(namespace, self._explicit_attr, True)


def apply_mode_presets(args) -> None:
"""Apply --mode preset to schemes_per_region, warmup_runs, timing_runs.

Only applies preset for an option when that option was not explicitly set on the
command line (explicit flags override the preset even when the value equals the default).
"""
if args.mode not in MODE_PRESETS:
return
preset = MODE_PRESETS[args.mode]
if not getattr(args, "_explicit_num_schemes", False):
args.num_schemes = preset["schemes_per_region"]
if not getattr(args, "_explicit_warmup_runs", False):
args.warmup_runs = preset["warmup_runs"]
if not getattr(args, "_explicit_timing_runs", False):
args.timing_runs = preset["timing_runs"]
Comment thread
willg-nv marked this conversation as resolved.

Comment thread
willg-nv marked this conversation as resolved.

def validate_file_path(path: str | None, description: str) -> Path | None:
Expand Down Expand Up @@ -94,12 +132,15 @@ def run_autotune() -> int:
- 130: Interrupted by user (Ctrl+C)
"""
args = _get_autotune_parser().parse_args()
apply_mode_presets(args)
model_path = validate_file_path(args.onnx_path, "Model file")
validate_file_path(args.qdq_baseline, "QDQ baseline model")
output_dir = Path(args.output_dir)

log_benchmark_config(args)
trtexec_args = getattr(args, "trtexec_benchmark_args", None)
if trtexec_args and isinstance(trtexec_args, str):
trtexec_args = trtexec_args.split()
benchmark_instance = init_benchmark_instance(
use_trtexec=args.use_trtexec,
plugin_libraries=args.plugin_libraries,
Expand Down Expand Up @@ -167,6 +208,12 @@ def _get_autotune_parser() -> argparse.ArgumentParser:
# Basic usage
python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx

# Quick mode (fewer schemes and benchmark runs for fast iteration)
python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx --mode quick

# Extensive mode (more schemes and runs for thorough tuning)
python -m modelopt.onnx.quantization.autotune --onnx_path model.onnx --mode extensive

# Import patterns from QDQ baseline model
python -m modelopt.onnx.quantization.autotune \\
--onnx_path model.onnx --qdq_baseline baseline.onnx
Expand Down Expand Up @@ -198,13 +245,26 @@ def _get_autotune_parser() -> argparse.ArgumentParser:

# Autotuning Strategy
strategy_group = parser.add_argument_group("Autotuning Strategy")
strategy_group.add_argument(
"--mode",
type=str,
default="default",
choices=["quick", "default", "extensive"],
help="Preset for schemes_per_region, warmup_runs, and timing_runs. "
"'quick': fewer schemes/runs for fast iteration; "
"'default': balanced; "
"'extensive': more schemes/runs for thorough tuning. "
"Explicit --schemes_per_region, --warmup_runs, --timing_runs override the preset.",
)
strategy_group.add_argument(
"--schemes_per_region",
"-s",
type=int,
default=DEFAULT_NUM_SCHEMES,
dest="num_schemes",
help=f"Number of schemes to test per region (default: {DEFAULT_NUM_SCHEMES})",
action=_StoreWithExplicitFlag,
explicit_attr="_explicit_num_schemes",
help=f"Schemes per region (default: {DEFAULT_NUM_SCHEMES}; preset from --mode if not set)",
)
strategy_group.add_argument(
"--pattern_cache",
Expand Down Expand Up @@ -268,13 +328,17 @@ def _get_autotune_parser() -> argparse.ArgumentParser:
"--warmup_runs",
type=int,
default=DEFAULT_WARMUP_RUNS,
help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS})",
action=_StoreWithExplicitFlag,
explicit_attr="_explicit_warmup_runs",
help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS}; preset from --mode applies if not set)",
)
trt_group.add_argument(
"--timing_runs",
type=int,
default=DEFAULT_TIMING_RUNS,
help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS})",
action=_StoreWithExplicitFlag,
explicit_attr="_explicit_timing_runs",
help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS}; preset from --mode applies if not set)",
)
trt_group.add_argument(
"--plugin_libraries",
Expand Down
15 changes: 11 additions & 4 deletions tests/_test_utils/onnx/quantization/autotune/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,19 @@

def _create_simple_conv_onnx_model():
"""Build ONNX model: Input -> Conv -> Relu -> Output (minimal for autotuner tests)."""
input_tensor = helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [32, 3, 224, 224])
input_tensor = helper.make_tensor_value_info(
"input", onnx.TensorProto.FLOAT, [64, 32, 224, 224]
)
output_tensor = helper.make_tensor_value_info(
"output", onnx.TensorProto.FLOAT, [32, 64, 224, 224]
"output", onnx.TensorProto.FLOAT, [64, 64, 224, 224]
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
conv_node = helper.make_node(
"Conv", inputs=["input", "conv_weight"], outputs=["conv_out"], name="conv"
"Conv",
inputs=["input", "conv_weight"],
outputs=["conv_out"],
name="conv",
kernel_shape=[3, 3],
pads=[1, 1, 1, 1],
)
relu_node = helper.make_node("Relu", inputs=["conv_out"], outputs=["output"], name="relu")
graph = helper.make_graph(
Expand All @@ -40,7 +47,7 @@ def _create_simple_conv_onnx_model():
[output_tensor],
initializer=[
helper.make_tensor(
"conv_weight", onnx.TensorProto.FLOAT, [64, 3, 3, 3], [0.1] * (64 * 3 * 3 * 3)
"conv_weight", onnx.TensorProto.FLOAT, [64, 32, 3, 3], [0.1] * (64 * 32 * 3 * 3)
)
],
)
Expand Down
1 change: 0 additions & 1 deletion tests/gpu/onnx/quantization/autotune/test_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def simple_conv_model():
return _test_models._create_simple_conv_onnx_model()


@pytest.mark.skip(reason="TODO: Fix test and enable")
@pytest.mark.parametrize("use_trtexec", [True, False])
def test_export_quantized_model(use_trtexec, simple_conv_model):
"""Test exporting quantized model with Q/DQ."""
Expand Down
103 changes: 101 additions & 2 deletions tests/unit/onnx/quantization/autotune/test_autotune_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,17 @@
# limitations under the License.

"""
Tests for the Config class in the autotuner.
Tests for the Config class and CLI mode presets in the autotuner.

Tests configuration parameter validation and defaults.
Tests configuration parameter validation, defaults, and CLI --mode preset
selection and explicit-flag precedence.
"""

from modelopt.onnx.quantization.autotune.__main__ import (
MODE_PRESETS,
_get_autotune_parser,
apply_mode_presets,
)
from modelopt.onnx.quantization.autotune.common import Config


Expand Down Expand Up @@ -95,3 +101,96 @@ def test_pattern_cache_params(self):

assert config.pattern_cache_minimum_distance == 3
assert config.pattern_cache_max_entries_per_pattern == 10


class TestModePresets:
"""Test --mode preset selection and explicit-flag precedence."""

@staticmethod
def _parse_cli(argv):
"""Parse argv with the autotune CLI parser and apply mode presets."""
parser = _get_autotune_parser()
args = parser.parse_args(argv)
apply_mode_presets(args)
return args

def test_mode_quick_applies_preset_when_no_explicit_flags(self):
"""With --mode quick and no explicit schemes/warmup/timing, preset values are used."""
args = self._parse_cli(["--onnx_path", "model.onnx", "--mode", "quick"])
preset = MODE_PRESETS["quick"]
assert args.num_schemes == preset["schemes_per_region"]
assert args.warmup_runs == preset["warmup_runs"]
assert args.timing_runs == preset["timing_runs"]

def test_mode_default_applies_preset_when_no_explicit_flags(self):
"""With --mode default and no explicit flags, preset values are used."""
args = self._parse_cli(["--onnx_path", "model.onnx", "--mode", "default"])
preset = MODE_PRESETS["default"]
assert args.num_schemes == preset["schemes_per_region"]
assert args.warmup_runs == preset["warmup_runs"]
assert args.timing_runs == preset["timing_runs"]

def test_mode_extensive_applies_preset_when_no_explicit_flags(self):
"""With --mode extensive and no explicit flags, preset values are used."""
args = self._parse_cli(["--onnx_path", "model.onnx", "--mode", "extensive"])
preset = MODE_PRESETS["extensive"]
assert args.num_schemes == preset["schemes_per_region"]
assert args.warmup_runs == preset["warmup_runs"]
assert args.timing_runs == preset["timing_runs"]

def test_explicit_schemes_per_region_overrides_mode_preset(self):
"""Explicit --schemes_per_region is kept even when it differs from preset."""
args = self._parse_cli(
["--onnx_path", "model.onnx", "--mode", "default", "--schemes_per_region", "99"]
)
assert args.num_schemes == 99
assert args.warmup_runs == MODE_PRESETS["default"]["warmup_runs"]
assert args.timing_runs == MODE_PRESETS["default"]["timing_runs"]

def test_explicit_default_value_not_overridden_by_mode(self):
"""Explicit --schemes_per_region 30 (parser default) is not overridden by --mode default."""
args = self._parse_cli(
["--onnx_path", "model.onnx", "--mode", "default", "--schemes_per_region", "30"]
)
assert args.num_schemes == 30

def test_explicit_warmup_runs_overrides_mode_preset(self):
"""Explicit --warmup_runs is kept and not overridden by preset."""
args = self._parse_cli(
["--onnx_path", "model.onnx", "--mode", "extensive", "--warmup_runs", "3"]
)
assert args.warmup_runs == 3
assert args.num_schemes == MODE_PRESETS["extensive"]["schemes_per_region"]
assert args.timing_runs == MODE_PRESETS["extensive"]["timing_runs"]

def test_explicit_timing_runs_overrides_mode_preset(self):
"""Explicit --timing_runs is kept and not overridden by preset."""
args = self._parse_cli(
["--onnx_path", "model.onnx", "--mode", "quick", "--timing_runs", "7"]
)
assert args.timing_runs == 7
assert args.num_schemes == MODE_PRESETS["quick"]["schemes_per_region"]
assert args.warmup_runs == MODE_PRESETS["quick"]["warmup_runs"]

def test_multiple_explicit_overrides_mode_preset(self):
"""Multiple explicit flags override only their respective preset values."""
args = self._parse_cli(
[
"--onnx_path",
"model.onnx",
"--mode",
"extensive",
"--schemes_per_region",
"10",
"--timing_runs",
"5",
]
)
assert args.num_schemes == 10
assert args.timing_runs == 5
assert args.warmup_runs == MODE_PRESETS["extensive"]["warmup_runs"]

def test_short_flag_schemes_per_region_overrides_mode(self):
"""Short form -s for schemes_per_region is treated as explicit and overrides preset."""
args = self._parse_cli(["--onnx_path", "model.onnx", "--mode", "default", "-s", "25"])
assert args.num_schemes == 25
Loading