Skip to content

Commit acabf87

Browse files
kevalmorabia97milesialgcunhasecjluo-nvmeenchen
authored
## Cherry-picked PRs - #1393 - #1389 - #1268 - #1397 - #1402 - #1411 - #1410 - #1419 - #1408 - #1416 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * SPEEDBench now uses stratified sampling for deterministic, balanced dataset selection. * Added legacy quantization conversion shims for INT4, MXFP8 and FP4→2DQ workflows. * AWQ Lite: fallback handling for uncalibrated per-expert quantizers during export. * **Bug Fixes** * Clamp FP8 scales in NVFP4 quantization to avoid NaNs. * Fixed warmup steps formatting in finetune launch script. * **Improvements** * LM-Eval integration updated for v0.4.10+ compatibility. * TensorRT execution routed through a dedicated trtexec helper. * **Tests** * Added/regressed tests covering quantization shims, FP8 scale behavior, export fallbacks, and LM eval. [![Review Change Stack](https://storage.googleapis.com/coderabbit_public_assets/review-stack-in-coderabbit-ui.svg)](https://app.coderabbit.ai/change-stack/NVIDIA/Model-Optimizer/pull/1426) <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Signed-off-by: Alexandre Milesi <milesial@users.noreply.github.com> Signed-off-by: gcunhase <4861122+gcunhase@users.noreply.github.com> Signed-off-by: Chenjie Luo <chenjiel@nvidia.com> Signed-off-by: weimingc <17592131+meenchen@users.noreply.github.com> Signed-off-by: weimingc <weimingc@nvidia.com> Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> Co-authored-by: milesial <milesial@users.noreply.github.com> Co-authored-by: Gwena Cunha <4861122+gcunhase@users.noreply.github.com> Co-authored-by: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com> Co-authored-by: Wei-Ming Chen <17592131+meenchen@users.noreply.github.com> Co-authored-by: sugunav14 <178320438+sugunav14@users.noreply.github.com> Co-authored-by: Ajinkya Rasane <131806219+ajrasane@users.noreply.github.com>
1 parent cc06062 commit acabf87

23 files changed

Lines changed: 961 additions & 91 deletions

File tree

examples/llm_eval/lm_eval_hf.py

Lines changed: 56 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,15 @@
4242

4343
import datasets
4444
from lm_eval import utils
45-
from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser
45+
from packaging.version import Version
4646

47-
if not version("lm_eval").startswith("0.4.8"):
48-
warnings.warn(
49-
f"lm_eval_hf.py is tested with lm-eval 0.4.8; found {version('lm_eval')}. "
50-
"Later versions may have incompatible API changes."
51-
)
47+
if Version(version("lm_eval")) < Version("0.4.10"):
48+
raise ImportError(f"lm_eval_hf.py requires lm-eval >= 0.4.10; found {version('lm_eval')}.")
49+
50+
from lm_eval._cli import HarnessCLI
5251
from lm_eval.api.model import T
5352
from lm_eval.models.huggingface import HFLM
53+
from lm_eval.utils import setup_logging
5454
from quantization_utils import quantize_model
5555
from sparse_attention_utils import sparsify_model
5656

@@ -160,9 +160,24 @@ def create_from_arg_string(
160160
HFLM.create_from_arg_string = classmethod(create_from_arg_string)
161161

162162

163-
def setup_parser_with_modelopt_args():
164-
"""Extend the lm-eval argument parser with ModelOpt quantization and sparsity options."""
165-
parser = setup_parser()
163+
# ModelOpt-specific args that we add to lm-eval's parser. After parsing, these are
164+
# moved out of the argparse namespace and into args.model_args so they reach
165+
# HFLM.create_from_arg_obj (and so lm-eval's own arg validation doesn't reject them).
166+
_MODELOPT_ARG_KEYS = (
167+
"quant_cfg",
168+
"calib_batch_size",
169+
"calib_size",
170+
"auto_quantize_bits",
171+
"auto_quantize_method",
172+
"auto_quantize_score_size",
173+
"auto_quantize_checkpoint",
174+
"compress",
175+
"sparse_cfg",
176+
)
177+
178+
179+
def _add_modelopt_args(parser):
180+
"""Extend an lm-eval argument parser with ModelOpt quantization and sparsity options."""
166181
parser.add_argument(
167182
"--quant_cfg",
168183
type=str,
@@ -221,33 +236,45 @@ def setup_parser_with_modelopt_args():
221236
type=str,
222237
help="Sparse attention configuration (e.g., SKIP_SOFTMAX_DEFAULT, SKIP_SOFTMAX_CALIB)",
223238
)
224-
return parser
225239

226240

227-
if __name__ == "__main__":
228-
parser = setup_parser_with_modelopt_args()
229-
args = parse_eval_args(parser)
230-
model_args = utils.simple_parse_args_string(args.model_args)
241+
def _inject_modelopt_args_into_model_args(args):
242+
"""Move ModelOpt args from the argparse namespace into args.model_args.
243+
244+
args.model_args is a dict (parsed by lm-eval's MergeDictAction). The ModelOpt
245+
keys must be removed from the namespace so EvaluatorConfig.from_cli doesn't
246+
reject them as unknown kwargs.
247+
"""
248+
model_args = dict(args.model_args) if args.model_args else {}
231249

232-
if args.trust_remote_code:
250+
if getattr(args, "trust_remote_code", False):
251+
# Propagate the user-provided --trust_remote_code flag (not hardcoded).
233252
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
234253
model_args["trust_remote_code"] = True
235254
args.trust_remote_code = None
236255

237-
model_args.update(
238-
{
239-
"quant_cfg": args.quant_cfg,
240-
"auto_quantize_bits": args.auto_quantize_bits,
241-
"auto_quantize_method": args.auto_quantize_method,
242-
"auto_quantize_score_size": args.auto_quantize_score_size,
243-
"auto_quantize_checkpoint": args.auto_quantize_checkpoint,
244-
"calib_batch_size": args.calib_batch_size,
245-
"calib_size": args.calib_size,
246-
"compress": args.compress,
247-
"sparse_cfg": args.sparse_cfg,
248-
}
249-
)
256+
for key in _MODELOPT_ARG_KEYS:
257+
if hasattr(args, key):
258+
model_args[key] = getattr(args, key)
259+
delattr(args, key)
250260

251261
args.model_args = model_args
252262

253-
cli_evaluate(args)
263+
264+
if __name__ == "__main__":
265+
setup_logging()
266+
cli = HarnessCLI()
267+
# The `run` subcommand owns the model/task arguments; extend that parser.
268+
# `_subparsers` is private API; guard so a future lm-eval refactor surfaces a
269+
# clear error instead of an opaque AttributeError.
270+
try:
271+
run_parser = cli._subparsers.choices["run"]
272+
except (AttributeError, KeyError) as e:
273+
raise RuntimeError(
274+
"Cannot locate lm-eval's `run` subparser; the HarnessCLI internals may "
275+
f"have changed. Installed lm-eval version: {version('lm_eval')}."
276+
) from e
277+
_add_modelopt_args(run_parser)
278+
args = cli.parse_args()
279+
_inject_modelopt_args_into_model_args(args)
280+
cli.execute(args)

examples/llm_eval/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
fire>=0.5.0
2-
lm_eval[api,ifeval]==0.4.8
2+
lm_eval[api,ifeval]>=0.4.10
33
peft>=0.5.0
44
rwkv>=0.7.3
55
torchvision

examples/llm_sparsity/weight_sparsity/launch_finetune.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ CMD="accelerate launch --multi_gpu --mixed_precision bf16 finetune.py \
8888
--save_total_limit 10 \
8989
--learning_rate 2e-5 \
9090
--weight_decay 0.1 \
91-
--warmup_steps 0.0 \
91+
--warmup_steps 0 \
9292
--lr_scheduler_type cosine \
9393
--logging_steps 1 \
9494
--fsdp 'full_shard auto_wrap' \

examples/puzzletron/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
lm-eval==0.4.8
21
math-verify
32
ray
43
# Likely works for transformers v5 also, but we need to test it

examples/specdec_bench/specdec_bench/datasets/speed.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -737,10 +737,40 @@ def _load_dataset(self, config_name_or_dataset_path: config_type | str) -> "Data
737737
}
738738
table = table.replace_schema_metadata(new_meta or None)
739739
dataset = HFDataset(table)
740-
if self.num_samples is not None:
741-
dataset = dataset.select(range(self.num_samples))
740+
if self.num_samples is not None and self.num_samples < len(dataset):
741+
dataset = self._stratified_select(dataset, self.num_samples)
742742
return dataset
743743

744+
@staticmethod
745+
def _stratified_select(dataset: "Dataset", n: int) -> "Dataset":
746+
"""Select ``n`` samples uniformly across the ``category`` column.
747+
748+
Round-robin across categories until ``n`` rows are collected. The
749+
resulting prefix is balanced; once a smaller category is exhausted
750+
the remaining categories continue contributing, so exactly ``n``
751+
rows are returned whenever ``n`` does not exceed the dataset size.
752+
Falls back to ``range(n)`` when ``category`` is absent or there is
753+
only one category. Indices come from ``range(category_size)`` (not
754+
random) so behavior is deterministic.
755+
"""
756+
if "category" not in dataset.column_names:
757+
return dataset.select(range(n))
758+
cat_to_rows: dict[str, list[int]] = {}
759+
for i, c in enumerate(dataset["category"]):
760+
cat_to_rows.setdefault(c, []).append(i)
761+
if len(cat_to_rows) <= 1:
762+
return dataset.select(range(n))
763+
cat_lists = list(cat_to_rows.values())
764+
interleaved: list[int] = []
765+
max_len = max(len(c) for c in cat_lists)
766+
for i in range(max_len):
767+
for c in cat_lists:
768+
if i < len(c):
769+
interleaved.append(c[i])
770+
if len(interleaved) == n:
771+
return dataset.select(interleaved)
772+
return dataset.select(interleaved)
773+
744774
def _resolve_external_data(
745775
self, dataset: "Dataset", speed_config: config_type | str
746776
) -> "Dataset":

modelopt/onnx/export/nvfp4_exporter.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ def _cast_fp4(array: np.ndarray) -> np.ndarray:
3939
4040
Note: The first dimension of the array must be divisible by 2
4141
as two FP4 values are packed into a single byte.
42+
43+
Also reused by the deprecated ``modelopt.onnx.quantization.qdq_utils.fp4qdq_to_2dq``
44+
compatibility shim. Do not rename or change the signature without updating that
45+
shim (it is a load-bearing re-export for TensorRT-Edge-LLM 0.6.1).
4246
"""
4347
array_f32_t = torch.from_numpy(array)
4448
array_f32_t_shape = array_f32_t.shape
@@ -76,6 +80,10 @@ def _replace_fp4qdq_with_2dq(
7680
):
7781
"""Replaces the given node in the ONNX graph with a subgraph consisting of two DequantizeLinear nodes.
7882
83+
Also reused by the deprecated ``modelopt.onnx.quantization.qdq_utils.fp4qdq_to_2dq``
84+
compatibility shim. Do not rename or change the signature without updating that
85+
shim (it is a load-bearing re-export for TensorRT-Edge-LLM 0.6.1).
86+
7987
Args:
8088
graph: The ONNX graph containing the node to replace.
8189
node: The node to be replaced.

modelopt/onnx/quantization/autotune/benchmark.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
import os
3232
import re
3333
import shutil
34-
import subprocess # nosec B404
3534
import tempfile
3635
import time
3736
from abc import ABC, abstractmethod
@@ -42,7 +41,7 @@
4241
import torch
4342

4443
from modelopt.onnx.logging_config import logger
45-
from modelopt.onnx.quantization.ort_utils import _check_for_trtexec
44+
from modelopt.onnx.quantization.ort_utils import _check_for_trtexec, _run_trtexec
4645

4746
TRT_AVAILABLE = importlib.util.find_spec("tensorrt") is not None
4847
if TRT_AVAILABLE:
@@ -159,7 +158,6 @@ def __init__(
159158
warmup_runs: int = 5,
160159
timing_runs: int = 10,
161160
plugin_libraries: list[str] | None = None,
162-
trtexec_path: str = "trtexec",
163161
trtexec_args: list[str] | None = None,
164162
):
165163
"""Initialize the trtexec benchmark.
@@ -169,14 +167,11 @@ def __init__(
169167
warmup_runs: See :meth:`Benchmark.__init__`.
170168
timing_runs: See :meth:`Benchmark.__init__`.
171169
plugin_libraries: See :meth:`Benchmark.__init__`.
172-
trtexec_path: Path to trtexec binary. Defaults to 'trtexec' which
173-
looks for the binary in PATH.
174170
trtexec_args: Additional command-line arguments to pass to trtexec.
175171
These are appended after the standard arguments.
176172
Example: ['--fp16', '--workspace=4096', '--verbose']
177173
"""
178174
super().__init__(timing_cache_file, warmup_runs, timing_runs, plugin_libraries)
179-
self.trtexec_path = trtexec_path
180175
self.trtexec_args = trtexec_args if trtexec_args is not None else []
181176
self.temp_dir = tempfile.mkdtemp(prefix="trtexec_benchmark_")
182177
self.engine_path = os.path.join(self.temp_dir, "engine.trt")
@@ -186,7 +181,6 @@ def __init__(
186181
self.latency_pattern = r"\[I\]\s+Latency:.*?median\s*=\s*([\d.]+)\s*ms"
187182

188183
self._base_cmd = [
189-
self.trtexec_path,
190184
f"--avgRuns={self.timing_runs}",
191185
f"--iterations={self.timing_runs}",
192186
f"--warmUp={self.warmup_runs}",
@@ -268,13 +262,14 @@ def run(
268262
self.logger.debug(f"Wrote model bytes to temporary file: {model_path}")
269263

270264
cmd = [*self._base_cmd, f"--onnx={model_path}"]
271-
self.logger.debug(f"Running: {' '.join(cmd)}")
272-
result = subprocess.run(cmd, capture_output=True, text=True) # nosec B603
265+
full_cmd = ["trtexec", *cmd]
266+
self.logger.debug(f"Running: {' '.join(full_cmd)}")
267+
result = _run_trtexec(cmd)
273268
self._write_log_file(
274269
log_file,
275270
"\n".join(
276271
[
277-
f"Command: {' '.join(cmd)}",
272+
f"Command: {' '.join(full_cmd)}",
278273
f"Return code: {result.returncode}",
279274
"=" * 80,
280275
"STDOUT:",
@@ -301,8 +296,9 @@ def run(
301296
self.logger.info(f"TrtExec benchmark (median): {latency:.2f} ms")
302297
return latency
303298
except FileNotFoundError:
304-
self.logger.error(f"trtexec binary not found: {self.trtexec_path}")
305-
self.logger.error("Please ensure TensorRT is installed and trtexec path is correct")
299+
self.logger.error(
300+
"'trtexec' binary not found. Please ensure TensorRT is installed and 'trtexec' is in PATH."
301+
)
306302
return float("inf")
307303
except Exception as e:
308304
self.logger.error(f"Benchmark failed: {e}")

modelopt/onnx/quantization/ort_utils.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,30 @@ def _check_lib_in_ld_library_path(ld_library_path, lib_pattern):
4646
return False, None
4747

4848

49+
def _run_trtexec(
50+
args: list[str] | None = None, timeout: float | None = None
51+
) -> subprocess.CompletedProcess:
52+
"""Run a 'trtexec' command via subprocess.
53+
54+
Args:
55+
args: Arguments to pass to trtexec (without the 'trtexec' command itself).
56+
timeout: Optional subprocess timeout in seconds.
57+
58+
Returns:
59+
The completed subprocess result.
60+
61+
Raises:
62+
FileNotFoundError: If the 'trtexec' binary is not found in PATH.
63+
"""
64+
cmd = ["trtexec", *(args or [])]
65+
try:
66+
return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) # nosec B603
67+
except FileNotFoundError as e:
68+
raise FileNotFoundError(
69+
"'trtexec' binary not found. Please ensure TensorRT is installed and 'trtexec' is in PATH."
70+
) from e
71+
72+
4973
def _check_for_trtexec(min_version: str = "10.0") -> str:
5074
"""Check if the `trtexec` CLI tool is available in PATH and is >= min_version.
5175
@@ -89,7 +113,7 @@ def _parse_version_from_string(version_str: str) -> str | None:
89113
)
90114

91115
try:
92-
result = subprocess.run([trtexec_path], capture_output=True, text=True, timeout=5) # nosec B603
116+
result = _run_trtexec(timeout=5)
93117
banner_output = result.stdout + result.stderr
94118
parsed_version = _parse_version_from_string(banner_output)
95119

0 commit comments

Comments
 (0)