NVIDIA
diff --git a/‎examples/llm_eval/lm_eval_hf.py‎
Lines changed: 56 additions & 29 deletions b/‎examples/llm_eval/lm_eval_hf.py‎
Lines changed: 56 additions & 29 deletions
diff --git a/‎examples/llm_eval/requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎examples/llm_eval/requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llm_sparsity/weight_sparsity/launch_finetune.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/llm_sparsity/weight_sparsity/launch_finetune.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/puzzletron/requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎examples/puzzletron/requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/specdec_bench/specdec_bench/datasets/speed.py‎
Lines changed: 32 additions & 2 deletions b/‎examples/specdec_bench/specdec_bench/datasets/speed.py‎
Lines changed: 32 additions & 2 deletions
diff --git a/‎modelopt/onnx/export/nvfp4_exporter.py‎
Lines changed: 8 additions & 0 deletions b/‎modelopt/onnx/export/nvfp4_exporter.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎modelopt/onnx/quantization/autotune/benchmark.py‎
Lines changed: 8 additions & 12 deletions b/‎modelopt/onnx/quantization/autotune/benchmark.py‎
Lines changed: 8 additions & 12 deletions
diff --git a/‎modelopt/onnx/quantization/ort_utils.py‎
Lines changed: 25 additions & 1 deletion b/‎modelopt/onnx/quantization/ort_utils.py‎
Lines changed: 25 additions & 1 deletion
@@ -42,15 +42,15 @@
 
 import datasets
 from lm_eval import utils
-from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser
+from packaging.version import Version
 
-if not version("lm_eval").startswith("0.4.8"):
-    warnings.warn(
-        f"lm_eval_hf.py is tested with lm-eval 0.4.8; found {version('lm_eval')}. "
-        "Later versions may have incompatible API changes."
-    )
+if Version(version("lm_eval")) < Version("0.4.10"):
+    raise ImportError(f"lm_eval_hf.py requires lm-eval >= 0.4.10; found {version('lm_eval')}.")
+
+from lm_eval._cli import HarnessCLI
 from lm_eval.api.model import T
 from lm_eval.models.huggingface import HFLM
+from lm_eval.utils import setup_logging
 from quantization_utils import quantize_model
 from sparse_attention_utils import sparsify_model
 
@@ -160,9 +160,24 @@ def create_from_arg_string(
 HFLM.create_from_arg_string = classmethod(create_from_arg_string)
 
 
-def setup_parser_with_modelopt_args():
-    """Extend the lm-eval argument parser with ModelOpt quantization and sparsity options."""
-    parser = setup_parser()
+# ModelOpt-specific args that we add to lm-eval's parser. After parsing, these are
+# moved out of the argparse namespace and into args.model_args so they reach
+# HFLM.create_from_arg_obj (and so lm-eval's own arg validation doesn't reject them).
+_MODELOPT_ARG_KEYS = (
+    "quant_cfg",
+    "calib_batch_size",
+    "calib_size",
+    "auto_quantize_bits",
+    "auto_quantize_method",
+    "auto_quantize_score_size",
+    "auto_quantize_checkpoint",
+    "compress",
+    "sparse_cfg",
+)
+
+
+def _add_modelopt_args(parser):
+    """Extend an lm-eval argument parser with ModelOpt quantization and sparsity options."""
     parser.add_argument(
         "--quant_cfg",
         type=str,
@@ -221,33 +236,45 @@ def setup_parser_with_modelopt_args():
         type=str,
         help="Sparse attention configuration (e.g., SKIP_SOFTMAX_DEFAULT, SKIP_SOFTMAX_CALIB)",
     )
-    return parser
 
 
-if __name__ == "__main__":
-    parser = setup_parser_with_modelopt_args()
-    args = parse_eval_args(parser)
-    model_args = utils.simple_parse_args_string(args.model_args)
+def _inject_modelopt_args_into_model_args(args):
+    """Move ModelOpt args from the argparse namespace into args.model_args.
+
+    args.model_args is a dict (parsed by lm-eval's MergeDictAction). The ModelOpt
+    keys must be removed from the namespace so EvaluatorConfig.from_cli doesn't
+    reject them as unknown kwargs.
+    """
+    model_args = dict(args.model_args) if args.model_args else {}
 
-    if args.trust_remote_code:
+    if getattr(args, "trust_remote_code", False):
+        # Propagate the user-provided --trust_remote_code flag (not hardcoded).
         datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
         model_args["trust_remote_code"] = True
         args.trust_remote_code = None
 
-    model_args.update(
-        {
-            "quant_cfg": args.quant_cfg,
-            "auto_quantize_bits": args.auto_quantize_bits,
-            "auto_quantize_method": args.auto_quantize_method,
-            "auto_quantize_score_size": args.auto_quantize_score_size,
-            "auto_quantize_checkpoint": args.auto_quantize_checkpoint,
-            "calib_batch_size": args.calib_batch_size,
-            "calib_size": args.calib_size,
-            "compress": args.compress,
-            "sparse_cfg": args.sparse_cfg,
-        }
-    )
+    for key in _MODELOPT_ARG_KEYS:
+        if hasattr(args, key):
+            model_args[key] = getattr(args, key)
+            delattr(args, key)
 
     args.model_args = model_args
 
-    cli_evaluate(args)
+
+if __name__ == "__main__":
+    setup_logging()
+    cli = HarnessCLI()
+    # The `run` subcommand owns the model/task arguments; extend that parser.
+    # `_subparsers` is private API; guard so a future lm-eval refactor surfaces a
+    # clear error instead of an opaque AttributeError.
+    try:
+        run_parser = cli._subparsers.choices["run"]
+    except (AttributeError, KeyError) as e:
+        raise RuntimeError(
+            "Cannot locate lm-eval's `run` subparser; the HarnessCLI internals may "
+            f"have changed. Installed lm-eval version: {version('lm_eval')}."
+        ) from e
+    _add_modelopt_args(run_parser)
+    args = cli.parse_args()
+    _inject_modelopt_args_into_model_args(args)
+    cli.execute(args)
@@ -1,5 +1,5 @@
 fire>=0.5.0
-lm_eval[api,ifeval]==0.4.8
+lm_eval[api,ifeval]>=0.4.10
 peft>=0.5.0
 rwkv>=0.7.3
 torchvision
@@ -88,7 +88,7 @@ CMD="accelerate launch --multi_gpu --mixed_precision bf16 finetune.py \
     --save_total_limit 10 \
     --learning_rate 2e-5 \
     --weight_decay 0.1 \
-    --warmup_steps 0.0 \
+    --warmup_steps 0 \
     --lr_scheduler_type cosine \
     --logging_steps 1 \
     --fsdp 'full_shard auto_wrap' \
 
@@ -1,4 +1,3 @@
-lm-eval==0.4.8
 math-verify
 ray
 # Likely works for transformers v5 also, but we need to test it
 
@@ -737,10 +737,40 @@ def _load_dataset(self, config_name_or_dataset_path: config_type | str) -> "Data
                     }
                     table = table.replace_schema_metadata(new_meta or None)
                 dataset = HFDataset(table)
-        if self.num_samples is not None:
-            dataset = dataset.select(range(self.num_samples))
+        if self.num_samples is not None and self.num_samples < len(dataset):
+            dataset = self._stratified_select(dataset, self.num_samples)
         return dataset
 
+    @staticmethod
+    def _stratified_select(dataset: "Dataset", n: int) -> "Dataset":
+        """Select ``n`` samples uniformly across the ``category`` column.
+
+        Round-robin across categories until ``n`` rows are collected. The
+        resulting prefix is balanced; once a smaller category is exhausted
+        the remaining categories continue contributing, so exactly ``n``
+        rows are returned whenever ``n`` does not exceed the dataset size.
+        Falls back to ``range(n)`` when ``category`` is absent or there is
+        only one category. Indices come from ``range(category_size)`` (not
+        random) so behavior is deterministic.
+        """
+        if "category" not in dataset.column_names:
+            return dataset.select(range(n))
+        cat_to_rows: dict[str, list[int]] = {}
+        for i, c in enumerate(dataset["category"]):
+            cat_to_rows.setdefault(c, []).append(i)
+        if len(cat_to_rows) <= 1:
+            return dataset.select(range(n))
+        cat_lists = list(cat_to_rows.values())
+        interleaved: list[int] = []
+        max_len = max(len(c) for c in cat_lists)
+        for i in range(max_len):
+            for c in cat_lists:
+                if i < len(c):
+                    interleaved.append(c[i])
+                    if len(interleaved) == n:
+                        return dataset.select(interleaved)
+        return dataset.select(interleaved)
+
     def _resolve_external_data(
         self, dataset: "Dataset", speed_config: config_type | str
     ) -> "Dataset":
 
@@ -39,6 +39,10 @@ def _cast_fp4(array: np.ndarray) -> np.ndarray:
 
     Note: The first dimension of the array must be divisible by 2
     as two FP4 values are packed into a single byte.
+
+    Also reused by the deprecated ``modelopt.onnx.quantization.qdq_utils.fp4qdq_to_2dq``
+    compatibility shim. Do not rename or change the signature without updating that
+    shim (it is a load-bearing re-export for TensorRT-Edge-LLM 0.6.1).
     """
     array_f32_t = torch.from_numpy(array)
     array_f32_t_shape = array_f32_t.shape
@@ -76,6 +80,10 @@ def _replace_fp4qdq_with_2dq(
 ):
     """Replaces the given node in the ONNX graph with a subgraph consisting of two DequantizeLinear nodes.
 
+    Also reused by the deprecated ``modelopt.onnx.quantization.qdq_utils.fp4qdq_to_2dq``
+    compatibility shim. Do not rename or change the signature without updating that
+    shim (it is a load-bearing re-export for TensorRT-Edge-LLM 0.6.1).
+
     Args:
         graph: The ONNX graph containing the node to replace.
         node: The node to be replaced.
 
@@ -31,7 +31,6 @@
 import os
 import re
 import shutil
-import subprocess  # nosec B404
 import tempfile
 import time
 from abc import ABC, abstractmethod
@@ -42,7 +41,7 @@
 import torch
 
 from modelopt.onnx.logging_config import logger
-from modelopt.onnx.quantization.ort_utils import _check_for_trtexec
+from modelopt.onnx.quantization.ort_utils import _check_for_trtexec, _run_trtexec
 
 TRT_AVAILABLE = importlib.util.find_spec("tensorrt") is not None
 if TRT_AVAILABLE:
@@ -159,7 +158,6 @@ def __init__(
         warmup_runs: int = 5,
         timing_runs: int = 10,
         plugin_libraries: list[str] | None = None,
-        trtexec_path: str = "trtexec",
         trtexec_args: list[str] | None = None,
     ):
         """Initialize the trtexec benchmark.
@@ -169,14 +167,11 @@ def __init__(
             warmup_runs: See :meth:`Benchmark.__init__`.
             timing_runs: See :meth:`Benchmark.__init__`.
             plugin_libraries: See :meth:`Benchmark.__init__`.
-            trtexec_path: Path to trtexec binary. Defaults to 'trtexec' which
-                         looks for the binary in PATH.
             trtexec_args: Additional command-line arguments to pass to trtexec.
                          These are appended after the standard arguments.
                          Example: ['--fp16', '--workspace=4096', '--verbose']
         """
         super().__init__(timing_cache_file, warmup_runs, timing_runs, plugin_libraries)
-        self.trtexec_path = trtexec_path
         self.trtexec_args = trtexec_args if trtexec_args is not None else []
         self.temp_dir = tempfile.mkdtemp(prefix="trtexec_benchmark_")
         self.engine_path = os.path.join(self.temp_dir, "engine.trt")
@@ -186,7 +181,6 @@ def __init__(
         self.latency_pattern = r"\[I\]\s+Latency:.*?median\s*=\s*([\d.]+)\s*ms"
 
         self._base_cmd = [
-            self.trtexec_path,
             f"--avgRuns={self.timing_runs}",
             f"--iterations={self.timing_runs}",
             f"--warmUp={self.warmup_runs}",
@@ -268,13 +262,14 @@ def run(
                 self.logger.debug(f"Wrote model bytes to temporary file: {model_path}")
 
             cmd = [*self._base_cmd, f"--onnx={model_path}"]
-            self.logger.debug(f"Running: {' '.join(cmd)}")
-            result = subprocess.run(cmd, capture_output=True, text=True)  # nosec B603
+            full_cmd = ["trtexec", *cmd]
+            self.logger.debug(f"Running: {' '.join(full_cmd)}")
+            result = _run_trtexec(cmd)
             self._write_log_file(
                 log_file,
                 "\n".join(
                     [
-                        f"Command: {' '.join(cmd)}",
+                        f"Command: {' '.join(full_cmd)}",
                         f"Return code: {result.returncode}",
                         "=" * 80,
                         "STDOUT:",
@@ -301,8 +296,9 @@ def run(
             self.logger.info(f"TrtExec benchmark (median): {latency:.2f} ms")
             return latency
         except FileNotFoundError:
-            self.logger.error(f"trtexec binary not found: {self.trtexec_path}")
-            self.logger.error("Please ensure TensorRT is installed and trtexec path is correct")
+            self.logger.error(
+                "'trtexec' binary not found. Please ensure TensorRT is installed and 'trtexec' is in PATH."
+            )
             return float("inf")
         except Exception as e:
             self.logger.error(f"Benchmark failed: {e}")
 
@@ -46,6 +46,30 @@ def _check_lib_in_ld_library_path(ld_library_path, lib_pattern):
     return False, None
 
 
+def _run_trtexec(
+    args: list[str] | None = None, timeout: float | None = None
+) -> subprocess.CompletedProcess:
+    """Run a 'trtexec' command via subprocess.
+
+    Args:
+        args: Arguments to pass to trtexec (without the 'trtexec' command itself).
+        timeout: Optional subprocess timeout in seconds.
+
+    Returns:
+        The completed subprocess result.
+
+    Raises:
+        FileNotFoundError: If the 'trtexec' binary is not found in PATH.
+    """
+    cmd = ["trtexec", *(args or [])]
+    try:
+        return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)  # nosec B603
+    except FileNotFoundError as e:
+        raise FileNotFoundError(
+            "'trtexec' binary not found. Please ensure TensorRT is installed and 'trtexec' is in PATH."
+        ) from e
+
+
 def _check_for_trtexec(min_version: str = "10.0") -> str:
     """Check if the `trtexec` CLI tool is available in PATH and is >= min_version.
 
@@ -89,7 +113,7 @@ def _parse_version_from_string(version_str: str) -> str | None:
         )
 
     try:
-        result = subprocess.run([trtexec_path], capture_output=True, text=True, timeout=5)  # nosec B603
+        result = _run_trtexec(timeout=5)
         banner_output = result.stdout + result.stderr
         parsed_version = _parse_version_from_string(banner_output)
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-lm-eval==0.4.8`
`2`	`1`	`math-verify`
`3`	`2`	`ray`
`4`	`3`	`# Likely works for transformers v5 also, but we need to test it`