fix(llm_eval): migrate lm_eval_hf.py to lm-eval >= 0.4.10 HarnessCLI

kevalmorabia97 · kevalmorabia97 · commit 4727635da481 · 2026-05-08T05:55:46.000-07:00
lm-eval 0.4.10 replaced lm_eval.__main__.{setup_parser, parse_eval_args}
with a HarnessCLI-based interface in lm_eval._cli, breaking the script's
import. Drive HarnessCLI directly: extend the run subparser with the
ModelOpt args, then move them out of the namespace into args.model_args
so EvaluatorConfig.from_cli does not reject them. Bump pinned lm-eval
versions in examples/llm_eval and examples/puzzletron requirements,
and add an end-to-end test that runs lm_eval_hf.py against a tiny qwen3.

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/examples/llm_eval/lm_eval_hf.py b/examples/llm_eval/lm_eval_hf.py
@@ -42,15 +42,15 @@
 
 import datasets
 from lm_eval import utils
-from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser
+from packaging.version import Version
 
-if not version("lm_eval").startswith("0.4.8"):
-    warnings.warn(
-        f"lm_eval_hf.py is tested with lm-eval 0.4.8; found {version('lm_eval')}. "
-        "Later versions may have incompatible API changes."
-    )
+if Version(version("lm_eval")) < Version("0.4.10"):
+    raise ImportError(f"lm_eval_hf.py requires lm-eval >= 0.4.10; found {version('lm_eval')}.")
+
+from lm_eval._cli import HarnessCLI
 from lm_eval.api.model import T
 from lm_eval.models.huggingface import HFLM
+from lm_eval.utils import setup_logging
 from quantization_utils import quantize_model
 from sparse_attention_utils import sparsify_model
 
@@ -160,9 +160,24 @@ def create_from_arg_string(
 HFLM.create_from_arg_string = classmethod(create_from_arg_string)
 
 
-def setup_parser_with_modelopt_args():
-    """Extend the lm-eval argument parser with ModelOpt quantization and sparsity options."""
-    parser = setup_parser()
+# ModelOpt-specific args that we add to lm-eval's parser. After parsing, these are
+# moved out of the argparse namespace and into args.model_args so they reach
+# HFLM.create_from_arg_obj (and so lm-eval's own arg validation doesn't reject them).
+_MODELOPT_ARG_KEYS = (
+    "quant_cfg",
+    "calib_batch_size",
+    "calib_size",
+    "auto_quantize_bits",
+    "auto_quantize_method",
+    "auto_quantize_score_size",
+    "auto_quantize_checkpoint",
+    "compress",
+    "sparse_cfg",
+)
+
+
+def _add_modelopt_args(parser):
+    """Extend an lm-eval argument parser with ModelOpt quantization and sparsity options."""
     parser.add_argument(
         "--quant_cfg",
         type=str,
@@ -221,33 +236,35 @@ def setup_parser_with_modelopt_args():
         type=str,
         help="Sparse attention configuration (e.g., SKIP_SOFTMAX_DEFAULT, SKIP_SOFTMAX_CALIB)",
     )
-    return parser
 
 
-if __name__ == "__main__":
-    parser = setup_parser_with_modelopt_args()
-    args = parse_eval_args(parser)
-    model_args = utils.simple_parse_args_string(args.model_args)
+def _inject_modelopt_args_into_model_args(args):
+    """Move ModelOpt args from the argparse namespace into args.model_args.
+
+    args.model_args is a dict (parsed by lm-eval's MergeDictAction). The ModelOpt
+    keys must be removed from the namespace so EvaluatorConfig.from_cli doesn't
+    reject them as unknown kwargs.
+    """
+    model_args = dict(args.model_args) if args.model_args else {}
 
-    if args.trust_remote_code:
+    if getattr(args, "trust_remote_code", False):
         datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
         model_args["trust_remote_code"] = True
         args.trust_remote_code = None
 
-    model_args.update(
-        {
-            "quant_cfg": args.quant_cfg,
-            "auto_quantize_bits": args.auto_quantize_bits,
-            "auto_quantize_method": args.auto_quantize_method,
-            "auto_quantize_score_size": args.auto_quantize_score_size,
-            "auto_quantize_checkpoint": args.auto_quantize_checkpoint,
-            "calib_batch_size": args.calib_batch_size,
-            "calib_size": args.calib_size,
-            "compress": args.compress,
-            "sparse_cfg": args.sparse_cfg,
-        }
-    )
+    for key in _MODELOPT_ARG_KEYS:
+        if hasattr(args, key):
+            model_args[key] = getattr(args, key)
+            delattr(args, key)
 
     args.model_args = model_args
 
-    cli_evaluate(args)
+
+if __name__ == "__main__":
+    setup_logging()
+    cli = HarnessCLI()
+    # The `run` subcommand owns the model/task arguments; extend that parser.
+    _add_modelopt_args(cli._subparsers.choices["run"])
+    args = cli.parse_args()
+    _inject_modelopt_args_into_model_args(args)
+    cli.execute(args)
diff --git a/examples/llm_eval/requirements.txt b/examples/llm_eval/requirements.txt
@@ -1,5 +1,5 @@
 fire>=0.5.0
-lm_eval[api,ifeval]==0.4.8
+lm_eval[api,ifeval]>=0.4.10
 peft>=0.5.0
 rwkv>=0.7.3
 torchvision
diff --git a/examples/puzzletron/requirements.txt b/examples/puzzletron/requirements.txt
@@ -1,4 +1,3 @@
-lm-eval==0.4.8
 math-verify
 ray
 # Likely works for transformers v5 also, but we need to test it
diff --git a/tests/examples/llm_eval/test_llm_eval.py b/tests/examples/llm_eval/test_llm_eval.py
@@ -15,16 +15,39 @@
 
 import subprocess
 
-from _test_utils.examples.models import TINY_LLAMA_PATH
-from _test_utils.examples.run_command import run_llm_ptq_command
+from _test_utils.examples.run_command import (
+    extend_cmd_parts,
+    run_example_command,
+    run_llm_ptq_command,
+)
 from _test_utils.torch.misc import minimum_sm
+from _test_utils.torch.transformers_models import create_tiny_qwen3_dir
+
+
+def test_lm_eval_hf(tmp_path):
+    """End-to-end smoke test: run lm_eval_hf.py against a tiny qwen3 on a 2-sample
+    slice of hellaswag. Verifies the HarnessCLI integration (lm-eval >= 0.4.10)
+    plus our HFLM.create_from_arg_obj override actually execute."""
+    model_dir = create_tiny_qwen3_dir(tmp_path, with_tokenizer=True)
+
+    cmd_parts = extend_cmd_parts(
+        ["python", "lm_eval_hf.py"],
+        model="hf",
+        model_args=f"pretrained={model_dir}",
+        tasks="mmlu",
+        num_fewshot=5,
+        limit=0.1,
+        batch_size=8,
+    )
+    run_example_command(cmd_parts, "llm_eval")
 
 
 @minimum_sm(89)
-def test_llama_eval_fp8():
+def test_qwen3_eval_fp8(tmp_path):
+    model_dir = create_tiny_qwen3_dir(tmp_path, with_tokenizer=True)
     try:
         run_llm_ptq_command(
-            model=TINY_LLAMA_PATH,
+            model=str(model_dir),
             quant="fp8",
             tasks="mmlu,lm_eval,simple_eval",
             calib=64,

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-lm-eval==0.4.8`
`2`	`1`	`math-verify`
`3`	`2`	`ray`
`4`	`3`	`# Likely works for transformers v5 also, but we need to test it`