Skip to content

Commit 4727635

Browse files
fix(llm_eval): migrate lm_eval_hf.py to lm-eval >= 0.4.10 HarnessCLI
lm-eval 0.4.10 replaced lm_eval.__main__.{setup_parser, parse_eval_args} with a HarnessCLI-based interface in lm_eval._cli, breaking the script's import. Drive HarnessCLI directly: extend the run subparser with the ModelOpt args, then move them out of the namespace into args.model_args so EvaluatorConfig.from_cli does not reject them. Bump pinned lm-eval versions in examples/llm_eval and examples/puzzletron requirements, and add an end-to-end test that runs lm_eval_hf.py against a tiny qwen3. Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
1 parent e2d29c8 commit 4727635

4 files changed

Lines changed: 74 additions & 35 deletions

File tree

examples/llm_eval/lm_eval_hf.py

Lines changed: 46 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,15 @@
4242

4343
import datasets
4444
from lm_eval import utils
45-
from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser
45+
from packaging.version import Version
4646

47-
if not version("lm_eval").startswith("0.4.8"):
48-
warnings.warn(
49-
f"lm_eval_hf.py is tested with lm-eval 0.4.8; found {version('lm_eval')}. "
50-
"Later versions may have incompatible API changes."
51-
)
47+
if Version(version("lm_eval")) < Version("0.4.10"):
48+
raise ImportError(f"lm_eval_hf.py requires lm-eval >= 0.4.10; found {version('lm_eval')}.")
49+
50+
from lm_eval._cli import HarnessCLI
5251
from lm_eval.api.model import T
5352
from lm_eval.models.huggingface import HFLM
53+
from lm_eval.utils import setup_logging
5454
from quantization_utils import quantize_model
5555
from sparse_attention_utils import sparsify_model
5656

@@ -160,9 +160,24 @@ def create_from_arg_string(
160160
HFLM.create_from_arg_string = classmethod(create_from_arg_string)
161161

162162

163-
def setup_parser_with_modelopt_args():
164-
"""Extend the lm-eval argument parser with ModelOpt quantization and sparsity options."""
165-
parser = setup_parser()
163+
# ModelOpt-specific args that we add to lm-eval's parser. After parsing, these are
164+
# moved out of the argparse namespace and into args.model_args so they reach
165+
# HFLM.create_from_arg_obj (and so lm-eval's own arg validation doesn't reject them).
166+
_MODELOPT_ARG_KEYS = (
167+
"quant_cfg",
168+
"calib_batch_size",
169+
"calib_size",
170+
"auto_quantize_bits",
171+
"auto_quantize_method",
172+
"auto_quantize_score_size",
173+
"auto_quantize_checkpoint",
174+
"compress",
175+
"sparse_cfg",
176+
)
177+
178+
179+
def _add_modelopt_args(parser):
180+
"""Extend an lm-eval argument parser with ModelOpt quantization and sparsity options."""
166181
parser.add_argument(
167182
"--quant_cfg",
168183
type=str,
@@ -221,33 +236,35 @@ def setup_parser_with_modelopt_args():
221236
type=str,
222237
help="Sparse attention configuration (e.g., SKIP_SOFTMAX_DEFAULT, SKIP_SOFTMAX_CALIB)",
223238
)
224-
return parser
225239

226240

227-
if __name__ == "__main__":
228-
parser = setup_parser_with_modelopt_args()
229-
args = parse_eval_args(parser)
230-
model_args = utils.simple_parse_args_string(args.model_args)
241+
def _inject_modelopt_args_into_model_args(args):
242+
"""Move ModelOpt args from the argparse namespace into args.model_args.
243+
244+
args.model_args is a dict (parsed by lm-eval's MergeDictAction). The ModelOpt
245+
keys must be removed from the namespace so EvaluatorConfig.from_cli doesn't
246+
reject them as unknown kwargs.
247+
"""
248+
model_args = dict(args.model_args) if args.model_args else {}
231249

232-
if args.trust_remote_code:
250+
if getattr(args, "trust_remote_code", False):
233251
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
234252
model_args["trust_remote_code"] = True
235253
args.trust_remote_code = None
236254

237-
model_args.update(
238-
{
239-
"quant_cfg": args.quant_cfg,
240-
"auto_quantize_bits": args.auto_quantize_bits,
241-
"auto_quantize_method": args.auto_quantize_method,
242-
"auto_quantize_score_size": args.auto_quantize_score_size,
243-
"auto_quantize_checkpoint": args.auto_quantize_checkpoint,
244-
"calib_batch_size": args.calib_batch_size,
245-
"calib_size": args.calib_size,
246-
"compress": args.compress,
247-
"sparse_cfg": args.sparse_cfg,
248-
}
249-
)
255+
for key in _MODELOPT_ARG_KEYS:
256+
if hasattr(args, key):
257+
model_args[key] = getattr(args, key)
258+
delattr(args, key)
250259

251260
args.model_args = model_args
252261

253-
cli_evaluate(args)
262+
263+
if __name__ == "__main__":
264+
setup_logging()
265+
cli = HarnessCLI()
266+
# The `run` subcommand owns the model/task arguments; extend that parser.
267+
_add_modelopt_args(cli._subparsers.choices["run"])
268+
args = cli.parse_args()
269+
_inject_modelopt_args_into_model_args(args)
270+
cli.execute(args)

examples/llm_eval/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
fire>=0.5.0
2-
lm_eval[api,ifeval]==0.4.8
2+
lm_eval[api,ifeval]>=0.4.10
33
peft>=0.5.0
44
rwkv>=0.7.3
55
torchvision

examples/puzzletron/requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
lm-eval==0.4.8
21
math-verify
32
ray
43
# Likely works for transformers v5 also, but we need to test it

tests/examples/llm_eval/test_llm_eval.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,39 @@
1515

1616
import subprocess
1717

18-
from _test_utils.examples.models import TINY_LLAMA_PATH
19-
from _test_utils.examples.run_command import run_llm_ptq_command
18+
from _test_utils.examples.run_command import (
19+
extend_cmd_parts,
20+
run_example_command,
21+
run_llm_ptq_command,
22+
)
2023
from _test_utils.torch.misc import minimum_sm
24+
from _test_utils.torch.transformers_models import create_tiny_qwen3_dir
25+
26+
27+
def test_lm_eval_hf(tmp_path):
28+
"""End-to-end smoke test: run lm_eval_hf.py against a tiny qwen3 on a 2-sample
29+
slice of hellaswag. Verifies the HarnessCLI integration (lm-eval >= 0.4.10)
30+
plus our HFLM.create_from_arg_obj override actually execute."""
31+
model_dir = create_tiny_qwen3_dir(tmp_path, with_tokenizer=True)
32+
33+
cmd_parts = extend_cmd_parts(
34+
["python", "lm_eval_hf.py"],
35+
model="hf",
36+
model_args=f"pretrained={model_dir}",
37+
tasks="mmlu",
38+
num_fewshot=5,
39+
limit=0.1,
40+
batch_size=8,
41+
)
42+
run_example_command(cmd_parts, "llm_eval")
2143

2244

2345
@minimum_sm(89)
24-
def test_llama_eval_fp8():
46+
def test_qwen3_eval_fp8(tmp_path):
47+
model_dir = create_tiny_qwen3_dir(tmp_path, with_tokenizer=True)
2548
try:
2649
run_llm_ptq_command(
27-
model=TINY_LLAMA_PATH,
50+
model=str(model_dir),
2851
quant="fp8",
2952
tasks="mmlu,lm_eval,simple_eval",
3053
calib=64,

0 commit comments

Comments
 (0)