4242
4343logger = logging .getLogger (__name__ )
4444
45- _TASK_MAP_LM_EVAL : dict [str , str ] = {
46- "mmlu" : "mmlu" ,
47- "gpqa" : "gpqa_diamond" ,
48- "math" : "hendrycks_math" ,
49- "gsm8k" : "gsm8k" ,
50- }
51-
52- _TASK_MAP_EVALCHEMY : dict [str , str ] = {
53- "ifeval" : "ifeval" ,
54- "alpacaeval" : "alpaca_eval_v2" ,
55- "arena_hard" : "arena_hard" ,
56- "mtbench" : "mt_bench" ,
57- "wildbench" : "wildbench" ,
58- "mixeval" : "mixeval" ,
59- "zeroeval" : "zeroeval" ,
60- "math500" : "math_500" ,
61- "aime24" : "aime2024" ,
62- "aime25" : "aime2025" ,
63- "amc23" : "amc2023" ,
64- "gpqa_diamond" : "gpqa_diamond" ,
65- "humaneval" : "humaneval" ,
66- "livecodebench" : "livecodebench" ,
67- "gsm8k" : "gsm8k" ,
68- }
69-
70-
71- def _map_results (raw_results : dict , tasks : list [str ], task_map : dict [str , str ]) -> dict :
45+
46+ def _map_results (raw_results : dict , tasks : list [str ]) -> dict :
7247 """Extract per-task accuracy metrics from lm-eval / evalchemy output."""
7348 scores : dict [str , float ] = {}
7449 results_section = raw_results .get ("results" , {})
7550 for task in tasks :
76- lm_task = task_map .get (task , task )
77- task_r = results_section .get (lm_task , {})
51+ task_r = results_section .get (task , {})
7852
7953 acc = None
8054 for key in (
@@ -125,7 +99,6 @@ def run_harness(cfg: dict, hf_token: str | None = None) -> dict:
12599
126100 Raises:
127101 ImportError: If lm_eval (or evalchemy for that backend) is not installed.
128- ValueError: If a requested task name is not in the backend's task map.
129102 """
130103 # pylint: disable=import-outside-toplevel
131104 try :
@@ -154,19 +127,8 @@ def run_harness(cfg: dict, hf_token: str | None = None) -> dict:
154127 gcs_results_path = cfg .get ("gcs_results_path" )
155128 token = resolve_token (cfg , hf_token )
156129
157- task_map = _TASK_MAP_EVALCHEMY if backend == "evalchemy" else _TASK_MAP_LM_EVAL
158130 lm_model_type = "local-chat-completions" if backend == "evalchemy" else "local-completions"
159131
160- lm_tasks : list [str ] = []
161- for t in tasks :
162- lm_task = task_map .get (t )
163- if lm_task is None :
164- raise ValueError (
165- f"No { backend } task mapping for '{ t } '. "
166- f"Known tasks: { list (task_map .keys ())} "
167- )
168- lm_tasks .append (lm_task )
169-
170132 with build_server_manager (cfg , token ) as server :
171133 import jax as _jax
172134 from jax .experimental import multihost_utils as _multihost_utils
@@ -191,14 +153,14 @@ def run_harness(cfg: dict, hf_token: str | None = None) -> dict:
191153 logger .info (
192154 "Running %s tasks %s via %s at %s" ,
193155 backend ,
194- lm_tasks ,
156+ tasks ,
195157 lm_model_type ,
196158 server .base_url ,
197159 )
198160 raw_results = lm_eval_lib .simple_evaluate (
199161 model = lm_model_type ,
200162 model_args = model_args ,
201- tasks = lm_tasks ,
163+ tasks = tasks ,
202164 num_fewshot = num_fewshot ,
203165 limit = num_samples ,
204166 log_samples = False ,
@@ -214,7 +176,7 @@ def run_harness(cfg: dict, hf_token: str | None = None) -> dict:
214176 if not is_rank0 :
215177 return {}
216178
217- scores = _map_results (raw_results , tasks , task_map )
179+ scores = _map_results (raw_results , tasks )
218180 logger .info ("%s scores: %s" , backend , scores )
219181
220182 output = write_results (
@@ -253,9 +215,8 @@ def _build_arg_parser() -> argparse.ArgumentParser:
253215 nargs = "+" ,
254216 default = ["mmlu" ],
255217 help = (
256- "Benchmark task names. "
257- "lm_eval choices: " + ", " .join (_TASK_MAP_LM_EVAL ) + ". "
258- "evalchemy choices: " + ", " .join (_TASK_MAP_EVALCHEMY ) + "."
218+ "lm-eval task names passed directly to simple_evaluate. "
219+ "Any task registered in lm-eval or evalchemy is accepted (e.g. gsm8k, mmlu, gpqa_diamond, ifeval, math_500)."
259220 ),
260221 )
261222 parser .add_argument (
0 commit comments