Skip to content

Commit 88849da

Browse files
authored
feat: show throughput metrics in final results table (#1078)
* feat: surface throughput metrics in final results table Reported-by: luodian * fix: harden limit normalization for iterator slicing Reported-by: luodian * fix: support --limit -1 as full-sample evaluation Reported-by: luodian * docs: clarify --limit argument semantics Reported-by: luodian * refactor: standardize stability terminology on repeats Reported-by: luodian * refactor: prefer repeats naming with num_samples alias Reported-by: luodian * refactor: simplify throughput reporting and OpenRouter MME script Reported-by: luodian * refactor: unify throughput metric names Reported-by: luodian * refactor: standardize total_elapsed_time metric naming Reported-by: luodian * Run black on __main__.py for PR 1078 * fix(vllm): pass chat_template directly for lint stability
1 parent c68539e commit 88849da

19 files changed

Lines changed: 298 additions & 88 deletions
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
# OpenRouter + MME quick test (simple version)
6+
# - Default uses all samples (`LIMIT=-1`)
7+
# - Stability metrics appear when `REPEATS > 1`
8+
9+
export HF_HOME="${HF_HOME:-$HOME/.cache/huggingface}"
10+
export OPENAI_API_KEY="${OPENAI_API_KEY:-${OPENROUTER_API_KEY:?Error: OPENROUTER_API_KEY not set in environment}}"
11+
export OPENAI_API_BASE="${OPENAI_API_BASE:-https://openrouter.ai/api/v1}"
12+
13+
MODEL_VERSION="${MODEL_VERSION:-mistralai/ministral-3b-2512}"
14+
TASKS="${TASKS:-mme}"
15+
REPEATS="${REPEATS:-1}"
16+
LIMIT="${LIMIT:--1}"
17+
BATCH_SIZE="${BATCH_SIZE:-1}"
18+
OUTPUT_PATH="${OUTPUT_PATH:-./logs/openrouter_mme_stats/}"
19+
VERBOSITY="${VERBOSITY:-INFO}"
20+
21+
echo "[INFO] OpenRouter MME test"
22+
echo "[INFO] model=${MODEL_VERSION} tasks=${TASKS} repeats=${REPEATS} limit=${LIMIT}"
23+
echo "[INFO] output_path=${OUTPUT_PATH}"
24+
25+
python3 -m lmms_eval \
26+
--model openai_compatible \
27+
--model_args "model_version=${MODEL_VERSION}" \
28+
--tasks "${TASKS}" \
29+
--batch_size "${BATCH_SIZE}" \
30+
--repeats "${REPEATS}" \
31+
--limit "${LIMIT}" \
32+
--output_path "${OUTPUT_PATH}" \
33+
--log_samples \
34+
--verbosity "${VERBOSITY}"

lmms_eval/__main__.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ def parse_eval_args() -> argparse.Namespace:
230230
"--limit",
231231
type=float,
232232
default=None,
233-
help="Limit the number of examples per task. " "If <1, limit is a percentage of the total number of examples.",
233+
help=("Limit examples per task: use -1 (or omit) for all samples, " "0 < limit < 1 for a fraction of the dataset, and limit >= 1 " "for an absolute sample count."),
234234
)
235235
parser.add_argument(
236236
"--offset",
@@ -387,10 +387,12 @@ def parse_eval_args() -> argparse.Namespace:
387387
)
388388
parser.add_argument(
389389
"-n",
390+
"--repeats",
390391
"--num_samples",
392+
dest="repeats",
391393
type=int,
392394
default=1,
393-
help="Number of samples per question for model stability measurement. " "When n > 1, enables k-samples mode and computes EA, CA, IV, CR metrics.",
395+
help=("Number of repeated generations per question for model stability " "measurement. Backward-compatible alias: --num_samples. " "When n > 1, enables k-samples " "mode and computes EA, CA, IV, CR metrics."),
394396
)
395397
parser.add_argument("--baseline", type=str, default=None, help="Baseline for paired t-test comparison. Accepts: local JSONL path, hf://user/repo, or preset name (e.g., qwen25vl).")
396398

@@ -587,8 +589,10 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
587589
if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
588590
eval_logger.warning("Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub.")
589591

590-
if args.limit:
592+
if args.limit is not None and args.limit != -1:
591593
eval_logger.warning(" --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
594+
if args.limit is not None and args.limit < 0 and args.limit != -1:
595+
raise ValueError("--limit must be -1 or non-negative")
592596
if args.offset < 0:
593597
raise ValueError("--offset must be >= 0")
594598

@@ -675,7 +679,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
675679
distributed_executor_backend="torchrun" if (torch.distributed.is_available() and torch.distributed.is_initialized()) else "accelerate",
676680
force_simple=args.force_simple,
677681
launcher_args=args.launcher_args,
678-
num_samples=args.num_samples,
682+
repeats=args.repeats,
679683
baseline=args.baseline,
680684
**request_caching_args,
681685
)

lmms_eval/api/task.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1662,7 +1662,7 @@ def task_name(self) -> Any:
16621662
return getattr(self.config, "task", None)
16631663

16641664
def __repr__(self):
1665-
return f"ConfigurableTask(task_name={getattr(self.config, 'task', None)}," f"output_type={self.OUTPUT_TYPE}," f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," f"num_samples={len(self.eval_docs)})"
1665+
return f"ConfigurableTask(task_name={getattr(self.config, 'task', None)}," f"output_type={self.OUTPUT_TYPE}," f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," f"repeats={getattr(self.config, 'repeats', None)})"
16661666

16671667

16681668
class ConfigurableMessagesTask(ConfigurableTask):
@@ -1712,4 +1712,4 @@ def construct_requests(self, doc_id: int, ctx: str, **kwargs) -> Union[List[Inst
17121712
return Instance(request_type=self.OUTPUT_TYPE, arguments=arguments, idx=0, task_name=self.config.task, doc_id=doc_id, **kwargs)
17131713

17141714
def __repr__(self):
1715-
return f"ConfigurableMessagesTask(task_name={getattr(self.config, 'task', None)}," f"output_type={self.OUTPUT_TYPE}," f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," f"num_samples={len(self.eval_docs)})"
1715+
return f"ConfigurableMessagesTask(task_name={getattr(self.config, 'task', None)}," f"output_type={self.OUTPUT_TYPE}," f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," f"repeats={getattr(self.config, 'repeats', None)})"

lmms_eval/evaluator.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def simple_evaluate(
8484
distributed_executor_backend: str = "accelerate",
8585
cli_args=None,
8686
force_simple: bool = False,
87-
num_samples: int = 1,
87+
repeats: int = 1,
8888
baseline: Optional[str] = None,
8989
):
9090
"""Instantiate and evaluate a model on a list of tasks.
@@ -124,6 +124,8 @@ def simple_evaluate(
124124
If True, write out an example document and model input for checking task integrity
125125
:param log_samples: bool
126126
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
127+
:param repeats: int
128+
Number of repeated generations per question for k-samples stability metrics.
127129
:param system_instruction: str
128130
System instruction to be applied to the prompt
129131
:param apply_chat_template: bool
@@ -247,11 +249,11 @@ def _adjust_config(task_dict):
247249
task_obj.set_fewshot_seed(seed=fewshot_random_seed)
248250
# eval_logger.info(f"Setting fewshot random generator seed to {fewshot_random_seed}")
249251

250-
# Handle num_samples for model stability measurement (k-samples mode)
251-
if num_samples > 1:
252+
# Handle repeated generations for model stability measurement (k-samples mode)
253+
if repeats > 1:
252254
default_repeats = task_obj.get_config("repeats") or 1
253-
eval_logger.info(f"[Model Stability] Setting repeats={num_samples} for {task_name} (was: {default_repeats})")
254-
task_obj.set_config(key="repeats", value=num_samples)
255+
eval_logger.info(f"[Model Stability] Setting repeats={repeats} for {task_name} (was: {default_repeats})")
256+
task_obj.set_config(key="repeats", value=repeats)
255257

256258
adjusted_task_dict[task_name] = task_obj
257259

@@ -271,6 +273,10 @@ def _adjust_config(task_dict):
271273
fewshot_as_multiturn=fewshot_as_multiturn,
272274
)
273275

276+
from lmms_eval.models.model_utils.gen_metrics import reset_logged_metrics
277+
278+
reset_logged_metrics()
279+
274280
# Getting the rank settings
275281
local_rank = int(os.environ.get("LOCAL_RANK", 0))
276282
global_rank = int(os.environ.get("RANK", 0))
@@ -296,6 +302,8 @@ def _adjust_config(task_dict):
296302
)
297303

298304
if global_rank == 0:
305+
from lmms_eval.models.model_utils.gen_metrics import summarize_logged_metrics
306+
299307
if isinstance(model, str):
300308
model_name = model
301309
elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"):
@@ -330,6 +338,9 @@ def _adjust_config(task_dict):
330338
)
331339
results["git_hash"] = get_git_commit_hash()
332340
results["date"] = datetime_str
341+
throughput_summary = summarize_logged_metrics()
342+
if throughput_summary:
343+
results["throughput"] = throughput_summary
333344
# add_env_info(results) # additional environment info to results
334345
# add_tokenizer_info(results, lm) # additional info about tokenizer
335346

lmms_eval/evaluator_utils.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def calculate_clt_aggregate_metric(self) -> None:
162162
self.agg_metrics[f"{metric}_stderr_clustered,{filter_key}"] = "N/A"
163163

164164
def calculate_stability_metrics(self) -> None:
165-
"""Calculate model stability metrics (EA, CA, IV, CR) when num_samples > 1.
165+
"""Calculate model stability metrics (EA, CA, IV, CR) when repeats > 1.
166166
167167
These metrics measure model consistency across multiple samples per question.
168168
Only computed when repeats > 1 (k-samples mode).
@@ -285,10 +285,14 @@ def print_writeout(task) -> None:
285285
eval_logger.info(f"Request: {str(inst)}")
286286

287287

288-
def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
289-
if limit is not None:
290-
limit = int(math.ceil(len(task.eval_docs) * limit)) if limit < 1.0 else int(limit)
291-
return limit
288+
def get_sample_size(task, limit: Optional[Union[int, float]]) -> Union[int, None]:
289+
if limit is None or limit == -1:
290+
return None
291+
if limit < 0:
292+
raise ValueError(f"limit must be -1 or non-negative, got {limit}")
293+
if 0 < limit < 1.0:
294+
return int(math.ceil(len(task.eval_docs) * limit))
295+
return int(limit)
292296

293297

294298
def prepare_print_tasks(

lmms_eval/models/chat/huggingface.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ def _collate(x):
210210
chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
211211
num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1
212212
pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding")
213-
e2e_latency = 0
213+
total_elapsed_time = 0
214214
total_tokens = 0
215215
for chunk in chunks:
216216
ctx, doc_to_messages, all_gen_kwargs, doc_id, task, split = zip(*chunk)
@@ -288,7 +288,7 @@ def _collate(x):
288288
)
289289

290290
# Calculate timing metrics for batch
291-
e2e_latency += end_time - start_time
291+
total_elapsed_time += end_time - start_time
292292
total_tokens += sum(len(ids) for ids in generated_ids_trimmed)
293293

294294
for ans, context in zip(answers, texts):
@@ -303,11 +303,11 @@ def _collate(x):
303303
# reorder this group of results back to original unsorted form
304304
res = re_ords.get_original(res)
305305
# Calculate average speed
306-
avg_speed = total_tokens / e2e_latency if e2e_latency > 0 else 0
306+
avg_speed = total_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
307307
# Log metrics
308308
metric_dict = {
309-
"total_tokens": total_tokens,
310-
"e2e_latency": e2e_latency,
309+
"total_gen_tokens": total_tokens,
310+
"total_elapsed_time": total_elapsed_time,
311311
"avg_speed": avg_speed,
312312
}
313313
log_metrics(**metric_dict)

lmms_eval/models/chat/internvl_hf.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ def _collate(x):
227227
chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
228228
num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1
229229
pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding")
230-
e2e_latency = 0
230+
total_elapsed_time = 0
231231
total_tokens = 0
232232
for chunk in chunks:
233233
ctx, doc_to_messages, all_gen_kwargs, doc_id, task, split = zip(*chunk)
@@ -309,12 +309,12 @@ def _collate(x):
309309
)
310310

311311
# Calculate timing metrics
312-
e2e_latency += end_time - start_time
312+
total_elapsed_time += end_time - start_time
313313
total_tokens += sum(len(ids) for ids in generated_ids_trimmed)
314314
except Exception as e:
315315
eval_logger.error(f"Error {e} in generating")
316316
cont = ""
317-
e2e_latency += 0
317+
total_elapsed_time += 0
318318
total_tokens += 0
319319

320320
if self.accelerator.is_main_process and doc_id[0] % 100 == 0:
@@ -327,9 +327,9 @@ def _collate(x):
327327
res = re_ords.get_original(res)
328328

329329
metric_dict = {
330-
"total_tokens": total_tokens,
331-
"e2e_latency": e2e_latency,
332-
"avg_speed": total_tokens / e2e_latency if e2e_latency > 0 else 0,
330+
"total_gen_tokens": total_tokens,
331+
"total_elapsed_time": total_elapsed_time,
332+
"avg_speed": total_tokens / total_elapsed_time if total_elapsed_time > 0 else 0,
333333
"additional_metrics": {
334334
"rank": self.rank,
335335
},

lmms_eval/models/chat/llava_hf.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def _collate(x):
4141
chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
4242
num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1
4343
pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding")
44-
e2e_latency = 0
44+
total_elapsed_time = 0
4545
total_tokens = 0
4646
for chunk in chunks:
4747
ctx, doc_to_messages, all_gen_kwargs, doc_id, task, split = zip(*chunk)
@@ -100,13 +100,13 @@ def _collate(x):
100100
cont = cont[:, inputs["input_ids"].shape[-1] :]
101101

102102
# Calculate timing metrics
103-
e2e_latency += end_time - start_time
103+
total_elapsed_time += end_time - start_time
104104
total_tokens += cont.shape[-1] if len(cont.shape) > 1 else len(cont)
105105

106106
except Exception as e:
107107
eval_logger.error(f"Error {e} in generating")
108108
cont = ""
109-
e2e_latency += 0
109+
total_elapsed_time += 0
110110
total_tokens += 0
111111

112112
text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0] if cont != "" else ""
@@ -121,9 +121,9 @@ def _collate(x):
121121
res = re_ords.get_original(res)
122122

123123
metric_dict = {
124-
"total_tokens": total_tokens,
125-
"e2e_latency": e2e_latency,
126-
"avg_speed": total_tokens / e2e_latency if e2e_latency > 0 else 0,
124+
"total_gen_tokens": total_tokens,
125+
"total_elapsed_time": total_elapsed_time,
126+
"avg_speed": total_tokens / total_elapsed_time if total_elapsed_time > 0 else 0,
127127
"additional_metrics": {
128128
"rank": self.rank,
129129
},

lmms_eval/models/chat/llava_onevision1_5.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def _collate(x):
4141
num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1
4242
pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding")
4343

44-
e2e_latency = 0.0
44+
total_elapsed_time = 0.0
4545
total_tokens = 0
4646

4747
if self.batch_size > 1:
@@ -135,7 +135,7 @@ def _collate(x):
135135
with torch.inference_mode():
136136
cont = self.model.generate(**gen_args)
137137
end_time = time.time()
138-
e2e_latency += end_time - start_time
138+
total_elapsed_time += end_time - start_time
139139

140140
# Remove prompt tokens
141141
cont = cont[:, inputs["input_ids"].shape[-1] :]
@@ -153,9 +153,9 @@ def _collate(x):
153153
res = re_ords.get_original(res)
154154

155155
metric_dict = {
156-
"total_tokens": total_tokens,
157-
"e2e_latency": e2e_latency,
158-
"avg_speed": total_tokens / e2e_latency if e2e_latency > 0 else 0,
156+
"total_gen_tokens": total_tokens,
157+
"total_elapsed_time": total_elapsed_time,
158+
"avg_speed": total_tokens / total_elapsed_time if total_elapsed_time > 0 else 0,
159159
"additional_metrics": {
160160
"rank": self.rank,
161161
},

lmms_eval/models/chat/longvila.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def generate_until(self, requests) -> List[str]:
168168

169169
batch_size = self.batch_size_per_gpu
170170
batched_requests = [requests[i : i + batch_size] for i in range(0, len(requests), batch_size)]
171-
e2e_latency = 0
171+
total_elapsed_time = 0
172172
for batch_requests in batched_requests:
173173
prompt_embeds_list = []
174174
params_list = []
@@ -190,7 +190,7 @@ def generate_until(self, requests) -> List[str]:
190190
self.add_request_response_to_cache(req, text)
191191

192192
# Calculate timing metrics for batch
193-
e2e_latency += end_time - start_time
193+
total_elapsed_time += end_time - start_time
194194

195195
assert len(response_text) == len(batch_requests)
196196
res.extend(response_text)

0 commit comments

Comments
 (0)