|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
| 3 | +# SPDX-FileCopyrightText: Copyright contributors to the Aphrodite project |
| 4 | +"""Single-request model performance benchmark. |
| 5 | +
|
| 6 | +This intentionally mirrors the compact output style of exllamav3's |
| 7 | +``eval/perf.py`` while running through Aphrodite's normal offline engine. |
| 8 | +""" |
| 9 | + |
| 10 | +import argparse |
| 11 | +import logging |
| 12 | +import os |
| 13 | +import sys |
| 14 | +import time |
| 15 | +from collections.abc import Sequence |
| 16 | +from contextlib import contextmanager |
| 17 | +from types import TracebackType |
| 18 | + |
| 19 | +import numpy as np |
| 20 | +from rich.console import Console |
| 21 | +from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn |
| 22 | + |
| 23 | +from aphrodite.inputs import TokensPrompt |
| 24 | +from aphrodite.outputs import RequestOutput |
| 25 | + |
| 26 | +ESC = "\u001b" |
| 27 | +COL_DEFAULT = f"{ESC}[0m" |
| 28 | +COL_GREEN = f"{ESC}[32;1m" |
| 29 | +COL_YELLOW = f"{ESC}[33;1m" |
| 30 | + |
| 31 | + |
| 32 | +@contextmanager |
| 33 | +def _suppress_startup_logs(enabled: bool): |
| 34 | + if not enabled: |
| 35 | + yield |
| 36 | + return |
| 37 | + |
| 38 | + previous_disable_level = logging.root.manager.disable |
| 39 | + previous_env_level = os.environ.get("APHRODITE_LOGGING_LEVEL") |
| 40 | + aphrodite_logger = logging.getLogger("aphrodite") |
| 41 | + previous_aphrodite_level = aphrodite_logger.level |
| 42 | + previous_handler_levels = [handler.level for handler in aphrodite_logger.handlers] |
| 43 | + stdout_fd = os.dup(1) |
| 44 | + stderr_fd = os.dup(2) |
| 45 | + devnull_fd = os.open(os.devnull, os.O_WRONLY) |
| 46 | + |
| 47 | + os.environ["APHRODITE_LOGGING_LEVEL"] = "ERROR" |
| 48 | + aphrodite_logger.setLevel(logging.ERROR) |
| 49 | + for handler in aphrodite_logger.handlers: |
| 50 | + handler.setLevel(logging.ERROR) |
| 51 | + logging.disable(logging.INFO) |
| 52 | + os.dup2(devnull_fd, 1) |
| 53 | + os.dup2(devnull_fd, 2) |
| 54 | + try: |
| 55 | + yield |
| 56 | + finally: |
| 57 | + os.dup2(stdout_fd, 1) |
| 58 | + os.dup2(stderr_fd, 2) |
| 59 | + os.close(stdout_fd) |
| 60 | + os.close(stderr_fd) |
| 61 | + os.close(devnull_fd) |
| 62 | + logging.disable(previous_disable_level) |
| 63 | + if previous_env_level is None: |
| 64 | + os.environ.pop("APHRODITE_LOGGING_LEVEL", None) |
| 65 | + else: |
| 66 | + os.environ["APHRODITE_LOGGING_LEVEL"] = previous_env_level |
| 67 | + aphrodite_logger.setLevel(previous_aphrodite_level) |
| 68 | + for handler, level in zip(aphrodite_logger.handlers, previous_handler_levels): |
| 69 | + handler.setLevel(level) |
| 70 | + |
| 71 | + |
| 72 | +class _StartupProgress: |
| 73 | + def __init__(self, enabled: bool) -> None: |
| 74 | + self.enabled = enabled |
| 75 | + self.progress: Progress | None = None |
| 76 | + self.console_file = None |
| 77 | + |
| 78 | + def __enter__(self) -> "_StartupProgress": |
| 79 | + if self.enabled: |
| 80 | + self.console_file = os.fdopen(os.dup(sys.stdout.fileno()), "w", buffering=1) |
| 81 | + self.progress = Progress( |
| 82 | + SpinnerColumn(), |
| 83 | + TextColumn("[progress.description]{task.description}"), |
| 84 | + TimeElapsedColumn(), |
| 85 | + console=Console(file=self.console_file), |
| 86 | + transient=True, |
| 87 | + ) |
| 88 | + self.progress.start() |
| 89 | + self.progress.add_task("Loading, compiling, and capturing model...", total=None) |
| 90 | + return self |
| 91 | + |
| 92 | + def __exit__( |
| 93 | + self, |
| 94 | + exc_type: type[BaseException] | None, |
| 95 | + exc: BaseException | None, |
| 96 | + traceback: TracebackType | None, |
| 97 | + ) -> None: |
| 98 | + if self.progress is not None: |
| 99 | + if exc_type is None: |
| 100 | + for task_id in self.progress.task_ids: |
| 101 | + self.progress.update(task_id, description="Model ready") |
| 102 | + self.progress.stop() |
| 103 | + if self.console_file is not None: |
| 104 | + self.console_file.close() |
| 105 | + |
| 106 | + |
| 107 | +def _get_lengths(max_length: int) -> list[int]: |
| 108 | + length = 256 |
| 109 | + lengths = [length] |
| 110 | + while length < max_length: |
| 111 | + length = min(length * 2, max_length) |
| 112 | + lengths.append(length) |
| 113 | + return lengths |
| 114 | + |
| 115 | + |
| 116 | +def _get_vocab_size(llm) -> int: # type: ignore[no-untyped-def] |
| 117 | + vocab_size = getattr(llm.model_config.hf_config, "vocab_size", None) |
| 118 | + tokenizer = llm.get_tokenizer() |
| 119 | + try: |
| 120 | + tokenizer_size = len(tokenizer) |
| 121 | + except TypeError: |
| 122 | + tokenizer_size = len(tokenizer.get_vocab()) |
| 123 | + |
| 124 | + if vocab_size is None: |
| 125 | + return tokenizer_size |
| 126 | + return min(vocab_size, tokenizer_size) |
| 127 | + |
| 128 | + |
| 129 | +def _make_prompt( |
| 130 | + rng: np.random.Generator, |
| 131 | + length: int, |
| 132 | + vocab_size: int, |
| 133 | +) -> TokensPrompt: |
| 134 | + token_ids = rng.integers(0, vocab_size, size=length, dtype=np.int64) |
| 135 | + return TokensPrompt(prompt_token_ids=token_ids.tolist()) |
| 136 | + |
| 137 | + |
| 138 | +def _prefill_time(output: RequestOutput, wall_time: float) -> float: |
| 139 | + metrics = output.metrics |
| 140 | + if metrics is None: |
| 141 | + return wall_time |
| 142 | + |
| 143 | + prefill_time = metrics.first_token_ts - metrics.scheduled_ts |
| 144 | + if prefill_time <= 0: |
| 145 | + return wall_time |
| 146 | + return prefill_time |
| 147 | + |
| 148 | + |
| 149 | +def _decode_time(output: RequestOutput, wall_time: float) -> float: |
| 150 | + metrics = output.metrics |
| 151 | + if metrics is None: |
| 152 | + return wall_time |
| 153 | + |
| 154 | + decode_time = metrics.last_token_ts - metrics.first_token_ts |
| 155 | + if decode_time <= 0: |
| 156 | + return wall_time |
| 157 | + return decode_time |
| 158 | + |
| 159 | + |
| 160 | +def _run_generate(llm, prompt: TokensPrompt, max_tokens: int): # type: ignore[no-untyped-def] |
| 161 | + from aphrodite import SamplingParams |
| 162 | + |
| 163 | + sampling_params = SamplingParams( |
| 164 | + temperature=0.0, |
| 165 | + top_p=1.0, |
| 166 | + ignore_eos=True, |
| 167 | + max_tokens=max_tokens, |
| 168 | + detokenize=False, |
| 169 | + ) |
| 170 | + start = time.perf_counter() |
| 171 | + outputs = llm.generate(prompt, sampling_params=sampling_params, use_tqdm=False) |
| 172 | + end = time.perf_counter() |
| 173 | + return outputs[0], end - start |
| 174 | + |
| 175 | + |
| 176 | +def _measure_prefill( |
| 177 | + args: argparse.Namespace, |
| 178 | + llm, |
| 179 | + lengths: Sequence[int], |
| 180 | + rng: np.random.Generator, |
| 181 | + vocab_size: int, |
| 182 | + *, |
| 183 | + warmup: bool = False, |
| 184 | +) -> dict[int, float]: |
| 185 | + results: dict[int, float] = {} |
| 186 | + for length in lengths: |
| 187 | + prompt = _make_prompt(rng, length, vocab_size) |
| 188 | + output, wall_time = _run_generate(llm, prompt, max_tokens=1) |
| 189 | + tokens_per_second = length / _prefill_time(output, wall_time) |
| 190 | + results[length] = tokens_per_second |
| 191 | + if not warmup: |
| 192 | + print(f"Length {length:6}: {COL_GREEN}{tokens_per_second:10.2f}{COL_DEFAULT} tokens/s", flush=True) |
| 193 | + return results |
| 194 | + |
| 195 | + |
| 196 | +def _measure_generate( |
| 197 | + args: argparse.Namespace, |
| 198 | + llm, |
| 199 | + contexts: Sequence[int], |
| 200 | + rng: np.random.Generator, |
| 201 | + vocab_size: int, |
| 202 | + *, |
| 203 | + warmup: bool = False, |
| 204 | +) -> dict[int, float]: |
| 205 | + results: dict[int, float] = {} |
| 206 | + for context_len in contexts: |
| 207 | + # Aphrodite needs at least one prompt token. Label the first case as |
| 208 | + # context 0 to match exllamav3's eval/perf.py output. |
| 209 | + prompt_len = max(context_len, 1) |
| 210 | + prompt = _make_prompt(rng, prompt_len, vocab_size) |
| 211 | + |
| 212 | + # The first generated token is produced by the prefill step. Request |
| 213 | + # one extra token so the measured decode interval covers args.gen_tokens |
| 214 | + # decode steps. |
| 215 | + output, wall_time = _run_generate(llm, prompt, max_tokens=args.gen_tokens + 1) |
| 216 | + tokens_per_second = args.gen_tokens / _decode_time(output, wall_time) |
| 217 | + results[context_len] = tokens_per_second |
| 218 | + if not warmup: |
| 219 | + print(f"Context {context_len:6}: {COL_GREEN}{tokens_per_second:10.2f}{COL_DEFAULT} tokens/s", flush=True) |
| 220 | + return results |
| 221 | + |
| 222 | + |
| 223 | +def add_cli_args(parser: argparse.ArgumentParser) -> None: |
| 224 | + parser.add_argument( |
| 225 | + "model_tag", |
| 226 | + nargs="?", |
| 227 | + help="Model name or path. Equivalent to --model for this benchmark.", |
| 228 | + ) |
| 229 | + parser.add_argument( |
| 230 | + "-m", |
| 231 | + "--model-dir", |
| 232 | + dest="model_dir", |
| 233 | + help="Model name or path, matching exllamav3 eval/perf.py.", |
| 234 | + ) |
| 235 | + parser.add_argument( |
| 236 | + "-max_length", |
| 237 | + "--max-length", |
| 238 | + type=int, |
| 239 | + default=32768, |
| 240 | + help="Max context length to measure.", |
| 241 | + ) |
| 242 | + parser.add_argument( |
| 243 | + "-chunk_size", |
| 244 | + "--chunk-size", |
| 245 | + type=int, |
| 246 | + default=4096, |
| 247 | + help="Chunk size used for the default max-num-batched-tokens.", |
| 248 | + ) |
| 249 | + parser.add_argument( |
| 250 | + "--gen-tokens", |
| 251 | + type=int, |
| 252 | + default=100, |
| 253 | + help="Number of decode tokens to measure for each context length.", |
| 254 | + ) |
| 255 | + parser.add_argument( |
| 256 | + "-spf", |
| 257 | + "--skip-prefill", |
| 258 | + action="store_true", |
| 259 | + help="Skip measuring prefill speed.", |
| 260 | + ) |
| 261 | + parser.add_argument( |
| 262 | + "-swu", |
| 263 | + "--skip-warmup", |
| 264 | + action="store_true", |
| 265 | + help="Skip warmup passes.", |
| 266 | + ) |
| 267 | + parser.add_argument( |
| 268 | + "--show-startup-logs", |
| 269 | + action="store_true", |
| 270 | + help="Show normal Aphrodite engine startup logs instead of the compact startup progress line.", |
| 271 | + ) |
| 272 | + # Building engine CLI args can initialize parts of the platform layer. |
| 273 | + # Keep that quiet so `bench perf` has a compact exllamav3-like surface. |
| 274 | + with _suppress_startup_logs(enabled=True): |
| 275 | + from aphrodite.engine.arg_utils import EngineArgs |
| 276 | + |
| 277 | + EngineArgs.add_cli_args(parser) |
| 278 | + parser.set_defaults(enable_prefix_caching=False, disable_log_stats=False) |
| 279 | + |
| 280 | + |
| 281 | +def main(args: argparse.Namespace) -> None: |
| 282 | + model = args.model_dir or args.model_tag or args.model |
| 283 | + if model is None: |
| 284 | + raise ValueError("aphrodite bench perf requires a model via MODEL, -m, or --model.") |
| 285 | + args.model = model |
| 286 | + |
| 287 | + if args.max_num_batched_tokens is None: |
| 288 | + args.max_num_batched_tokens = args.chunk_size |
| 289 | + |
| 290 | + # Keep request-level timing metrics enabled, but avoid interleaving the |
| 291 | + # normal "Request completed" log line with the perf.py-style table. |
| 292 | + logging.getLogger("aphrodite.v1.metrics.loggers").setLevel(logging.WARNING) |
| 293 | + |
| 294 | + hide_startup_logs = not args.show_startup_logs |
| 295 | + with _StartupProgress(enabled=hide_startup_logs): |
| 296 | + with _suppress_startup_logs(enabled=hide_startup_logs): |
| 297 | + from aphrodite import LLM |
| 298 | + from aphrodite.engine.arg_utils import EngineArgs |
| 299 | + |
| 300 | + engine_args = EngineArgs.from_cli_args(args) |
| 301 | + llm = LLM.from_engine_args(engine_args) |
| 302 | + |
| 303 | + max_required_len = args.max_length + args.gen_tokens + 1 |
| 304 | + assert llm.llm_engine.model_config.max_model_len >= max_required_len, ( |
| 305 | + f"Please ensure max_model_len is at least {max_required_len} tokens for this benchmark." |
| 306 | + ) |
| 307 | + |
| 308 | + vocab_size = _get_vocab_size(llm) |
| 309 | + rng = np.random.default_rng(args.seed) |
| 310 | + |
| 311 | + print(f" -- Model: {model}", flush=True) |
| 312 | + print(f" -- Chunk size: {args.chunk_size}", flush=True) |
| 313 | + print(flush=True) |
| 314 | + |
| 315 | + prefill_lengths = _get_lengths(args.max_length) |
| 316 | + generate_contexts = [0] + _get_lengths(max(args.max_length - 256, 256)) |
| 317 | + |
| 318 | + if not args.skip_prefill: |
| 319 | + if not args.skip_warmup: |
| 320 | + warmup_prefill_lengths = _get_lengths(min(args.chunk_size, args.max_length)) |
| 321 | + _measure_prefill(args, llm, warmup_prefill_lengths, rng, vocab_size, warmup=True) |
| 322 | + |
| 323 | + print(f"{COL_YELLOW}Prefill:{COL_DEFAULT}", flush=True) |
| 324 | + _measure_prefill(args, llm, prefill_lengths, rng, vocab_size) |
| 325 | + print(flush=True) |
| 326 | + |
| 327 | + if not args.skip_warmup: |
| 328 | + warmup_generate_contexts = [0] + _get_lengths(min(args.chunk_size, args.max_length)) |
| 329 | + _measure_generate(args, llm, warmup_generate_contexts, rng, vocab_size, warmup=True) |
| 330 | + |
| 331 | + print(f"{COL_YELLOW}Generation{COL_DEFAULT}", flush=True) |
| 332 | + _measure_generate(args, llm, generate_contexts, rng, vocab_size) |
| 333 | + print(flush=True) |
| 334 | + del llm |
0 commit comments