Skip to content

Commit e03dafe

Browse files
authored
feat: add perf benchmark script (#1645)
Signed-off-by: AlpinDale <alpindale@gmail.com>
1 parent 0927708 commit e03dafe

3 files changed

Lines changed: 364 additions & 1 deletion

File tree

aphrodite/benchmarks/perf.py

Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
# SPDX-FileCopyrightText: Copyright contributors to the Aphrodite project
4+
"""Single-request model performance benchmark.
5+
6+
This intentionally mirrors the compact output style of exllamav3's
7+
``eval/perf.py`` while running through Aphrodite's normal offline engine.
8+
"""
9+
10+
import argparse
11+
import logging
12+
import os
13+
import sys
14+
import time
15+
from collections.abc import Sequence
16+
from contextlib import contextmanager
17+
from types import TracebackType
18+
19+
import numpy as np
20+
from rich.console import Console
21+
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
22+
23+
from aphrodite.inputs import TokensPrompt
24+
from aphrodite.outputs import RequestOutput
25+
26+
ESC = "\u001b"
27+
COL_DEFAULT = f"{ESC}[0m"
28+
COL_GREEN = f"{ESC}[32;1m"
29+
COL_YELLOW = f"{ESC}[33;1m"
30+
31+
32+
@contextmanager
33+
def _suppress_startup_logs(enabled: bool):
34+
if not enabled:
35+
yield
36+
return
37+
38+
previous_disable_level = logging.root.manager.disable
39+
previous_env_level = os.environ.get("APHRODITE_LOGGING_LEVEL")
40+
aphrodite_logger = logging.getLogger("aphrodite")
41+
previous_aphrodite_level = aphrodite_logger.level
42+
previous_handler_levels = [handler.level for handler in aphrodite_logger.handlers]
43+
stdout_fd = os.dup(1)
44+
stderr_fd = os.dup(2)
45+
devnull_fd = os.open(os.devnull, os.O_WRONLY)
46+
47+
os.environ["APHRODITE_LOGGING_LEVEL"] = "ERROR"
48+
aphrodite_logger.setLevel(logging.ERROR)
49+
for handler in aphrodite_logger.handlers:
50+
handler.setLevel(logging.ERROR)
51+
logging.disable(logging.INFO)
52+
os.dup2(devnull_fd, 1)
53+
os.dup2(devnull_fd, 2)
54+
try:
55+
yield
56+
finally:
57+
os.dup2(stdout_fd, 1)
58+
os.dup2(stderr_fd, 2)
59+
os.close(stdout_fd)
60+
os.close(stderr_fd)
61+
os.close(devnull_fd)
62+
logging.disable(previous_disable_level)
63+
if previous_env_level is None:
64+
os.environ.pop("APHRODITE_LOGGING_LEVEL", None)
65+
else:
66+
os.environ["APHRODITE_LOGGING_LEVEL"] = previous_env_level
67+
aphrodite_logger.setLevel(previous_aphrodite_level)
68+
for handler, level in zip(aphrodite_logger.handlers, previous_handler_levels):
69+
handler.setLevel(level)
70+
71+
72+
class _StartupProgress:
73+
def __init__(self, enabled: bool) -> None:
74+
self.enabled = enabled
75+
self.progress: Progress | None = None
76+
self.console_file = None
77+
78+
def __enter__(self) -> "_StartupProgress":
79+
if self.enabled:
80+
self.console_file = os.fdopen(os.dup(sys.stdout.fileno()), "w", buffering=1)
81+
self.progress = Progress(
82+
SpinnerColumn(),
83+
TextColumn("[progress.description]{task.description}"),
84+
TimeElapsedColumn(),
85+
console=Console(file=self.console_file),
86+
transient=True,
87+
)
88+
self.progress.start()
89+
self.progress.add_task("Loading, compiling, and capturing model...", total=None)
90+
return self
91+
92+
def __exit__(
93+
self,
94+
exc_type: type[BaseException] | None,
95+
exc: BaseException | None,
96+
traceback: TracebackType | None,
97+
) -> None:
98+
if self.progress is not None:
99+
if exc_type is None:
100+
for task_id in self.progress.task_ids:
101+
self.progress.update(task_id, description="Model ready")
102+
self.progress.stop()
103+
if self.console_file is not None:
104+
self.console_file.close()
105+
106+
107+
def _get_lengths(max_length: int) -> list[int]:
108+
length = 256
109+
lengths = [length]
110+
while length < max_length:
111+
length = min(length * 2, max_length)
112+
lengths.append(length)
113+
return lengths
114+
115+
116+
def _get_vocab_size(llm) -> int: # type: ignore[no-untyped-def]
117+
vocab_size = getattr(llm.model_config.hf_config, "vocab_size", None)
118+
tokenizer = llm.get_tokenizer()
119+
try:
120+
tokenizer_size = len(tokenizer)
121+
except TypeError:
122+
tokenizer_size = len(tokenizer.get_vocab())
123+
124+
if vocab_size is None:
125+
return tokenizer_size
126+
return min(vocab_size, tokenizer_size)
127+
128+
129+
def _make_prompt(
130+
rng: np.random.Generator,
131+
length: int,
132+
vocab_size: int,
133+
) -> TokensPrompt:
134+
token_ids = rng.integers(0, vocab_size, size=length, dtype=np.int64)
135+
return TokensPrompt(prompt_token_ids=token_ids.tolist())
136+
137+
138+
def _prefill_time(output: RequestOutput, wall_time: float) -> float:
139+
metrics = output.metrics
140+
if metrics is None:
141+
return wall_time
142+
143+
prefill_time = metrics.first_token_ts - metrics.scheduled_ts
144+
if prefill_time <= 0:
145+
return wall_time
146+
return prefill_time
147+
148+
149+
def _decode_time(output: RequestOutput, wall_time: float) -> float:
150+
metrics = output.metrics
151+
if metrics is None:
152+
return wall_time
153+
154+
decode_time = metrics.last_token_ts - metrics.first_token_ts
155+
if decode_time <= 0:
156+
return wall_time
157+
return decode_time
158+
159+
160+
def _run_generate(llm, prompt: TokensPrompt, max_tokens: int): # type: ignore[no-untyped-def]
161+
from aphrodite import SamplingParams
162+
163+
sampling_params = SamplingParams(
164+
temperature=0.0,
165+
top_p=1.0,
166+
ignore_eos=True,
167+
max_tokens=max_tokens,
168+
detokenize=False,
169+
)
170+
start = time.perf_counter()
171+
outputs = llm.generate(prompt, sampling_params=sampling_params, use_tqdm=False)
172+
end = time.perf_counter()
173+
return outputs[0], end - start
174+
175+
176+
def _measure_prefill(
177+
args: argparse.Namespace,
178+
llm,
179+
lengths: Sequence[int],
180+
rng: np.random.Generator,
181+
vocab_size: int,
182+
*,
183+
warmup: bool = False,
184+
) -> dict[int, float]:
185+
results: dict[int, float] = {}
186+
for length in lengths:
187+
prompt = _make_prompt(rng, length, vocab_size)
188+
output, wall_time = _run_generate(llm, prompt, max_tokens=1)
189+
tokens_per_second = length / _prefill_time(output, wall_time)
190+
results[length] = tokens_per_second
191+
if not warmup:
192+
print(f"Length {length:6}: {COL_GREEN}{tokens_per_second:10.2f}{COL_DEFAULT} tokens/s", flush=True)
193+
return results
194+
195+
196+
def _measure_generate(
197+
args: argparse.Namespace,
198+
llm,
199+
contexts: Sequence[int],
200+
rng: np.random.Generator,
201+
vocab_size: int,
202+
*,
203+
warmup: bool = False,
204+
) -> dict[int, float]:
205+
results: dict[int, float] = {}
206+
for context_len in contexts:
207+
# Aphrodite needs at least one prompt token. Label the first case as
208+
# context 0 to match exllamav3's eval/perf.py output.
209+
prompt_len = max(context_len, 1)
210+
prompt = _make_prompt(rng, prompt_len, vocab_size)
211+
212+
# The first generated token is produced by the prefill step. Request
213+
# one extra token so the measured decode interval covers args.gen_tokens
214+
# decode steps.
215+
output, wall_time = _run_generate(llm, prompt, max_tokens=args.gen_tokens + 1)
216+
tokens_per_second = args.gen_tokens / _decode_time(output, wall_time)
217+
results[context_len] = tokens_per_second
218+
if not warmup:
219+
print(f"Context {context_len:6}: {COL_GREEN}{tokens_per_second:10.2f}{COL_DEFAULT} tokens/s", flush=True)
220+
return results
221+
222+
223+
def add_cli_args(parser: argparse.ArgumentParser) -> None:
224+
parser.add_argument(
225+
"model_tag",
226+
nargs="?",
227+
help="Model name or path. Equivalent to --model for this benchmark.",
228+
)
229+
parser.add_argument(
230+
"-m",
231+
"--model-dir",
232+
dest="model_dir",
233+
help="Model name or path, matching exllamav3 eval/perf.py.",
234+
)
235+
parser.add_argument(
236+
"-max_length",
237+
"--max-length",
238+
type=int,
239+
default=32768,
240+
help="Max context length to measure.",
241+
)
242+
parser.add_argument(
243+
"-chunk_size",
244+
"--chunk-size",
245+
type=int,
246+
default=4096,
247+
help="Chunk size used for the default max-num-batched-tokens.",
248+
)
249+
parser.add_argument(
250+
"--gen-tokens",
251+
type=int,
252+
default=100,
253+
help="Number of decode tokens to measure for each context length.",
254+
)
255+
parser.add_argument(
256+
"-spf",
257+
"--skip-prefill",
258+
action="store_true",
259+
help="Skip measuring prefill speed.",
260+
)
261+
parser.add_argument(
262+
"-swu",
263+
"--skip-warmup",
264+
action="store_true",
265+
help="Skip warmup passes.",
266+
)
267+
parser.add_argument(
268+
"--show-startup-logs",
269+
action="store_true",
270+
help="Show normal Aphrodite engine startup logs instead of the compact startup progress line.",
271+
)
272+
# Building engine CLI args can initialize parts of the platform layer.
273+
# Keep that quiet so `bench perf` has a compact exllamav3-like surface.
274+
with _suppress_startup_logs(enabled=True):
275+
from aphrodite.engine.arg_utils import EngineArgs
276+
277+
EngineArgs.add_cli_args(parser)
278+
parser.set_defaults(enable_prefix_caching=False, disable_log_stats=False)
279+
280+
281+
def main(args: argparse.Namespace) -> None:
282+
model = args.model_dir or args.model_tag or args.model
283+
if model is None:
284+
raise ValueError("aphrodite bench perf requires a model via MODEL, -m, or --model.")
285+
args.model = model
286+
287+
if args.max_num_batched_tokens is None:
288+
args.max_num_batched_tokens = args.chunk_size
289+
290+
# Keep request-level timing metrics enabled, but avoid interleaving the
291+
# normal "Request completed" log line with the perf.py-style table.
292+
logging.getLogger("aphrodite.v1.metrics.loggers").setLevel(logging.WARNING)
293+
294+
hide_startup_logs = not args.show_startup_logs
295+
with _StartupProgress(enabled=hide_startup_logs):
296+
with _suppress_startup_logs(enabled=hide_startup_logs):
297+
from aphrodite import LLM
298+
from aphrodite.engine.arg_utils import EngineArgs
299+
300+
engine_args = EngineArgs.from_cli_args(args)
301+
llm = LLM.from_engine_args(engine_args)
302+
303+
max_required_len = args.max_length + args.gen_tokens + 1
304+
assert llm.llm_engine.model_config.max_model_len >= max_required_len, (
305+
f"Please ensure max_model_len is at least {max_required_len} tokens for this benchmark."
306+
)
307+
308+
vocab_size = _get_vocab_size(llm)
309+
rng = np.random.default_rng(args.seed)
310+
311+
print(f" -- Model: {model}", flush=True)
312+
print(f" -- Chunk size: {args.chunk_size}", flush=True)
313+
print(flush=True)
314+
315+
prefill_lengths = _get_lengths(args.max_length)
316+
generate_contexts = [0] + _get_lengths(max(args.max_length - 256, 256))
317+
318+
if not args.skip_prefill:
319+
if not args.skip_warmup:
320+
warmup_prefill_lengths = _get_lengths(min(args.chunk_size, args.max_length))
321+
_measure_prefill(args, llm, warmup_prefill_lengths, rng, vocab_size, warmup=True)
322+
323+
print(f"{COL_YELLOW}Prefill:{COL_DEFAULT}", flush=True)
324+
_measure_prefill(args, llm, prefill_lengths, rng, vocab_size)
325+
print(flush=True)
326+
327+
if not args.skip_warmup:
328+
warmup_generate_contexts = [0] + _get_lengths(min(args.chunk_size, args.max_length))
329+
_measure_generate(args, llm, warmup_generate_contexts, rng, vocab_size, warmup=True)
330+
331+
print(f"{COL_YELLOW}Generation{COL_DEFAULT}", flush=True)
332+
_measure_generate(args, llm, generate_contexts, rng, vocab_size)
333+
print(flush=True)
334+
del llm

aphrodite/entrypoints/cli/benchmark/main.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

44
import argparse
5+
import logging
56
import sys
67
import typing
78

@@ -20,6 +21,7 @@ def _import_bench_subcommand_modules() -> None:
2021
# when `aphrodite bench` is actually invoked.
2122
import aphrodite.entrypoints.cli.benchmark.latency # noqa: F401
2223
import aphrodite.entrypoints.cli.benchmark.mm_processor # noqa: F401
24+
import aphrodite.entrypoints.cli.benchmark.perf # noqa: F401
2325
import aphrodite.entrypoints.cli.benchmark.serve # noqa: F401
2426
import aphrodite.entrypoints.cli.benchmark.startup # noqa: F401
2527
import aphrodite.entrypoints.cli.benchmark.sweep # noqa: F401
@@ -55,7 +57,12 @@ def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgu
5557
# before the subcommand don't break detection.
5658
first_positional = next((arg for arg in sys.argv[1:] if not arg.startswith("-")), None)
5759
if first_positional == self.name:
58-
_import_bench_subcommand_modules()
60+
previous_disable_level = logging.root.manager.disable
61+
logging.disable(logging.INFO)
62+
try:
63+
_import_bench_subcommand_modules()
64+
finally:
65+
logging.disable(previous_disable_level)
5966
for cmd_cls in BenchmarkSubcommandBase.__subclasses__():
6067
cmd_subparser = bench_subparsers.add_parser(
6168
cmd_cls.name,
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
# SPDX-FileCopyrightText: Copyright contributors to the Aphrodite project
4+
import argparse
5+
6+
from aphrodite.benchmarks.perf import add_cli_args, main
7+
from aphrodite.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
8+
9+
10+
class BenchmarkPerfSubcommand(BenchmarkSubcommandBase):
11+
"""The `perf` subcommand for `aphrodite bench`."""
12+
13+
name = "perf"
14+
help = "Benchmark single-request prefill and decode throughput."
15+
16+
@classmethod
17+
def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
18+
add_cli_args(parser)
19+
20+
@staticmethod
21+
def cmd(args: argparse.Namespace) -> None:
22+
main(args)

0 commit comments

Comments
 (0)