From e78827e5870ed8378970b4edadab4fb52d80f666 Mon Sep 17 00:00:00 2001 From: aerdem4 Date: Mon, 16 Jun 2025 16:26:45 +0300 Subject: [PATCH] Remove cloning for non-mutating LPs to prevent run-time increase Signed-off-by: aerdem4 --- .../vllm/performance_profiling.ipynb | 1902 +++++++++++++++++ logits_processor_zoo/vllm/cite_prompt.py | 4 - .../vllm/generation_length.py | 4 - logits_processor_zoo/vllm/last_phrase.py | 2 +- logits_processor_zoo/vllm/multiple_choice.py | 3 - logits_processor_zoo/vllm/trigger_phrase.py | 2 + pyproject.toml | 2 +- 7 files changed, 1906 insertions(+), 13 deletions(-) create mode 100644 example_notebooks/vllm/performance_profiling.ipynb diff --git a/example_notebooks/vllm/performance_profiling.ipynb b/example_notebooks/vllm/performance_profiling.ipynb new file mode 100644 index 0000000..f6a2f12 --- /dev/null +++ b/example_notebooks/vllm/performance_profiling.ipynb @@ -0,0 +1,1902 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "28ed6952", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/aerdem/projects/nvidia/logits-processor-zoo\n" + ] + } + ], + "source": [ + "%cd ../.." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b89279fe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO 06-16 16:23:14 [__init__.py:243] Automatically detected platform cuda.\n", + "INFO 06-16 16:23:16 [__init__.py:31] Available plugins for group vllm.general_plugins:\n", + "INFO 06-16 16:23:16 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver\n", + "INFO 06-16 16:23:16 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.\n", + "WARNING 06-16 16:23:17 [config.py:3135] Casting torch.bfloat16 to torch.float16.\n", + "INFO 06-16 16:23:22 [config.py:793] This model supports multiple tasks: {'generate', 'embed', 'reward', 'score', 'classify'}. Defaulting to 'generate'.\n", + "WARNING 06-16 16:23:22 [cuda.py:87] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used\n", + "INFO 06-16 16:23:22 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.0) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=None, served_model_name=Qwen/Qwen2.5-1.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=False, pooler_config=None, compilation_config={\"compile_sizes\": [], \"inductor_compile_config\": {\"enable_auto_functionalized_v2\": false}, \"cudagraph_capture_sizes\": [], \"max_capture_size\": 0}, use_cached_outputs=False, \n", + "INFO 06-16 16:23:23 [cuda.py:292] Using Flash Attention backend.\n", + "INFO 06-16 16:23:24 [parallel_state.py:1064] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0\n", + "INFO 06-16 16:23:24 [model_runner.py:1170] Starting to load model Qwen/Qwen2.5-1.5B-Instruct...\n", + "INFO 06-16 16:23:24 [weight_utils.py:291] Using model weights format ['*.safetensors']\n", + "INFO 06-16 16:23:24 [weight_utils.py:344] No model.safetensors.index.json found in remote.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dd472e91bc2547a79f9973db4060b706", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00 torch.Tensor: tokens = set(prompt_tokens_ids) if self.boost_eos: diff --git a/logits_processor_zoo/vllm/generation_length.py b/logits_processor_zoo/vllm/generation_length.py index bc5b37d..3a7bb54 100644 --- a/logits_processor_zoo/vllm/generation_length.py +++ b/logits_processor_zoo/vllm/generation_length.py @@ -53,10 +53,6 @@ def __init__(self, tokenizer: Union[PreTrainedTokenizer, str], boost_factor: flo self.new_line_token = text_to_token(self.tokenizer, "It is a new line\n", last=True) self.complete_sentences = complete_sentences - def clone(self): - return GenLengthLogitsProcessor(self.tokenizer, self.boost_factor, self.p, - self.complete_sentences, self.boost_token_str) - def __call__(self, prompt_tokens_ids: List[int], past_token_ids: List[int], scores: torch.Tensor) -> torch.Tensor: gen_length = len(past_token_ids) diff --git a/logits_processor_zoo/vllm/last_phrase.py b/logits_processor_zoo/vllm/last_phrase.py index b1fe874..bf26512 100644 --- a/logits_processor_zoo/vllm/last_phrase.py +++ b/logits_processor_zoo/vllm/last_phrase.py @@ -41,7 +41,7 @@ def __init__(self, phrase: str, tokenizer: Union[PreTrainedTokenizer, str]): self._reset() self.phrase = phrase - # LogitsProcessor can contain a clone attribute to deep copy it + # Mutable logits processor gets cloned for each prompt in a batch in order to prevent updating the same object # https://github.com/vllm-project/vllm/blob/19dcc02a72e3ed52e3bf95aae44ea1f40ce42ea0/vllm/sampling_params.py#L537-L550 def clone(self): return ForceLastPhraseLogitsProcessor(self.phrase, self.tokenizer) diff --git a/logits_processor_zoo/vllm/multiple_choice.py b/logits_processor_zoo/vllm/multiple_choice.py index dbfb050..615163e 100644 --- a/logits_processor_zoo/vllm/multiple_choice.py +++ b/logits_processor_zoo/vllm/multiple_choice.py @@ -57,9 +57,6 @@ def __init__(self, tokenizer: Union[PreTrainedTokenizer, str], choices: List[str self.choice_tokens = [text_to_token(self.tokenizer, choice, last=False) for choice in choices] self.boost_first_words = boost_first_words - def clone(self): - return MultipleChoiceLogitsProcessor(self.tokenizer, self.choices, self.delimiter, self.boost_first_words) - def __call__(self, prompt_tokens_ids: List[int], past_token_ids: List[int], scores: torch.Tensor) -> torch.Tensor: if self.boost_first_words: diff --git a/logits_processor_zoo/vllm/trigger_phrase.py b/logits_processor_zoo/vllm/trigger_phrase.py index 09fa46c..356ae7f 100644 --- a/logits_processor_zoo/vllm/trigger_phrase.py +++ b/logits_processor_zoo/vllm/trigger_phrase.py @@ -48,6 +48,8 @@ def __init__(self, phrase: str, trigger_token_phrase: str, tokenizer: Union[PreT self.trigger_after = trigger_after self._reset() + # Mutable logits processor gets cloned for each prompt in a batch in order to prevent updating the same object + # https://github.com/vllm-project/vllm/blob/19dcc02a72e3ed52e3bf95aae44ea1f40ce42ea0/vllm/sampling_params.py#L537-L550 def clone(self): return TriggerPhraseLogitsProcessor(self.phrase, self.trigger_token_phrase, self.tokenizer, self.initial_trigger_count, self.trigger_after) diff --git a/pyproject.toml b/pyproject.toml index e102ec5..7a9bad1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "logits-processor-zoo" -version = "0.1.9" +version = "0.1.10" description = "A collection of LogitsProcessors to customize and enhance LLM behavior for specific tasks." authors = ["Ahmet Erdem", "Ivan Sorokin", "Maximilian Jeblick", "Darragh Hanley", "David Austin"] readme = "README.md"