From d3ef529cf64949614e5438230ae099120a181d88 Mon Sep 17 00:00:00 2001 From: David Schulmeister Date: Thu, 16 Apr 2026 11:22:58 +0200 Subject: [PATCH 1/3] Fix BART processor compatibility with vLLM 0.18 The multimodal processor in bart.py broke under vLLM 0.18 in three places: - TextDataParser relied on MultiModalDataParser._is_empty, which was removed in 0.18. Replaced with inline emptiness checks for str and list. - create_encoder_prompt previously tokenized `prompt` as the encoder text. In 0.18 `inputs.prompt` passed to this method is the DECODER prompt text, not the encoder text (the encoder content lives in mm_data). The method now returns a single [0] placeholder token; _get_prompt_updates replaces it with the correct number of encoder token slots during rendering. - _call_hf_processor is now sometimes called with an already-tokenized decoder prompt (list[int]) instead of a str. Handle both cases when building result["input_ids"]. Adds tests/test_vllm_018_compat.py with three narrow unit tests covering each of these paths; no GPU required. Signed-off-by: David Schulmeister --- tests/test_vllm_018_compat.py | 47 +++++++++++++++++++++++++++++++++++ vllm_bart_plugin/bart.py | 41 ++++++++++++++++-------------- 2 files changed, 70 insertions(+), 18 deletions(-) create mode 100644 tests/test_vllm_018_compat.py diff --git a/tests/test_vllm_018_compat.py b/tests/test_vllm_018_compat.py new file mode 100644 index 0000000..921ffba --- /dev/null +++ b/tests/test_vllm_018_compat.py @@ -0,0 +1,47 @@ +"""Regression tests for vLLM 0.18 compatibility in the BART processor.""" + +import torch + + +def test_text_data_parser_handles_v018_empty_inputs(): + from vllm_bart_plugin.bart import TextDataParser + + parser = TextDataParser() + + assert parser._parse_text_data("") is None + assert parser._parse_text_data([]) is None + + +def test_create_encoder_prompt_uses_placeholder_token(): + from vllm_bart_plugin.bart import BartMultiModalProcessor + + processor = BartMultiModalProcessor.__new__(BartMultiModalProcessor) + + assert processor.create_encoder_prompt("decoder text", {"texts": ["encoder text"]}) == [0] + + +def test_call_hf_processor_accepts_pretokenized_decoder_prompt(): + from vllm_bart_plugin.bart import BartMultiModalProcessor + + class FakeTokenizer: + def __call__(self, text, return_tensors="pt", **kwargs): + if text == "encoder text": + return {"input_ids": torch.tensor([[11, 12, 13]])} + return {"input_ids": torch.tensor([[21, 22]])} + + class FakeInfo: + def get_tokenizer(self): + return FakeTokenizer() + + processor = BartMultiModalProcessor.__new__(BartMultiModalProcessor) + processor.info = FakeInfo() + + out = processor._call_hf_processor( + [7, 8, 9], + {"texts": ["encoder text"]}, + {}, + {}, + ) + + assert torch.equal(out["encoder_input_ids"], torch.tensor([[11, 12, 13]])) + assert torch.equal(out["input_ids"], torch.tensor([[7, 8, 9]])) diff --git a/vllm_bart_plugin/bart.py b/vllm_bart_plugin/bart.py index fcced7d..d79777c 100644 --- a/vllm_bart_plugin/bart.py +++ b/vllm_bart_plugin/bart.py @@ -996,7 +996,10 @@ def _parse_text_data( if data is None: return TextProcessorItems(None) - if self._is_empty(data): + # _is_empty was removed in vLLM >=0.18; handle emptiness inline + if isinstance(data, str) and not data: + return None + if isinstance(data, list) and len(data) == 0: return None # Text data should be a string or list of strings @@ -1030,15 +1033,11 @@ def create_encoder_prompt( prompt: str | list[int], mm_data: MultiModalDataDict, ) -> str | list[int]: - if not prompt: - return [0] - tokenizer = self.info.get_tokenizer() - tokens = tokenizer( - prompt, - add_special_tokens=False, - return_tensors="pt", - )["input_ids"].flatten() - return tokens.tolist() + # In vLLM >=0.18, `prompt` here is the DECODER prompt text, not the + # encoder text. The encoder content lives in mm_data ("text" key). + # Always return [0] as a single placeholder token; _get_prompt_updates + # will replace it with the correct number of encoder token slots. + return [0] def create_decoder_prompt( self, @@ -1079,14 +1078,20 @@ def _call_hf_processor( ) result["encoder_input_ids"] = encoder_tokenized["input_ids"] - # Always tokenize the prompt (for decoder or as dummy) - # This will be popped by the base class - prompt_tokenized = tokenizer( - prompt if prompt else "", - return_tensors="pt", - **tok_kwargs, - ) - result["input_ids"] = prompt_tokenized["input_ids"] + # Always produce input_ids for the decoder prompt. + # In vLLM >=0.18 the rendering pipeline may call _call_hf_processor + # with an already-tokenized prompt (a list of ints) instead of a str. + # Handle both cases. + import torch as _torch + if isinstance(prompt, (list, tuple)) and len(prompt) > 0 and isinstance(prompt[0], int): + result["input_ids"] = _torch.tensor([prompt]) + else: + prompt_tokenized = tokenizer( + prompt if prompt else "", + return_tensors="pt", + **tok_kwargs, + ) + result["input_ids"] = prompt_tokenized["input_ids"] return BatchFeature(result) From d656d7c9795b7b678d1dd12b51453192e1a8cce5 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Thu, 30 Apr 2026 07:32:24 +0000 Subject: [PATCH 2/3] remove _torch import Signed-off-by: NickLucche --- vllm_bart_plugin/bart.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_bart_plugin/bart.py b/vllm_bart_plugin/bart.py index d79777c..62cca02 100644 --- a/vllm_bart_plugin/bart.py +++ b/vllm_bart_plugin/bart.py @@ -1082,9 +1082,8 @@ def _call_hf_processor( # In vLLM >=0.18 the rendering pipeline may call _call_hf_processor # with an already-tokenized prompt (a list of ints) instead of a str. # Handle both cases. - import torch as _torch if isinstance(prompt, (list, tuple)) and len(prompt) > 0 and isinstance(prompt[0], int): - result["input_ids"] = _torch.tensor([prompt]) + result["input_ids"] = torch.tensor([prompt]) else: prompt_tokenized = tokenizer( prompt if prompt else "", From 8d34ba48f3fa15f7f61c5f14c02a15a7d3ec30e8 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Thu, 30 Apr 2026 12:30:29 +0000 Subject: [PATCH 3/3] add_special_tokens=False consistent use Signed-off-by: NickLucche --- vllm_bart_plugin/bart.py | 67 ++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 16 deletions(-) diff --git a/vllm_bart_plugin/bart.py b/vllm_bart_plugin/bart.py index 62cca02..41e0bb5 100644 --- a/vllm_bart_plugin/bart.py +++ b/vllm_bart_plugin/bart.py @@ -945,6 +945,21 @@ def get_data_parser(self) -> MultiModalDataParser: return TextDataParser() +# vLLM >=0.18 moved tokenization defaults from a global enc-dec override +# (InputPreprocessor._get_tokenization_kw) into per-model ProcessingInfo. +# The old code forced add_special_tokens=False for every is_encoder_decoder +# model; replicate that here so the renderer does not inject extra BOS/EOS +# into the decoder prompt. On vLLM <0.18 the method does not exist on the +# base class and is not needed (the global override handles it). +if hasattr(BaseProcessingInfo, "get_default_tok_params"): + + def _bart_get_default_tok_params(self): + return super(BartProcessingInfo, self).get_default_tok_params() \ + .with_kwargs(add_special_tokens=False) + + BartProcessingInfo.get_default_tok_params = _bart_get_default_tok_params # type: ignore[attr-defined] + + class BartDummyInputsBuilder(BaseDummyInputsBuilder[BartProcessingInfo]): """Builds dummy inputs for profiling BART models.""" @@ -993,14 +1008,9 @@ def _parse_text_data( data: ModalityData[str], ) -> ModalityDataItems[Any, Any] | None: """Parse text data for BART.""" - if data is None: - return TextProcessorItems(None) - # _is_empty was removed in vLLM >=0.18; handle emptiness inline - if isinstance(data, str) and not data: - return None - if isinstance(data, list) and len(data) == 0: - return None + if data is None or not len(data): + return TextProcessorItems(None) # Text data should be a string or list of strings if isinstance(data, str) or is_list_of(data, str): @@ -1033,10 +1043,22 @@ def create_encoder_prompt( prompt: str | list[int], mm_data: MultiModalDataDict, ) -> str | list[int]: - # In vLLM >=0.18, `prompt` here is the DECODER prompt text, not the - # encoder text. The encoder content lives in mm_data ("text" key). - # Always return [0] as a single placeholder token; _get_prompt_updates - # will replace it with the correct number of encoder token slots. + # vLLM compatibility: + # - Legacy (<0.18): prompt is encoder text (str) — tokenize directly. + # - Modern (>=0.18): prompt is decoder token IDs or empty str from + # profiling — return a single [0] placeholder that _get_prompt_updates + # will expand to the real encoder token count. The placeholder IDs + # are structural (KV-cache sizing); the actual encoder computation + # uses encoder_input_ids from mm_kwargs. + if isinstance(prompt, str) and prompt: + tokenizer = self.info.get_tokenizer() + tokens = tokenizer( + prompt, + add_special_tokens=False, + return_tensors="pt", + )["input_ids"].flatten() + return tokens.tolist() + return [0] def create_decoder_prompt( @@ -1055,10 +1077,16 @@ def _call_hf_processor( tok_kwargs: Mapping[str, object], ): """ - BART doesn't have a HuggingFace Processor - it only has a tokenizer. - We tokenize both the prompt (decoder) and encoder text from mm_data. + BART doesn't have a HuggingFace Processor — it only has a tokenizer. + + Produces two sets of token IDs: + - ``encoder_input_ids``: tokenized encoder text from ``mm_data["texts"]`` + - ``input_ids``: tokenized decoder prompt (used by the base class to + build ``prompt_token_ids``) + + Encoder text is always tokenized with ``add_special_tokens=False`` to + match v0.16 behaviour and stay consistent with ``_get_prompt_updates``. """ - # tok_kwargs["add_special_tokens"] = False from transformers.feature_extraction_utils import BatchFeature tokenizer = self.info.get_tokenizer() @@ -1068,13 +1096,13 @@ def _call_hf_processor( result = {} if has_encoder_data: - # Tokenize the encoder text from mm_data encoder_texts = mm_data["texts"] encoder_text = encoder_texts[0] if encoder_texts else "" + # Tokenize the encoder text from mm_data encoder_tokenized = tokenizer( encoder_text, return_tensors="pt", - **tok_kwargs, + add_special_tokens=False, ) result["encoder_input_ids"] = encoder_tokenized["input_ids"] @@ -1109,6 +1137,13 @@ def _get_prompt_updates( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: + """Replace the single [0] encoder placeholder with N placeholder + tokens, where N equals the tokenized length of the encoder text. + + The token count must use ``add_special_tokens=False`` to stay + consistent with ``_call_hf_processor`` (which tokenizes the encoder + text the same way). + """ from vllm.multimodal.processing import PromptReplacement # Get the number of text items to determine token count