From d3ef529cf64949614e5438230ae099120a181d88 Mon Sep 17 00:00:00 2001
From: David Schulmeister <dschulmeist@users.noreply.github.com>
Date: Thu, 16 Apr 2026 11:22:58 +0200
Subject: [PATCH 1/3] Fix BART processor compatibility with vLLM 0.18

The multimodal processor in bart.py broke under vLLM 0.18 in three places:

- TextDataParser relied on MultiModalDataParser._is_empty, which was
  removed in 0.18. Replaced with inline emptiness checks for str and list.

- create_encoder_prompt previously tokenized `prompt` as the encoder text.
  In 0.18 `inputs.prompt` passed to this method is the DECODER prompt text,
  not the encoder text (the encoder content lives in mm_data). The method
  now returns a single [0] placeholder token; _get_prompt_updates replaces
  it with the correct number of encoder token slots during rendering.

- _call_hf_processor is now sometimes called with an already-tokenized
  decoder prompt (list[int]) instead of a str. Handle both cases when
  building result["input_ids"].

Adds tests/test_vllm_018_compat.py with three narrow unit tests covering
each of these paths; no GPU required.

Signed-off-by: David Schulmeister <dschulmeist@users.noreply.github.com>
---
 tests/test_vllm_018_compat.py | 47 +++++++++++++++++++++++++++++++++++
 vllm_bart_plugin/bart.py      | 41 ++++++++++++++++--------------
 2 files changed, 70 insertions(+), 18 deletions(-)
 create mode 100644 tests/test_vllm_018_compat.py
diff --git a/tests/test_vllm_018_compat.py b/tests/test_vllm_018_compat.py
new file mode 100644
index 0000000..921ffba
--- /dev/null
+++ b/tests/test_vllm_018_compat.py
@@ -0,0 +1,47 @@
+"""Regression tests for vLLM 0.18 compatibility in the BART processor."""
+
+import torch
+
+
+def test_text_data_parser_handles_v018_empty_inputs():
+    from vllm_bart_plugin.bart import TextDataParser
+
+    parser = TextDataParser()
+
+    assert parser._parse_text_data("") is None
+    assert parser._parse_text_data([]) is None
+
+
+def test_create_encoder_prompt_uses_placeholder_token():
+    from vllm_bart_plugin.bart import BartMultiModalProcessor
+
+    processor = BartMultiModalProcessor.__new__(BartMultiModalProcessor)
+
+    assert processor.create_encoder_prompt("<s>decoder text", {"texts": ["encoder text"]}) == [0]
+
+
+def test_call_hf_processor_accepts_pretokenized_decoder_prompt():
+    from vllm_bart_plugin.bart import BartMultiModalProcessor
+
+    class FakeTokenizer:
+        def __call__(self, text, return_tensors="pt", **kwargs):
+            if text == "encoder text":
+                return {"input_ids": torch.tensor([[11, 12, 13]])}
+            return {"input_ids": torch.tensor([[21, 22]])}
+
+    class FakeInfo:
+        def get_tokenizer(self):
+            return FakeTokenizer()
+
+    processor = BartMultiModalProcessor.__new__(BartMultiModalProcessor)
+    processor.info = FakeInfo()
+
+    out = processor._call_hf_processor(
+        [7, 8, 9],
+        {"texts": ["encoder text"]},
+        {},
+        {},
+    )
+
+    assert torch.equal(out["encoder_input_ids"], torch.tensor([[11, 12, 13]]))
+    assert torch.equal(out["input_ids"], torch.tensor([[7, 8, 9]]))
diff --git a/vllm_bart_plugin/bart.py b/vllm_bart_plugin/bart.py
index fcced7d..d79777c 100644
--- a/vllm_bart_plugin/bart.py
+++ b/vllm_bart_plugin/bart.py
@@ -996,7 +996,10 @@ def _parse_text_data(
         if data is None:
             return TextProcessorItems(None)
 
-        if self._is_empty(data):
+        # _is_empty was removed in vLLM >=0.18; handle emptiness inline
+        if isinstance(data, str) and not data:
+            return None
+        if isinstance(data, list) and len(data) == 0:
             return None
 
         # Text data should be a string or list of strings
@@ -1030,15 +1033,11 @@ def create_encoder_prompt(
         prompt: str | list[int],
         mm_data: MultiModalDataDict,
     ) -> str | list[int]:
-        if not prompt:
-            return [0]
-        tokenizer = self.info.get_tokenizer()
-        tokens = tokenizer(
-            prompt,
-            add_special_tokens=False,
-            return_tensors="pt",
-        )["input_ids"].flatten()
-        return tokens.tolist()
+        # In vLLM >=0.18, `prompt` here is the DECODER prompt text, not the
+        # encoder text. The encoder content lives in mm_data ("text" key).
+        # Always return [0] as a single placeholder token; _get_prompt_updates
+        # will replace it with the correct number of encoder token slots.
+        return [0]
 
     def create_decoder_prompt(
         self,
@@ -1079,14 +1078,20 @@ def _call_hf_processor(
             )
             result["encoder_input_ids"] = encoder_tokenized["input_ids"]
 
-        # Always tokenize the prompt (for decoder or as dummy)
-        # This will be popped by the base class
-        prompt_tokenized = tokenizer(
-            prompt if prompt else "",
-            return_tensors="pt",
-            **tok_kwargs,
-        )
-        result["input_ids"] = prompt_tokenized["input_ids"]
+        # Always produce input_ids for the decoder prompt.
+        # In vLLM >=0.18 the rendering pipeline may call _call_hf_processor
+        # with an already-tokenized prompt (a list of ints) instead of a str.
+        # Handle both cases.
+        import torch as _torch
+        if isinstance(prompt, (list, tuple)) and len(prompt) > 0 and isinstance(prompt[0], int):
+            result["input_ids"] = _torch.tensor([prompt])
+        else:
+            prompt_tokenized = tokenizer(
+                prompt if prompt else "",
+                return_tensors="pt",
+                **tok_kwargs,
+            )
+            result["input_ids"] = prompt_tokenized["input_ids"]
 
         return BatchFeature(result)
 

From d656d7c9795b7b678d1dd12b51453192e1a8cce5 Mon Sep 17 00:00:00 2001
From: NickLucche <nlucches@redhat.com>
Date: Thu, 30 Apr 2026 07:32:24 +0000
Subject: [PATCH 2/3] remove _torch import

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm_bart_plugin/bart.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm_bart_plugin/bart.py b/vllm_bart_plugin/bart.py
index d79777c..62cca02 100644
--- a/vllm_bart_plugin/bart.py
+++ b/vllm_bart_plugin/bart.py
@@ -1082,9 +1082,8 @@ def _call_hf_processor(
         # In vLLM >=0.18 the rendering pipeline may call _call_hf_processor
         # with an already-tokenized prompt (a list of ints) instead of a str.
         # Handle both cases.
-        import torch as _torch
         if isinstance(prompt, (list, tuple)) and len(prompt) > 0 and isinstance(prompt[0], int):
-            result["input_ids"] = _torch.tensor([prompt])
+            result["input_ids"] = torch.tensor([prompt])
         else:
             prompt_tokenized = tokenizer(
                 prompt if prompt else "",

From 8d34ba48f3fa15f7f61c5f14c02a15a7d3ec30e8 Mon Sep 17 00:00:00 2001
From: NickLucche <nlucches@redhat.com>
Date: Thu, 30 Apr 2026 12:30:29 +0000
Subject: [PATCH 3/3] add_special_tokens=False consistent use

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm_bart_plugin/bart.py | 67 ++++++++++++++++++++++++++++++----------
 1 file changed, 51 insertions(+), 16 deletions(-)

diff --git a/vllm_bart_plugin/bart.py b/vllm_bart_plugin/bart.py
index 62cca02..41e0bb5 100644
--- a/vllm_bart_plugin/bart.py
+++ b/vllm_bart_plugin/bart.py
@@ -945,6 +945,21 @@ def get_data_parser(self) -> MultiModalDataParser:
         return TextDataParser()
 
 
+# vLLM >=0.18 moved tokenization defaults from a global enc-dec override
+# (InputPreprocessor._get_tokenization_kw) into per-model ProcessingInfo.
+# The old code forced add_special_tokens=False for every is_encoder_decoder
+# model; replicate that here so the renderer does not inject extra BOS/EOS
+# into the decoder prompt.  On vLLM <0.18 the method does not exist on the
+# base class and is not needed (the global override handles it).
+if hasattr(BaseProcessingInfo, "get_default_tok_params"):
+
+    def _bart_get_default_tok_params(self):
+        return super(BartProcessingInfo, self).get_default_tok_params() \
+            .with_kwargs(add_special_tokens=False)
+
+    BartProcessingInfo.get_default_tok_params = _bart_get_default_tok_params  # type: ignore[attr-defined]
+
+
 class BartDummyInputsBuilder(BaseDummyInputsBuilder[BartProcessingInfo]):
     """Builds dummy inputs for profiling BART models."""
 
@@ -993,14 +1008,9 @@ def _parse_text_data(
         data: ModalityData[str],
     ) -> ModalityDataItems[Any, Any] | None:
         """Parse text data for BART."""
-        if data is None:
-            return TextProcessorItems(None)
-
         # _is_empty was removed in vLLM >=0.18; handle emptiness inline
-        if isinstance(data, str) and not data:
-            return None
-        if isinstance(data, list) and len(data) == 0:
-            return None
+        if data is None or not len(data):
+            return TextProcessorItems(None)
 
         # Text data should be a string or list of strings
         if isinstance(data, str) or is_list_of(data, str):
@@ -1033,10 +1043,22 @@ def create_encoder_prompt(
         prompt: str | list[int],
         mm_data: MultiModalDataDict,
     ) -> str | list[int]:
-        # In vLLM >=0.18, `prompt` here is the DECODER prompt text, not the
-        # encoder text. The encoder content lives in mm_data ("text" key).
-        # Always return [0] as a single placeholder token; _get_prompt_updates
-        # will replace it with the correct number of encoder token slots.
+        # vLLM compatibility:
+        # - Legacy (<0.18): prompt is encoder text (str) — tokenize directly.
+        # - Modern (>=0.18): prompt is decoder token IDs or empty str from
+        #   profiling — return a single [0] placeholder that _get_prompt_updates
+        #   will expand to the real encoder token count.  The placeholder IDs
+        #   are structural (KV-cache sizing); the actual encoder computation
+        #   uses encoder_input_ids from mm_kwargs.
+        if isinstance(prompt, str) and prompt:
+            tokenizer = self.info.get_tokenizer()
+            tokens = tokenizer(
+                prompt,
+                add_special_tokens=False,
+                return_tensors="pt",
+            )["input_ids"].flatten()
+            return tokens.tolist()
+
         return [0]
 
     def create_decoder_prompt(
@@ -1055,10 +1077,16 @@ def _call_hf_processor(
         tok_kwargs: Mapping[str, object],
     ):
         """
-        BART doesn't have a HuggingFace Processor - it only has a tokenizer.
-        We tokenize both the prompt (decoder) and encoder text from mm_data.
+        BART doesn't have a HuggingFace Processor — it only has a tokenizer.
+
+        Produces two sets of token IDs:
+        - ``encoder_input_ids``: tokenized encoder text from ``mm_data["texts"]``
+        - ``input_ids``: tokenized decoder prompt (used by the base class to
+          build ``prompt_token_ids``)
+
+        Encoder text is always tokenized with ``add_special_tokens=False`` to
+        match v0.16 behaviour and stay consistent with ``_get_prompt_updates``.
         """
-        # tok_kwargs["add_special_tokens"] = False
         from transformers.feature_extraction_utils import BatchFeature
 
         tokenizer = self.info.get_tokenizer()
@@ -1068,13 +1096,13 @@ def _call_hf_processor(
         result = {}
 
         if has_encoder_data:
-            # Tokenize the encoder text from mm_data
             encoder_texts = mm_data["texts"]
             encoder_text = encoder_texts[0] if encoder_texts else ""
+            # Tokenize the encoder text from mm_data
             encoder_tokenized = tokenizer(
                 encoder_text,
                 return_tensors="pt",
-                **tok_kwargs,
+                add_special_tokens=False,
             )
             result["encoder_input_ids"] = encoder_tokenized["input_ids"]
 
@@ -1109,6 +1137,13 @@ def _get_prompt_updates(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
+        """Replace the single [0] encoder placeholder with N placeholder
+        tokens, where N equals the tokenized length of the encoder text.
+
+        The token count must use ``add_special_tokens=False`` to stay
+        consistent with ``_call_hf_processor`` (which tokenizes the encoder
+        text the same way).
+        """
         from vllm.multimodal.processing import PromptReplacement
 
         # Get the number of text items to determine token count