Prevent add_special_tokens overwrite for BART

carlesonielfa · carlesonielfa · commit 1ef949e68ad3 · 2026-03-13T10:30:29.000+01:00
diff --git a/vllm_bart_plugin/bart.py b/vllm_bart_plugin/bart.py
@@ -1041,6 +1041,9 @@ def _call_hf_processor(
         has_encoder_data = mm_data is not None and "texts" in mm_data
         result = {}
 
+        # vLLM may pass add_special_tokens in tok_kwargs; we set it ourselves
+        tok_kwargs = {k: v for k, v in tok_kwargs.items() if k != "add_special_tokens"}
+
         if has_encoder_data:
             # Tokenize the encoder text from mm_data
             encoder_texts = mm_data["texts"]
@@ -1152,8 +1155,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.vocab_size, config.d_model, embed_scale=embed_scale
         )
         # Bias added to logits after lm_head, matching HuggingFace approach
-        self.register_buffer("final_logits_bias",
-                             torch.zeros((1, config.vocab_size)))
+        self.register_buffer("final_logits_bias", torch.zeros((1, config.vocab_size)))
         self.logits_processor = LogitsProcessor(
             self.unpadded_vocab_size, config.vocab_size
         )
@@ -1341,7 +1343,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                         "Shared weight embedding already loaded with name "
                         "%s, skipping. This is expected on facebook/bart-large"
                         " like models, where the same shared embedding is "
-                        "present multiple times.", name)
+                        "present multiple times.",
+                        name,
+                    )
                     continue
 
         loader = AutoWeightsLoader(