vllm 0.16.0 support

carlesonielfa · carlesonielfa · commit 4bc4975e44df · 2026-03-06T12:28:44.000+01:00
Signed-off-by: Carles Onielfa &lt;carlesonielfa@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -15,6 +15,13 @@ This plugin requires [uv](https://docs.astral.sh/uv/) for package management. If
 ```bash
 curl -LsSf https://astral.sh/uv/install.sh | sh
 ```
+### From Git
+
+Install using git as a package index:
+
+```bash
+pip install git+https://github.com/vllm-project/bart-plugin
+```
 
 ### From Source
 
@@ -186,11 +193,14 @@ Notes:
 ```
 bart-plugin/
 ├── vllm_bart_plugin/
-│   ├── __init__.py          # Plugin registration
-│   └── bart.py              # BART model implementation
-├── setup.py                 # Package configuration and entry points
-├── README.md                # This file
-└── LICENSE                  # License file
+│   ├── __init__.py            # Plugin registration
+│   └── bart.py                # BART model implementation
+│   └── florence2.py           # Florence-2 model implementation
+├── setup.py                   # Package configuration and entry points
+├── README.md                  # This file
+└── LICENSE                    # License file
+└── example_bart_usage.py      # Example usage script for BART
+└── example_florence2_usage.py # Example usage script for Florence-2
 ```
 
 ### Running Tests
diff --git a/example_florence2_usage.py b/example_florence2_usage.py
@@ -12,7 +12,7 @@
 
 def main():
     """Run Florence-2 model examples."""
-    model_name = "microsoft/Florence-2-large"
+    model_name = "microsoft/Florence-2-large-ft"
     tokenizer_name = "Isotr0py/Florence-2-tokenizer"
 
     llm = LLM(
@@ -60,4 +60,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "vllm-bart-plugin"
-version = "0.2.0"
+version = "0.3.0"
 description = "BART model plugin for vLLM"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -26,9 +26,9 @@ classifiers = [
 ]
 
 dependencies = [
-    "vllm>=0.14.0",
+    "vllm>=0.16.0",
     "torch>=2.9.0",
-    "transformers>=4.56.0,<5",
+    "transformers>=4.56.0",
 ]
 
 [project.optional-dependencies]
diff --git a/vllm_bart_plugin/bart.py b/vllm_bart_plugin/bart.py
@@ -29,7 +29,8 @@
 from torch import nn
 from transformers import BartConfig
 from transformers.utils import logging
-from vllm.attention.layer import Attention, AttentionType
+from vllm.model_executor.layers.attention import Attention
+from vllm.v1.attention.backend import AttentionType
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.lora import LoRAConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -78,7 +79,7 @@
     EncDecMultiModalProcessor,
     PromptUpdate,
 )
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.processing.dummy_inputs import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.collection_utils import is_list_of
 
diff --git a/vllm_bart_plugin/florence2.py b/vllm_bart_plugin/florence2.py
@@ -13,7 +13,8 @@
 from transformers import BartConfig, BatchFeature, BartTokenizer, PretrainedConfig
 from transformers.utils import logging
 
-from vllm.attention.layer import Attention, AttentionType
+from vllm.model_executor.layers.attention import Attention
+from vllm.v1.attention.backend import AttentionType
 from vllm.model_executor.layers.attention.cross_attention import CrossAttention
 from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.config import CacheConfig, VllmConfig
@@ -53,7 +54,7 @@
     PromptInsertion,
     PromptIndexTargets,
 )
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.processing.dummy_inputs import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.collection_utils import is_list_of