vllm-project · carlesonielfa · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/README.md b/README.md
@@ -15,6 +15,13 @@ This plugin requires [uv](https://docs.astral.sh/uv/) for package management. If
 ```bash
 curl -LsSf https://astral.sh/uv/install.sh | sh
 ```
+### From Git
+
+Install from git:
+
+```bash
+pip install git+https://github.com/vllm-project/bart-plugin
+```
 
 ### From Source
 
@@ -126,10 +133,8 @@ This plugin should work with any BART-based model from HuggingFace, including:
 
 ### Florence-2 Models
 
-- `microsoft/Florence-2-base`
-- `microsoft/Florence-2-large`
-
-Note: Florence-2 requires `trust_remote_code=True` and uses a separate tokenizer (`Isotr0py/Florence-2-tokenizer`).
+- `florence-community/Florence-2-base`
+- `florence-community/Florence-2-large`
 
 ## Evaluation
 
@@ -186,11 +191,14 @@ Notes:
 ```
 bart-plugin/
 ├── vllm_bart_plugin/
-│   ├── __init__.py          # Plugin registration
-│   └── bart.py              # BART model implementation
-├── setup.py                 # Package configuration and entry points
-├── README.md                # This file
-└── LICENSE                  # License file
+│   ├── __init__.py            # Plugin registration
+│   └── bart.py                # BART model implementation
+│   └── florence2.py           # Florence-2 model implementation
+├── setup.py                   # Package configuration and entry points
+├── README.md                  # This file
+└── LICENSE                    # License file
+└── example_bart_usage.py      # Example usage script for BART
+└── example_florence2_usage.py # Example usage script for Florence-2
 ```
 
 ### Running Tests

diff --git a/example_florence2_usage.py b/example_florence2_usage.py
@@ -5,28 +5,23 @@
 This script demonstrates how to use Florence-2 models with vLLM
 after installing the BART plugin.
 """
-import vllm_bart_plugin
+
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 
 
 def main():
     """Run Florence-2 model examples."""
-    model_name = "microsoft/Florence-2-large"
-    tokenizer_name = "Isotr0py/Florence-2-tokenizer"
+    model_name = "florence-community/Florence-2-large-ft"
 
     llm = LLM(
         model=model_name,
-        tokenizer=tokenizer_name,
         mm_processor_cache_gb=0,
-        trust_remote_code=True,
         enforce_eager=True,
     )
     params = SamplingParams(
         temperature=0.0,
         max_tokens=20,
-        # repetition_penalty is needed to prevent <s> repetition
-        repetition_penalty=1.5,
         # skip_special_tokens=False is needed to present
         # grounding tokens like <loc_0><loc_1>
         skip_special_tokens=False,
@@ -60,4 +55,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "vllm-bart-plugin"
-version = "0.2.0"
+version = "0.3.0"
 description = "BART model plugin for vLLM"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -26,9 +26,9 @@ classifiers = [
 ]
 
 dependencies = [
-    "vllm>=0.14.0",
+    "vllm>=0.16.0",
     "torch>=2.9.0",
-    "transformers>=4.56.0,<5",
+    "transformers>=4.56.0,<6",
 ]
 
 [project.optional-dependencies]
@@ -62,8 +62,20 @@ include = '\.pyi?$'
 profile = "black"
 line_length = 88
 
+[tool.pytest.ini_options]
+markers = [
+    "slow: marks tests requiring a GPU and full model download (deselect with '-m \"not slow\"')",
+]
+
 [tool.mypy]
 python_version = "3.10"
 warn_return_any = true
 warn_unused_configs = true
 ignore_missing_imports = true
+
+[dependency-groups]
+dev = [
+    "black>=26.1.0",
+    "isort>=8.0.1",
+    "pytest>=9.0.2",
+]
diff --git a/tests/test_florence2.py b/tests/test_florence2.py
@@ -0,0 +1,267 @@
+"""Tests for the Florence-2 multimodal model plugin."""
+
+import os
+
+import pytest
+import torch
+from transformers import Florence2Config
+
+MODEL_NAME = "florence-community/Florence-2-base-ft"
+
+
+def _small_vision_config():
+    """Tiny 1-stage Florence2 config for fast CPU tests."""
+    cfg = Florence2Config()
+    vc = cfg.vision_config
+    vc.embed_dim = [64]
+    vc.depths = [1]
+    vc.num_heads = [4]
+    vc.num_groups = [4]
+    vc.patch_size = [7]
+    vc.patch_stride = [4]
+    vc.patch_padding = [3]
+    vc.patch_prenorm = [False]
+    vc.drop_path_rate = 0.0
+    return cfg, vc
+
+
+# ---------------------------------------------------------------------------
+# Unit tests — vision architecture (CPU, no weights)
+# ---------------------------------------------------------------------------
+
+
+class TestFlorenceVisionBackbone:
+    def test_output_shape(self):
+        from vllm_bart_plugin.florence2 import Florence2VisionBackbone
+
+        _, vc = _small_vision_config()
+        out = Florence2VisionBackbone(vc)(torch.randn(2, 3, 64, 64))
+        assert out.shape == (2, vc.embed_dim[-1], 16, 16)
+
+
+class TestFlorenceMultiModalProjector:
+    def test_output_shape(self):
+        from vllm_bart_plugin.florence2 import Florence2MultiModalProjector
+
+        cfg, vc = _small_vision_config()
+        vc.projection_dim = 128
+        m = Florence2MultiModalProjector(cfg)
+        out = m(torch.randn(2, vc.embed_dim[-1], 12, 12))
+        # (B, 1 spatial-avg token + H*W tokens, proj_dim)
+        assert out.shape == (2, 1 + 12 * 12, vc.projection_dim)
+
+
+# ---------------------------------------------------------------------------
+# Integration tests — full model inference (GPU required)
+# ---------------------------------------------------------------------------
+
+
+def _run_task(llm, processor, image, task_prompt, text_input=None, max_tokens=100):
+    """Helper: run one Florence-2 task and return the post-processed result."""
+    from vllm import SamplingParams
+
+    prompt = task_prompt if text_input is None else task_prompt + text_input
+    params = SamplingParams(
+        temperature=0.0, max_tokens=max_tokens, skip_special_tokens=False
+    )
+    outputs = llm.generate(
+        [{"prompt": prompt, "multi_modal_data": {"image": image}}],
+        sampling_params=params,
+    )
+    raw = outputs[0].outputs[0].text
+    return processor.post_process_generation(
+        raw, task=task_prompt, image_size=image.size
+    )
+
+
+@pytest.fixture(scope="module")
+def florence2_llm():
+    from vllm import LLM
+
+    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+    return LLM(
+        model=MODEL_NAME,
+        enforce_eager=True,
+        gpu_memory_utilization=0.5,
+        mm_processor_cache_gb=0,
+    )
+
+
+@pytest.fixture(scope="module")
+def florence2_processor():
+    from transformers import AutoProcessor
+
+    return AutoProcessor.from_pretrained(MODEL_NAME)
+
+
+@pytest.fixture(scope="module")
+def stop_sign_image():
+    from vllm.assets.image import ImageAsset
+
+    return ImageAsset("stop_sign").pil_image.convert("RGB")
+
+
+@pytest.mark.slow
+class TestFlorenceInference:
+    # ------------------------------------------------------------------
+    # Caption tasks — check for semantically meaningful keywords
+    # ------------------------------------------------------------------
+
+    def test_caption(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<CAPTION>",
+            max_tokens=30,
+        )
+        text = result["<CAPTION>"].lower()
+        assert (
+            "car" in text or "stop" in text
+        ), f"<CAPTION> output missing expected content: {text!r}"
+
+    def test_detailed_caption(
+        self, florence2_llm, florence2_processor, stop_sign_image
+    ):
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<DETAILED_CAPTION>",
+            max_tokens=80,
+        )
+        text = result["<DETAILED_CAPTION>"].lower()
+        # Must mention the car and give some background detail — guards against the
+        # KV-cache encoder_seq_lens regression that previously produced garbled output.
+        assert "car" in text, f"<DETAILED_CAPTION> missing 'car': {text!r}"
+        assert len(text.split()) >= 10, f"<DETAILED_CAPTION> too short: {text!r}"
+
+    def test_more_detailed_caption(
+        self, florence2_llm, florence2_processor, stop_sign_image
+    ):
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<MORE_DETAILED_CAPTION>",
+            max_tokens=100,
+        )
+        text = result["<MORE_DETAILED_CAPTION>"].lower()
+        assert (
+            "stop sign" in text or "sign" in text
+        ), f"<MORE_DETAILED_CAPTION> missing 'stop sign': {text!r}"
+        assert len(text.split()) >= 10, f"<MORE_DETAILED_CAPTION> too short: {text!r}"
+
+    # ------------------------------------------------------------------
+    # Structured-output tasks — check schema and key labels
+    # ------------------------------------------------------------------
+
+    def test_object_detection(
+        self, florence2_llm, florence2_processor, stop_sign_image
+    ):
+        result = _run_task(
+            florence2_llm, florence2_processor, stop_sign_image, "<OD>", max_tokens=300
+        )
+        od = result["<OD>"]
+        assert "bboxes" in od and "labels" in od
+        assert len(od["bboxes"]) == len(od["labels"]) > 0
+        # Each bbox must be a 4-element list with non-negative coords
+        for bbox in od["bboxes"]:
+            assert len(bbox) == 4 and all(c >= 0 for c in bbox)
+        labels = od["labels"]
+        assert (
+            "stop sign" in labels
+        ), f"Expected 'stop sign' in OD labels, got: {labels}"
+        assert (
+            "car" in labels or "building" in labels
+        ), f"Expected common objects in OD labels, got: {labels}"
+
+    def test_dense_region_caption(
+        self, florence2_llm, florence2_processor, stop_sign_image
+    ):
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<DENSE_REGION_CAPTION>",
+            max_tokens=250,
+        )
+        drc = result["<DENSE_REGION_CAPTION>"]
+        assert "bboxes" in drc and "labels" in drc
+        assert len(drc["bboxes"]) == len(drc["labels"]) > 0
+        assert (
+            "stop sign" in drc["labels"]
+        ), f"Expected 'stop sign' in dense captions, got: {drc['labels']}"
+
+    def test_region_proposal(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<REGION_PROPOSAL>",
+            max_tokens=100,
+        )
+        rp = result["<REGION_PROPOSAL>"]
+        assert "bboxes" in rp and "labels" in rp
+        assert len(rp["bboxes"]) > 0
+        # Region proposal labels are always empty strings
+        assert all(label == "" for label in rp["labels"])
+
+    def test_ocr_with_region(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<OCR_WITH_REGION>",
+            max_tokens=250,
+        )
+        ocr = result["<OCR_WITH_REGION>"]
+        assert "quad_boxes" in ocr and "labels" in ocr
+        assert len(ocr["quad_boxes"]) == len(ocr["labels"]) > 0
+        # Each quad box must be 8 coords
+        for quad in ocr["quad_boxes"]:
+            assert len(quad) == 8
+        # "STOP" is the most prominent text in the image
+        joined = " ".join(ocr["labels"])
+        assert (
+            "STOP" in joined
+        ), f"Expected 'STOP' in OCR_WITH_REGION labels, got: {joined!r}"
+
+    def test_caption_to_phrase_grounding(
+        self, florence2_llm, florence2_processor, stop_sign_image
+    ):
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<CAPTION_TO_PHRASE_GROUNDING>",
+            text_input="A stop sign on a street corner.",
+            max_tokens=80,
+        )
+        cpg = result["<CAPTION_TO_PHRASE_GROUNDING>"]
+        assert "bboxes" in cpg and "labels" in cpg
+        assert len(cpg["bboxes"]) > 0
+        assert any(
+            "stop sign" in lbl.lower() for lbl in cpg["labels"]
+        ), f"Expected 'stop sign' grounded, got labels: {cpg['labels']}"
+
+    # ------------------------------------------------------------------
+    # Batch tests
+    # ------------------------------------------------------------------
+
+    def test_batch_inference(self, florence2_llm, florence2_processor, stop_sign_image):
+        """Multiple prompts in one batch must all produce non-empty output."""
+        from vllm import SamplingParams
+
+        params = SamplingParams(
+            temperature=0.0, max_tokens=30, skip_special_tokens=False
+        )
+        prompts = [
+            {"prompt": "<CAPTION>", "multi_modal_data": {"image": stop_sign_image}},
+            {
+                "prompt": "<DETAILED_CAPTION>",
+                "multi_modal_data": {"image": stop_sign_image},
+            },
+        ]
+        outputs = florence2_llm.generate(prompts, sampling_params=params)
+        assert all(len(o.outputs[0].text) > 0 for o in outputs)