From 4bc4975e44df7866c0752f0043171dfcb620585a Mon Sep 17 00:00:00 2001
From: Carles Onielfa <carlesonielfa@gmail.com>
Date: Fri, 6 Mar 2026 08:35:03 +0100
Subject: [PATCH 01/11] vllm 0.16.0 support

Signed-off-by: Carles Onielfa <carlesonielfa@gmail.com>
---
 README.md                     | 20 +++++++++++++++-----
 example_florence2_usage.py    |  4 ++--
 pyproject.toml                |  6 +++---
 vllm_bart_plugin/bart.py      |  5 +++--
 vllm_bart_plugin/florence2.py |  5 +++--
 5 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 60efa88..de93e28 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,13 @@ This plugin requires [uv](https://docs.astral.sh/uv/) for package management. If
 ```bash
 curl -LsSf https://astral.sh/uv/install.sh | sh
 ```
+### From Git
+
+Install using git as a package index:
+
+```bash
+pip install git+https://github.com/vllm-project/bart-plugin
+```
 
 ### From Source
 
@@ -186,11 +193,14 @@ Notes:
 ```
 bart-plugin/
 ├── vllm_bart_plugin/
-│   ├── __init__.py          # Plugin registration
-│   └── bart.py              # BART model implementation
-├── setup.py                 # Package configuration and entry points
-├── README.md                # This file
-└── LICENSE                  # License file
+│   ├── __init__.py            # Plugin registration
+│   └── bart.py                # BART model implementation
+│   └── florence2.py           # Florence-2 model implementation
+├── setup.py                   # Package configuration and entry points
+├── README.md                  # This file
+└── LICENSE                    # License file
+└── example_bart_usage.py      # Example usage script for BART
+└── example_florence2_usage.py # Example usage script for Florence-2
 ```
 
 ### Running Tests
diff --git a/example_florence2_usage.py b/example_florence2_usage.py
index 6e9593b..69ca43e 100644
--- a/example_florence2_usage.py
+++ b/example_florence2_usage.py
@@ -12,7 +12,7 @@
 
 def main():
     """Run Florence-2 model examples."""
-    model_name = "microsoft/Florence-2-large"
+    model_name = "microsoft/Florence-2-large-ft"
     tokenizer_name = "Isotr0py/Florence-2-tokenizer"
 
     llm = LLM(
@@ -60,4 +60,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index c2627f2..7256cb6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "vllm-bart-plugin"
-version = "0.2.0"
+version = "0.3.0"
 description = "BART model plugin for vLLM"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -26,9 +26,9 @@ classifiers = [
 ]
 
 dependencies = [
-    "vllm>=0.14.0",
+    "vllm>=0.16.0",
     "torch>=2.9.0",
-    "transformers>=4.56.0,<5",
+    "transformers>=4.56.0",
 ]
 
 [project.optional-dependencies]
diff --git a/vllm_bart_plugin/bart.py b/vllm_bart_plugin/bart.py
index f3d6cd5..db98d6f 100644
--- a/vllm_bart_plugin/bart.py
+++ b/vllm_bart_plugin/bart.py
@@ -29,7 +29,8 @@
 from torch import nn
 from transformers import BartConfig
 from transformers.utils import logging
-from vllm.attention.layer import Attention, AttentionType
+from vllm.model_executor.layers.attention import Attention
+from vllm.v1.attention.backend import AttentionType
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.lora import LoRAConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -78,7 +79,7 @@
     EncDecMultiModalProcessor,
     PromptUpdate,
 )
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.processing.dummy_inputs import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.collection_utils import is_list_of
 
diff --git a/vllm_bart_plugin/florence2.py b/vllm_bart_plugin/florence2.py
index 3624b1f..3c71bee 100644
--- a/vllm_bart_plugin/florence2.py
+++ b/vllm_bart_plugin/florence2.py
@@ -13,7 +13,8 @@
 from transformers import BartConfig, BatchFeature, BartTokenizer, PretrainedConfig
 from transformers.utils import logging
 
-from vllm.attention.layer import Attention, AttentionType
+from vllm.model_executor.layers.attention import Attention
+from vllm.v1.attention.backend import AttentionType
 from vllm.model_executor.layers.attention.cross_attention import CrossAttention
 from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.config import CacheConfig, VllmConfig
@@ -53,7 +54,7 @@
     PromptInsertion,
     PromptIndexTargets,
 )
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.processing.dummy_inputs import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.collection_utils import is_list_of
 

From b42b2880c450b273acb4c01610a657a5cd180d98 Mon Sep 17 00:00:00 2001
From: Carles Onielfa <carlesonielfa@gmail.com>
Date: Fri, 6 Mar 2026 11:50:04 +0100
Subject: [PATCH 02/11] Support offical HF implementation and transformers > 5

Signed-off-by: Carles Onielfa <carlesonielfa@gmail.com>
---
 README.md                     |   2 +-
 example_florence2_usage.py    |   6 +-
 pyproject.toml                |   2 +-
 vllm_bart_plugin/__init__.py  |   4 +-
 vllm_bart_plugin/florence2.py | 999 ++++++++++++----------------------
 5 files changed, 357 insertions(+), 656 deletions(-)

diff --git a/README.md b/README.md
index de93e28..a56b469 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
 ```
 ### From Git
 
-Install using git as a package index:
+Install from git:
 
 ```bash
 pip install git+https://github.com/vllm-project/bart-plugin
diff --git a/example_florence2_usage.py b/example_florence2_usage.py
index 69ca43e..6b01b2b 100644
--- a/example_florence2_usage.py
+++ b/example_florence2_usage.py
@@ -5,19 +5,17 @@
 This script demonstrates how to use Florence-2 models with vLLM
 after installing the BART plugin.
 """
-import vllm_bart_plugin
+
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 
 
 def main():
     """Run Florence-2 model examples."""
-    model_name = "microsoft/Florence-2-large-ft"
-    tokenizer_name = "Isotr0py/Florence-2-tokenizer"
+    model_name = "florence-community/Florence-2-large-ft"
 
     llm = LLM(
         model=model_name,
-        tokenizer=tokenizer_name,
         mm_processor_cache_gb=0,
         trust_remote_code=True,
         enforce_eager=True,
diff --git a/pyproject.toml b/pyproject.toml
index 7256cb6..9cb05c0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ classifiers = [
 dependencies = [
     "vllm>=0.16.0",
     "torch>=2.9.0",
-    "transformers>=4.56.0",
+    "transformers>=4.56.0,<6",
 ]
 
 [project.optional-dependencies]
diff --git a/vllm_bart_plugin/__init__.py b/vllm_bart_plugin/__init__.py
index 0648366..df079d7 100644
--- a/vllm_bart_plugin/__init__.py
+++ b/vllm_bart_plugin/__init__.py
@@ -34,10 +34,10 @@ def register_bart_model() -> None:
             "vllm_bart_plugin.florence2:Florence2ForConditionalGeneration",
         )
 
-        logger.info("Successfully registered BART model with vLLM")
+        logger.info("Successfully registered BART and Florence2 models with vLLM")
 
     except Exception as e:
-        logger.error(f"Failed to register BART model: {e}")
+        logger.error(f"Failed to register BART and Florence2 models: {e}")
         raise
 
 
diff --git a/vllm_bart_plugin/florence2.py b/vllm_bart_plugin/florence2.py
index 3c71bee..aee2e8c 100644
--- a/vllm_bart_plugin/florence2.py
+++ b/vllm_bart_plugin/florence2.py
@@ -1,17 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Literal, TypedDict, OrderedDict
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any
-
-import torch.nn.functional as F
-from einops import rearrange
+from typing import Literal, TypedDict
 
 import torch
+import torch.nn.functional as F
 from torch import nn
-from transformers import BartConfig, BatchFeature, BartTokenizer, PretrainedConfig
-from transformers.utils import logging
+from transformers import BartConfig, BartTokenizer, BatchFeature, Florence2Config, Florence2Processor
 
 from vllm.model_executor.layers.attention import Attention
 from vllm.v1.attention.backend import AttentionType
@@ -70,544 +66,350 @@ class Florence2ImagePixelInputs(TypedDict):
     """Shape: (batch_size, num_channel, height, width)"""
 
 
-# ViT implementation are all copied from
-# https://huggingface.co/microsoft/Florence-2-base/blob/main/modeling_florence2.py
-class LearnedAbsolutePositionEmbedding2D(nn.Module):
-    """
-    This module learns positional embeddings up to a fixed maximum size.
-    """
 
-    def __init__(self, embedding_dim=256, num_pos=50):
-        super().__init__()
-        self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
-        self.column_embeddings = nn.Embedding(
-            num_pos, embedding_dim - (embedding_dim // 2))
-
-    def forward(self, pixel_values):
-        """
-        pixel_values: (batch_size, height, width, num_channels) 
-        returns: (batch_size, height, width, embedding_dim * 2)
-        """
-        if len(pixel_values.shape) != 4:
-            raise ValueError('pixel_values must be a 4D tensor')
-        height, width = pixel_values.shape[1:3]
-        width_values = torch.arange(width, device=pixel_values.device)
-        height_values = torch.arange(height, device=pixel_values.device)
-        x_emb = self.column_embeddings(width_values)
-        y_emb = self.row_embeddings(height_values)
-        # (height, width, embedding_dim * 2)
-        pos = torch.cat([
-            x_emb.unsqueeze(0).repeat(height, 1, 1),
-            y_emb.unsqueeze(1).repeat(1, width, 1)
-        ],
-                        dim=-1)
-        # (embedding_dim * 2, height, width)
-        pos = pos.permute(2, 0, 1)
-        pos = pos.unsqueeze(0)
-        # (batch_size, embedding_dim * 2, height, width)
-        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
-        # (batch_size, height, width, embedding_dim * 2)
-        pos = pos.permute(0, 2, 3, 1)
-        return pos
-
-
-class PositionalEmbeddingCosine1D(nn.Module):
-    """
-    This class implements a very simple positional encoding. It follows closely
-    the encoder from the link below:
-    https://pytorch.org/tutorials/beginner/translation_transformer.html
-    Args:
-        embed_dim: The dimension of the embeddings.
-        dropout_prob: The dropout probability.
-        max_seq_len: The maximum length to precompute the positional encodings.
-    """
-
-    def __init__(self, embed_dim: int = 512, max_seq_len: int = 1024) -> None:
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.max_seq_len = max_seq_len
-        # Generate the sinusoidal arrays.
-        factor = math.log(10000)
-        denominator = torch.exp(-factor * torch.arange(0, self.embed_dim, 2) /
-                                self.embed_dim)
-        # Matrix where rows correspond to a positional embedding as a function
-        # of the position index (i.e., the row index).
-        frequencies = \
-            torch.arange(0, self.max_seq_len) \
-            .reshape(self.max_seq_len, 1) * denominator
-        pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
-        # Populate uneven entries.
-        pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
-        pos_idx_to_embed[:, 1::2] = torch.cos(frequencies)
-        # Save the positional embeddings in a constant buffer.
-        # self.register_buffer("pos_idx_to_embed", pos_idx_to_embed)
-        self.pos_idx_to_embed = nn.Parameter(pos_idx_to_embed,
-                                             requires_grad=False)
-
-    def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            seq_embeds: The sequence embeddings in order. Allowed size:
-                1. [T, D], where T is the length of the sequence, and D is the
-                frame embedding dimension.
-                2. [B, T, D], where B is the batch size and T and D are the
-                same as above.
-        Returns a tensor of with the same dimensions as the input: i.e.,
-        [1, T, D] or [T, D].
-        """
-        shape_len = len(seq_embeds.shape)
-        assert 2 <= shape_len <= 3
-        len_seq = seq_embeds.size(-2)
-        assert len_seq <= self.max_seq_len
-        pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :]
-        # Adapt pre-computed positional embeddings to the input.
-        if shape_len == 3:
-            pos_embeds = pos_embeds.view(
-                (1, pos_embeds.size(0), pos_embeds.size(1)))
-        return pos_embeds
-
-
-class MySequential(nn.Sequential):
-
-    def forward(self, *inputs):
-        for module in self._modules.values():
-            if isinstance(inputs, tuple):
-                inputs = module(*inputs)
-            else:
-                inputs = module(inputs)
-        return inputs
-
-
-class PreNorm(nn.Module):
-
-    def __init__(self, norm, fn):
-        super().__init__()
-        self.norm = norm
-        self.fn = fn
-
-    def forward(self, x, *args, **kwargs):
-        shortcut = x
-        if self.norm is not None:
-            x, size = self.fn(self.norm(x), *args, **kwargs)
-        else:
-            x, size = self.fn(x, *args, **kwargs)
-
-        x = shortcut + x
-
-        return x, size
 
+def _drop_path(x: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()
+    return x.div(keep_prob) * random_tensor
 
-class Mlp(nn.Module):
 
-    def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer=nn.GELU,
-    ):
+class Florence2VisionDropPath(nn.Module):
+    def __init__(self, drop_prob: float = 0.0):
         super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.net = nn.Sequential(
-            OrderedDict([("fc1", nn.Linear(in_features, hidden_features)),
-                         ("act", act_layer()),
-                         ("fc2", nn.Linear(hidden_features, out_features))]))
-
-    def forward(self, x, size):
-        return self.net(x), size
+        self.drop_prob = drop_prob
 
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return _drop_path(x, self.drop_prob, self.training)
 
-class DepthWiseConv2d(nn.Module):
 
-    def __init__(
-        self,
-        dim_in,
-        kernel_size,
-        padding,
-        stride,
-        bias=True,
-    ):
+class Florence2VisionMLP(nn.Module):
+    def __init__(self, embed_dim: int, mlp_ratio: float = 4.0):
         super().__init__()
-        self.dw = nn.Conv2d(dim_in,
-                            dim_in,
-                            kernel_size=kernel_size,
-                            padding=padding,
-                            groups=dim_in,
-                            stride=stride,
-                            bias=bias)
-
-    def forward(self, x, size):
-        B, N, C = x.shape
-        H, W = size
-        assert N == H * W
+        hidden_dim = int(embed_dim * mlp_ratio)
+        self.fc1 = nn.Linear(embed_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, embed_dim)
+        self.act = nn.GELU()
 
-        x = self.dw(x.transpose(1, 2).view(B, C, H, W))
-        size = (x.size(-2), x.size(-1))
-        x = x.flatten(2).transpose(1, 2)
-        return x, size
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fc2(self.act(self.fc1(x)))
 
 
-class ConvEmbed(nn.Module):
-    """ Image to Patch Embedding
-    """
+class Florence2VisionConvEmbed(nn.Module):
+    """Image-to-patch embedding via strided convolution (NCHW in, NCHW out)."""
 
-    def __init__(self,
-                 patch_size=7,
-                 in_chans=3,
-                 embed_dim=64,
-                 stride=4,
-                 padding=2,
-                 norm_layer=None,
-                 pre_norm=True):
+    def __init__(self, patch_size: int, in_channels: int, embed_dim: int,
+                 stride: int, padding: int, pre_norm: bool):
         super().__init__()
-        self.patch_size = patch_size
-
-        self.proj = nn.Conv2d(in_chans,
-                              embed_dim,
-                              kernel_size=patch_size,
-                              stride=stride,
-                              padding=padding)
-
-        dim_norm = in_chans if pre_norm else embed_dim
-        self.norm = norm_layer(dim_norm) if norm_layer else None
-
         self.pre_norm = pre_norm
-
-    def forward(self, x, size):
-        H, W = size
-        if len(x.size()) == 3:
-            if self.norm and self.pre_norm:
-                x = self.norm(x)
-            x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)
-
-        x = self.proj(x)
-
-        _, _, H, W = x.shape
-        x = rearrange(x, 'b c h w -> b (h w) c')
-        if self.norm and not self.pre_norm:
-            x = self.norm(x)
-
-        return x, (H, W)
+        self.conv = nn.Conv2d(in_channels, embed_dim,
+                              kernel_size=patch_size, stride=stride, padding=padding)
+        dim_norm = in_channels if pre_norm else embed_dim
+        self.norm = nn.LayerNorm(dim_norm)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.pre_norm:
+            x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        x = self.conv(x)
+        if not self.pre_norm:
+            x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        return x
 
 
-class ChannelAttention(nn.Module):
+class Florence2VisionChannelAttention(nn.Module):
+    """Channel (group) attention — attends over the channel dimension."""
 
-    def __init__(self, dim, groups=8, qkv_bias=True):
+    def __init__(self, dim: int, groups: int, qkv_bias: bool = True):
         super().__init__()
-
         self.groups = groups
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
         self.proj = nn.Linear(dim, dim)
 
-    def forward(self, x, size):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
-
-        qkv = self.qkv(x).reshape(B, N, 3, self.groups,
-                                  C // self.groups).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]
-
-        q = q * (float(N)**-0.5)
-        attention = q.transpose(-1, -2) @ k
-        attention = attention.softmax(dim=-1)
-        x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
-        x = x.transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        return x, size
-
-
-class ChannelBlock(nn.Module):
-
-    def __init__(self,
-                 dim,
-                 groups,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 drop_path_rate=0.,
-                 act_layer=nn.GELU,
-                 norm_layer=nn.LayerNorm,
-                 conv_at_attn=True,
-                 conv_at_ffn=True):
+        # Reshape: (B, N, 3, groups, C//groups) -> (3, B, groups, N, C//groups)
+        qkv = self.qkv(x).reshape(B, N, 3, self.groups, C // self.groups)
+        qkv = qkv.permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # each: (B, groups, N, C//groups)
+
+        # Scale by sequence length and compute channel-to-channel attention
+        q = q * (float(N) ** -0.5)
+        attn = (q.transpose(-2, -1) @ k).softmax(dim=-1)  # (B, groups, C//g, C//g)
+        out = (attn @ v.transpose(-2, -1)).transpose(-2, -1)  # (B, groups, N, C//g)
+        out = out.transpose(1, 2).reshape(B, N, C)
+        return self.proj(out)
+
+
+class Florence2VisionChannelBlock(nn.Module):
+    def __init__(self, embed_dim: int, groups: int, mlp_ratio: float = 4.0,
+                 qkv_bias: bool = True, drop_path_rate: float = 0.0):
         super().__init__()
+        self.conv1 = nn.Conv2d(embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim)
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.channel_attn = Florence2VisionChannelAttention(embed_dim, groups, qkv_bias)
+        self.drop_path1 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.conv2 = nn.Conv2d(embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim)
+        self.norm2 = nn.LayerNorm(embed_dim)
+        self.ffn = Florence2VisionMLP(embed_dim, mlp_ratio)
+        self.drop_path2 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, C, H, W = x.shape
+        # Sub-block 1: depthwise conv residual + channel attention
+        x = self.conv1(x) + x
+        x_flat = x.flatten(2).transpose(1, 2)  # (B, H*W, C)
+        residual = x_flat
+        x_flat = residual + self.drop_path1(self.channel_attn(self.norm1(x_flat)))
+        x = x_flat.transpose(1, 2).view(B, C, H, W)
+        # Sub-block 2: depthwise conv residual + FFN
+        x = self.conv2(x) + x
+        x_flat = x.flatten(2).transpose(1, 2)
+        residual = x_flat
+        x_flat = residual + self.drop_path2(self.ffn(self.norm2(x_flat)))
+        x = x_flat.transpose(1, 2).view(B, C, H, W)
+        return x
 
-        self.conv1 = PreNorm(None, DepthWiseConv2d(
-            dim, 3, 1, 1)) if conv_at_attn else None
-        self.channel_attn = PreNorm(
-            norm_layer(dim),
-            ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
-        )
-        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
-                                                   1)) if conv_at_ffn else None
-        self.ffn = PreNorm(
-            norm_layer(dim),
-            Mlp(in_features=dim,
-                hidden_features=int(dim * mlp_ratio),
-                act_layer=act_layer),
-        )
-
-    def forward(self, x, size):
-        if self.conv1:
-            x, size = self.conv1(x, size)
-        x, size = self.channel_attn(x, size)
-
-        if self.conv2:
-            x, size = self.conv2(x, size)
-        x, size = self.ffn(x, size)
-
-        return x, size
-
-
-def window_partition(x, window_size: int):
-    B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
-               C)
-    windows = x.permute(0, 1, 3, 2, 4,
-                        5).contiguous().view(-1, window_size, window_size, C)
-    return windows
-
-
-def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
-    B = batch_size
-
-    x = windows.view(B, H // window_size, W // window_size, window_size,
-                     window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-    return x
-
-
-class WindowAttention(nn.Module):
 
-    def __init__(self, dim, num_heads, window_size, qkv_bias=True):
+class Florence2VisionWindowAttention(nn.Module):
+    """Window-based local spatial self-attention."""
 
+    def __init__(self, dim: int, num_heads: int, window_size: int, qkv_bias: bool = True):
         super().__init__()
-        self.dim = dim
-        self.window_size = window_size
         self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = float(head_dim)**-0.5
-
+        self.window_size = window_size
+        self.scale = (dim // num_heads) ** -0.5
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
         self.proj = nn.Linear(dim, dim)
 
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x, size):
-
-        H, W = size
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-
-        x = x.view(B, H, W, C)
-
-        pad_l = pad_t = 0
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (B, H, W, C) BHWC
+        B, H, W, C = x.shape
         pad_r = (self.window_size - W % self.window_size) % self.window_size
         pad_b = (self.window_size - H % self.window_size) % self.window_size
-        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
-        _, Hp, Wp, _ = x.shape
-
-        x = window_partition(x, self.window_size)
-        x = x.view(-1, self.window_size * self.window_size, C)
-
-        # W-MSA/SW-MSA
-        # attn_windows = self.attn(x_windows)
-
-        B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
-                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]
-
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-        attn = self.softmax(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        if pad_r > 0 or pad_b > 0:
+            x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
+        Hp, Wp = x.shape[1], x.shape[2]
+
+        # Partition into non-overlapping windows
+        x = x.view(B, Hp // self.window_size, self.window_size,
+                   Wp // self.window_size, self.window_size, C)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, self.window_size ** 2, C)
+
+        Bw, Nw = x.shape[:2]
+        qkv = (self.qkv(x)
+               .reshape(Bw, Nw, 3, self.num_heads, C // self.num_heads)
+               .permute(2, 0, 3, 1, 4))
+        q, k, v = qkv.unbind(0)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        x = (attn.softmax(dim=-1) @ v).transpose(1, 2).reshape(Bw, Nw, C)
         x = self.proj(x)
 
-        # merge windows
+        # Merge windows back
         x = x.view(-1, self.window_size, self.window_size, C)
-        x = window_reverse(x, B, self.window_size, Hp, Wp)
-
+        x = x.view(B, Hp // self.window_size, Wp // self.window_size,
+                   self.window_size, self.window_size, C)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, C)
         if pad_r > 0 or pad_b > 0:
             x = x[:, :H, :W, :].contiguous()
+        return x.view(B, H * W, C)
 
-        x = x.view(B, H * W, C)
-
-        return x, size
-
-
-class SpatialBlock(nn.Module):
 
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 window_size,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 drop_path_rate=0.,
-                 act_layer=nn.GELU,
-                 norm_layer=nn.LayerNorm,
-                 conv_at_attn=True,
-                 conv_at_ffn=True):
+class Florence2VisionSpatialBlock(nn.Module):
+    def __init__(self, embed_dim: int, num_heads: int, window_size: int,
+                 mlp_ratio: float = 4.0, qkv_bias: bool = True, drop_path_rate: float = 0.0):
         super().__init__()
+        self.conv1 = nn.Conv2d(embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim)
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.window_attn = Florence2VisionWindowAttention(embed_dim, num_heads, window_size, qkv_bias)
+        self.drop_path1 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.conv2 = nn.Conv2d(embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim)
+        self.norm2 = nn.LayerNorm(embed_dim)
+        self.ffn = Florence2VisionMLP(embed_dim, mlp_ratio)
+        self.drop_path2 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, C, H, W = x.shape
+        # Sub-block 1: depthwise conv residual + window attention
+        x = self.conv1(x) + x
+        x_flat = x.flatten(2).transpose(1, 2)  # (B, H*W, C)
+        residual = x_flat
+        x_bhwc = self.norm1(x_flat).view(B, H, W, C)
+        x_flat = residual + self.drop_path1(self.window_attn(x_bhwc))
+        x = x_flat.transpose(1, 2).view(B, C, H, W)
+        # Sub-block 2: depthwise conv residual + FFN
+        x = self.conv2(x) + x
+        x_flat = x.flatten(2).transpose(1, 2)
+        residual = x_flat
+        x_flat = residual + self.drop_path2(self.ffn(self.norm2(x_flat)))
+        x = x_flat.transpose(1, 2).view(B, C, H, W)
+        return x
 
-        self.conv1 = PreNorm(None, DepthWiseConv2d(
-            dim, 3, 1, 1)) if conv_at_attn else None
-        self.window_attn = PreNorm(
-            norm_layer(dim),
-            WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
-        )
-        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
-                                                   1)) if conv_at_ffn else None
-        self.ffn = PreNorm(
-            norm_layer(dim),
-            Mlp(in_features=dim,
-                hidden_features=int(dim * mlp_ratio),
-                act_layer=act_layer),
-        )
 
-    def forward(self, x, size):
-        if self.conv1:
-            x, size = self.conv1(x, size)
-        x, size = self.window_attn(x, size)
+class Florence2VisionBlock(nn.Module):
+    def __init__(self, embed_dim: int, num_heads: int, num_groups: int,
+                 window_size: int, mlp_ratio: float = 4.0, qkv_bias: bool = True,
+                 spatial_drop_path_rate: float = 0.0, channel_drop_path_rate: float = 0.0):
+        super().__init__()
+        self.spatial_block = Florence2VisionSpatialBlock(
+            embed_dim, num_heads, window_size, mlp_ratio, qkv_bias, spatial_drop_path_rate)
+        self.channel_block = Florence2VisionChannelBlock(
+            embed_dim, num_groups, mlp_ratio, qkv_bias, channel_drop_path_rate)
 
-        if self.conv2:
-            x, size = self.conv2(x, size)
-        x, size = self.ffn(x, size)
-        return x, size
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.channel_block(self.spatial_block(x))
 
 
-class DaViT(nn.Module):
+class Florence2VisionBackbone(nn.Module):
+    """
+    DaViT-based vision backbone for the new Florence-2 architecture.
+    Produces NCHW feature maps for the multi-modal projector.
+    """
 
-    def __init__(
-        self,
-        in_chans=3,
-        num_classes=1000,
-        depths=(1, 1, 3, 1),
-        patch_size=(7, 2, 2, 2),
-        patch_stride=(4, 2, 2, 2),
-        patch_padding=(3, 0, 0, 0),
-        patch_prenorm=(False, False, False, False),
-        embed_dims=(64, 128, 192, 256),
-        num_heads=(3, 6, 12, 24),
-        num_groups=(3, 6, 12, 24),
-        window_size=7,
-        mlp_ratio=4.,
-        qkv_bias=True,
-        drop_path_rate=0.1,
-        norm_layer=nn.LayerNorm,
-        enable_checkpoint=False,
-        conv_at_attn=True,
-        conv_at_ffn=True,
-    ):
+    def __init__(self, config):
         super().__init__()
-
-        self.num_classes = num_classes
-        self.embed_dims = embed_dims
-        self.num_heads = num_heads
-        self.num_groups = num_groups
-        self.num_stages = len(self.embed_dims)
-        self.enable_checkpoint = enable_checkpoint
-        assert self.num_stages == len(self.num_heads) == len(self.num_groups)
-
+        embed_dims = config.embed_dim
         num_stages = len(embed_dims)
-        dpr = [
-            x.item() for x in torch.linspace(0, drop_path_rate,
-                                             sum(depths) * 2)
-        ]
+        depths = config.depths
+        mlp_ratio = getattr(config, 'mlp_ratio', 4.0)
+        qkv_bias = getattr(config, 'qkv_bias', True)
 
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(depths) * 2)]
         depth_offset = 0
+
         convs = []
         blocks = []
-        for i in range(num_stages):
-            conv_embed = ConvEmbed(
-                patch_size=patch_size[i],
-                stride=patch_stride[i],
-                padding=patch_padding[i],
-                in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
-                embed_dim=self.embed_dims[i],
-                norm_layer=norm_layer,
-                pre_norm=patch_prenorm[i])
-            convs.append(conv_embed)
-
-            block = MySequential(*[
-                MySequential(
-                    OrderedDict([('spatial_block',
-                                  SpatialBlock(
-                                      embed_dims[i],
-                                      num_heads[i],
-                                      window_size,
-                                      drop_path_rate=dpr[depth_offset + j * 2],
-                                      qkv_bias=qkv_bias,
-                                      mlp_ratio=mlp_ratio,
-                                      conv_at_attn=conv_at_attn,
-                                      conv_at_ffn=conv_at_ffn,
-                                  )),
-                                 ('channel_block',
-                                  ChannelBlock(
-                                      embed_dims[i],
-                                      num_groups[i],
-                                      drop_path_rate=dpr[depth_offset + j * 2 +
-                                                         1],
-                                      qkv_bias=qkv_bias,
-                                      mlp_ratio=mlp_ratio,
-                                      conv_at_attn=conv_at_attn,
-                                      conv_at_ffn=conv_at_ffn,
-                                  ))])) for j in range(depths[i])
+        for stage_idx in range(num_stages):
+            in_ch = config.in_channels if stage_idx == 0 else embed_dims[stage_idx - 1]
+            convs.append(Florence2VisionConvEmbed(
+                patch_size=config.patch_size[stage_idx],
+                in_channels=in_ch,
+                embed_dim=embed_dims[stage_idx],
+                stride=config.patch_stride[stage_idx],
+                padding=config.patch_padding[stage_idx],
+                pre_norm=config.patch_prenorm[stage_idx],
+            ))
+            stage_blocks = nn.ModuleList([
+                Florence2VisionBlock(
+                    embed_dim=embed_dims[stage_idx],
+                    num_heads=config.num_heads[stage_idx],
+                    num_groups=config.num_groups[stage_idx],
+                    window_size=config.window_size,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    spatial_drop_path_rate=dpr[depth_offset + block_idx * 2],
+                    channel_drop_path_rate=dpr[depth_offset + block_idx * 2 + 1],
+                )
+                for block_idx in range(depths[stage_idx])
             ])
-            blocks.append(block)
-            depth_offset += depths[i] * 2
+            blocks.append(stage_blocks)
+            depth_offset += depths[stage_idx] * 2
 
         self.convs = nn.ModuleList(convs)
         self.blocks = nn.ModuleList(blocks)
 
-        self.avgpool = nn.AdaptiveAvgPool1d(1)
-
-    @property
-    def dim_out(self):
-        return self.embed_dims[-1]
-
-    def forward_features_unpool(self, x):
-        """
-        forward until avg pooling
-        Args:
-            x (_type_): input image tensor
-        """
-        input_size = (x.size(2), x.size(3))
-        for conv, block in zip(self.convs, self.blocks):
-            x, input_size = conv(x, input_size)
-            x, input_size = block(x, input_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Args: x (B, 3, H, W). Returns: (B, C_last, H', W') NCHW feature map."""
+        for conv, block_list in zip(self.convs, self.blocks):
+            x = conv(x)
+            for block in block_list:
+                x = block(x)
         return x
 
-    def forward_features(self, x):
-        x = self.forward_features_unpool(x)
 
-        # (batch_size, num_tokens, token_dim)
-        x = self.avgpool(x.transpose(1, 2))
-        # (batch_size, 1, num_tokens)
-        x = torch.flatten(x, 1)
-        x = self.norms(x)
+class Florence2VisionLearnedAbsolutePositionEmbedding2D(nn.Module):
+    """2D learned absolute position embedding (NCHW interface)."""
 
-        return x
+    def __init__(self, embedding_dim: int = 256, num_pos: int = 50):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
+        self.column_embeddings = nn.Embedding(num_pos, embedding_dim - (embedding_dim // 2))
 
-    def forward(self, x):
-        x = self.forward_features(x)
-        x = self.head(x)
-        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """x: (B, C, H, W) — returns positional embeddings of same shape."""
+        height, width = x.shape[-2:]
+        x_emb = self.column_embeddings(torch.arange(width, device=x.device))   # (W, C//2)
+        y_emb = self.row_embeddings(torch.arange(height, device=x.device))      # (H, C//2)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).expand(height, -1, -1),
+            y_emb.unsqueeze(1).expand(-1, width, -1),
+        ], dim=-1)  # (H, W, C)
+        return pos.permute(2, 0, 1).unsqueeze(0).expand(x.shape[0], -1, -1, -1)  # (B, C, H, W)
 
-    @classmethod
-    def from_config(cls, config):
-        return cls(
-            depths=config.depths,
-            embed_dims=config.dim_embed,
-            num_heads=config.num_heads,
-            num_groups=config.num_groups,
-            patch_size=config.patch_size,
-            patch_stride=config.patch_stride,
-            patch_padding=config.patch_padding,
-            patch_prenorm=config.patch_prenorm,
-            drop_path_rate=config.drop_path_rate,
-            window_size=config.window_size,
+
+class Florence2VisionPositionalEmbeddingCosine1D(nn.Module):
+    """Sinusoidal temporal positional embedding; returns (T, C) without batch dim."""
+
+    def __init__(self, embed_dim: int = 512, max_seq_len: int = 100) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.max_seq_len = max_seq_len
+        factor = math.log(10000)
+        denominator = torch.exp(-factor * torch.arange(0, embed_dim, 2) / embed_dim)
+        frequencies = torch.arange(0, max_seq_len).reshape(max_seq_len, 1) * denominator
+        pos_idx_to_embed = torch.zeros((max_seq_len, embed_dim))
+        pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
+        pos_idx_to_embed[:, 1::2] = torch.cos(frequencies)
+        self.pos_idx_to_embed = nn.Parameter(pos_idx_to_embed, requires_grad=False)
+
+    def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
+        """seq_embeds: (B, T, C) — returns (T, C) positional embeddings."""
+        len_seq = seq_embeds.size(1)
+        assert len_seq <= self.max_seq_len
+        return self.pos_idx_to_embed[0:len_seq, :]  # (T, C)
+
+
+class Florence2MultiModalProjector(nn.Module):
+    """
+    Projects vision backbone features into the language model's embedding space.
+    Applies 2D spatial positional embeddings, a temporal embedding, pools to
+    produce both a spatial-average and a per-token representation, then projects
+    with a linear layer + layer norm.
+
+    Input:  (B, C, H, W) NCHW feature map from Florence2VisionBackbone.
+    Output: (B, 1 + H*W, projection_dim) token embeddings for the encoder.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.vision_config.embed_dim[-1]
+        proj_dim = config.vision_config.projection_dim
+
+        self.image_projection = nn.Linear(embed_dim, proj_dim, bias=False)
+        self.image_proj_norm = nn.LayerNorm(proj_dim)
+        self.image_position_embed = Florence2VisionLearnedAbsolutePositionEmbedding2D(
+            embedding_dim=embed_dim,
+            num_pos=config.vision_config.max_position_embeddings,
         )
+        self.visual_temporal_embed = Florence2VisionPositionalEmbeddingCosine1D(
+            embed_dim=embed_dim,
+            max_seq_len=config.vision_config.max_temporal_embeddings,
+        )
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        # image_features: (B, C, H, W)
+        B, C, H, W = image_features.shape
+
+        # 2D spatial positional embedding
+        pos = self.image_position_embed(image_features)  # (B, C, H, W)
+        x = (image_features + pos).flatten(2).transpose(1, 2)  # (B, H*W, C)
+
+        # Temporal positional embedding (T=1 for single-frame images)
+        temporal_embed = self.visual_temporal_embed(x[:, :1, :])  # (1, C)
+        x = x + temporal_embed  # broadcast over H*W tokens
+
+        # Pool: spatial average (1 token) + all spatial tokens (H*W tokens)
+        x_t = x.unsqueeze(1)  # (B, 1, H*W, C) — treat as T=1 video
+        spatial_avg = x_t.mean(dim=2)   # (B, 1, C)
+        temporal_avg = x_t.mean(dim=1)  # (B, H*W, C)
+        x = torch.cat([spatial_avg, temporal_avg], dim=1)  # (B, 1+H*W, C)
+
+        x = self.image_projection(x)   # (B, 1+H*W, proj_dim)
+        x = self.image_proj_norm(x)
+        return x
 
 
 # Language backbone and processor implementation
@@ -757,18 +559,18 @@ def load_weights(self, weights: Iterable[tuple[str,
 
 class Florence2ProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_config(self):
+    def get_hf_config(self) -> Florence2Config:
         return self.ctx.get_hf_config()
 
-    def get_hf_processor(self):
+    def get_hf_processor(self) -> Florence2Processor:
         return self.ctx.get_hf_processor()
 
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": 1}
 
     def get_num_image_tokens(self) -> int:
-        processor_config = self.ctx.get_hf_image_processor_config()
-        return processor_config["image_seq_length"]
+        processor = self.get_hf_processor()
+        return processor.num_image_tokens
 
 
 class Florence2DummyInputsBuilder(
@@ -785,7 +587,7 @@ def get_dummy_mm_data(
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
-        target_width = target_height = self.info.get_hf_config().projection_dim
+        target_width = target_height = self.info.get_hf_config().vision_config.projection_dim
 
         return {
             "image":
@@ -805,7 +607,10 @@ def _hf_processor_applies_updates(
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
     ) -> bool:
-        return False
+        # The Florence2Processor already inserts image_token_id placeholders
+        # into the input_ids (577 tokens for a 768x768 image), so we tell
+        # vllm to find those existing placeholders rather than insert new ones.
+        return bool(mm_items.get_all_counts().get("image", 0))
 
     def create_encoder_prompt(
         self,
@@ -819,7 +624,7 @@ def create_decoder_prompt(
         prompt: str | list[int],
         mm_data: MultiModalDataDict,
     ) -> str | list[int]:
-        return [self.info.get_hf_config().eos_token_id]
+        return [self.info.get_hf_config().text_config.eos_token_id]
 
     def _apply_hf_processor_tokens_only(
         self,
@@ -870,9 +675,13 @@ def _get_prompt_updates(
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
-        pad_token_id = hf_config.pad_token_id
+        # Use image_token_id (51289) — this is what the Florence2Processor
+        # inserts into input_ids. With _hf_processor_applies_updates=True,
+        # vllm will FIND these tokens in the existing prompt rather than
+        # inserting new ones (so no token doubling / length overflow).
+        image_token_id = hf_config.image_token_id
         num_image_tokens = self.info.get_num_image_tokens()
-        image_tokens = [pad_token_id] * num_image_tokens
+        image_tokens = [image_token_id] * num_image_tokens
 
         return [
             PromptInsertion(
@@ -902,63 +711,37 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         processor_config = vllm_config.model_config.hf_image_processor_config
 
         self.config = config
-        self.vision_config = config.vision_config
         self.processor_config = processor_config
-        assert config.vision_config.model_type == 'davit', (
-            'only DaViT is supported for now')
-        self.vision_tower = DaViT.from_config(config=config.vision_config)
-        self._build_image_projection_layers(config)
+        assert config.vision_config.model_type == 'florence_vision', (
+            f'only Florence Vision is supported for now. '
+            f'Received model type: {config.vision_config.model_type}')
+        self.vision_tower = Florence2VisionBackbone(config.vision_config)
+        self.multi_modal_projector = Florence2MultiModalProjector(config)
         self.language_model = Florence2LanguageForConditionalGeneration(
             vllm_config=vllm_config.with_hf_config(config.text_config),
             prefix=f"{prefix}.language_model",
         )
-        self.pad_token_id = config.pad_token_id
-
-    def _build_image_projection_layers(self, config: PretrainedConfig):
-        image_dim_out = config.vision_config.dim_embed[-1]
-        dim_projection = config.vision_config.projection_dim
-        self.image_projection = nn.Parameter(
-            torch.empty(image_dim_out, dim_projection))
-        self.image_proj_norm = nn.LayerNorm(dim_projection)
-        image_pos_embed_config = config.vision_config.image_pos_embed
-        if image_pos_embed_config['type'] == 'learned_abs_2d':
-            self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
-                embedding_dim=image_dim_out,
-                num_pos=image_pos_embed_config['max_pos_embeddings'])
-        else:
-            raise NotImplementedError("Florence2 only supports learned_abs_2d "
-                                      "as image position embedding.")
-
-        self.image_feature_source = config.vision_config.image_feature_source
-
-        # temporal embedding
-        visual_temporal_embedding_config = (
-            self.vision_config.visual_temporal_embedding)
-        if visual_temporal_embedding_config['type'] == 'COSINE':
-            self.visual_temporal_embed = PositionalEmbeddingCosine1D(
-                embed_dim=image_dim_out,
-                max_seq_len=visual_temporal_embedding_config[
-                    'max_temporal_embeddings'])
-        else:
-            raise NotImplementedError(
-                'Florence2 only supports COSINE as temporal embedding.')
+        self.pad_token_id = config.text_config.pad_token_id
 
     def _validate_pixel_values(
         self, data: torch.Tensor | list[torch.Tensor]
     ) -> torch.Tensor | list[torch.Tensor]:
+        # The image processor config may use "size" or "crop_size"; fall back
+        # to reading the actual tensor shape if neither key is available.
+        cfg = self.processor_config
+        size = cfg.get("size") or cfg.get("crop_size")
+        if size is None:
+            return data
 
-        size = self.processor_config["size"]
         h, w = size["height"], size["width"]
         expected_dims = (3, h, w)
 
         def _validate_shape(d: torch.Tensor):
             actual_dims = tuple(d.shape)
-
             if actual_dims != expected_dims:
-                expected_expr = tuple(*map(str, expected_dims))
                 raise ValueError(
                     "The expected shape of pixel values per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+                    f"is {expected_dims}. You supplied {actual_dims}.")
 
         for d in data:
             _validate_shape(d)
@@ -966,112 +749,39 @@ def _validate_shape(d: torch.Tensor):
         return data
 
     def _parse_and_validate_image_input(self, **kwargs: object):
-        pixel_values: list[list[torch.Tensor]] | list[torch.Tensor] | torch.Tensor | None = kwargs.pop(
-            "pixel_values", None)
-        image_embeds: list[list[torch.Tensor]] | list[torch.Tensor] | torch.Tensor | None = kwargs.pop(
-                                         "image_embeds", None)
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values is None and image_embeds is None:
             return None
-
         if pixel_values is not None and image_embeds is not None:
-            raise ValueError(
-                "Both pixel values and image embeds are provided.")
-
+            raise ValueError("Both pixel values and image embeds are provided.")
         if pixel_values is not None:
             return Florence2ImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(pixel_values),
             )
-
-        if image_embeds is not None:
-            raise NotImplementedError
-
-        raise AssertionError("This line should be unreachable.")
+        raise NotImplementedError("image_embeds not supported.")
 
     def _parse_and_validate_encoder_input(self, **kwargs: object) -> list[torch.Tensor]:
         encoder_input_ids = kwargs.get("encoder_input_ids", kwargs.get("input_ids"))
-
         if encoder_input_ids is None:
             return []
-
         if not isinstance(encoder_input_ids, (torch.Tensor, list)):
             raise ValueError(
-                "Incorrect type of encoder input_ids. "
-                f"Got type: {type(encoder_input_ids)}"
+                f"Incorrect type of encoder input_ids. Got type: {type(encoder_input_ids)}"
             )
-
-        # Return as a list of tensors (one per item in the batch)
         if isinstance(encoder_input_ids, list):
-            # Already a list - ensure each item is valid
-            result = []
-            for item in encoder_input_ids:
-                if isinstance(item, torch.Tensor):
-                    if item.dim() == 0:
-                        item = item.unsqueeze(0)
-                    result.append(item)
-                else:
-                    result.append(item)
-            return result
-        else:
-            # [1xD]xN times
-            return encoder_input_ids.unsqueeze(1).unbind(dim=0)
+            return [item.unsqueeze(0) if item.dim() == 0 else item
+                    for item in encoder_input_ids]
+        return encoder_input_ids.unsqueeze(1).unbind(dim=0)
 
     def _encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
-        dtype = next(self.vision_tower.parameters()).dtype
-        pixel_values = pixel_values.to(dtype)
-
-        batch_size, T = pixel_values.size(0), 1
-        x = self.vision_tower.forward_features_unpool(pixel_values)
-        if self.image_pos_embed is not None:
-            x = x.view(batch_size * T, -1, x.shape[-1])
-            num_tokens = x.shape[-2]
-            h, w = int(num_tokens**0.5), int(num_tokens**0.5)
-            assert h * w == num_tokens, (
-                'only support square feature maps for now')
-            x = x.view(batch_size * T, h, w, x.shape[-1])
-            pos_embed = self.image_pos_embed(x)
-            x = x + pos_embed
-            x = x.view(batch_size, T * h * w, x.shape[-1])
-
-        if self.visual_temporal_embed is not None:
-            visual_temporal_embed = self.visual_temporal_embed(
-                x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
-            x = x.view(batch_size, T, -1,
-                       x.shape[-1]) + visual_temporal_embed.view(
-                           1, T, 1, x.shape[-1])
-
-        x_feat_dict = {}
-
-        spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
-        x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
-
-        temporal_avg_pool_x = x.view(batch_size, T, -1,
-                                     x.shape[-1]).mean(dim=1)
-        x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
-
-        x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
-        x_feat_dict['last_frame'] = x
-
-        new_x = []
-        for _image_feature_source in self.image_feature_source:
-            if _image_feature_source not in x_feat_dict:
-                raise ValueError('invalid image feature source: {}'.format(
-                    _image_feature_source))
-            new_x.append(x_feat_dict[_image_feature_source])
-
-        x = torch.cat(new_x, dim=1)
-
-        x = x @ self.image_projection
-        x = self.image_proj_norm(x)
-
-        return x
+        pixel_values = pixel_values.to(next(self.vision_tower.parameters()).dtype)
+        return self.multi_modal_projector(self.vision_tower(pixel_values))
 
-    def _process_image_input(
-            self, image_input: Florence2ImagePixelInputs) -> torch.Tensor:
-        assert image_input["type"] == "pixel_values"
-        pixel_values = image_input["data"]
-        return self._encode_image(pixel_values)
+    def _process_image_input(self, image_input: Florence2ImagePixelInputs) -> torch.Tensor:
+        return self._encode_image(image_input["data"])
 
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
@@ -1086,14 +796,10 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
 
         if not encoder_input_ids_list:
             raise ValueError(
-                "encoder_input_ids_list is empty - this should not happen. "
-                "Check that multimodal data is being passed correctly."
+                "encoder_input_ids_list is empty - check multimodal data is being passed correctly."
             )
 
-        # Process each encoder input separately and return a list of outputs
-        # NOTE (NickLucche): Basic encoder batching optimization: BART input sequences
-        # can have different lengths. Due to computational load of encoder being very
-        # low here, we batch all sequences to run a single forward by max_seq padding.
+        # Batch encoder inputs (pad to max length if needed) and run a single forward pass.
         lengths = [t.numel() for t in encoder_input_ids_list]
         max_len = max(lengths) if lengths else 0
         assert max_len > 0, "Empty encoder_input_ids encountered."
@@ -1101,45 +807,38 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         if len(encoder_input_ids_list) == 1:
             batch_encoder_input_ids = encoder_input_ids_list[0]
         elif same_len:
-            # [1xD]xN =>NxD
             batch_encoder_input_ids = torch.cat(encoder_input_ids_list, dim=0)
         else:
             batch_encoder_input_ids = torch.full(
                 (len(encoder_input_ids_list), max_len),
-                fill_value=self._pad_id,
+                fill_value=self.pad_token_id,
                 dtype=encoder_input_ids_list[0].dtype,
                 device=encoder_input_ids_list[0].device,
             )
             for i, t in enumerate(encoder_input_ids_list):
                 batch_encoder_input_ids[i, : t.numel()] = t.squeeze()
-        # Create (B, T) positions: 0..T-1 for each item.
-        # batch_encoder_positions = torch.arange(
-        #     max_len,
-        #     dtype=torch.long,
-        #     device=batch_encoder_input_ids.device,
-        # ).unsqueeze(0).expand(batch_encoder_input_ids.size(0), -1)
-
         inputs_embeds = self.language_model.model.encoder.embed_tokens(batch_encoder_input_ids)
-        inputs_embeds = torch.cat([vision_embeddings, inputs_embeds], dim=-2)
+
+        # Replace the leading image_token_id placeholders with vision features.
+        if isinstance(vision_embeddings, torch.Tensor) and vision_embeddings.numel() > 0:
+            num_vision = vision_embeddings.size(1)
+            inputs_embeds = inputs_embeds.clone()
+            inputs_embeds[:, :num_vision, :] = vision_embeddings
         batch_encoder_positions = torch.arange(
             inputs_embeds.size(1),
             dtype=torch.long,
             device=inputs_embeds.device,
         ).unsqueeze(0).expand(inputs_embeds.size(0), -1)
 
-        # Run encoder once on the batch.
+        # Run encoder once on the batch, then split back per item.
         batch_encoder_output = self.language_model.model.encoder(
             input_ids=batch_encoder_input_ids,
             positions=batch_encoder_positions,
             inputs_embeds=inputs_embeds,
         )
-        # Split back into list[(T, H)] to match expected downstream format.
-        # If we had to pad, slice back to the original lengths per item.
         encoder_outputs: list[torch.Tensor] = batch_encoder_output.unbind(dim=0)
         if not same_len:
-            encoder_outputs = [
-                out[:l] for out, l in zip(encoder_outputs, lengths)
-            ]
+            encoder_outputs = [out[:l] for out, l in zip(encoder_outputs, lengths)]
         return encoder_outputs
 
     def forward(
@@ -1149,22 +848,8 @@ def forward(
         intermediate_tensors: IntermediateTensors | None = None,
         inputs_embeds: torch.Tensor | None = None,
         encoder_outputs: torch.Tensor | None = None,
-        # num_encoder_outputs: int | None = None,
         **kwargs,
     ) -> torch.Tensor:
-        r"""
-        Args:
-            input_ids
-                torch.Tensor of *decoder* input token ids.
-            positions
-                torch.Tensor of *decoder* position indices.
-            encoder_input_ids
-                torch.Tensor of *encoder* input token ids.
-            encoder_positions
-                torch.Tensor of *encoder* position indices
-        Returns:
-            Output torch.Tensor
-        """
         if encoder_outputs is not None:
             # Assume same shape for all encoder outputs
             encoder_outputs = torch.cat(encoder_outputs, dim=0)
@@ -1183,5 +868,23 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
+        def _remap(weights: Iterable[tuple[str, torch.Tensor]]):
+            for name, param in weights:
+                # HF checkpoint layout (Florence2ForConditionalGeneration):
+                #   model.vision_tower.*           -> vision_tower.*
+                #   model.multi_modal_projector.*  -> multi_modal_projector.*
+                #   model.language_model.*         -> language_model.model.*
+                #       (HF uses BartModel directly; our wrapper adds .model)
+                #   lm_head.*                      -> language_model.lm_head.*
+                if name.startswith("model.vision_tower."):
+                    name = name[len("model."):]
+                elif name.startswith("model.multi_modal_projector."):
+                    name = name[len("model."):]
+                elif name.startswith("model.language_model."):
+                    name = "language_model.model." + name[len("model.language_model."):]
+                elif name.startswith("lm_head."):
+                    name = "language_model." + name
+                yield name, param
+
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
+        return loader.load_weights(_remap(weights))

From 7cdec11d2ee0d660f01bd652df2ce37d5ef0b74c Mon Sep 17 00:00:00 2001
From: Carles Onielfa <carlesonielfa@gmail.com>
Date: Fri, 6 Mar 2026 12:15:56 +0100
Subject: [PATCH 03/11] add tests

Signed-off-by: Carles Onielfa <carlesonielfa@gmail.com>
---
 pyproject.toml                |  12 +
 tests/test_florence2.py       | 210 +++++++++++++++++
 vllm_bart_plugin/florence2.py | 420 ++++++++++++++++++++++------------
 3 files changed, 501 insertions(+), 141 deletions(-)
 create mode 100644 tests/test_florence2.py

diff --git a/pyproject.toml b/pyproject.toml
index 9cb05c0..e1789ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,8 +62,20 @@ include = '\.pyi?$'
 profile = "black"
 line_length = 88
 
+[tool.pytest.ini_options]
+markers = [
+    "slow: marks tests requiring a GPU and full model download (deselect with '-m \"not slow\"')",
+]
+
 [tool.mypy]
 python_version = "3.10"
 warn_return_any = true
 warn_unused_configs = true
 ignore_missing_imports = true
+
+[dependency-groups]
+dev = [
+    "black>=26.1.0",
+    "isort>=8.0.1",
+    "pytest>=9.0.2",
+]
diff --git a/tests/test_florence2.py b/tests/test_florence2.py
new file mode 100644
index 0000000..23ff912
--- /dev/null
+++ b/tests/test_florence2.py
@@ -0,0 +1,210 @@
+"""Tests for the Florence-2 multimodal model plugin."""
+
+import os
+
+import pytest
+import torch
+from transformers import Florence2Config
+
+MODEL_NAME = "florence-community/Florence-2-base-ft"
+
+
+def _small_vision_config():
+    """Tiny 1-stage Florence2 config for fast CPU tests."""
+    cfg = Florence2Config()
+    vc = cfg.vision_config
+    vc.embed_dim = [64]
+    vc.depths = [1]
+    vc.num_heads = [4]
+    vc.num_groups = [4]
+    vc.patch_size = [7]
+    vc.patch_stride = [4]
+    vc.patch_padding = [3]
+    vc.patch_prenorm = [False]
+    vc.drop_path_rate = 0.0
+    return cfg, vc
+
+
+# ---------------------------------------------------------------------------
+# Unit tests — vision architecture (CPU, no weights)
+# ---------------------------------------------------------------------------
+
+
+class TestFlorenceVisionDropPath:
+    def test_eval_is_identity(self):
+        from vllm_bart_plugin.florence2 import Florence2VisionDropPath
+
+        m = Florence2VisionDropPath(drop_prob=0.9).eval()
+        x = torch.randn(2, 16)
+        assert torch.equal(m(x), x)
+
+    def test_training_drops_samples(self):
+        from vllm_bart_plugin.florence2 import Florence2VisionDropPath
+
+        torch.manual_seed(0)
+        m = Florence2VisionDropPath(drop_prob=0.5).train()
+        out = m(torch.ones(64, 16))
+        assert not torch.all(out == 1)
+
+
+class TestFlorenceVisionConvEmbed:
+    @pytest.mark.parametrize("pre_norm", [True, False])
+    def test_output_channels(self, pre_norm):
+        from vllm_bart_plugin.florence2 import Florence2VisionConvEmbed
+
+        m = Florence2VisionConvEmbed(
+            patch_size=7,
+            in_channels=3,
+            embed_dim=64,
+            stride=4,
+            padding=3,
+            pre_norm=pre_norm,
+        )
+        out = m(torch.randn(1, 3, 64, 64))
+        assert out.shape[1] == 64
+
+
+class TestFlorenceVisionWindowAttention:
+    def test_exact_window(self):
+        from vllm_bart_plugin.florence2 import Florence2VisionWindowAttention
+
+        m = Florence2VisionWindowAttention(dim=32, num_heads=4, window_size=4)
+        assert m(torch.randn(1, 4, 4, 32)).shape == (1, 16, 32)
+
+    def test_input_requires_padding(self):
+        from vllm_bart_plugin.florence2 import Florence2VisionWindowAttention
+
+        m = Florence2VisionWindowAttention(dim=32, num_heads=4, window_size=4)
+        # 6 is not divisible by 4; output should still be (B, 6*6, C)
+        assert m(torch.randn(1, 6, 6, 32)).shape == (1, 36, 32)
+
+
+class TestFlorenceVisionBackbone:
+    def test_output_shape(self):
+        from vllm_bart_plugin.florence2 import Florence2VisionBackbone
+
+        _, vc = _small_vision_config()
+        out = Florence2VisionBackbone(vc)(torch.randn(2, 3, 64, 64))
+        assert out.shape == (2, vc.embed_dim[-1], 16, 16)
+
+
+class TestFlorenceVisionPositionalEmbeddingCosine1D:
+    def test_output_shape_and_no_batch_dim(self):
+        from vllm_bart_plugin.florence2 import (
+            Florence2VisionPositionalEmbeddingCosine1D,
+        )
+
+        m = Florence2VisionPositionalEmbeddingCosine1D(embed_dim=64, max_seq_len=100)
+        assert m(torch.randn(2, 5, 64)).shape == (5, 64)
+
+    def test_raises_if_exceeds_max(self):
+        from vllm_bart_plugin.florence2 import (
+            Florence2VisionPositionalEmbeddingCosine1D,
+        )
+
+        m = Florence2VisionPositionalEmbeddingCosine1D(embed_dim=64, max_seq_len=10)
+        with pytest.raises(AssertionError):
+            m(torch.randn(1, 20, 64))
+
+
+class TestFlorenceMultiModalProjector:
+    def test_output_shape(self):
+        from vllm_bart_plugin.florence2 import Florence2MultiModalProjector
+
+        cfg, vc = _small_vision_config()
+        vc.projection_dim = 128
+        m = Florence2MultiModalProjector(cfg)
+        out = m(torch.randn(2, vc.embed_dim[-1], 12, 12))
+        # (B, 1 spatial-avg token + H*W tokens, proj_dim)
+        assert out.shape == (2, 1 + 12 * 12, vc.projection_dim)
+
+
+# ---------------------------------------------------------------------------
+# Integration tests — full model inference (GPU required)
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(scope="module")
+def florence2_llm():
+    from vllm import LLM
+
+    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+    return LLM(
+        model=MODEL_NAME,
+        trust_remote_code=True,
+        enforce_eager=True,
+        gpu_memory_utilization=0.5,
+        mm_processor_cache_gb=0,
+    )
+
+
+@pytest.fixture(scope="module")
+def stop_sign_image():
+    from vllm.assets.image import ImageAsset
+
+    return ImageAsset("stop_sign").pil_image
+
+
+@pytest.fixture(scope="module")
+def sampling_params():
+    from vllm import SamplingParams
+
+    return SamplingParams(
+        temperature=0.0,
+        max_tokens=20,
+        repetition_penalty=1.5,
+        skip_special_tokens=False,
+    )
+
+
+@pytest.mark.slow
+class TestFlorenceInference:
+    def test_caption(self, florence2_llm, stop_sign_image, sampling_params):
+        outputs = florence2_llm.generate(
+            [
+                {
+                    "prompt": "<DETAILED_CAPTION>",
+                    "multi_modal_data": {"image": stop_sign_image},
+                }
+            ],
+            sampling_params=sampling_params,
+        )
+        assert len(outputs[0].outputs[0].text) > 0
+
+    def test_object_detection_has_loc_tokens(
+        self, florence2_llm, stop_sign_image, sampling_params
+    ):
+        outputs = florence2_llm.generate(
+            [
+                {
+                    "encoder_prompt": {
+                        "prompt": "<OD>",
+                        "multi_modal_data": {"image": stop_sign_image},
+                    },
+                    "decoder_prompt": "",
+                }
+            ],
+            sampling_params=sampling_params,
+        )
+        assert "<loc_" in outputs[0].outputs[0].text
+
+    def test_batch_inference(self, florence2_llm, stop_sign_image, sampling_params):
+        prompts = [
+            {"prompt": "<CAPTION>", "multi_modal_data": {"image": stop_sign_image}},
+            {
+                "prompt": "<DETAILED_CAPTION>",
+                "multi_modal_data": {"image": stop_sign_image},
+            },
+        ]
+        outputs = florence2_llm.generate(prompts, sampling_params=sampling_params)
+        assert all(len(o.outputs[0].text) > 0 for o in outputs)
+
+    def test_encoder_length_within_limit(self, stop_sign_image):
+        """Processor output must not exceed BART max_position_embeddings."""
+        from transformers import AutoProcessor
+
+        processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
+        out = processor(
+            text="<DETAILED_CAPTION>", images=stop_sign_image, return_tensors="pt"
+        )
+        assert out["input_ids"].shape[1] <= 1024
diff --git a/vllm_bart_plugin/florence2.py b/vllm_bart_plugin/florence2.py
index aee2e8c..52f9f5e 100644
--- a/vllm_bart_plugin/florence2.py
+++ b/vllm_bart_plugin/florence2.py
@@ -7,17 +7,21 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-from transformers import BartConfig, BartTokenizer, BatchFeature, Florence2Config, Florence2Processor
-
-from vllm.model_executor.layers.attention import Attention
-from vllm.v1.attention.backend import AttentionType
-from vllm.model_executor.layers.attention.cross_attention import CrossAttention
-from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
+from transformers import (
+    BartConfig,
+    BartTokenizer,
+    BatchFeature,
+    Florence2Config,
+    Florence2Processor,
+)
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.lora import LoRAConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.attention.cross_attention import CrossAttention
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -30,6 +34,18 @@
     VocabParallelEmbedding,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (
+    MultiModalEmbeddings,
+    SupportsMultiModal,
+    SupportsQuant,
+)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    cast_overflow_tensors,
+    flatten_bn,
+    maybe_prefix,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY, ModalityData
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
@@ -46,18 +62,21 @@
 from vllm.multimodal.processing import (
     BaseProcessingInfo,
     EncDecMultiModalProcessor,
-    PromptUpdate,
-    PromptInsertion,
     PromptIndexTargets,
+    PromptInsertion,
+    PromptUpdate,
 )
 from vllm.multimodal.processing.dummy_inputs import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.collection_utils import is_list_of
+from vllm.v1.attention.backend import AttentionType
 
-from vllm.model_executor.models.interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant
-from vllm.model_executor.models.utils import AutoWeightsLoader, WeightsMapper, cast_overflow_tensors, maybe_prefix, flatten_bn
-
-from vllm_bart_plugin.bart import BartDecoder, BartEncoder, BartParallelLMHead, BartScaledWordEmbedding
+from vllm_bart_plugin.bart import (
+    BartDecoder,
+    BartEncoder,
+    BartParallelLMHead,
+    BartScaledWordEmbedding,
+)
 
 
 class Florence2ImagePixelInputs(TypedDict):
@@ -66,9 +85,9 @@ class Florence2ImagePixelInputs(TypedDict):
     """Shape: (batch_size, num_channel, height, width)"""
 
 
-
-
-def _drop_path(x: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+def _drop_path(
+    x: torch.Tensor, drop_prob: float = 0.0, training: bool = False
+) -> torch.Tensor:
     if drop_prob == 0.0 or not training:
         return x
     keep_prob = 1 - drop_prob
@@ -102,12 +121,24 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class Florence2VisionConvEmbed(nn.Module):
     """Image-to-patch embedding via strided convolution (NCHW in, NCHW out)."""
 
-    def __init__(self, patch_size: int, in_channels: int, embed_dim: int,
-                 stride: int, padding: int, pre_norm: bool):
+    def __init__(
+        self,
+        patch_size: int,
+        in_channels: int,
+        embed_dim: int,
+        stride: int,
+        padding: int,
+        pre_norm: bool,
+    ):
         super().__init__()
         self.pre_norm = pre_norm
-        self.conv = nn.Conv2d(in_channels, embed_dim,
-                              kernel_size=patch_size, stride=stride, padding=padding)
+        self.conv = nn.Conv2d(
+            in_channels,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=padding,
+        )
         dim_norm = in_channels if pre_norm else embed_dim
         self.norm = nn.LayerNorm(dim_norm)
 
@@ -145,17 +176,35 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class Florence2VisionChannelBlock(nn.Module):
-    def __init__(self, embed_dim: int, groups: int, mlp_ratio: float = 4.0,
-                 qkv_bias: bool = True, drop_path_rate: float = 0.0):
+    def __init__(
+        self,
+        embed_dim: int,
+        groups: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop_path_rate: float = 0.0,
+    ):
         super().__init__()
-        self.conv1 = nn.Conv2d(embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim)
+        self.conv1 = nn.Conv2d(
+            embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim
+        )
         self.norm1 = nn.LayerNorm(embed_dim)
         self.channel_attn = Florence2VisionChannelAttention(embed_dim, groups, qkv_bias)
-        self.drop_path1 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
-        self.conv2 = nn.Conv2d(embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim)
+        self.drop_path1 = (
+            Florence2VisionDropPath(drop_path_rate)
+            if drop_path_rate > 0
+            else nn.Identity()
+        )
+        self.conv2 = nn.Conv2d(
+            embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim
+        )
         self.norm2 = nn.LayerNorm(embed_dim)
         self.ffn = Florence2VisionMLP(embed_dim, mlp_ratio)
-        self.drop_path2 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.drop_path2 = (
+            Florence2VisionDropPath(drop_path_rate)
+            if drop_path_rate > 0
+            else nn.Identity()
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, C, H, W = x.shape
@@ -177,7 +226,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class Florence2VisionWindowAttention(nn.Module):
     """Window-based local spatial self-attention."""
 
-    def __init__(self, dim: int, num_heads: int, window_size: int, qkv_bias: bool = True):
+    def __init__(
+        self, dim: int, num_heads: int, window_size: int, qkv_bias: bool = True
+    ):
         super().__init__()
         self.num_heads = num_heads
         self.window_size = window_size
@@ -195,14 +246,22 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         Hp, Wp = x.shape[1], x.shape[2]
 
         # Partition into non-overlapping windows
-        x = x.view(B, Hp // self.window_size, self.window_size,
-                   Wp // self.window_size, self.window_size, C)
-        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, self.window_size ** 2, C)
+        x = x.view(
+            B,
+            Hp // self.window_size,
+            self.window_size,
+            Wp // self.window_size,
+            self.window_size,
+            C,
+        )
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, self.window_size**2, C)
 
         Bw, Nw = x.shape[:2]
-        qkv = (self.qkv(x)
-               .reshape(Bw, Nw, 3, self.num_heads, C // self.num_heads)
-               .permute(2, 0, 3, 1, 4))
+        qkv = (
+            self.qkv(x)
+            .reshape(Bw, Nw, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
         q, k, v = qkv.unbind(0)
         attn = (q @ k.transpose(-2, -1)) * self.scale
         x = (attn.softmax(dim=-1) @ v).transpose(1, 2).reshape(Bw, Nw, C)
@@ -210,8 +269,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         # Merge windows back
         x = x.view(-1, self.window_size, self.window_size, C)
-        x = x.view(B, Hp // self.window_size, Wp // self.window_size,
-                   self.window_size, self.window_size, C)
+        x = x.view(
+            B,
+            Hp // self.window_size,
+            Wp // self.window_size,
+            self.window_size,
+            self.window_size,
+            C,
+        )
         x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, C)
         if pad_r > 0 or pad_b > 0:
             x = x[:, :H, :W, :].contiguous()
@@ -219,17 +284,38 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class Florence2VisionSpatialBlock(nn.Module):
-    def __init__(self, embed_dim: int, num_heads: int, window_size: int,
-                 mlp_ratio: float = 4.0, qkv_bias: bool = True, drop_path_rate: float = 0.0):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        window_size: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop_path_rate: float = 0.0,
+    ):
         super().__init__()
-        self.conv1 = nn.Conv2d(embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim)
+        self.conv1 = nn.Conv2d(
+            embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim
+        )
         self.norm1 = nn.LayerNorm(embed_dim)
-        self.window_attn = Florence2VisionWindowAttention(embed_dim, num_heads, window_size, qkv_bias)
-        self.drop_path1 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
-        self.conv2 = nn.Conv2d(embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim)
+        self.window_attn = Florence2VisionWindowAttention(
+            embed_dim, num_heads, window_size, qkv_bias
+        )
+        self.drop_path1 = (
+            Florence2VisionDropPath(drop_path_rate)
+            if drop_path_rate > 0
+            else nn.Identity()
+        )
+        self.conv2 = nn.Conv2d(
+            embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim
+        )
         self.norm2 = nn.LayerNorm(embed_dim)
         self.ffn = Florence2VisionMLP(embed_dim, mlp_ratio)
-        self.drop_path2 = Florence2VisionDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.drop_path2 = (
+            Florence2VisionDropPath(drop_path_rate)
+            if drop_path_rate > 0
+            else nn.Identity()
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, C, H, W = x.shape
@@ -250,14 +336,29 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class Florence2VisionBlock(nn.Module):
-    def __init__(self, embed_dim: int, num_heads: int, num_groups: int,
-                 window_size: int, mlp_ratio: float = 4.0, qkv_bias: bool = True,
-                 spatial_drop_path_rate: float = 0.0, channel_drop_path_rate: float = 0.0):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        num_groups: int,
+        window_size: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        spatial_drop_path_rate: float = 0.0,
+        channel_drop_path_rate: float = 0.0,
+    ):
         super().__init__()
         self.spatial_block = Florence2VisionSpatialBlock(
-            embed_dim, num_heads, window_size, mlp_ratio, qkv_bias, spatial_drop_path_rate)
+            embed_dim,
+            num_heads,
+            window_size,
+            mlp_ratio,
+            qkv_bias,
+            spatial_drop_path_rate,
+        )
         self.channel_block = Florence2VisionChannelBlock(
-            embed_dim, num_groups, mlp_ratio, qkv_bias, channel_drop_path_rate)
+            embed_dim, num_groups, mlp_ratio, qkv_bias, channel_drop_path_rate
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.channel_block(self.spatial_block(x))
@@ -274,37 +375,43 @@ def __init__(self, config):
         embed_dims = config.embed_dim
         num_stages = len(embed_dims)
         depths = config.depths
-        mlp_ratio = getattr(config, 'mlp_ratio', 4.0)
-        qkv_bias = getattr(config, 'qkv_bias', True)
+        mlp_ratio = getattr(config, "mlp_ratio", 4.0)
+        qkv_bias = getattr(config, "qkv_bias", True)
 
-        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(depths) * 2)]
+        dpr = [
+            x.item() for x in torch.linspace(0, config.drop_path_rate, sum(depths) * 2)
+        ]
         depth_offset = 0
 
         convs = []
         blocks = []
         for stage_idx in range(num_stages):
             in_ch = config.in_channels if stage_idx == 0 else embed_dims[stage_idx - 1]
-            convs.append(Florence2VisionConvEmbed(
-                patch_size=config.patch_size[stage_idx],
-                in_channels=in_ch,
-                embed_dim=embed_dims[stage_idx],
-                stride=config.patch_stride[stage_idx],
-                padding=config.patch_padding[stage_idx],
-                pre_norm=config.patch_prenorm[stage_idx],
-            ))
-            stage_blocks = nn.ModuleList([
-                Florence2VisionBlock(
+            convs.append(
+                Florence2VisionConvEmbed(
+                    patch_size=config.patch_size[stage_idx],
+                    in_channels=in_ch,
                     embed_dim=embed_dims[stage_idx],
-                    num_heads=config.num_heads[stage_idx],
-                    num_groups=config.num_groups[stage_idx],
-                    window_size=config.window_size,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    spatial_drop_path_rate=dpr[depth_offset + block_idx * 2],
-                    channel_drop_path_rate=dpr[depth_offset + block_idx * 2 + 1],
+                    stride=config.patch_stride[stage_idx],
+                    padding=config.patch_padding[stage_idx],
+                    pre_norm=config.patch_prenorm[stage_idx],
                 )
-                for block_idx in range(depths[stage_idx])
-            ])
+            )
+            stage_blocks = nn.ModuleList(
+                [
+                    Florence2VisionBlock(
+                        embed_dim=embed_dims[stage_idx],
+                        num_heads=config.num_heads[stage_idx],
+                        num_groups=config.num_groups[stage_idx],
+                        window_size=config.window_size,
+                        mlp_ratio=mlp_ratio,
+                        qkv_bias=qkv_bias,
+                        spatial_drop_path_rate=dpr[depth_offset + block_idx * 2],
+                        channel_drop_path_rate=dpr[depth_offset + block_idx * 2 + 1],
+                    )
+                    for block_idx in range(depths[stage_idx])
+                ]
+            )
             blocks.append(stage_blocks)
             depth_offset += depths[stage_idx] * 2
 
@@ -326,18 +433,27 @@ class Florence2VisionLearnedAbsolutePositionEmbedding2D(nn.Module):
     def __init__(self, embedding_dim: int = 256, num_pos: int = 50):
         super().__init__()
         self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
-        self.column_embeddings = nn.Embedding(num_pos, embedding_dim - (embedding_dim // 2))
+        self.column_embeddings = nn.Embedding(
+            num_pos, embedding_dim - (embedding_dim // 2)
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """x: (B, C, H, W) — returns positional embeddings of same shape."""
         height, width = x.shape[-2:]
-        x_emb = self.column_embeddings(torch.arange(width, device=x.device))   # (W, C//2)
-        y_emb = self.row_embeddings(torch.arange(height, device=x.device))      # (H, C//2)
-        pos = torch.cat([
-            x_emb.unsqueeze(0).expand(height, -1, -1),
-            y_emb.unsqueeze(1).expand(-1, width, -1),
-        ], dim=-1)  # (H, W, C)
-        return pos.permute(2, 0, 1).unsqueeze(0).expand(x.shape[0], -1, -1, -1)  # (B, C, H, W)
+        x_emb = self.column_embeddings(
+            torch.arange(width, device=x.device)
+        )  # (W, C//2)
+        y_emb = self.row_embeddings(torch.arange(height, device=x.device))  # (H, C//2)
+        pos = torch.cat(
+            [
+                x_emb.unsqueeze(0).expand(height, -1, -1),
+                y_emb.unsqueeze(1).expand(-1, width, -1),
+            ],
+            dim=-1,
+        )  # (H, W, C)
+        return (
+            pos.permute(2, 0, 1).unsqueeze(0).expand(x.shape[0], -1, -1, -1)
+        )  # (B, C, H, W)
 
 
 class Florence2VisionPositionalEmbeddingCosine1D(nn.Module):
@@ -403,11 +519,11 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
 
         # Pool: spatial average (1 token) + all spatial tokens (H*W tokens)
         x_t = x.unsqueeze(1)  # (B, 1, H*W, C) — treat as T=1 video
-        spatial_avg = x_t.mean(dim=2)   # (B, 1, C)
+        spatial_avg = x_t.mean(dim=2)  # (B, 1, C)
         temporal_avg = x_t.mean(dim=1)  # (B, H*W, C)
         x = torch.cat([spatial_avg, temporal_avg], dim=1)  # (B, 1+H*W, C)
 
-        x = self.image_projection(x)   # (B, 1+H*W, proj_dim)
+        x = self.image_projection(x)  # (B, 1+H*W, proj_dim)
         x = self.image_proj_norm(x)
         return x
 
@@ -427,14 +543,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.vocab_size = config.vocab_size
 
         self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model)
-        self.encoder = BartEncoder(config,
-                                   cache_config=cache_config,
-                                   quant_config=quant_config,
-                                   prefix=f"{prefix}.encoder")
-        self.decoder = BartDecoder(config,
-                                   cache_config=cache_config,
-                                   quant_config=quant_config,
-                                   prefix=f"{prefix}.decoder")
+        self.encoder = BartEncoder(
+            config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.encoder",
+        )
+        self.decoder = BartDecoder(
+            config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.decoder",
+        )
 
         if self.config.tie_word_embeddings:
             self.encoder.embed_tokens.weight = self.shared.weight
@@ -467,18 +587,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
 
         self.config = config
-        self.model = Florence2LanguageModel(vllm_config=vllm_config,
-                                            prefix=f"{prefix}.model")
-        embed_scale = math.sqrt(
-            config.d_model) if config.scale_embedding else 1.0
+        self.model = Florence2LanguageModel(
+            vllm_config=vllm_config, prefix=f"{prefix}.model"
+        )
+        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
 
         self.vocab_size = config.vocab_size
-        self.lm_head = BartParallelLMHead(self.vocab_size,
-                                          config.d_model,
-                                          embed_scale=embed_scale)
+        self.lm_head = BartParallelLMHead(
+            self.vocab_size, config.d_model, embed_scale=embed_scale
+        )
 
-        self.logits_processor = LogitsProcessor(self.vocab_size,
-                                                config.vocab_size)
+        self.logits_processor = LogitsProcessor(self.vocab_size, config.vocab_size)
         if self.config.tie_word_embeddings:
             self.lm_head.tie_weights(self.model.shared)
 
@@ -492,10 +611,12 @@ def forward(
         # num_encoder_outputs: int | None = None,
         **kwargs,
     ) -> torch.Tensor:
-        return self.model(input_ids,
-                          positions,
-                          inputs_embeds=inputs_embeds,
-                          encoder_outputs=encoder_outputs)
+        return self.model(
+            input_ids,
+            positions,
+            inputs_embeds=inputs_embeds,
+            encoder_outputs=encoder_outputs,
+        )
 
     def get_encoder_outputs(
         self,
@@ -522,8 +643,7 @@ def compute_logits(
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("encoder_attn.kv_proj", "encoder_attn.k_proj", "k"),
@@ -536,7 +656,7 @@ def load_weights(self, weights: Iterable[tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
@@ -550,8 +670,7 @@ def load_weights(self, weights: Iterable[tuple[str,
                 if self.config.tie_word_embeddings and "embed_tokens" in name:
                     continue
                 param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
@@ -573,8 +692,7 @@ def get_num_image_tokens(self) -> int:
         return processor.num_image_tokens
 
 
-class Florence2DummyInputsBuilder(
-        BaseDummyInputsBuilder[Florence2ProcessingInfo]):
+class Florence2DummyInputsBuilder(BaseDummyInputsBuilder[Florence2ProcessingInfo]):
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         return ""
@@ -587,18 +705,18 @@ def get_dummy_mm_data(
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
-        target_width = target_height = self.info.get_hf_config().vision_config.projection_dim
+        target_width = target_height = (
+            self.info.get_hf_config().vision_config.projection_dim
+        )
 
         return {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
+            "image": self._get_dummy_images(
+                width=target_width, height=target_height, num_images=num_images
+            )
         }
 
 
-class Florence2MultiModalProcessor(
-        EncDecMultiModalProcessor[Florence2ProcessingInfo]):
+class Florence2MultiModalProcessor(EncDecMultiModalProcessor[Florence2ProcessingInfo]):
 
     def _hf_processor_applies_updates(
         self,
@@ -647,14 +765,15 @@ def _call_hf_processor(
     ) -> BatchFeature:
         if mm_data:
             processed_outputs = super()._call_hf_processor(
-                prompt, mm_data, mm_kwargs, tok_kwargs)
+                prompt, mm_data, mm_kwargs, tok_kwargs
+            )
         else:
             hf_processor = self.info.get_hf_processor()
             tokenizer = hf_processor.tokenizer
             prompt = hf_processor._construct_prompts([prompt])[0]
-            processed_outputs = tokenizer(prompt,
-                                          add_special_tokens=True,
-                                          return_tensors="pt")
+            processed_outputs = tokenizer(
+                prompt, add_special_tokens=True, return_tensors="pt"
+            )
         processed_outputs["encoder_input_ids"] = processed_outputs["input_ids"]
         return processed_outputs
 
@@ -695,7 +814,8 @@ def _get_prompt_updates(
 @MULTIMODAL_REGISTRY.register_processor(
     Florence2MultiModalProcessor,
     info=Florence2ProcessingInfo,
-    dummy_inputs=Florence2DummyInputsBuilder)
+    dummy_inputs=Florence2DummyInputsBuilder,
+)
 class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal):
 
     @classmethod
@@ -712,9 +832,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
         self.processor_config = processor_config
-        assert config.vision_config.model_type == 'florence_vision', (
-            f'only Florence Vision is supported for now. '
-            f'Received model type: {config.vision_config.model_type}')
+        assert config.vision_config.model_type == "florence_vision", (
+            f"only Florence Vision is supported for now. "
+            f"Received model type: {config.vision_config.model_type}"
+        )
         self.vision_tower = Florence2VisionBackbone(config.vision_config)
         self.multi_modal_projector = Florence2MultiModalProjector(config)
         self.language_model = Florence2LanguageForConditionalGeneration(
@@ -741,7 +862,8 @@ def _validate_shape(d: torch.Tensor):
             if actual_dims != expected_dims:
                 raise ValueError(
                     "The expected shape of pixel values per batch "
-                    f"is {expected_dims}. You supplied {actual_dims}.")
+                    f"is {expected_dims}. You supplied {actual_dims}."
+                )
 
         for d in data:
             _validate_shape(d)
@@ -772,15 +894,19 @@ def _parse_and_validate_encoder_input(self, **kwargs: object) -> list[torch.Tens
                 f"Incorrect type of encoder input_ids. Got type: {type(encoder_input_ids)}"
             )
         if isinstance(encoder_input_ids, list):
-            return [item.unsqueeze(0) if item.dim() == 0 else item
-                    for item in encoder_input_ids]
+            return [
+                item.unsqueeze(0) if item.dim() == 0 else item
+                for item in encoder_input_ids
+            ]
         return encoder_input_ids.unsqueeze(1).unbind(dim=0)
 
     def _encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
         pixel_values = pixel_values.to(next(self.vision_tower.parameters()).dtype)
         return self.multi_modal_projector(self.vision_tower(pixel_values))
 
-    def _process_image_input(self, image_input: Florence2ImagePixelInputs) -> torch.Tensor:
+    def _process_image_input(
+        self, image_input: Florence2ImagePixelInputs
+    ) -> torch.Tensor:
         return self._encode_image(image_input["data"])
 
     def get_language_model(self) -> torch.nn.Module:
@@ -817,18 +943,27 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
             )
             for i, t in enumerate(encoder_input_ids_list):
                 batch_encoder_input_ids[i, : t.numel()] = t.squeeze()
-        inputs_embeds = self.language_model.model.encoder.embed_tokens(batch_encoder_input_ids)
+        inputs_embeds = self.language_model.model.encoder.embed_tokens(
+            batch_encoder_input_ids
+        )
 
         # Replace the leading image_token_id placeholders with vision features.
-        if isinstance(vision_embeddings, torch.Tensor) and vision_embeddings.numel() > 0:
+        if (
+            isinstance(vision_embeddings, torch.Tensor)
+            and vision_embeddings.numel() > 0
+        ):
             num_vision = vision_embeddings.size(1)
             inputs_embeds = inputs_embeds.clone()
             inputs_embeds[:, :num_vision, :] = vision_embeddings
-        batch_encoder_positions = torch.arange(
-            inputs_embeds.size(1),
-            dtype=torch.long,
-            device=inputs_embeds.device,
-        ).unsqueeze(0).expand(inputs_embeds.size(0), -1)
+        batch_encoder_positions = (
+            torch.arange(
+                inputs_embeds.size(1),
+                dtype=torch.long,
+                device=inputs_embeds.device,
+            )
+            .unsqueeze(0)
+            .expand(inputs_embeds.size(0), -1)
+        )
 
         # Run encoder once on the batch, then split back per item.
         batch_encoder_output = self.language_model.model.encoder(
@@ -854,10 +989,12 @@ def forward(
             # Assume same shape for all encoder outputs
             encoder_outputs = torch.cat(encoder_outputs, dim=0)
 
-        hidden_states = self.language_model(input_ids,
-                                            positions,
-                                            encoder_outputs=encoder_outputs,
-                                            inputs_embeds=inputs_embeds)
+        hidden_states = self.language_model(
+            input_ids,
+            positions,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+        )
         return hidden_states
 
     def compute_logits(
@@ -866,8 +1003,7 @@ def compute_logits(
     ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
-    def load_weights(self, weights: Iterable[tuple[str,
-                                                   torch.Tensor]]) -> set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         def _remap(weights: Iterable[tuple[str, torch.Tensor]]):
             for name, param in weights:
                 # HF checkpoint layout (Florence2ForConditionalGeneration):
@@ -877,11 +1013,13 @@ def _remap(weights: Iterable[tuple[str, torch.Tensor]]):
                 #       (HF uses BartModel directly; our wrapper adds .model)
                 #   lm_head.*                      -> language_model.lm_head.*
                 if name.startswith("model.vision_tower."):
-                    name = name[len("model."):]
+                    name = name[len("model.") :]
                 elif name.startswith("model.multi_modal_projector."):
-                    name = name[len("model."):]
+                    name = name[len("model.") :]
                 elif name.startswith("model.language_model."):
-                    name = "language_model.model." + name[len("model.language_model."):]
+                    name = (
+                        "language_model.model." + name[len("model.language_model.") :]
+                    )
                 elif name.startswith("lm_head."):
                     name = "language_model." + name
                 yield name, param

From bb29009fe1882a9cc7b765a3b0c81c99eb633ed8 Mon Sep 17 00:00:00 2001
From: Carles Onielfa <carlesonielfa@gmail.com>
Date: Fri, 6 Mar 2026 12:46:43 +0100
Subject: [PATCH 04/11] No longer require trust_remote_code

Signed-off-by: Carles Onielfa <carlesonielfa@gmail.com>
---
 example_florence2_usage.py | 1 -
 tests/test_florence2.py    | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/example_florence2_usage.py b/example_florence2_usage.py
index 6b01b2b..75967ef 100644
--- a/example_florence2_usage.py
+++ b/example_florence2_usage.py
@@ -17,7 +17,6 @@ def main():
     llm = LLM(
         model=model_name,
         mm_processor_cache_gb=0,
-        trust_remote_code=True,
         enforce_eager=True,
     )
     params = SamplingParams(
diff --git a/tests/test_florence2.py b/tests/test_florence2.py
index 23ff912..cc95a4f 100644
--- a/tests/test_florence2.py
+++ b/tests/test_florence2.py
@@ -131,7 +131,6 @@ def florence2_llm():
     os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
     return LLM(
         model=MODEL_NAME,
-        trust_remote_code=True,
         enforce_eager=True,
         gpu_memory_utilization=0.5,
         mm_processor_cache_gb=0,
@@ -203,7 +202,7 @@ def test_encoder_length_within_limit(self, stop_sign_image):
         """Processor output must not exceed BART max_position_embeddings."""
         from transformers import AutoProcessor
 
-        processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
+        processor = AutoProcessor.from_pretrained(MODEL_NAME)
         out = processor(
             text="<DETAILED_CAPTION>", images=stop_sign_image, return_tensors="pt"
         )

From b60cb018a916fba8a4697bd9661539e38385563b Mon Sep 17 00:00:00 2001
From: Carles Onielfa <carlesonielfa@gmail.com>
Date: Tue, 10 Mar 2026 12:17:14 +0100
Subject: [PATCH 05/11] Fix generation and add better tests

Signed-off-by: Carles Onielfa <carlesonielfa@gmail.com>
---
 example_florence2_usage.py    |   2 -
 tests/test_florence2.py       | 167 +++++++++++++++++++++++-----------
 vllm_bart_plugin/florence2.py |  62 +++++++++++--
 3 files changed, 165 insertions(+), 66 deletions(-)

diff --git a/example_florence2_usage.py b/example_florence2_usage.py
index 75967ef..32c9c96 100644
--- a/example_florence2_usage.py
+++ b/example_florence2_usage.py
@@ -22,8 +22,6 @@ def main():
     params = SamplingParams(
         temperature=0.0,
         max_tokens=20,
-        # repetition_penalty is needed to prevent <s> repetition
-        repetition_penalty=1.5,
         # skip_special_tokens=False is needed to present
         # grounding tokens like <loc_0><loc_1>
         skip_special_tokens=False,
diff --git a/tests/test_florence2.py b/tests/test_florence2.py
index cc95a4f..8aaba8a 100644
--- a/tests/test_florence2.py
+++ b/tests/test_florence2.py
@@ -6,7 +6,13 @@
 import torch
 from transformers import Florence2Config
 
-MODEL_NAME = "florence-community/Florence-2-base-ft"
+# Allow override via env var so CI can point at a local checkpoint.
+MODEL_NAME = os.environ.get(
+    "FLORENCE2_MODEL",
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "../../Florence-2-base-ft")
+    ),
+)
 
 
 def _small_vision_config():
@@ -124,6 +130,20 @@ def test_output_shape(self):
 # ---------------------------------------------------------------------------
 
 
+def _run_task(llm, processor, image, task_prompt, text_input=None, max_tokens=100):
+    """Helper: run one Florence-2 task and return the post-processed result."""
+    from vllm import SamplingParams
+
+    prompt = task_prompt if text_input is None else task_prompt + text_input
+    params = SamplingParams(temperature=0.0, max_tokens=max_tokens, skip_special_tokens=False)
+    outputs = llm.generate(
+        [{"prompt": prompt, "multi_modal_data": {"image": image}}],
+        sampling_params=params,
+    )
+    raw = outputs[0].outputs[0].text
+    return processor.post_process_generation(raw, task=task_prompt, image_size=image.size)
+
+
 @pytest.fixture(scope="module")
 def florence2_llm():
     from vllm import LLM
@@ -138,72 +158,111 @@ def florence2_llm():
 
 
 @pytest.fixture(scope="module")
-def stop_sign_image():
-    from vllm.assets.image import ImageAsset
+def florence2_processor():
+    from transformers import AutoProcessor
 
-    return ImageAsset("stop_sign").pil_image
+    return AutoProcessor.from_pretrained(MODEL_NAME)
 
 
 @pytest.fixture(scope="module")
-def sampling_params():
-    from vllm import SamplingParams
+def stop_sign_image():
+    from vllm.assets.image import ImageAsset
 
-    return SamplingParams(
-        temperature=0.0,
-        max_tokens=20,
-        repetition_penalty=1.5,
-        skip_special_tokens=False,
-    )
+    return ImageAsset("stop_sign").pil_image.convert("RGB")
 
 
 @pytest.mark.slow
 class TestFlorenceInference:
-    def test_caption(self, florence2_llm, stop_sign_image, sampling_params):
-        outputs = florence2_llm.generate(
-            [
-                {
-                    "prompt": "<DETAILED_CAPTION>",
-                    "multi_modal_data": {"image": stop_sign_image},
-                }
-            ],
-            sampling_params=sampling_params,
+    # ------------------------------------------------------------------
+    # Caption tasks — check for semantically meaningful keywords
+    # ------------------------------------------------------------------
+
+    def test_caption(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<CAPTION>", max_tokens=30)
+        text = result["<CAPTION>"].lower()
+        assert "car" in text or "stop" in text, f"<CAPTION> output missing expected content: {text!r}"
+
+    def test_detailed_caption(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<DETAILED_CAPTION>", max_tokens=80)
+        text = result["<DETAILED_CAPTION>"].lower()
+        # Must mention the car and give some background detail — guards against the
+        # KV-cache encoder_seq_lens regression that previously produced garbled output.
+        assert "car" in text, f"<DETAILED_CAPTION> missing 'car': {text!r}"
+        assert len(text.split()) >= 10, f"<DETAILED_CAPTION> too short: {text!r}"
+
+    def test_more_detailed_caption(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<MORE_DETAILED_CAPTION>", max_tokens=100)
+        text = result["<MORE_DETAILED_CAPTION>"].lower()
+        assert "stop sign" in text or "sign" in text, f"<MORE_DETAILED_CAPTION> missing 'stop sign': {text!r}"
+        assert len(text.split()) >= 10, f"<MORE_DETAILED_CAPTION> too short: {text!r}"
+
+    # ------------------------------------------------------------------
+    # Structured-output tasks — check schema and key labels
+    # ------------------------------------------------------------------
+
+    def test_object_detection(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<OD>", max_tokens=300)
+        od = result["<OD>"]
+        assert "bboxes" in od and "labels" in od
+        assert len(od["bboxes"]) == len(od["labels"]) > 0
+        # Each bbox must be a 4-element list with non-negative coords
+        for bbox in od["bboxes"]:
+            assert len(bbox) == 4 and all(c >= 0 for c in bbox)
+        labels = od["labels"]
+        assert "stop sign" in labels, f"Expected 'stop sign' in OD labels, got: {labels}"
+        assert "car" in labels or "building" in labels, f"Expected common objects in OD labels, got: {labels}"
+
+    def test_dense_region_caption(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<DENSE_REGION_CAPTION>", max_tokens=250)
+        drc = result["<DENSE_REGION_CAPTION>"]
+        assert "bboxes" in drc and "labels" in drc
+        assert len(drc["bboxes"]) == len(drc["labels"]) > 0
+        assert "stop sign" in drc["labels"], f"Expected 'stop sign' in dense captions, got: {drc['labels']}"
+
+    def test_region_proposal(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<REGION_PROPOSAL>", max_tokens=100)
+        rp = result["<REGION_PROPOSAL>"]
+        assert "bboxes" in rp and "labels" in rp
+        assert len(rp["bboxes"]) > 0
+        # Region proposal labels are always empty strings
+        assert all(label == "" for label in rp["labels"])
+
+    def test_ocr_with_region(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<OCR_WITH_REGION>", max_tokens=250)
+        ocr = result["<OCR_WITH_REGION>"]
+        assert "quad_boxes" in ocr and "labels" in ocr
+        assert len(ocr["quad_boxes"]) == len(ocr["labels"]) > 0
+        # Each quad box must be 8 coords
+        for quad in ocr["quad_boxes"]:
+            assert len(quad) == 8
+        # "STOP" is the most prominent text in the image
+        joined = " ".join(ocr["labels"])
+        assert "STOP" in joined, f"Expected 'STOP' in OCR_WITH_REGION labels, got: {joined!r}"
+
+    def test_caption_to_phrase_grounding(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(
+            florence2_llm, florence2_processor, stop_sign_image,
+            "<CAPTION_TO_PHRASE_GROUNDING>", text_input="A stop sign on a street corner.", max_tokens=80,
         )
-        assert len(outputs[0].outputs[0].text) > 0
-
-    def test_object_detection_has_loc_tokens(
-        self, florence2_llm, stop_sign_image, sampling_params
-    ):
-        outputs = florence2_llm.generate(
-            [
-                {
-                    "encoder_prompt": {
-                        "prompt": "<OD>",
-                        "multi_modal_data": {"image": stop_sign_image},
-                    },
-                    "decoder_prompt": "",
-                }
-            ],
-            sampling_params=sampling_params,
+        cpg = result["<CAPTION_TO_PHRASE_GROUNDING>"]
+        assert "bboxes" in cpg and "labels" in cpg
+        assert len(cpg["bboxes"]) > 0
+        assert any("stop sign" in lbl.lower() for lbl in cpg["labels"]), (
+            f"Expected 'stop sign' grounded, got labels: {cpg['labels']}"
         )
-        assert "<loc_" in outputs[0].outputs[0].text
 
-    def test_batch_inference(self, florence2_llm, stop_sign_image, sampling_params):
+    # ------------------------------------------------------------------
+    # Batch tests
+    # ------------------------------------------------------------------
+
+    def test_batch_inference(self, florence2_llm, florence2_processor, stop_sign_image):
+        """Multiple prompts in one batch must all produce non-empty output."""
+        from vllm import SamplingParams
+
+        params = SamplingParams(temperature=0.0, max_tokens=30, skip_special_tokens=False)
         prompts = [
             {"prompt": "<CAPTION>", "multi_modal_data": {"image": stop_sign_image}},
-            {
-                "prompt": "<DETAILED_CAPTION>",
-                "multi_modal_data": {"image": stop_sign_image},
-            },
+            {"prompt": "<DETAILED_CAPTION>", "multi_modal_data": {"image": stop_sign_image}},
         ]
-        outputs = florence2_llm.generate(prompts, sampling_params=sampling_params)
+        outputs = florence2_llm.generate(prompts, sampling_params=params)
         assert all(len(o.outputs[0].text) > 0 for o in outputs)
-
-    def test_encoder_length_within_limit(self, stop_sign_image):
-        """Processor output must not exceed BART max_position_embeddings."""
-        from transformers import AutoProcessor
-
-        processor = AutoProcessor.from_pretrained(MODEL_NAME)
-        out = processor(
-            text="<DETAILED_CAPTION>", images=stop_sign_image, return_tensors="pt"
-        )
-        assert out["input_ids"].shape[1] <= 1024
diff --git a/vllm_bart_plugin/florence2.py b/vllm_bart_plugin/florence2.py
index 52f9f5e..e70889b 100644
--- a/vllm_bart_plugin/florence2.py
+++ b/vllm_bart_plugin/florence2.py
@@ -718,6 +718,19 @@ def get_dummy_mm_data(
 
 class Florence2MultiModalProcessor(EncDecMultiModalProcessor[Florence2ProcessingInfo]):
 
+    def __init__(self, info, dummy_inputs, *, cache=None) -> None:
+        super().__init__(info, dummy_inputs, cache=cache)
+        # Florence2Config does not expose decoder_start_token_id at the
+        # top level (it lives in text_config), so vLLM falls back to BOS
+        # (token 0) and incorrectly prepends it to the decoder prompt.
+        # Patch the top-level hf_config so vLLM's _prepare_decoder_input_ids
+        # sees the real value (EOS / token 2) and leaves our prompt intact.
+        hf_config = info.get_hf_config()
+        if getattr(hf_config, "decoder_start_token_id", None) is None:
+            hf_config.decoder_start_token_id = (
+                hf_config.text_config.decoder_start_token_id
+            )
+
     def _hf_processor_applies_updates(
         self,
         prompt_text: str,
@@ -742,7 +755,16 @@ def create_decoder_prompt(
         prompt: str | list[int],
         mm_data: MultiModalDataDict,
     ) -> str | list[int]:
-        return [self.info.get_hf_config().text_config.eos_token_id]
+        text_config = self.info.get_hf_config().text_config
+        # Decoder prompt mirrors what transformers does before open-ended
+        # generation: start with decoder_start_token_id (</s>, token 2),
+        # then include forced_bos_token_id (<s>, token 0) so that vLLM
+        # generates from the same position as transformers step 2.
+        decoder_prompt = [text_config.decoder_start_token_id]
+        forced_bos = getattr(text_config, "forced_bos_token_id", None)
+        if forced_bos is not None:
+            decoder_prompt.append(forced_bos)
+        return decoder_prompt
 
     def _apply_hf_processor_tokens_only(
         self,
@@ -793,20 +815,40 @@ def _get_prompt_updates(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
-        hf_config = self.info.get_hf_config()
-        # Use image_token_id (51289) — this is what the Florence2Processor
-        # inserts into input_ids. With _hf_processor_applies_updates=True,
-        # vllm will FIND these tokens in the existing prompt rather than
-        # inserting new ones (so no token doubling / length overflow).
-        image_token_id = hf_config.image_token_id
-        num_image_tokens = self.info.get_num_image_tokens()
-        image_tokens = [image_token_id] * num_image_tokens
+        # The placeholder must cover the FULL encoder input sequence (image
+        # tokens + text/task tokens) so that vLLM's _get_encoder_seq_lens
+        # computes the correct value for cross-attention KV cache allocation.
+        # Using only the image token count (577) would cause cross-attention
+        # to read only 577/590 K/V pairs, skipping the task-prompt tokens.
+        #
+        # With _hf_processor_applies_updates=True, vLLM detects the existing
+        # token sequence rather than inserting new tokens. By setting the
+        # insertion to the full encoder_input_ids sequence, the detected
+        # placeholder range covers all 590 encoder tokens.
+        insertion: list[int]
+        image_items = out_mm_kwargs.get("image", [])
+        if image_items:
+            item_data = image_items[0].get_data()
+            enc_ids = item_data.get("encoder_input_ids")
+            if enc_ids is not None:
+                insertion = enc_ids.tolist()
+            else:
+                # Cache hit: encoder_input_ids not available; fall back.
+                hf_config = self.info.get_hf_config()
+                insertion = (
+                    [hf_config.image_token_id] * self.info.get_num_image_tokens()
+                )
+        else:
+            hf_config = self.info.get_hf_config()
+            insertion = (
+                [hf_config.image_token_id] * self.info.get_num_image_tokens()
+            )
 
         return [
             PromptInsertion(
                 modality="image",
                 target=PromptIndexTargets.start(),
-                insertion=image_tokens,
+                insertion=insertion,
             )
         ]
 

From e7407336dbc329c72bc9ecc2bc9652845512c297 Mon Sep 17 00:00:00 2001
From: Carles Onielfa <carlesonielfa@gmail.com>
Date: Tue, 10 Mar 2026 12:26:04 +0100
Subject: [PATCH 06/11] Fix florence model name

Signed-off-by: Carles Onielfa <carlesonielfa@gmail.com>
---
 tests/test_florence2.py | 129 ++++++++++++++++++++++++++++++----------
 1 file changed, 98 insertions(+), 31 deletions(-)

diff --git a/tests/test_florence2.py b/tests/test_florence2.py
index 8aaba8a..0e78be0 100644
--- a/tests/test_florence2.py
+++ b/tests/test_florence2.py
@@ -6,13 +6,7 @@
 import torch
 from transformers import Florence2Config
 
-# Allow override via env var so CI can point at a local checkpoint.
-MODEL_NAME = os.environ.get(
-    "FLORENCE2_MODEL",
-    os.path.abspath(
-        os.path.join(os.path.dirname(__file__), "../../Florence-2-base-ft")
-    ),
-)
+MODEL_NAME = "florence-community/Florence-2-base-ft"
 
 
 def _small_vision_config():
@@ -135,13 +129,17 @@ def _run_task(llm, processor, image, task_prompt, text_input=None, max_tokens=10
     from vllm import SamplingParams
 
     prompt = task_prompt if text_input is None else task_prompt + text_input
-    params = SamplingParams(temperature=0.0, max_tokens=max_tokens, skip_special_tokens=False)
+    params = SamplingParams(
+        temperature=0.0, max_tokens=max_tokens, skip_special_tokens=False
+    )
     outputs = llm.generate(
         [{"prompt": prompt, "multi_modal_data": {"image": image}}],
         sampling_params=params,
     )
     raw = outputs[0].outputs[0].text
-    return processor.post_process_generation(raw, task=task_prompt, image_size=image.size)
+    return processor.post_process_generation(
+        raw, task=task_prompt, image_size=image.size
+    )
 
 
 @pytest.fixture(scope="module")
@@ -178,30 +176,60 @@ class TestFlorenceInference:
     # ------------------------------------------------------------------
 
     def test_caption(self, florence2_llm, florence2_processor, stop_sign_image):
-        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<CAPTION>", max_tokens=30)
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<CAPTION>",
+            max_tokens=30,
+        )
         text = result["<CAPTION>"].lower()
-        assert "car" in text or "stop" in text, f"<CAPTION> output missing expected content: {text!r}"
+        assert "car" in text or "stop" in text, (
+            f"<CAPTION> output missing expected content: {text!r}"
+        )
 
-    def test_detailed_caption(self, florence2_llm, florence2_processor, stop_sign_image):
-        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<DETAILED_CAPTION>", max_tokens=80)
+    def test_detailed_caption(
+        self, florence2_llm, florence2_processor, stop_sign_image
+    ):
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<DETAILED_CAPTION>",
+            max_tokens=80,
+        )
         text = result["<DETAILED_CAPTION>"].lower()
         # Must mention the car and give some background detail — guards against the
         # KV-cache encoder_seq_lens regression that previously produced garbled output.
         assert "car" in text, f"<DETAILED_CAPTION> missing 'car': {text!r}"
         assert len(text.split()) >= 10, f"<DETAILED_CAPTION> too short: {text!r}"
 
-    def test_more_detailed_caption(self, florence2_llm, florence2_processor, stop_sign_image):
-        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<MORE_DETAILED_CAPTION>", max_tokens=100)
+    def test_more_detailed_caption(
+        self, florence2_llm, florence2_processor, stop_sign_image
+    ):
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<MORE_DETAILED_CAPTION>",
+            max_tokens=100,
+        )
         text = result["<MORE_DETAILED_CAPTION>"].lower()
-        assert "stop sign" in text or "sign" in text, f"<MORE_DETAILED_CAPTION> missing 'stop sign': {text!r}"
+        assert "stop sign" in text or "sign" in text, (
+            f"<MORE_DETAILED_CAPTION> missing 'stop sign': {text!r}"
+        )
         assert len(text.split()) >= 10, f"<MORE_DETAILED_CAPTION> too short: {text!r}"
 
     # ------------------------------------------------------------------
     # Structured-output tasks — check schema and key labels
     # ------------------------------------------------------------------
 
-    def test_object_detection(self, florence2_llm, florence2_processor, stop_sign_image):
-        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<OD>", max_tokens=300)
+    def test_object_detection(
+        self, florence2_llm, florence2_processor, stop_sign_image
+    ):
+        result = _run_task(
+            florence2_llm, florence2_processor, stop_sign_image, "<OD>", max_tokens=300
+        )
         od = result["<OD>"]
         assert "bboxes" in od and "labels" in od
         assert len(od["bboxes"]) == len(od["labels"]) > 0
@@ -209,18 +237,38 @@ def test_object_detection(self, florence2_llm, florence2_processor, stop_sign_im
         for bbox in od["bboxes"]:
             assert len(bbox) == 4 and all(c >= 0 for c in bbox)
         labels = od["labels"]
-        assert "stop sign" in labels, f"Expected 'stop sign' in OD labels, got: {labels}"
-        assert "car" in labels or "building" in labels, f"Expected common objects in OD labels, got: {labels}"
+        assert "stop sign" in labels, (
+            f"Expected 'stop sign' in OD labels, got: {labels}"
+        )
+        assert "car" in labels or "building" in labels, (
+            f"Expected common objects in OD labels, got: {labels}"
+        )
 
-    def test_dense_region_caption(self, florence2_llm, florence2_processor, stop_sign_image):
-        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<DENSE_REGION_CAPTION>", max_tokens=250)
+    def test_dense_region_caption(
+        self, florence2_llm, florence2_processor, stop_sign_image
+    ):
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<DENSE_REGION_CAPTION>",
+            max_tokens=250,
+        )
         drc = result["<DENSE_REGION_CAPTION>"]
         assert "bboxes" in drc and "labels" in drc
         assert len(drc["bboxes"]) == len(drc["labels"]) > 0
-        assert "stop sign" in drc["labels"], f"Expected 'stop sign' in dense captions, got: {drc['labels']}"
+        assert "stop sign" in drc["labels"], (
+            f"Expected 'stop sign' in dense captions, got: {drc['labels']}"
+        )
 
     def test_region_proposal(self, florence2_llm, florence2_processor, stop_sign_image):
-        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<REGION_PROPOSAL>", max_tokens=100)
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<REGION_PROPOSAL>",
+            max_tokens=100,
+        )
         rp = result["<REGION_PROPOSAL>"]
         assert "bboxes" in rp and "labels" in rp
         assert len(rp["bboxes"]) > 0
@@ -228,7 +276,13 @@ def test_region_proposal(self, florence2_llm, florence2_processor, stop_sign_ima
         assert all(label == "" for label in rp["labels"])
 
     def test_ocr_with_region(self, florence2_llm, florence2_processor, stop_sign_image):
-        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<OCR_WITH_REGION>", max_tokens=250)
+        result = _run_task(
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<OCR_WITH_REGION>",
+            max_tokens=250,
+        )
         ocr = result["<OCR_WITH_REGION>"]
         assert "quad_boxes" in ocr and "labels" in ocr
         assert len(ocr["quad_boxes"]) == len(ocr["labels"]) > 0
@@ -237,12 +291,20 @@ def test_ocr_with_region(self, florence2_llm, florence2_processor, stop_sign_ima
             assert len(quad) == 8
         # "STOP" is the most prominent text in the image
         joined = " ".join(ocr["labels"])
-        assert "STOP" in joined, f"Expected 'STOP' in OCR_WITH_REGION labels, got: {joined!r}"
+        assert "STOP" in joined, (
+            f"Expected 'STOP' in OCR_WITH_REGION labels, got: {joined!r}"
+        )
 
-    def test_caption_to_phrase_grounding(self, florence2_llm, florence2_processor, stop_sign_image):
+    def test_caption_to_phrase_grounding(
+        self, florence2_llm, florence2_processor, stop_sign_image
+    ):
         result = _run_task(
-            florence2_llm, florence2_processor, stop_sign_image,
-            "<CAPTION_TO_PHRASE_GROUNDING>", text_input="A stop sign on a street corner.", max_tokens=80,
+            florence2_llm,
+            florence2_processor,
+            stop_sign_image,
+            "<CAPTION_TO_PHRASE_GROUNDING>",
+            text_input="A stop sign on a street corner.",
+            max_tokens=80,
         )
         cpg = result["<CAPTION_TO_PHRASE_GROUNDING>"]
         assert "bboxes" in cpg and "labels" in cpg
@@ -259,10 +321,15 @@ def test_batch_inference(self, florence2_llm, florence2_processor, stop_sign_ima
         """Multiple prompts in one batch must all produce non-empty output."""
         from vllm import SamplingParams
 
-        params = SamplingParams(temperature=0.0, max_tokens=30, skip_special_tokens=False)
+        params = SamplingParams(
+            temperature=0.0, max_tokens=30, skip_special_tokens=False
+        )
         prompts = [
             {"prompt": "<CAPTION>", "multi_modal_data": {"image": stop_sign_image}},
-            {"prompt": "<DETAILED_CAPTION>", "multi_modal_data": {"image": stop_sign_image}},
+            {
+                "prompt": "<DETAILED_CAPTION>",
+                "multi_modal_data": {"image": stop_sign_image},
+            },
         ]
         outputs = florence2_llm.generate(prompts, sampling_params=params)
         assert all(len(o.outputs[0].text) > 0 for o in outputs)

From 19d1c6e977717f98eae21b2502110442d02d6b60 Mon Sep 17 00:00:00 2001
From: Carles Onielfa <carlesonielfa@gmail.com>
Date: Tue, 10 Mar 2026 16:09:33 +0100
Subject: [PATCH 07/11] Fix BART tests

Signed-off-by: Carles Onielfa <carlesonielfa@gmail.com>
---
 tests/test_model_initialization.py | 19 +++++++++++--------
 vllm_bart_plugin/bart.py           |  8 +++++++-
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/tests/test_model_initialization.py b/tests/test_model_initialization.py
index 2a44aec..7f01bec 100644
--- a/tests/test_model_initialization.py
+++ b/tests/test_model_initialization.py
@@ -1,7 +1,8 @@
 """Tests for BART model initialization."""
 
 import pytest
-import torch
+import vllm_bart_plugin
+vllm_bart_plugin.register_bart_model()
 from vllm import LLM
 
 
@@ -42,6 +43,7 @@ def test_model_with_custom_config(self, small_model_name):
         except Exception as e:
             pytest.fail(f"Failed to load model with config: {e}")
 
+    @pytest.mark.slow
     def test_model_class_initialization(self):
         """Test that model class can be instantiated."""
         from vllm_bart_plugin.bart import BartForConditionalGeneration
@@ -53,7 +55,6 @@ def test_model_class_initialization(self):
 
         model_config = ModelConfig(
             model="facebook/bart-large-cnn",
-            task="generate",
             tokenizer="facebook/bart-large-cnn",
             tokenizer_mode="auto",
             trust_remote_code=False,
@@ -65,7 +66,6 @@ def test_model_class_initialization(self):
         cache_config = CacheConfig(
             block_size=16,
             gpu_memory_utilization=0.3,
-            swap_space_bytes=0,
             cache_dtype="auto",
         )
 
@@ -77,7 +77,9 @@ def test_model_class_initialization(self):
 
         # Try to instantiate the model
         try:
-            model = BartForConditionalGeneration(vllm_config=vllm_config)
+            from vllm.config import set_current_vllm_config
+            with set_current_vllm_config(vllm_config):
+                model = BartForConditionalGeneration(vllm_config=vllm_config)
             assert model is not None
             assert hasattr(model, 'model')
             assert hasattr(model, 'lm_head')
@@ -92,13 +94,14 @@ def test_model_has_required_methods(self):
             'forward',
             'compute_logits',
             'load_weights',
-            'get_multimodal_embeddings',
+            'embed_multimodal',
         ]
 
         for method in required_methods:
             assert hasattr(BartForConditionalGeneration, method), \
                 f"Model missing required method: {method}"
 
+    @pytest.mark.slow
     def test_encoder_decoder_structure(self):
         """Test that BART has proper encoder-decoder structure."""
         from vllm_bart_plugin.bart import BartModel, BartEncoder, BartDecoder
@@ -109,7 +112,6 @@ def test_encoder_decoder_structure(self):
 
         model_config = ModelConfig(
             model="facebook/bart-large-cnn",
-            task="generate",
             tokenizer="facebook/bart-large-cnn",
             tokenizer_mode="auto",
             trust_remote_code=False,
@@ -121,7 +123,6 @@ def test_encoder_decoder_structure(self):
         cache_config = CacheConfig(
             block_size=16,
             gpu_memory_utilization=0.3,
-            swap_space_bytes=0,
             cache_dtype="auto",
         )
 
@@ -131,7 +132,9 @@ def test_encoder_decoder_structure(self):
             load_config=LoadConfig(),
         )
 
-        model = BartModel(vllm_config=vllm_config)
+        from vllm.config import set_current_vllm_config
+        with set_current_vllm_config(vllm_config):
+            model = BartModel(vllm_config=vllm_config)
 
         assert hasattr(model, 'encoder')
         assert hasattr(model, 'decoder')
diff --git a/vllm_bart_plugin/bart.py b/vllm_bart_plugin/bart.py
index db98d6f..7d0c5eb 100644
--- a/vllm_bart_plugin/bart.py
+++ b/vllm_bart_plugin/bart.py
@@ -928,6 +928,9 @@ def get_mm_max_tokens_per_item(
         config = self.get_hf_config()
         return {"text": config.max_position_embeddings}
 
+    def get_data_parser(self) -> "MultiModalDataParser":
+        return TextDataParser()
+
 
 class BartDummyInputsBuilder(BaseDummyInputsBuilder[BartProcessingInfo]):
     """Builds dummy inputs for profiling BART models."""
@@ -1042,6 +1045,9 @@ def _call_hf_processor(
         has_encoder_data = mm_data is not None and "texts" in mm_data
         result = {}
 
+        # vLLM may pass add_special_tokens in tok_kwargs; we set it ourselves
+        tok_kwargs = {k: v for k, v in tok_kwargs.items() if k != "add_special_tokens"}
+
         if has_encoder_data:
             # Tokenize the encoder text from mm_data
             encoder_texts = mm_data["texts"]
@@ -1108,7 +1114,7 @@ def _get_prompt_updates(
             )
         ]
 
-    def _get_data_parser(self) -> MultiModalDataParser:
+    def build_data_parser(self) -> MultiModalDataParser:
         return TextDataParser()
 
 

From 440d8e2fe94ed2541edeea9ccc6846623e792b7e Mon Sep 17 00:00:00 2001
From: Carles Onielfa <carlesonielfa@gmail.com>
Date: Fri, 13 Mar 2026 09:57:03 +0100
Subject: [PATCH 08/11] Simplify by addressing comments

Signed-off-by: Carles Onielfa <carlesonielfa@gmail.com>
---
 tests/test_florence2.py       |  91 ++------
 vllm_bart_plugin/florence2.py | 398 ++--------------------------------
 2 files changed, 40 insertions(+), 449 deletions(-)

diff --git a/tests/test_florence2.py b/tests/test_florence2.py
index 0e78be0..4ed231f 100644
--- a/tests/test_florence2.py
+++ b/tests/test_florence2.py
@@ -30,55 +30,6 @@ def _small_vision_config():
 # ---------------------------------------------------------------------------
 
 
-class TestFlorenceVisionDropPath:
-    def test_eval_is_identity(self):
-        from vllm_bart_plugin.florence2 import Florence2VisionDropPath
-
-        m = Florence2VisionDropPath(drop_prob=0.9).eval()
-        x = torch.randn(2, 16)
-        assert torch.equal(m(x), x)
-
-    def test_training_drops_samples(self):
-        from vllm_bart_plugin.florence2 import Florence2VisionDropPath
-
-        torch.manual_seed(0)
-        m = Florence2VisionDropPath(drop_prob=0.5).train()
-        out = m(torch.ones(64, 16))
-        assert not torch.all(out == 1)
-
-
-class TestFlorenceVisionConvEmbed:
-    @pytest.mark.parametrize("pre_norm", [True, False])
-    def test_output_channels(self, pre_norm):
-        from vllm_bart_plugin.florence2 import Florence2VisionConvEmbed
-
-        m = Florence2VisionConvEmbed(
-            patch_size=7,
-            in_channels=3,
-            embed_dim=64,
-            stride=4,
-            padding=3,
-            pre_norm=pre_norm,
-        )
-        out = m(torch.randn(1, 3, 64, 64))
-        assert out.shape[1] == 64
-
-
-class TestFlorenceVisionWindowAttention:
-    def test_exact_window(self):
-        from vllm_bart_plugin.florence2 import Florence2VisionWindowAttention
-
-        m = Florence2VisionWindowAttention(dim=32, num_heads=4, window_size=4)
-        assert m(torch.randn(1, 4, 4, 32)).shape == (1, 16, 32)
-
-    def test_input_requires_padding(self):
-        from vllm_bart_plugin.florence2 import Florence2VisionWindowAttention
-
-        m = Florence2VisionWindowAttention(dim=32, num_heads=4, window_size=4)
-        # 6 is not divisible by 4; output should still be (B, 6*6, C)
-        assert m(torch.randn(1, 6, 6, 32)).shape == (1, 36, 32)
-
-
 class TestFlorenceVisionBackbone:
     def test_output_shape(self):
         from vllm_bart_plugin.florence2 import Florence2VisionBackbone
@@ -184,9 +135,9 @@ def test_caption(self, florence2_llm, florence2_processor, stop_sign_image):
             max_tokens=30,
         )
         text = result["<CAPTION>"].lower()
-        assert "car" in text or "stop" in text, (
-            f"<CAPTION> output missing expected content: {text!r}"
-        )
+        assert (
+            "car" in text or "stop" in text
+        ), f"<CAPTION> output missing expected content: {text!r}"
 
     def test_detailed_caption(
         self, florence2_llm, florence2_processor, stop_sign_image
@@ -215,9 +166,9 @@ def test_more_detailed_caption(
             max_tokens=100,
         )
         text = result["<MORE_DETAILED_CAPTION>"].lower()
-        assert "stop sign" in text or "sign" in text, (
-            f"<MORE_DETAILED_CAPTION> missing 'stop sign': {text!r}"
-        )
+        assert (
+            "stop sign" in text or "sign" in text
+        ), f"<MORE_DETAILED_CAPTION> missing 'stop sign': {text!r}"
         assert len(text.split()) >= 10, f"<MORE_DETAILED_CAPTION> too short: {text!r}"
 
     # ------------------------------------------------------------------
@@ -237,12 +188,12 @@ def test_object_detection(
         for bbox in od["bboxes"]:
             assert len(bbox) == 4 and all(c >= 0 for c in bbox)
         labels = od["labels"]
-        assert "stop sign" in labels, (
-            f"Expected 'stop sign' in OD labels, got: {labels}"
-        )
-        assert "car" in labels or "building" in labels, (
-            f"Expected common objects in OD labels, got: {labels}"
-        )
+        assert (
+            "stop sign" in labels
+        ), f"Expected 'stop sign' in OD labels, got: {labels}"
+        assert (
+            "car" in labels or "building" in labels
+        ), f"Expected common objects in OD labels, got: {labels}"
 
     def test_dense_region_caption(
         self, florence2_llm, florence2_processor, stop_sign_image
@@ -257,9 +208,9 @@ def test_dense_region_caption(
         drc = result["<DENSE_REGION_CAPTION>"]
         assert "bboxes" in drc and "labels" in drc
         assert len(drc["bboxes"]) == len(drc["labels"]) > 0
-        assert "stop sign" in drc["labels"], (
-            f"Expected 'stop sign' in dense captions, got: {drc['labels']}"
-        )
+        assert (
+            "stop sign" in drc["labels"]
+        ), f"Expected 'stop sign' in dense captions, got: {drc['labels']}"
 
     def test_region_proposal(self, florence2_llm, florence2_processor, stop_sign_image):
         result = _run_task(
@@ -291,9 +242,9 @@ def test_ocr_with_region(self, florence2_llm, florence2_processor, stop_sign_ima
             assert len(quad) == 8
         # "STOP" is the most prominent text in the image
         joined = " ".join(ocr["labels"])
-        assert "STOP" in joined, (
-            f"Expected 'STOP' in OCR_WITH_REGION labels, got: {joined!r}"
-        )
+        assert (
+            "STOP" in joined
+        ), f"Expected 'STOP' in OCR_WITH_REGION labels, got: {joined!r}"
 
     def test_caption_to_phrase_grounding(
         self, florence2_llm, florence2_processor, stop_sign_image
@@ -309,9 +260,9 @@ def test_caption_to_phrase_grounding(
         cpg = result["<CAPTION_TO_PHRASE_GROUNDING>"]
         assert "bboxes" in cpg and "labels" in cpg
         assert len(cpg["bboxes"]) > 0
-        assert any("stop sign" in lbl.lower() for lbl in cpg["labels"]), (
-            f"Expected 'stop sign' grounded, got labels: {cpg['labels']}"
-        )
+        assert any(
+            "stop sign" in lbl.lower() for lbl in cpg["labels"]
+        ), f"Expected 'stop sign' grounded, got labels: {cpg['labels']}"
 
     # ------------------------------------------------------------------
     # Batch tests
diff --git a/vllm_bart_plugin/florence2.py b/vllm_bart_plugin/florence2.py
index e70889b..6fca270 100644
--- a/vllm_bart_plugin/florence2.py
+++ b/vllm_bart_plugin/florence2.py
@@ -5,7 +5,6 @@
 from typing import Literal, TypedDict
 
 import torch
-import torch.nn.functional as F
 from torch import nn
 from transformers import (
     BartConfig,
@@ -14,6 +13,7 @@
     Florence2Config,
     Florence2Processor,
 )
+from transformers.models.florence2.modeling_florence2 import Florence2VisionBackbone
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.lora import LoRAConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -85,348 +85,6 @@ class Florence2ImagePixelInputs(TypedDict):
     """Shape: (batch_size, num_channel, height, width)"""
 
 
-def _drop_path(
-    x: torch.Tensor, drop_prob: float = 0.0, training: bool = False
-) -> torch.Tensor:
-    if drop_prob == 0.0 or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (x.ndim - 1)
-    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
-    random_tensor.floor_()
-    return x.div(keep_prob) * random_tensor
-
-
-class Florence2VisionDropPath(nn.Module):
-    def __init__(self, drop_prob: float = 0.0):
-        super().__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return _drop_path(x, self.drop_prob, self.training)
-
-
-class Florence2VisionMLP(nn.Module):
-    def __init__(self, embed_dim: int, mlp_ratio: float = 4.0):
-        super().__init__()
-        hidden_dim = int(embed_dim * mlp_ratio)
-        self.fc1 = nn.Linear(embed_dim, hidden_dim)
-        self.fc2 = nn.Linear(hidden_dim, embed_dim)
-        self.act = nn.GELU()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.fc2(self.act(self.fc1(x)))
-
-
-class Florence2VisionConvEmbed(nn.Module):
-    """Image-to-patch embedding via strided convolution (NCHW in, NCHW out)."""
-
-    def __init__(
-        self,
-        patch_size: int,
-        in_channels: int,
-        embed_dim: int,
-        stride: int,
-        padding: int,
-        pre_norm: bool,
-    ):
-        super().__init__()
-        self.pre_norm = pre_norm
-        self.conv = nn.Conv2d(
-            in_channels,
-            embed_dim,
-            kernel_size=patch_size,
-            stride=stride,
-            padding=padding,
-        )
-        dim_norm = in_channels if pre_norm else embed_dim
-        self.norm = nn.LayerNorm(dim_norm)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        if self.pre_norm:
-            x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
-        x = self.conv(x)
-        if not self.pre_norm:
-            x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
-        return x
-
-
-class Florence2VisionChannelAttention(nn.Module):
-    """Channel (group) attention — attends over the channel dimension."""
-
-    def __init__(self, dim: int, groups: int, qkv_bias: bool = True):
-        super().__init__()
-        self.groups = groups
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.proj = nn.Linear(dim, dim)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        B, N, C = x.shape
-        # Reshape: (B, N, 3, groups, C//groups) -> (3, B, groups, N, C//groups)
-        qkv = self.qkv(x).reshape(B, N, 3, self.groups, C // self.groups)
-        qkv = qkv.permute(2, 0, 3, 1, 4)
-        q, k, v = qkv.unbind(0)  # each: (B, groups, N, C//groups)
-
-        # Scale by sequence length and compute channel-to-channel attention
-        q = q * (float(N) ** -0.5)
-        attn = (q.transpose(-2, -1) @ k).softmax(dim=-1)  # (B, groups, C//g, C//g)
-        out = (attn @ v.transpose(-2, -1)).transpose(-2, -1)  # (B, groups, N, C//g)
-        out = out.transpose(1, 2).reshape(B, N, C)
-        return self.proj(out)
-
-
-class Florence2VisionChannelBlock(nn.Module):
-    def __init__(
-        self,
-        embed_dim: int,
-        groups: int,
-        mlp_ratio: float = 4.0,
-        qkv_bias: bool = True,
-        drop_path_rate: float = 0.0,
-    ):
-        super().__init__()
-        self.conv1 = nn.Conv2d(
-            embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim
-        )
-        self.norm1 = nn.LayerNorm(embed_dim)
-        self.channel_attn = Florence2VisionChannelAttention(embed_dim, groups, qkv_bias)
-        self.drop_path1 = (
-            Florence2VisionDropPath(drop_path_rate)
-            if drop_path_rate > 0
-            else nn.Identity()
-        )
-        self.conv2 = nn.Conv2d(
-            embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim
-        )
-        self.norm2 = nn.LayerNorm(embed_dim)
-        self.ffn = Florence2VisionMLP(embed_dim, mlp_ratio)
-        self.drop_path2 = (
-            Florence2VisionDropPath(drop_path_rate)
-            if drop_path_rate > 0
-            else nn.Identity()
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        B, C, H, W = x.shape
-        # Sub-block 1: depthwise conv residual + channel attention
-        x = self.conv1(x) + x
-        x_flat = x.flatten(2).transpose(1, 2)  # (B, H*W, C)
-        residual = x_flat
-        x_flat = residual + self.drop_path1(self.channel_attn(self.norm1(x_flat)))
-        x = x_flat.transpose(1, 2).view(B, C, H, W)
-        # Sub-block 2: depthwise conv residual + FFN
-        x = self.conv2(x) + x
-        x_flat = x.flatten(2).transpose(1, 2)
-        residual = x_flat
-        x_flat = residual + self.drop_path2(self.ffn(self.norm2(x_flat)))
-        x = x_flat.transpose(1, 2).view(B, C, H, W)
-        return x
-
-
-class Florence2VisionWindowAttention(nn.Module):
-    """Window-based local spatial self-attention."""
-
-    def __init__(
-        self, dim: int, num_heads: int, window_size: int, qkv_bias: bool = True
-    ):
-        super().__init__()
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.scale = (dim // num_heads) ** -0.5
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.proj = nn.Linear(dim, dim)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # x: (B, H, W, C) BHWC
-        B, H, W, C = x.shape
-        pad_r = (self.window_size - W % self.window_size) % self.window_size
-        pad_b = (self.window_size - H % self.window_size) % self.window_size
-        if pad_r > 0 or pad_b > 0:
-            x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
-        Hp, Wp = x.shape[1], x.shape[2]
-
-        # Partition into non-overlapping windows
-        x = x.view(
-            B,
-            Hp // self.window_size,
-            self.window_size,
-            Wp // self.window_size,
-            self.window_size,
-            C,
-        )
-        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, self.window_size**2, C)
-
-        Bw, Nw = x.shape[:2]
-        qkv = (
-            self.qkv(x)
-            .reshape(Bw, Nw, 3, self.num_heads, C // self.num_heads)
-            .permute(2, 0, 3, 1, 4)
-        )
-        q, k, v = qkv.unbind(0)
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        x = (attn.softmax(dim=-1) @ v).transpose(1, 2).reshape(Bw, Nw, C)
-        x = self.proj(x)
-
-        # Merge windows back
-        x = x.view(-1, self.window_size, self.window_size, C)
-        x = x.view(
-            B,
-            Hp // self.window_size,
-            Wp // self.window_size,
-            self.window_size,
-            self.window_size,
-            C,
-        )
-        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, C)
-        if pad_r > 0 or pad_b > 0:
-            x = x[:, :H, :W, :].contiguous()
-        return x.view(B, H * W, C)
-
-
-class Florence2VisionSpatialBlock(nn.Module):
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        window_size: int,
-        mlp_ratio: float = 4.0,
-        qkv_bias: bool = True,
-        drop_path_rate: float = 0.0,
-    ):
-        super().__init__()
-        self.conv1 = nn.Conv2d(
-            embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim
-        )
-        self.norm1 = nn.LayerNorm(embed_dim)
-        self.window_attn = Florence2VisionWindowAttention(
-            embed_dim, num_heads, window_size, qkv_bias
-        )
-        self.drop_path1 = (
-            Florence2VisionDropPath(drop_path_rate)
-            if drop_path_rate > 0
-            else nn.Identity()
-        )
-        self.conv2 = nn.Conv2d(
-            embed_dim, embed_dim, kernel_size=3, padding=1, groups=embed_dim
-        )
-        self.norm2 = nn.LayerNorm(embed_dim)
-        self.ffn = Florence2VisionMLP(embed_dim, mlp_ratio)
-        self.drop_path2 = (
-            Florence2VisionDropPath(drop_path_rate)
-            if drop_path_rate > 0
-            else nn.Identity()
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        B, C, H, W = x.shape
-        # Sub-block 1: depthwise conv residual + window attention
-        x = self.conv1(x) + x
-        x_flat = x.flatten(2).transpose(1, 2)  # (B, H*W, C)
-        residual = x_flat
-        x_bhwc = self.norm1(x_flat).view(B, H, W, C)
-        x_flat = residual + self.drop_path1(self.window_attn(x_bhwc))
-        x = x_flat.transpose(1, 2).view(B, C, H, W)
-        # Sub-block 2: depthwise conv residual + FFN
-        x = self.conv2(x) + x
-        x_flat = x.flatten(2).transpose(1, 2)
-        residual = x_flat
-        x_flat = residual + self.drop_path2(self.ffn(self.norm2(x_flat)))
-        x = x_flat.transpose(1, 2).view(B, C, H, W)
-        return x
-
-
-class Florence2VisionBlock(nn.Module):
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        num_groups: int,
-        window_size: int,
-        mlp_ratio: float = 4.0,
-        qkv_bias: bool = True,
-        spatial_drop_path_rate: float = 0.0,
-        channel_drop_path_rate: float = 0.0,
-    ):
-        super().__init__()
-        self.spatial_block = Florence2VisionSpatialBlock(
-            embed_dim,
-            num_heads,
-            window_size,
-            mlp_ratio,
-            qkv_bias,
-            spatial_drop_path_rate,
-        )
-        self.channel_block = Florence2VisionChannelBlock(
-            embed_dim, num_groups, mlp_ratio, qkv_bias, channel_drop_path_rate
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.channel_block(self.spatial_block(x))
-
-
-class Florence2VisionBackbone(nn.Module):
-    """
-    DaViT-based vision backbone for the new Florence-2 architecture.
-    Produces NCHW feature maps for the multi-modal projector.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        embed_dims = config.embed_dim
-        num_stages = len(embed_dims)
-        depths = config.depths
-        mlp_ratio = getattr(config, "mlp_ratio", 4.0)
-        qkv_bias = getattr(config, "qkv_bias", True)
-
-        dpr = [
-            x.item() for x in torch.linspace(0, config.drop_path_rate, sum(depths) * 2)
-        ]
-        depth_offset = 0
-
-        convs = []
-        blocks = []
-        for stage_idx in range(num_stages):
-            in_ch = config.in_channels if stage_idx == 0 else embed_dims[stage_idx - 1]
-            convs.append(
-                Florence2VisionConvEmbed(
-                    patch_size=config.patch_size[stage_idx],
-                    in_channels=in_ch,
-                    embed_dim=embed_dims[stage_idx],
-                    stride=config.patch_stride[stage_idx],
-                    padding=config.patch_padding[stage_idx],
-                    pre_norm=config.patch_prenorm[stage_idx],
-                )
-            )
-            stage_blocks = nn.ModuleList(
-                [
-                    Florence2VisionBlock(
-                        embed_dim=embed_dims[stage_idx],
-                        num_heads=config.num_heads[stage_idx],
-                        num_groups=config.num_groups[stage_idx],
-                        window_size=config.window_size,
-                        mlp_ratio=mlp_ratio,
-                        qkv_bias=qkv_bias,
-                        spatial_drop_path_rate=dpr[depth_offset + block_idx * 2],
-                        channel_drop_path_rate=dpr[depth_offset + block_idx * 2 + 1],
-                    )
-                    for block_idx in range(depths[stage_idx])
-                ]
-            )
-            blocks.append(stage_blocks)
-            depth_offset += depths[stage_idx] * 2
-
-        self.convs = nn.ModuleList(convs)
-        self.blocks = nn.ModuleList(blocks)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Args: x (B, 3, H, W). Returns: (B, C_last, H', W') NCHW feature map."""
-        for conv, block_list in zip(self.convs, self.blocks):
-            x = conv(x)
-            for block in block_list:
-                x = block(x)
-        return x
-
-
 class Florence2VisionLearnedAbsolutePositionEmbedding2D(nn.Module):
     """2D learned absolute position embedding (NCHW interface)."""
 
@@ -440,20 +98,16 @@ def __init__(self, embedding_dim: int = 256, num_pos: int = 50):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """x: (B, C, H, W) — returns positional embeddings of same shape."""
         height, width = x.shape[-2:]
-        x_emb = self.column_embeddings(
-            torch.arange(width, device=x.device)
-        )  # (W, C//2)
-        y_emb = self.row_embeddings(torch.arange(height, device=x.device))  # (H, C//2)
+        x_emb = self.column_embeddings(torch.arange(width, device=x.device))
+        y_emb = self.row_embeddings(torch.arange(height, device=x.device))
         pos = torch.cat(
             [
                 x_emb.unsqueeze(0).expand(height, -1, -1),
                 y_emb.unsqueeze(1).expand(-1, width, -1),
             ],
             dim=-1,
-        )  # (H, W, C)
-        return (
-            pos.permute(2, 0, 1).unsqueeze(0).expand(x.shape[0], -1, -1, -1)
-        )  # (B, C, H, W)
+        )
+        return pos.permute(2, 0, 1).unsqueeze(0).expand(x.shape[0], -1, -1, -1)
 
 
 class Florence2VisionPositionalEmbeddingCosine1D(nn.Module):
@@ -475,7 +129,7 @@ def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
         """seq_embeds: (B, T, C) — returns (T, C) positional embeddings."""
         len_seq = seq_embeds.size(1)
         assert len_seq <= self.max_seq_len
-        return self.pos_idx_to_embed[0:len_seq, :]  # (T, C)
+        return self.pos_idx_to_embed[0:len_seq, :]
 
 
 class Florence2MultiModalProjector(nn.Module):
@@ -835,14 +489,12 @@ def _get_prompt_updates(
             else:
                 # Cache hit: encoder_input_ids not available; fall back.
                 hf_config = self.info.get_hf_config()
-                insertion = (
-                    [hf_config.image_token_id] * self.info.get_num_image_tokens()
-                )
+                insertion = [
+                    hf_config.image_token_id
+                ] * self.info.get_num_image_tokens()
         else:
             hf_config = self.info.get_hf_config()
-            insertion = (
-                [hf_config.image_token_id] * self.info.get_num_image_tokens()
-            )
+            insertion = [hf_config.image_token_id] * self.info.get_num_image_tokens()
 
         return [
             PromptInsertion(
@@ -859,6 +511,14 @@ def _get_prompt_updates(
     dummy_inputs=Florence2DummyInputsBuilder,
 )
 class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.vision_tower.": "vision_tower.",
+            "model.multi_modal_projector.": "multi_modal_projector.",
+            "model.language_model.": "language_model.model.",
+            "lm_head.": "language_model.lm_head.",
+        }
+    )
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
@@ -1046,25 +706,5 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        def _remap(weights: Iterable[tuple[str, torch.Tensor]]):
-            for name, param in weights:
-                # HF checkpoint layout (Florence2ForConditionalGeneration):
-                #   model.vision_tower.*           -> vision_tower.*
-                #   model.multi_modal_projector.*  -> multi_modal_projector.*
-                #   model.language_model.*         -> language_model.model.*
-                #       (HF uses BartModel directly; our wrapper adds .model)
-                #   lm_head.*                      -> language_model.lm_head.*
-                if name.startswith("model.vision_tower."):
-                    name = name[len("model.") :]
-                elif name.startswith("model.multi_modal_projector."):
-                    name = name[len("model.") :]
-                elif name.startswith("model.language_model."):
-                    name = (
-                        "language_model.model." + name[len("model.language_model.") :]
-                    )
-                elif name.startswith("lm_head."):
-                    name = "language_model." + name
-                yield name, param
-
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(_remap(weights))
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From 0a00dbbf16ef92122c6bcead9f9e659c4dedca3e Mon Sep 17 00:00:00 2001
From: Carles Onielfa <carlesonielfa@gmail.com>
Date: Fri, 13 Mar 2026 10:31:29 +0100
Subject: [PATCH 09/11] Remove BART `add_special_tokens` fix

Signed-off-by: Carles Onielfa <carlesonielfa@gmail.com>
---
 vllm_bart_plugin/bart.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/vllm_bart_plugin/bart.py b/vllm_bart_plugin/bart.py
index 7d0c5eb..a21e3fc 100644
--- a/vllm_bart_plugin/bart.py
+++ b/vllm_bart_plugin/bart.py
@@ -1045,9 +1045,6 @@ def _call_hf_processor(
         has_encoder_data = mm_data is not None and "texts" in mm_data
         result = {}
 
-        # vLLM may pass add_special_tokens in tok_kwargs; we set it ourselves
-        tok_kwargs = {k: v for k, v in tok_kwargs.items() if k != "add_special_tokens"}
-
         if has_encoder_data:
             # Tokenize the encoder text from mm_data
             encoder_texts = mm_data["texts"]
@@ -1159,8 +1156,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.vocab_size, config.d_model, embed_scale=embed_scale
         )
         # Bias added to logits after lm_head, matching HuggingFace approach
-        self.register_buffer("final_logits_bias",
-                             torch.zeros((1, config.vocab_size)))
+        self.register_buffer("final_logits_bias", torch.zeros((1, config.vocab_size)))
         self.logits_processor = LogitsProcessor(
             self.unpadded_vocab_size, config.vocab_size
         )

From 5e78020c7c891765a68cc9a45d3b4d9c9e8717e8 Mon Sep 17 00:00:00 2001
From: Carles Onielfa <carlesonielfa@gmail.com>
Date: Fri, 13 Mar 2026 15:30:48 +0100
Subject: [PATCH 10/11] Even simpler

Signed-off-by: Carles Onielfa <carlesonielfa@gmail.com>
---
 tests/test_florence2.py       |  19 -----
 vllm_bart_plugin/florence2.py | 145 +++-------------------------------
 2 files changed, 13 insertions(+), 151 deletions(-)

diff --git a/tests/test_florence2.py b/tests/test_florence2.py
index 4ed231f..38e5731 100644
--- a/tests/test_florence2.py
+++ b/tests/test_florence2.py
@@ -39,25 +39,6 @@ def test_output_shape(self):
         assert out.shape == (2, vc.embed_dim[-1], 16, 16)
 
 
-class TestFlorenceVisionPositionalEmbeddingCosine1D:
-    def test_output_shape_and_no_batch_dim(self):
-        from vllm_bart_plugin.florence2 import (
-            Florence2VisionPositionalEmbeddingCosine1D,
-        )
-
-        m = Florence2VisionPositionalEmbeddingCosine1D(embed_dim=64, max_seq_len=100)
-        assert m(torch.randn(2, 5, 64)).shape == (5, 64)
-
-    def test_raises_if_exceeds_max(self):
-        from vllm_bart_plugin.florence2 import (
-            Florence2VisionPositionalEmbeddingCosine1D,
-        )
-
-        m = Florence2VisionPositionalEmbeddingCosine1D(embed_dim=64, max_seq_len=10)
-        with pytest.raises(AssertionError):
-            m(torch.randn(1, 20, 64))
-
-
 class TestFlorenceMultiModalProjector:
     def test_output_shape(self):
         from vllm_bart_plugin.florence2 import Florence2MultiModalProjector
diff --git a/vllm_bart_plugin/florence2.py b/vllm_bart_plugin/florence2.py
index 6fca270..b215dc1 100644
--- a/vllm_bart_plugin/florence2.py
+++ b/vllm_bart_plugin/florence2.py
@@ -7,58 +7,34 @@
 import torch
 from torch import nn
 from transformers import (
-    BartConfig,
     BartTokenizer,
     BatchFeature,
     Florence2Config,
     Florence2Processor,
 )
-from transformers.models.florence2.modeling_florence2 import Florence2VisionBackbone
-from vllm.config import CacheConfig, VllmConfig
-from vllm.config.lora import LoRAConfig
-from vllm.config.multimodal import BaseDummyOptions
-from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import Attention
-from vllm.model_executor.layers.attention.cross_attention import CrossAttention
-from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
-from vllm.model_executor.layers.linear import (
-    ColumnParallelLinear,
-    QKVParallelLinear,
-    RowParallelLinear,
+from transformers.models.florence2.modeling_florence2 import (
+    Florence2MultiModalProjector,
+    Florence2VisionBackbone,
 )
+from vllm.config import VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead,
-    VocabParallelEmbedding,
-)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (
     MultiModalEmbeddings,
     SupportsMultiModal,
-    SupportsQuant,
 )
 from vllm.model_executor.models.utils import (
     AutoWeightsLoader,
     WeightsMapper,
-    cast_overflow_tensors,
-    flatten_bn,
-    maybe_prefix,
 )
-from vllm.multimodal import MULTIMODAL_REGISTRY, ModalityData
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
     MultiModalFieldConfig,
     MultiModalKwargsItems,
 )
-from vllm.multimodal.parse import (
-    ModalityDataItems,
-    ModalityDataParser,
-    MultiModalDataItems,
-    MultiModalDataParser,
-    ProcessorBatchItems,
-)
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (
     BaseProcessingInfo,
     EncDecMultiModalProcessor,
@@ -68,8 +44,6 @@
 )
 from vllm.multimodal.processing.dummy_inputs import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.utils.collection_utils import is_list_of
-from vllm.v1.attention.backend import AttentionType
 
 from vllm_bart_plugin.bart import (
     BartDecoder,
@@ -85,104 +59,6 @@ class Florence2ImagePixelInputs(TypedDict):
     """Shape: (batch_size, num_channel, height, width)"""
 
 
-class Florence2VisionLearnedAbsolutePositionEmbedding2D(nn.Module):
-    """2D learned absolute position embedding (NCHW interface)."""
-
-    def __init__(self, embedding_dim: int = 256, num_pos: int = 50):
-        super().__init__()
-        self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
-        self.column_embeddings = nn.Embedding(
-            num_pos, embedding_dim - (embedding_dim // 2)
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """x: (B, C, H, W) — returns positional embeddings of same shape."""
-        height, width = x.shape[-2:]
-        x_emb = self.column_embeddings(torch.arange(width, device=x.device))
-        y_emb = self.row_embeddings(torch.arange(height, device=x.device))
-        pos = torch.cat(
-            [
-                x_emb.unsqueeze(0).expand(height, -1, -1),
-                y_emb.unsqueeze(1).expand(-1, width, -1),
-            ],
-            dim=-1,
-        )
-        return pos.permute(2, 0, 1).unsqueeze(0).expand(x.shape[0], -1, -1, -1)
-
-
-class Florence2VisionPositionalEmbeddingCosine1D(nn.Module):
-    """Sinusoidal temporal positional embedding; returns (T, C) without batch dim."""
-
-    def __init__(self, embed_dim: int = 512, max_seq_len: int = 100) -> None:
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.max_seq_len = max_seq_len
-        factor = math.log(10000)
-        denominator = torch.exp(-factor * torch.arange(0, embed_dim, 2) / embed_dim)
-        frequencies = torch.arange(0, max_seq_len).reshape(max_seq_len, 1) * denominator
-        pos_idx_to_embed = torch.zeros((max_seq_len, embed_dim))
-        pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
-        pos_idx_to_embed[:, 1::2] = torch.cos(frequencies)
-        self.pos_idx_to_embed = nn.Parameter(pos_idx_to_embed, requires_grad=False)
-
-    def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
-        """seq_embeds: (B, T, C) — returns (T, C) positional embeddings."""
-        len_seq = seq_embeds.size(1)
-        assert len_seq <= self.max_seq_len
-        return self.pos_idx_to_embed[0:len_seq, :]
-
-
-class Florence2MultiModalProjector(nn.Module):
-    """
-    Projects vision backbone features into the language model's embedding space.
-    Applies 2D spatial positional embeddings, a temporal embedding, pools to
-    produce both a spatial-average and a per-token representation, then projects
-    with a linear layer + layer norm.
-
-    Input:  (B, C, H, W) NCHW feature map from Florence2VisionBackbone.
-    Output: (B, 1 + H*W, projection_dim) token embeddings for the encoder.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        embed_dim = config.vision_config.embed_dim[-1]
-        proj_dim = config.vision_config.projection_dim
-
-        self.image_projection = nn.Linear(embed_dim, proj_dim, bias=False)
-        self.image_proj_norm = nn.LayerNorm(proj_dim)
-        self.image_position_embed = Florence2VisionLearnedAbsolutePositionEmbedding2D(
-            embedding_dim=embed_dim,
-            num_pos=config.vision_config.max_position_embeddings,
-        )
-        self.visual_temporal_embed = Florence2VisionPositionalEmbeddingCosine1D(
-            embed_dim=embed_dim,
-            max_seq_len=config.vision_config.max_temporal_embeddings,
-        )
-
-    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
-        # image_features: (B, C, H, W)
-        B, C, H, W = image_features.shape
-
-        # 2D spatial positional embedding
-        pos = self.image_position_embed(image_features)  # (B, C, H, W)
-        x = (image_features + pos).flatten(2).transpose(1, 2)  # (B, H*W, C)
-
-        # Temporal positional embedding (T=1 for single-frame images)
-        temporal_embed = self.visual_temporal_embed(x[:, :1, :])  # (1, C)
-        x = x + temporal_embed  # broadcast over H*W tokens
-
-        # Pool: spatial average (1 token) + all spatial tokens (H*W tokens)
-        x_t = x.unsqueeze(1)  # (B, 1, H*W, C) — treat as T=1 video
-        spatial_avg = x_t.mean(dim=2)  # (B, 1, C)
-        temporal_avg = x_t.mean(dim=1)  # (B, H*W, C)
-        x = torch.cat([spatial_avg, temporal_avg], dim=1)  # (B, 1+H*W, C)
-
-        x = self.image_projection(x)  # (B, 1+H*W, proj_dim)
-        x = self.image_proj_norm(x)
-        return x
-
-
-# Language backbone and processor implementation
 class Florence2LanguageModel(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -706,5 +582,10 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self)
+        # pos_idx_to_embed is a register_buffer in the transformers implementation
+        # (deterministically computed from config), so it has no matching parameter.
+        loader = AutoWeightsLoader(
+            self,
+            ignore_unexpected_suffixes=["visual_temporal_embed.pos_idx_to_embed"],
+        )
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From 07483dc700faf57b1520ff5c6838153fe4e952a1 Mon Sep 17 00:00:00 2001
From: Carles Onielfa <carlesonielfa@gmail.com>
Date: Fri, 13 Mar 2026 18:48:04 +0100
Subject: [PATCH 11/11] Update README

Signed-off-by: Carles Onielfa <carlesonielfa@gmail.com>
---
 README.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index a56b469..6e0c11a 100644
--- a/README.md
+++ b/README.md
@@ -133,10 +133,8 @@ This plugin should work with any BART-based model from HuggingFace, including:
 
 ### Florence-2 Models
 
-- `microsoft/Florence-2-base`
-- `microsoft/Florence-2-large`
-
-Note: Florence-2 requires `trust_remote_code=True` and uses a separate tokenizer (`Isotr0py/Florence-2-tokenizer`).
+- `florence-community/Florence-2-base`
+- `florence-community/Florence-2-large`
 
 ## Evaluation