Fix bugs and add better tests

carlesonielfa · carlesonielfa · commit c68d68561d1a · 2026-03-10T12:04:36.000+01:00
diff --git a/tests/test_florence2.py b/tests/test_florence2.py
@@ -6,7 +6,13 @@
 import torch
 from transformers import Florence2Config
 
-MODEL_NAME = "florence-community/Florence-2-base-ft"
+# Allow override via env var so CI can point at a local checkpoint.
+MODEL_NAME = os.environ.get(
+    "FLORENCE2_MODEL",
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "../../Florence-2-base-ft")
+    ),
+)
 
 
 def _small_vision_config():
@@ -124,6 +130,20 @@ def test_output_shape(self):
 # ---------------------------------------------------------------------------
 
 
+def _run_task(llm, processor, image, task_prompt, text_input=None, max_tokens=100):
+    """Helper: run one Florence-2 task and return the post-processed result."""
+    from vllm import SamplingParams
+
+    prompt = task_prompt if text_input is None else task_prompt + text_input
+    params = SamplingParams(temperature=0.0, max_tokens=max_tokens, skip_special_tokens=False)
+    outputs = llm.generate(
+        [{"prompt": prompt, "multi_modal_data": {"image": image}}],
+        sampling_params=params,
+    )
+    raw = outputs[0].outputs[0].text
+    return processor.post_process_generation(raw, task=task_prompt, image_size=image.size)
+
+
 @pytest.fixture(scope="module")
 def florence2_llm():
     from vllm import LLM
@@ -138,72 +158,111 @@ def florence2_llm():
 
 
 @pytest.fixture(scope="module")
-def stop_sign_image():
-    from vllm.assets.image import ImageAsset
+def florence2_processor():
+    from transformers import AutoProcessor
 
-    return ImageAsset("stop_sign").pil_image
+    return AutoProcessor.from_pretrained(MODEL_NAME)
 
 
 @pytest.fixture(scope="module")
-def sampling_params():
-    from vllm import SamplingParams
+def stop_sign_image():
+    from vllm.assets.image import ImageAsset
 
-    return SamplingParams(
-        temperature=0.0,
-        max_tokens=20,
-        repetition_penalty=1.5,
-        skip_special_tokens=False,
-    )
+    return ImageAsset("stop_sign").pil_image.convert("RGB")
 
 
 @pytest.mark.slow
 class TestFlorenceInference:
-    def test_caption(self, florence2_llm, stop_sign_image, sampling_params):
-        outputs = florence2_llm.generate(
-            [
-                {
-                    "prompt": "<DETAILED_CAPTION>",
-                    "multi_modal_data": {"image": stop_sign_image},
-                }
-            ],
-            sampling_params=sampling_params,
+    # ------------------------------------------------------------------
+    # Caption tasks — check for semantically meaningful keywords
+    # ------------------------------------------------------------------
+
+    def test_caption(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<CAPTION>", max_tokens=30)
+        text = result["<CAPTION>"].lower()
+        assert "car" in text or "stop" in text, f"<CAPTION> output missing expected content: {text!r}"
+
+    def test_detailed_caption(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<DETAILED_CAPTION>", max_tokens=80)
+        text = result["<DETAILED_CAPTION>"].lower()
+        # Must mention the car and give some background detail — guards against the
+        # KV-cache encoder_seq_lens regression that previously produced garbled output.
+        assert "car" in text, f"<DETAILED_CAPTION> missing 'car': {text!r}"
+        assert len(text.split()) >= 10, f"<DETAILED_CAPTION> too short: {text!r}"
+
+    def test_more_detailed_caption(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<MORE_DETAILED_CAPTION>", max_tokens=100)
+        text = result["<MORE_DETAILED_CAPTION>"].lower()
+        assert "stop sign" in text or "sign" in text, f"<MORE_DETAILED_CAPTION> missing 'stop sign': {text!r}"
+        assert len(text.split()) >= 10, f"<MORE_DETAILED_CAPTION> too short: {text!r}"
+
+    # ------------------------------------------------------------------
+    # Structured-output tasks — check schema and key labels
+    # ------------------------------------------------------------------
+
+    def test_object_detection(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<OD>", max_tokens=300)
+        od = result["<OD>"]
+        assert "bboxes" in od and "labels" in od
+        assert len(od["bboxes"]) == len(od["labels"]) > 0
+        # Each bbox must be a 4-element list with non-negative coords
+        for bbox in od["bboxes"]:
+            assert len(bbox) == 4 and all(c >= 0 for c in bbox)
+        labels = od["labels"]
+        assert "stop sign" in labels, f"Expected 'stop sign' in OD labels, got: {labels}"
+        assert "car" in labels or "building" in labels, f"Expected common objects in OD labels, got: {labels}"
+
+    def test_dense_region_caption(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<DENSE_REGION_CAPTION>", max_tokens=250)
+        drc = result["<DENSE_REGION_CAPTION>"]
+        assert "bboxes" in drc and "labels" in drc
+        assert len(drc["bboxes"]) == len(drc["labels"]) > 0
+        assert "stop sign" in drc["labels"], f"Expected 'stop sign' in dense captions, got: {drc['labels']}"
+
+    def test_region_proposal(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<REGION_PROPOSAL>", max_tokens=100)
+        rp = result["<REGION_PROPOSAL>"]
+        assert "bboxes" in rp and "labels" in rp
+        assert len(rp["bboxes"]) > 0
+        # Region proposal labels are always empty strings
+        assert all(label == "" for label in rp["labels"])
+
+    def test_ocr_with_region(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(florence2_llm, florence2_processor, stop_sign_image, "<OCR_WITH_REGION>", max_tokens=250)
+        ocr = result["<OCR_WITH_REGION>"]
+        assert "quad_boxes" in ocr and "labels" in ocr
+        assert len(ocr["quad_boxes"]) == len(ocr["labels"]) > 0
+        # Each quad box must be 8 coords
+        for quad in ocr["quad_boxes"]:
+            assert len(quad) == 8
+        # "STOP" is the most prominent text in the image
+        joined = " ".join(ocr["labels"])
+        assert "STOP" in joined, f"Expected 'STOP' in OCR_WITH_REGION labels, got: {joined!r}"
+
+    def test_caption_to_phrase_grounding(self, florence2_llm, florence2_processor, stop_sign_image):
+        result = _run_task(
+            florence2_llm, florence2_processor, stop_sign_image,
+            "<CAPTION_TO_PHRASE_GROUNDING>", text_input="A stop sign on a street corner.", max_tokens=80,
         )
-        assert len(outputs[0].outputs[0].text) > 0
-
-    def test_object_detection_has_loc_tokens(
-        self, florence2_llm, stop_sign_image, sampling_params
-    ):
-        outputs = florence2_llm.generate(
-            [
-                {
-                    "encoder_prompt": {
-                        "prompt": "<OD>",
-                        "multi_modal_data": {"image": stop_sign_image},
-                    },
-                    "decoder_prompt": "",
-                }
-            ],
-            sampling_params=sampling_params,
+        cpg = result["<CAPTION_TO_PHRASE_GROUNDING>"]
+        assert "bboxes" in cpg and "labels" in cpg
+        assert len(cpg["bboxes"]) > 0
+        assert any("stop sign" in lbl.lower() for lbl in cpg["labels"]), (
+            f"Expected 'stop sign' grounded, got labels: {cpg['labels']}"
         )
-        assert "<loc_" in outputs[0].outputs[0].text
 
-    def test_batch_inference(self, florence2_llm, stop_sign_image, sampling_params):
+    # ------------------------------------------------------------------
+    # Batch tests
+    # ------------------------------------------------------------------
+
+    def test_batch_inference(self, florence2_llm, florence2_processor, stop_sign_image):
+        """Multiple prompts in one batch must all produce non-empty output."""
+        from vllm import SamplingParams
+
+        params = SamplingParams(temperature=0.0, max_tokens=30, skip_special_tokens=False)
         prompts = [
             {"prompt": "<CAPTION>", "multi_modal_data": {"image": stop_sign_image}},
-            {
-                "prompt": "<DETAILED_CAPTION>",
-                "multi_modal_data": {"image": stop_sign_image},
-            },
+            {"prompt": "<DETAILED_CAPTION>", "multi_modal_data": {"image": stop_sign_image}},
         ]
-        outputs = florence2_llm.generate(prompts, sampling_params=sampling_params)
+        outputs = florence2_llm.generate(prompts, sampling_params=params)
         assert all(len(o.outputs[0].text) > 0 for o in outputs)
-
-    def test_encoder_length_within_limit(self, stop_sign_image):
-        """Processor output must not exceed BART max_position_embeddings."""
-        from transformers import AutoProcessor
-
-        processor = AutoProcessor.from_pretrained(MODEL_NAME)
-        out = processor(
-            text="<DETAILED_CAPTION>", images=stop_sign_image, return_tensors="pt"
-        )
-        assert out["input_ids"].shape[1] <= 1024
diff --git a/vllm_bart_plugin/florence2.py b/vllm_bart_plugin/florence2.py
@@ -718,6 +718,19 @@ def get_dummy_mm_data(
 
 class Florence2MultiModalProcessor(EncDecMultiModalProcessor[Florence2ProcessingInfo]):
 
+    def __init__(self, info, dummy_inputs, *, cache=None) -> None:
+        super().__init__(info, dummy_inputs, cache=cache)
+        # Florence2Config does not expose decoder_start_token_id at the
+        # top level (it lives in text_config), so vLLM falls back to BOS
+        # (token 0) and incorrectly prepends it to the decoder prompt.
+        # Patch the top-level hf_config so vLLM's _prepare_decoder_input_ids
+        # sees the real value (EOS / token 2) and leaves our prompt intact.
+        hf_config = info.get_hf_config()
+        if getattr(hf_config, "decoder_start_token_id", None) is None:
+            hf_config.decoder_start_token_id = (
+                hf_config.text_config.decoder_start_token_id
+            )
+
     def _hf_processor_applies_updates(
         self,
         prompt_text: str,
@@ -742,7 +755,16 @@ def create_decoder_prompt(
         prompt: str | list[int],
         mm_data: MultiModalDataDict,
     ) -> str | list[int]:
-        return [self.info.get_hf_config().text_config.eos_token_id]
+        text_config = self.info.get_hf_config().text_config
+        # Decoder prompt mirrors what transformers does before open-ended
+        # generation: start with decoder_start_token_id (</s>, token 2),
+        # then include forced_bos_token_id (<s>, token 0) so that vLLM
+        # generates from the same position as transformers step 2.
+        decoder_prompt = [text_config.decoder_start_token_id]
+        forced_bos = getattr(text_config, "forced_bos_token_id", None)
+        if forced_bos is not None:
+            decoder_prompt.append(forced_bos)
+        return decoder_prompt
 
     def _apply_hf_processor_tokens_only(
         self,
@@ -793,20 +815,40 @@ def _get_prompt_updates(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
-        hf_config = self.info.get_hf_config()
-        # Use image_token_id (51289) — this is what the Florence2Processor
-        # inserts into input_ids. With _hf_processor_applies_updates=True,
-        # vllm will FIND these tokens in the existing prompt rather than
-        # inserting new ones (so no token doubling / length overflow).
-        image_token_id = hf_config.image_token_id
-        num_image_tokens = self.info.get_num_image_tokens()
-        image_tokens = [image_token_id] * num_image_tokens
+        # The placeholder must cover the FULL encoder input sequence (image
+        # tokens + text/task tokens) so that vLLM's _get_encoder_seq_lens
+        # computes the correct value for cross-attention KV cache allocation.
+        # Using only the image token count (577) would cause cross-attention
+        # to read only 577/590 K/V pairs, skipping the task-prompt tokens.
+        #
+        # With _hf_processor_applies_updates=True, vLLM detects the existing
+        # token sequence rather than inserting new tokens. By setting the
+        # insertion to the full encoder_input_ids sequence, the detected
+        # placeholder range covers all 590 encoder tokens.
+        insertion: list[int]
+        image_items = out_mm_kwargs.get("image", [])
+        if image_items:
+            item_data = image_items[0].get_data()
+            enc_ids = item_data.get("encoder_input_ids")
+            if enc_ids is not None:
+                insertion = enc_ids.tolist()
+            else:
+                # Cache hit: encoder_input_ids not available; fall back.
+                hf_config = self.info.get_hf_config()
+                insertion = (
+                    [hf_config.image_token_id] * self.info.get_num_image_tokens()
+                )
+        else:
+            hf_config = self.info.get_hf_config()
+            insertion = (
+                [hf_config.image_token_id] * self.info.get_num_image_tokens()
+            )
 
         return [
             PromptInsertion(
                 modality="image",
                 target=PromptIndexTargets.start(),
-                insertion=image_tokens,
+                insertion=insertion,
             )
         ]