qwen3_vl: match HF reference by fixing two upstream mlx-vlm bugs

Blaizzy · claude · Blaizzy · commit 45a501c79bb8 · 2026-04-24T00:05:42.000+02:00
On the 6-query × 6-image retrieval benchmark, the mlx-embeddings output
had max|cosine diff| = 0.087 vs HF transformers reference and only 83%
top-1 agreement. Three fixes close the gap to max 0.006 diff and 100%
top-1/top-3 agreement:

1. Forward the embedder's MIN_PIXELS/MAX_PIXELS (4096..1,843,200) onto
   the inner image_processor. The Qwen3-VL preprocessor_config.json
   lists the full-context size bounds (16 MP), so without this override
   the image_processor resized to a different grid than the HF reference
   and the comparison ran on different visual tokens.

2. Work around mlx-vlm bug in Qwen3-VL get_input_embeddings: the
   upstream assigns `mx.eval(deepstack_image_embeds)` to
   `deepstack_visual_embeds`, but mx.eval returns None — so multi-scale
   deepstack features were silently dropped at every LM layer the
   model was supposed to inject them into. Re-run the vision tower in
   our Model.get_input_embeddings when we detect this.

3. Patch mlx-vlm's `_deepstack_process` on the language-model instance:
   upstream indexes the full concatenated visual_embeds at each batch
   sample's image positions, which only works for batch_size=1. Our
   patched version slices visual_embeds per sample using a running
   offset so multi-image batches work.

Once (2) is fixed upstream, (3) surfaces immediately — they're stacked
bugs that cancel for single-image batches.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/mlx_embeddings/models/qwen3_vl/model.py b/mlx_embeddings/models/qwen3_vl/model.py
@@ -2,6 +2,7 @@
 from typing import Any, Dict, Optional, Tuple
 
 import mlx.core as mx
+import numpy as np
 from mlx_lm.models.base import create_causal_mask
 from mlx_vlm.models.qwen3_vl import LanguageModel as Qwen3VLLanguageModel
 from mlx_vlm.models.qwen3_vl import Model as Qwen3VLBackbone
@@ -11,6 +12,40 @@
 from ..base import BaseModelArgs, BaseModelOutput, normalize_embeddings
 
 
+def _patched_deepstack_process(
+    self,
+    hidden_states: mx.array,
+    visual_pos_masks: mx.array,
+    visual_embeds: mx.array,
+) -> mx.array:
+    """Fixed version of mlx-vlm's Qwen3-VL ``_deepstack_process``.
+
+    Upstream passes the full concatenated ``visual_embeds`` (all samples)
+    into each sample's ``batch_result.at[batch_indices].add(...)``, which
+    only broadcasts when the batch has one image. This version slices
+    ``visual_embeds`` per sample using the running offset of image-token
+    counts so it works for multi-image batches.
+    """
+    batch_size = hidden_states.shape[0]
+    updated = []
+    offset = 0
+    for b in range(batch_size):
+        batch_mask = visual_pos_masks[b]
+        batch_hidden = hidden_states[b]
+        batch_indices = mx.array(np.where(batch_mask)[0], dtype=mx.uint32)
+        n = int(batch_indices.shape[0])
+        if n == 0:
+            updated.append(batch_hidden)
+            continue
+        batch_result = mx.array(batch_hidden)
+        batch_result = batch_result.at[batch_indices].add(
+            visual_embeds[offset : offset + n]
+        )
+        offset += n
+        updated.append(batch_result)
+    return mx.stack(updated, axis=0)
+
+
 def build_qwen3_vl_config(vlm_config: Dict[str, Any]) -> ModelConfig:
     base_config = dict(vlm_config)
     base_config["model_type"] = "qwen3_vl"
@@ -159,6 +194,14 @@ class Model(Qwen3VLBackbone):
     def __init__(self, config: ModelArgs):
         self.args = config
         super().__init__(build_qwen3_vl_config(config.vlm_config))
+        # Fix upstream mlx-vlm Qwen3-VL bug (as of 0.4.4): _deepstack_process
+        # indexes the full concatenated visual_embeds at each batch sample's
+        # image positions, which is only correct for batch_size=1. Patch the
+        # instance with a version that slices visual_embeds per sample.
+        lm_inner = self.language_model.model
+        lm_inner._deepstack_process = _patched_deepstack_process.__get__(
+            lm_inner, type(lm_inner)
+        )
 
     @property
     def visual(self):
@@ -178,6 +221,29 @@ def get_video_features(
     ) -> mx.array:
         return self.vision_tower(pixel_values, video_grid_thw)[0]
 
+    def get_input_embeddings(self, input_ids=None, pixel_values=None, **kwargs):
+        # Work around an mlx-vlm bug (as of 0.4.4): Qwen3-VL's
+        # get_input_embeddings assigns `mx.eval(deepstack_image_embeds)` to
+        # `deepstack_visual_embeds`, but mx.eval returns None — so multi-scale
+        # deepstack features are silently dropped, costing ~0.1 cosine on the
+        # final image embedding. If they came back None but we actually have
+        # images, re-run the vision tower just to grab the deepstack list.
+        feats = super().get_input_embeddings(
+            input_ids=input_ids, pixel_values=pixel_values, **kwargs
+        )
+        if (
+            pixel_values is not None
+            and feats.deepstack_visual_embeds is None
+            and getattr(self.config.vision_config, "deepstack_visual_indexes", None)
+        ):
+            image_grid_thw = kwargs.get("image_grid_thw")
+            video_grid_thw = kwargs.get("video_grid_thw")
+            grid_thw = image_grid_thw if image_grid_thw is not None else video_grid_thw
+            dtype = self.vision_tower.patch_embed.proj.weight.dtype
+            _, deepstack = self.vision_tower(pixel_values.astype(dtype), grid_thw)
+            feats.deepstack_visual_embeds = deepstack
+        return feats
+
     def get_binary_logits(self, pooled: mx.array) -> mx.array:
         if hasattr(self.language_model, "lm_head"):
             token_logits = self.language_model.lm_head(pooled)
diff --git a/mlx_embeddings/models/qwen3_vl/processor.py b/mlx_embeddings/models/qwen3_vl/processor.py
@@ -711,6 +711,14 @@ def from_pretrained(cls, model_path, **kwargs):
         kwargs.pop("use_fast", None)
 
         processor = Qwen3VLProcessor.from_pretrained(model_path, **kwargs)
+        # preprocessor_config.json often caps max_pixels at the full Qwen3-VL
+        # context (e.g. 1,310,720 or 16M); the embedder-specific resize budget
+        # (4096..1,843,200) must win so our resize matches the HF reference.
+        processor.image_processor.min_pixels = min_pixels
+        processor.image_processor.max_pixels = max_pixels
+        if processor.video_processor is not None:
+            processor.video_processor.min_pixels = min_pixels
+            processor.video_processor.max_pixels = max_pixels
         return cls(
             processor=processor,
             embedding_max_length=embedding_max_length,