qwen3_vl: fetch image URLs and chat_template.jinja from Hub

Blaizzy · claude · Blaizzy · commit 2ab88f130db5 · 2026-04-23T22:34:05.000+02:00
README examples surfaced two gaps in the port:

- Image inputs passed as https:// URLs (the embedding/reranker README
  examples) hit `FileNotFoundError` because `_to_numpy_image` treated
  every string as a local path. Detect URLs and fetch via requests.

- `Qwen/Qwen3-VL-Reranker-2B` ships its chat template in
  chat_template.jinja, not in tokenizer_config.json. Add a
  `_load_qwen_vl_text` helper (local-then-Hub) and fall back to it when
  neither processor_config.json nor the tokenizer carries a template.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/mlx_embeddings/models/qwen3_vl/processor.py b/mlx_embeddings/models/qwen3_vl/processor.py
@@ -103,10 +103,17 @@ def _smart_resize_image(
 
 
 def _to_numpy_image(img) -> np.ndarray:
+    from io import BytesIO
+
     from PIL import Image
 
     if isinstance(img, str):
-        img = Image.open(img)
+        if img.startswith(("http://", "https://")):
+            import requests
+
+            img = Image.open(BytesIO(requests.get(img, timeout=30).content))
+        else:
+            img = Image.open(img)
     if hasattr(img, "convert"):
         img = img.convert("RGB")
         arr = np.array(img)
@@ -372,6 +379,21 @@ def _load_qwen_vl_json(pretrained_model_name_or_path, relative_name: str):
         return None
 
 
+def _load_qwen_vl_text(pretrained_model_name_or_path, relative_name: str):
+    from pathlib import Path
+
+    local = Path(pretrained_model_name_or_path) / relative_name
+    if local.exists():
+        return local.read_text(encoding="utf-8")
+    try:
+        from huggingface_hub import hf_hub_download
+
+        fetched = Path(hf_hub_download(pretrained_model_name_or_path, relative_name))
+        return fetched.read_text(encoding="utf-8")
+    except Exception:
+        return None
+
+
 def _qwen_vl_image_kwargs(pretrained_model_name_or_path, default_patch_size: int = 16):
     proc_cfg = (
         _load_qwen_vl_json(pretrained_model_name_or_path, "processor_config.json") or {}
@@ -639,6 +661,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         chat_template = proc_cfg.get(
             "chat_template", getattr(tokenizer, "chat_template", None)
         )
+        # Some checkpoints (e.g. Qwen3-VL-Reranker-2B) ship the template in
+        # chat_template.jinja on the Hub instead of tokenizer_config.json.
+        if chat_template is None:
+            chat_template = _load_qwen_vl_text(
+                pretrained_model_name_or_path, "chat_template.jinja"
+            )
+            if chat_template is not None:
+                tokenizer.chat_template = chat_template
 
         return cls(
             image_processor=image_processor,