Blaizzy · Blaizzy · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/README.md b/README.md
@@ -39,37 +39,65 @@ pip install mlx-embeddings
 
 Qwen3-VL uses a model-specific processor and a high-level `model.process(...)` API for multimodal embedding and reranking.
 
-#### Multimodal Embedding
+#### Multimodal Retrieval
+
+Text-to-image retrieval over a small gallery — embed images once, then score any number of text queries against them. Full notebook (with heatmap + top-K plot): [`examples/qwen3_vl_retrieval.ipynb`](examples/qwen3_vl_retrieval.ipynb).
 
 ```python
+from io import BytesIO
+
+import matplotlib.pyplot as plt
 import mlx.core as mx
-from mlx_embeddings import load
+import numpy as np
+import requests
+from PIL import Image
 
-model, processor = load("Qwen/Qwen3-VL-Embedding-2B")
+from mlx_embeddings import load
 
-inputs = [
-    {
-        "text": "A woman playing with her dog on a beach at sunset.",
-        "instruction": "Retrieve images or text relevant to the user's query.",
-    },
-    {
-        "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset."
-    },
-    {
-        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-    },
-    {
-        "text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset.",
-        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
-    },
+GALLERY = [
+    ("woman with dog on beach",
+     "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"),
+    ("two cats on a couch",
+     "http://images.cocodataset.org/val2017/000000039769.jpg"),
+    ("tennis player on a court",
+     "http://images.cocodataset.org/val2017/000000000872.jpg"),
+    ("bear in the wild",
+     "http://images.cocodataset.org/val2017/000000000285.jpg"),
+    ("dark train tunnel",
+     "http://images.cocodataset.org/val2017/000000001268.jpg"),
+    ("group of people standing together",
+     "http://images.cocodataset.org/val2017/000000001000.jpg"),
 ]
+QUERIES = [
+    "a person spending time with their pet outdoors at sunset",
+    "sleepy cats relaxing indoors",
+    "someone playing a racquet sport",
+    "wildlife in a natural habitat",
+    "the inside of a transit tunnel",
+    "a crowd of people gathered outside",
+]
+INSTRUCTION = "Retrieve images that match the user's query."
+
+def fetch(src):
+    if src.startswith(("http://", "https://")):
+        return Image.open(BytesIO(requests.get(src, timeout=30).content)).convert("RGB")
+    return Image.open(src).convert("RGB")
 
-embeddings = model.process(inputs, processor=processor)
-similarity = embeddings @ embeddings.T
+labels, urls = zip(*GALLERY)
+images = [fetch(u) for u in urls]
+
+model, processor = load("Qwen/Qwen3-VL-Embedding-2B")
+img_embeds = model.process([{"image": i} for i in images], processor=processor)
+txt_embeds = model.process(
+    [{"text": q, "instruction": INSTRUCTION} for q in QUERIES], processor=processor,
+)
+sim = np.array((txt_embeds @ img_embeds.T).astype(mx.float32))
 
-mx.eval(embeddings, similarity)
-print(embeddings.shape)  # (4, 2048)
-print(similarity)
+for qi, q in enumerate(QUERIES):
+    top = np.argsort(-sim[qi])[:3]
+    print(f"q{qi}: {q}")
+    for k, idx in enumerate(top):
+        print(f"  #{k + 1}  {sim[qi, idx]:.3f}  {labels[idx]}")
 ```
 
 #### Multimodal Reranking