fixes

anakin87 · anakin87 · commit 5d62dd66996b · 2026-04-15T09:02:57.000+02:00
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
@@ -30,6 +30,7 @@ env:
   PYTHONUNBUFFERED: "1"
   FORCE_COLOR: "1"
   VLLM_MODEL: "Qwen/Qwen3-0.6B"
+  VLLM_EMBEDDING_MODEL: "sentence-transformers/all-MiniLM-L6-v2"
   # we only test on Ubuntu to keep vLLM server running simple
   TEST_MATRIX_OS: '["ubuntu-latest"]'
   # vLLM is not compatible with Python 3.14. https://github.com/vllm-project/vllm/issues/34096
@@ -88,12 +89,13 @@ jobs:
             "https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl" \
             --torch-backend cpu
 
-      - name: Start vLLM server
+      - name: Start vLLM chat server
         env:
           VLLM_TARGET_DEVICE: "cpu"
           VLLM_CPU_KVCACHE_SPACE: "4"
         run: |
           nohup hatch run -- vllm serve ${{ env.VLLM_MODEL }} \
+            --port 8000 \
             --reasoning-parser qwen3 \
             --max-model-len 1024 \
             --enforce-eager \
@@ -102,20 +104,45 @@ jobs:
             --tool-call-parser hermes \
             --max-num-seqs 1 &
 
-          # Wait for the vLLM server to be ready with a timeout of 300 seconds
+          # Wait for the vLLM chat server to be ready with a timeout of 300 seconds
           timeout=300
           while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8000/health > /dev/null 2>&1; do
-            echo "Waiting for vLLM server to start..."
+            echo "Waiting for vLLM chat server to start..."
             sleep 10
             ((timeout-=10))
           done
 
           if [ $timeout -eq 0 ]; then
-            echo "Timed out waiting for vLLM server to start."
+            echo "Timed out waiting for vLLM chat server to start."
             exit 1
           fi
 
-          echo "vLLM server started successfully."
+          echo "vLLM chat server started successfully."
+
+      - name: Start vLLM embedding server
+        env:
+          VLLM_TARGET_DEVICE: "cpu"
+          VLLM_CPU_KVCACHE_SPACE: "4"
+        run: |
+          nohup hatch run -- vllm serve ${{ env.VLLM_EMBEDDING_MODEL }} \
+            --port 8001 \
+            --enforce-eager \
+            --max-num-seqs 1 &
+
+          # Wait for the vLLM embedding server to be ready with a timeout of 300 seconds
+          timeout=300
+          while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8001/health > /dev/null 2>&1; do
+            echo "Waiting for vLLM embedding server to start..."
+            sleep 10
+            ((timeout-=10))
+          done
+
+          if [ $timeout -eq 0 ]; then
+            echo "Timed out waiting for vLLM embedding server to start."
+            exit 1
+          fi
+
+          echo "vLLM embedding server started successfully."
 
       - name: Lint
         if: matrix.python-version == '3.10' && runner.os == 'Linux'
diff --git a/integrations/vllm/README.md b/integrations/vllm/README.md
@@ -13,12 +13,17 @@ Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/hay
 
 To run integration tests locally, you need two vLLM servers running in parallel: one for the chat generator on port `8000` and one for the embedders on port `8001`. Refer to the [workflow file](https://github.com/deepset-ai/haystack-core-integrations/blob/main/.github/workflows/vllm.yml) for more details.
 
-For example, on macOs, you can install [vLLM-metal](https://github.com/vllm-project/vllm-metal) and start both servers with:
+For example, on macOs, you can install [vLLM-metal](https://github.com/vllm-project/vllm-metal) and start the chat generator server with:
 
 ```bash
 # chat generator server (port 8000)
 source ~/.venv-vllm-metal/bin/activate && vllm serve Qwen/Qwen3-0.6B --reasoning-parser qwen3 --max-model-len 1024 --enforce-eager --enable-auto-tool-choice --tool-call-parser hermes
+```
 
+vLLM-metal does not support embedding models. On macOS, you can run the embedding server via CPU Docker image:
+
+```bash
 # embedders server (port 8001)
-source ~/.venv-vllm-metal/bin/activate && vllm serve sergeyzh/rubert-tiny-turbo --port 8001 --enforce-eager --max-num-seqs 1
+docker run --rm -p 8001:8000 -e VLLM_CPU_OMP_THREADS_BIND=0-3 vllm/vllm-openai-cpu:latest \
+    --model sentence-transformers/all-MiniLM-L6-v2 --enforce-eager
 ```
diff --git a/integrations/vllm/tests/test_document_embedder.py b/integrations/vllm/tests/test_document_embedder.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from unittest.mock import AsyncMock, MagicMock
 
+import numpy as np
 import pytest
 from haystack import Document
 from haystack.utils import Secret
@@ -12,7 +13,7 @@
 
 from haystack_integrations.components.embedders.vllm import VLLMDocumentEmbedder
 
-MODEL = "sergeyzh/rubert-tiny-turbo"
+MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 API_BASE_URL = "http://localhost:8001/v1"
 
 
@@ -235,12 +236,13 @@ async def test_run_async(self):
         assert [d.embedding for d in result["documents"]] == [[0.5], [0.6]]
 
     @pytest.mark.integration
-    def test_run(self):
+    def test_live_run(self):
         embedder = VLLMDocumentEmbedder(model=MODEL, api_base_url=API_BASE_URL)
 
         docs = [
-            Document(content="I love cheese", meta={"topic": "Cuisine"}),
-            Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
+            Document(content="I love cheese"),
+            Document(content="Cheddar is my favorite food"),
+            Document(content="A transformer is a deep learning architecture"),
         ]
 
         result = embedder.run(docs)
@@ -250,3 +252,29 @@ def test_run(self):
         for doc in docs_with_embeddings:
             assert isinstance(doc.embedding, list)
             assert isinstance(doc.embedding[0], float)
+
+        embeddings = [np.array(d.embedding) for d in docs_with_embeddings]
+
+        def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+            return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
+
+        assert cosine_similarity(embeddings[0], embeddings[1]) > cosine_similarity(embeddings[0], embeddings[2])
+
+    @pytest.mark.integration
+    @pytest.mark.asyncio
+    async def test_live_run_async(self):
+        embedder = VLLMDocumentEmbedder(model=MODEL, api_base_url=API_BASE_URL)
+
+        docs = [
+            Document(content="I love cheese"),
+            Document(content="Cheddar is my favorite food"),
+            Document(content="A transformer is a deep learning architecture"),
+        ]
+
+        result = await embedder.run_async(docs)
+        docs_with_embeddings = result["documents"]
+
+        assert len(docs_with_embeddings) == len(docs)
+        for doc in docs_with_embeddings:
+            assert isinstance(doc.embedding, list)
+            assert isinstance(doc.embedding[0], float)
diff --git a/integrations/vllm/tests/test_text_embedder.py b/integrations/vllm/tests/test_text_embedder.py
@@ -10,7 +10,7 @@
 
 from haystack_integrations.components.embedders.vllm import VLLMTextEmbedder
 
-MODEL = "sergeyzh/rubert-tiny-turbo"
+MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 API_BASE_URL = "http://localhost:8001/v1"
 
 
@@ -175,8 +175,16 @@ async def test_run_async(self):
         assert result["embedding"] == [0.3, 0.4]
 
     @pytest.mark.integration
-    def test_run(self):
+    def test_live_run(self):
         embedder = VLLMTextEmbedder(model=MODEL, api_base_url=API_BASE_URL)
         result = embedder.run("The food was delicious")
         assert isinstance(result["embedding"], list)
         assert all(isinstance(x, float) for x in result["embedding"])
+
+    @pytest.mark.asyncio
+    @pytest.mark.integration
+    async def test_live_run_async(self):
+        embedder = VLLMTextEmbedder(model=MODEL, api_base_url=API_BASE_URL)
+        result = await embedder.run_async("The food was delicious")
+        assert isinstance(result["embedding"], list)
+        assert all(isinstance(x, float) for x in result["embedding"])