improvements

anakin87 · anakin87 · commit bb20fc7c6f96 · 2026-04-14T16:52:45.000+02:00
diff --git a/integrations/vllm/pyproject.toml b/integrations/vllm/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: CPython",
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
-dependencies = ["haystack-ai>=2.23.0", "openai"]
+dependencies = ["haystack-ai>=2.23.0", "openai", "more_itertools", "tqdm"]
 
 [project.urls]
 Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/vllm#readme"
diff --git a/integrations/vllm/src/haystack_integrations/common/vllm/utils.py b/integrations/vllm/src/haystack_integrations/common/vllm/utils.py
@@ -19,10 +19,9 @@ def _create_openai_clients(
     """
     Build sync and async OpenAI clients pointing at a vLLM server.
 
-    A placeholder api key is used when the user did not supply one and no `VLLM_API_KEY` env var is
-    set, because the OpenAI client requires a non-empty value. `timeout` and `max_retries` are only
-    forwarded when provided: when None, the OpenAI client's own defaults apply and no `OPENAI_*`
-    env vars are read.
+    A placeholder api key is used when the user did not supply one and no `VLLM_API_KEY` env var is set, because the
+    OpenAI client requires a non-empty value.
+    `timeout` and `max_retries` are only forwarded when provided: when None, the OpenAI client's own defaults apply.
     """
     resolved_api_key = "placeholder-api-key"
     if api_key is not None and (value := api_key.resolve_value()):
diff --git a/integrations/vllm/src/haystack_integrations/components/embedders/vllm/document_embedder.py b/integrations/vllm/src/haystack_integrations/components/embedders/vllm/document_embedder.py
@@ -31,7 +31,7 @@ class VLLMDocumentEmbedder:
     Before using this component, start a vLLM server with an embedding model:
 
     ```bash
-    vllm serve intfloat/e5-mistral-7b-instruct
+    vllm serve google/embeddinggemma-300m
     ```
 
     For details on server options, see the [vLLM CLI docs](https://docs.vllm.ai/en/stable/cli/serve/).
@@ -44,7 +44,7 @@ class VLLMDocumentEmbedder:
 
     doc = Document(content="I love pizza!")
 
-    document_embedder = VLLMDocumentEmbedder(model="intfloat/e5-mistral-7b-instruct")
+    document_embedder = VLLMDocumentEmbedder(model="google/embeddinggemma-300m")
 
     result = document_embedder.run([doc])
     print(result["documents"][0].embedding)
@@ -57,8 +57,8 @@ class VLLMDocumentEmbedder:
 
     ```python
     document_embedder = VLLMDocumentEmbedder(
-        model="jinaai/jina-embeddings-v3",
-        extra_parameters={"dimensions": 32, "truncate_prompt_tokens": 256},
+        model="google/embeddinggemma-300m",
+        extra_parameters={"truncate_prompt_tokens": 256, "truncation_side": "right"},
     )
     ```
     """
@@ -71,6 +71,7 @@ def __init__(
         api_base_url: str = "http://localhost:8000/v1",
         prefix: str = "",
         suffix: str = "",
+        dimensions: int | None = None,
         batch_size: int = 32,
         progress_bar: bool = True,
         meta_fields_to_embed: list[str] | None = None,
@@ -84,16 +85,21 @@ def __init__(
         """
         Creates an instance of VLLMDocumentEmbedder.
 
-        :param model: The name of the model served by vLLM (e.g., "intfloat/e5-mistral-7b-instruct").
+        :param model: The name of the model served by vLLM. Check
+        [vLLM's documentation](https://docs.vllm.ai/en/stable/models/pooling_models) for more information.
         :param api_key: The vLLM API key. Defaults to the `VLLM_API_KEY` environment variable.
             Only required if the vLLM server was started with `--api-key`.
         :param api_base_url: The base URL of the vLLM server.
         :param prefix: A string to add at the beginning of each text.
         :param suffix: A string to add at the end of each text.
-        :param batch_size: Number of Documents to encode at once.
-        :param progress_bar: Whether to show a progress bar. Disable in production to keep logs clean.
-        :param meta_fields_to_embed: List of meta fields to embed along with the Document text.
-        :param embedding_separator: Separator used to concatenate the meta fields to the Document text.
+        :param dimensions: The number of dimensions of the resulting embedding. Only models trained with
+            Matryoshka Representation Learning support this parameter. See
+            [vLLMs documentation](https://docs.vllm.ai/en/stable/models/pooling_models/embed/#matryoshka-embeddings)
+            for more information.
+        :param batch_size: Number of documents to encode at once.
+        :param progress_bar: Whether to show a progress bar.
+        :param meta_fields_to_embed: List of meta fields to embed along with the document text.
+        :param embedding_separator: Separator used to concatenate the meta fields to the document text.
         :param timeout: Timeout in seconds for vLLM client calls. If not set, the OpenAI client default applies.
         :param max_retries: Maximum number of retries for failed requests. If not set, the OpenAI client
             default applies.
@@ -104,15 +110,15 @@ def __init__(
             the component logs the error and continues processing the remaining documents.
         :param extra_parameters: Additional parameters forwarded as `extra_body` to the vLLM embeddings
             endpoint. Use this to pass parameters not part of the standard OpenAI Embeddings API, such as
-            `dimensions` (for Matryoshka models), `truncate_prompt_tokens`, `truncation_side`,
-            `additional_data`, `use_activation`, etc. See the
-            [vLLM Embeddings API docs](https://docs.vllm.ai/en/stable/models/pooling_models.html#openai-compatible-embeddings-api).
+            `truncate_prompt_tokens`, `truncation_side`, etc. See the
+            [vLLM Embeddings API docs](https://docs.vllm.ai/en/stable/models/pooling_models/embed/#openai-compatible-embeddings-api).
         """
         self.model = model
         self.api_key = api_key
         self.api_base_url = api_base_url
         self.prefix = prefix
         self.suffix = suffix
+        self.dimensions = dimensions
         self.batch_size = batch_size
         self.progress_bar = progress_bar
         self.meta_fields_to_embed = meta_fields_to_embed or []
@@ -149,10 +155,11 @@ def to_dict(self) -> dict[str, Any]:
         return default_to_dict(
             self,
             model=self.model,
-            api_key=self.api_key.to_dict() if self.api_key else None,
+            api_key=self.api_key,
             api_base_url=self.api_base_url,
             prefix=self.prefix,
             suffix=self.suffix,
+            dimensions=self.dimensions,
             batch_size=self.batch_size,
             progress_bar=self.progress_bar,
             meta_fields_to_embed=self.meta_fields_to_embed,
@@ -183,6 +190,8 @@ def _prepare_texts_to_embed(self, documents: list[Document]) -> dict[str, str]:
 
     def _prepare_input(self, inputs: list[str]) -> dict[str, Any]:
         kwargs: dict[str, Any] = {"model": self.model, "input": inputs, "encoding_format": "float"}
+        if self.dimensions is not None:
+            kwargs["dimensions"] = self.dimensions
         if self.extra_parameters:
             kwargs["extra_body"] = self.extra_parameters
         return kwargs
diff --git a/integrations/vllm/src/haystack_integrations/components/embedders/vllm/text_embedder.py b/integrations/vllm/src/haystack_integrations/components/embedders/vllm/text_embedder.py
@@ -25,7 +25,7 @@ class VLLMTextEmbedder:
     Before using this component, start a vLLM server with an embedding model:
 
     ```bash
-    vllm serve intfloat/e5-mistral-7b-instruct
+    vllm serve google/embeddinggemma-300m
     ```
 
     For details on server options, see the [vLLM CLI docs](https://docs.vllm.ai/en/stable/cli/serve/).
@@ -35,7 +35,7 @@ class VLLMTextEmbedder:
     ```python
     from haystack_integrations.components.embedders.vllm import VLLMTextEmbedder
 
-    text_embedder = VLLMTextEmbedder(model="intfloat/e5-mistral-7b-instruct")
+    text_embedder = VLLMTextEmbedder(model="google/embeddinggemma-300m")
     print(text_embedder.run("I love pizza!"))
     ```
 
@@ -46,8 +46,8 @@ class VLLMTextEmbedder:
 
     ```python
     text_embedder = VLLMTextEmbedder(
-        model="jinaai/jina-embeddings-v3",
-        extra_parameters={"dimensions": 32, "truncate_prompt_tokens": 256},
+        model="google/embeddinggemma-300m",
+        extra_parameters={"truncate_prompt_tokens": 256, "truncation_side": "right"},
     )
     ```
     """
@@ -60,6 +60,7 @@ def __init__(
         api_base_url: str = "http://localhost:8000/v1",
         prefix: str = "",
         suffix: str = "",
+        dimensions: int | None = None,
         timeout: float | None = None,
         max_retries: int | None = None,
         http_client_kwargs: dict[str, Any] | None = None,
@@ -74,6 +75,10 @@ def __init__(
         :param api_base_url: The base URL of the vLLM server.
         :param prefix: A string to add at the beginning of each text to embed.
         :param suffix: A string to add at the end of each text to embed.
+        :param dimensions: The number of dimensions of the resulting embedding. Only models trained with
+            Matryoshka Representation Learning support this parameter. See
+            [vLLMs documentation](https://docs.vllm.ai/en/stable/models/pooling_models/embed/#matryoshka-embeddings)
+            for more information.
         :param timeout: Timeout in seconds for vLLM client calls. If not set, the OpenAI client default applies.
         :param max_retries: Maximum number of retries for failed requests. If not set, the OpenAI client
             default applies.
@@ -82,15 +87,15 @@ def __init__(
             [HTTPX documentation](https://www.python-httpx.org/api/#client).
         :param extra_parameters: Additional parameters forwarded as `extra_body` to the vLLM embeddings
             endpoint. Use this to pass parameters not part of the standard OpenAI Embeddings API, such as
-            `dimensions` (for Matryoshka models), `truncate_prompt_tokens`, `truncation_side`,
-            `additional_data`, `use_activation`, etc. See the
-            [vLLM Embeddings API docs](https://docs.vllm.ai/en/stable/models/pooling_models.html#openai-compatible-embeddings-api).
+            `truncate_prompt_tokens`, `truncation_side`, `additional_data`, `use_activation`, etc. See the
+            [vLLM Embeddings API docs](https://docs.vllm.ai/en/stable/models/pooling_models/embed/#openai-compatible-embeddings-api).
         """
         self.model = model
         self.api_key = api_key
         self.api_base_url = api_base_url
         self.prefix = prefix
         self.suffix = suffix
+        self.dimensions = dimensions
         self.timeout = timeout
         self.max_retries = max_retries
         self.http_client_kwargs = http_client_kwargs
@@ -126,6 +131,7 @@ def to_dict(self) -> dict[str, Any]:
             api_base_url=self.api_base_url,
             prefix=self.prefix,
             suffix=self.suffix,
+            dimensions=self.dimensions,
             timeout=self.timeout,
             max_retries=self.max_retries,
             http_client_kwargs=self.http_client_kwargs,
@@ -150,6 +156,8 @@ def _prepare_input(self, text: str) -> dict[str, Any]:
             "input": self.prefix + text + self.suffix,
             "encoding_format": "float",
         }
+        if self.dimensions is not None:
+            kwargs["dimensions"] = self.dimensions
         if self.extra_parameters:
             kwargs["extra_body"] = self.extra_parameters
         return kwargs
diff --git a/integrations/vllm/tests/test_document_embedder.py b/integrations/vllm/tests/test_document_embedder.py
@@ -39,6 +39,7 @@ def test_init_default(self, monkeypatch):
         assert embedder.api_base_url == "http://localhost:8000/v1"
         assert embedder.prefix == ""
         assert embedder.suffix == ""
+        assert embedder.dimensions is None
         assert embedder.batch_size == 32
         assert embedder.progress_bar is True
         assert embedder.meta_fields_to_embed == []
@@ -56,6 +57,7 @@ def test_init_with_parameters(self):
             api_base_url="http://my-vllm-server:8000/v1",
             prefix="START",
             suffix="END",
+            dimensions=64,
             batch_size=64,
             progress_bar=False,
             meta_fields_to_embed=["test_field"],
@@ -67,6 +69,7 @@ def test_init_with_parameters(self):
         assert embedder.api_base_url == "http://my-vllm-server:8000/v1"
         assert embedder.prefix == "START"
         assert embedder.suffix == "END"
+        assert embedder.dimensions == 64
         assert embedder.batch_size == 64
         assert embedder.progress_bar is False
         assert embedder.meta_fields_to_embed == ["test_field"]
@@ -101,6 +104,7 @@ def test_to_dict(self, monkeypatch):
                 "api_base_url": "http://localhost:8000/v1",
                 "prefix": "",
                 "suffix": "",
+                "dimensions": None,
                 "batch_size": 32,
                 "progress_bar": True,
                 "meta_fields_to_embed": [],
@@ -123,6 +127,7 @@ def test_from_dict(self, monkeypatch):
                 "api_base_url": "http://localhost:8000/v1",
                 "prefix": "",
                 "suffix": "",
+                "dimensions": 32,
                 "batch_size": 32,
                 "progress_bar": True,
                 "meta_fields_to_embed": [],
@@ -131,15 +136,15 @@ def test_from_dict(self, monkeypatch):
                 "max_retries": None,
                 "http_client_kwargs": None,
                 "raise_on_failure": False,
-                "extra_parameters": {"dimensions": 32},
+                "extra_parameters": None,
             },
         }
         embedder = VLLMDocumentEmbedder.from_dict(data)
         assert embedder.api_key == Secret.from_env_var("VLLM_API_KEY", strict=False)
         assert embedder.model == MODEL
         assert embedder.api_base_url == "http://localhost:8000/v1"
         assert embedder.batch_size == 32
-        assert embedder.extra_parameters == {"dimensions": 32}
+        assert embedder.dimensions == 32
 
     def test_prepare_texts_to_embed(self):
         embedder = VLLMDocumentEmbedder(
@@ -149,14 +154,15 @@ def test_prepare_texts_to_embed(self):
         texts = embedder._prepare_texts_to_embed([doc])
         assert texts == {doc.id: "[ML | hello]"}
 
-    def test_prepare_input_adds_extra_body(self):
-        embedder = VLLMDocumentEmbedder(model=MODEL, extra_parameters={"dimensions": 32})
+    def test_prepare_input_adds_dimensions_and_extra_body(self):
+        embedder = VLLMDocumentEmbedder(model=MODEL, dimensions=32, extra_parameters={"truncate_prompt_tokens": 256})
         kwargs = embedder._prepare_input(["a", "b"])
         assert kwargs == {
             "model": MODEL,
             "input": ["a", "b"],
             "encoding_format": "float",
-            "extra_body": {"dimensions": 32},
+            "dimensions": 32,
+            "extra_body": {"truncate_prompt_tokens": 256},
         }
 
     def test_run_wrong_input_format(self):
diff --git a/integrations/vllm/tests/test_text_embedder.py b/integrations/vllm/tests/test_text_embedder.py
@@ -33,6 +33,7 @@ def test_init_default(self, monkeypatch):
         assert embedder.model == MODEL
         assert embedder.prefix == ""
         assert embedder.suffix == ""
+        assert embedder.dimensions is None
         assert embedder.timeout is None
         assert embedder.max_retries is None
         assert embedder.http_client_kwargs is None
@@ -48,6 +49,7 @@ def test_init_with_parameters(self):
             api_base_url="http://my-vllm-server:8000/v1",
             prefix="START",
             suffix="END",
+            dimensions=64,
             timeout=10.0,
             max_retries=2,
             http_client_kwargs={"proxy": "https://proxy.example.com"},
@@ -58,6 +60,7 @@ def test_init_with_parameters(self):
         assert embedder.model == MODEL
         assert embedder.prefix == "START"
         assert embedder.suffix == "END"
+        assert embedder.dimensions == 64
         assert embedder.timeout == 10.0
         assert embedder.max_retries == 2
         assert embedder.http_client_kwargs == {"proxy": "https://proxy.example.com"}
@@ -90,6 +93,7 @@ def test_to_dict(self, monkeypatch):
                 "api_base_url": "http://localhost:8000/v1",
                 "prefix": "",
                 "suffix": "",
+                "dimensions": None,
                 "timeout": None,
                 "max_retries": None,
                 "http_client_kwargs": None,
@@ -107,26 +111,30 @@ def test_from_dict(self, monkeypatch):
                 "api_base_url": "http://localhost:8000/v1",
                 "prefix": "",
                 "suffix": "",
+                "dimensions": 32,
                 "timeout": None,
                 "max_retries": None,
                 "http_client_kwargs": None,
-                "extra_parameters": {"dimensions": 32},
+                "extra_parameters": None,
             },
         }
         embedder = VLLMTextEmbedder.from_dict(data)
         assert embedder.api_key == Secret.from_env_var("VLLM_API_KEY", strict=False)
         assert embedder.model == MODEL
         assert embedder.api_base_url == "http://localhost:8000/v1"
-        assert embedder.extra_parameters == {"dimensions": 32}
+        assert embedder.dimensions == 32
 
-    def test_prepare_input_adds_extra_body(self):
-        embedder = VLLMTextEmbedder(model=MODEL, prefix="[", suffix="]", extra_parameters={"dimensions": 32})
+    def test_prepare_input_adds_dimensions_and_extra_body(self):
+        embedder = VLLMTextEmbedder(
+            model=MODEL, prefix="[", suffix="]", dimensions=32, extra_parameters={"truncate_prompt_tokens": 256}
+        )
         kwargs = embedder._prepare_input("hello")
         assert kwargs == {
             "model": MODEL,
             "input": "[hello]",
             "encoding_format": "float",
-            "extra_body": {"dimensions": 32},
+            "dimensions": 32,
+            "extra_body": {"truncate_prompt_tokens": 256},
         }
 
     def test_run_wrong_input_format(self):
@@ -135,7 +143,7 @@ def test_run_wrong_input_format(self):
             embedder.run(text=["text_1", "text_2"])
 
     def test_run_with_mock(self):
-        embedder = VLLMTextEmbedder(model=MODEL, prefix="[", suffix="]", extra_parameters={"dimensions": 2})
+        embedder = VLLMTextEmbedder(model=MODEL, prefix="[", suffix="]", dimensions=2)
         embedder._client = MagicMock()
         embedder._client.embeddings.create.return_value = _fake_response([[0.1, 0.2]])
         embedder._is_warmed_up = True
@@ -144,7 +152,7 @@ def test_run_with_mock(self):
 
         call_kwargs = embedder._client.embeddings.create.call_args.kwargs
         assert call_kwargs["input"] == "[hello]"
-        assert call_kwargs["extra_body"] == {"dimensions": 2}
+        assert call_kwargs["dimensions"] == 2
         assert result == {
             "embedding": [0.1, 0.2],
             "meta": {"model": "fake-model", "usage": {"prompt_tokens": 5, "total_tokens": 5}},

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ classifiers = [`
`22`	`22`	`"Programming Language :: Python :: Implementation :: CPython",`
`23`	`23`	`"Programming Language :: Python :: Implementation :: PyPy",`
`24`	`24`	`]`
`25`		`-dependencies = ["haystack-ai>=2.23.0", "openai"]`
	`25`	`+dependencies = ["haystack-ai>=2.23.0", "openai", "more_itertools", "tqdm"]`
`26`	`26`
`27`	`27`	`[project.urls]`
`28`	`28`	`Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/vllm#readme"`