From b223aa369a10c5e0463bfa903bc51726448ad688 Mon Sep 17 00:00:00 2001 From: GunaPalanivel Date: Thu, 19 Mar 2026 13:05:06 +0530 Subject: [PATCH 01/15] feat: add sparse vector storage to ElasticsearchDocumentStore (#2939) --- .../elasticsearch/document_store.py | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py index f35dca8d0c..eb4c58cef5 100644 --- a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py @@ -85,6 +85,7 @@ def __init__( api_key: Secret | str | None = Secret.from_env_var("ELASTIC_API_KEY", strict=False), api_key_id: Secret | str | None = Secret.from_env_var("ELASTIC_API_KEY_ID", strict=False), embedding_similarity_function: Literal["cosine", "dot_product", "l2_norm", "max_inner_product"] = "cosine", + sparse_vector_field: str | None = None, **kwargs: Any, ): """ @@ -125,6 +126,7 @@ def __init__( self._api_key = api_key self._api_key_id = api_key_id self._embedding_similarity_function = embedding_similarity_function + self._sparse_vector_field = sparse_vector_field self._custom_mapping = custom_mapping self._kwargs = kwargs self._initialized = False @@ -155,6 +157,8 @@ def __init__( } ], } + if self._sparse_vector_field: + self._default_mappings["properties"][self._sparse_vector_field] = {"type": "sparse_vector"} def _ensure_initialized(self): """ @@ -276,6 +280,7 @@ def to_dict(self) -> dict[str, Any]: api_key=self._api_key.to_dict() if isinstance(self._api_key, Secret) else None, api_key_id=self._api_key_id.to_dict() if isinstance(self._api_key_id, Secret) else None, embedding_similarity_function=self._embedding_similarity_function, + sparse_vector_field=self._sparse_vector_field, **self._kwargs, ) @@ -457,12 +462,17 @@ def write_documents( if "sparse_embedding" in doc_dict: sparse_embedding = doc_dict.pop("sparse_embedding", None) if sparse_embedding: - logger.warning( - "Document {doc_id} has the `sparse_embedding` field set," - "but storing sparse embeddings in Elasticsearch is not currently supported." - "The `sparse_embedding` field will be ignored.", - doc_id=doc.id, - ) + if self._sparse_vector_field: + doc_dict[self._sparse_vector_field] = { + str(idx): val for idx, val in zip(sparse_embedding["indices"], sparse_embedding["values"]) + } + else: + logger.warning( + "Document {doc_id} has the `sparse_embedding` field set, " + "but `sparse_vector_field` is not configured for this ElasticsearchDocumentStore. " + "The `sparse_embedding` field will be ignored.", + doc_id=doc.id, + ) elasticsearch_actions.append( { "_op_type": action, @@ -544,12 +554,17 @@ async def write_documents_async( if "sparse_embedding" in doc_dict: sparse_embedding = doc_dict.pop("sparse_embedding", None) if sparse_embedding: - logger.warning( - "Document {doc_id} has the `sparse_embedding` field set," - "but storing sparse embeddings in Elasticsearch is not currently supported." - "The `sparse_embedding` field will be ignored.", - doc_id=doc.id, - ) + if self._sparse_vector_field: + doc_dict[self._sparse_vector_field] = { + str(idx): val for idx, val in zip(sparse_embedding["indices"], sparse_embedding["values"]) + } + else: + logger.warning( + "Document {doc_id} has the `sparse_embedding` field set, " + "but `sparse_vector_field` is not configured for this ElasticsearchDocumentStore. " + "The `sparse_embedding` field will be ignored.", + doc_id=doc.id, + ) action = { "_op_type": "create" if policy == DuplicatePolicy.FAIL else "index", From 17ce682e143357301c51561126a844ada3587cb3 Mon Sep 17 00:00:00 2001 From: GunaPalanivel Date: Thu, 19 Mar 2026 13:05:25 +0530 Subject: [PATCH 02/15] test: update retriever tests for new ElasticsearchDocumentStore serialization - Update est_bm25_retriever.py and est_embedding_retriever.py to include sparse_vector_field in serialized document_store init parameters. --- integrations/elasticsearch/tests/test_bm25_retriever.py | 5 +++-- integrations/elasticsearch/tests/test_embedding_retriever.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/integrations/elasticsearch/tests/test_bm25_retriever.py b/integrations/elasticsearch/tests/test_bm25_retriever.py index 4be26c7b35..92b7e82794 100644 --- a/integrations/elasticsearch/tests/test_bm25_retriever.py +++ b/integrations/elasticsearch/tests/test_bm25_retriever.py @@ -56,6 +56,7 @@ def test_to_dict(_mock_elasticsearch_client): "custom_mapping": None, "index": "default", "embedding_similarity_function": "cosine", + "sparse_vector_field": None, }, "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", }, @@ -74,7 +75,7 @@ def test_from_dict(_mock_elasticsearch_client): "type": "haystack_integrations.components.retrievers.elasticsearch.bm25_retriever.ElasticsearchBM25Retriever", "init_parameters": { "document_store": { - "init_parameters": {"hosts": "some fake host", "index": "default"}, + "init_parameters": {"hosts": "some fake host", "index": "default", "sparse_vector_field": None}, "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", }, "filters": {}, @@ -99,7 +100,7 @@ def test_from_dict_no_filter_policy(_mock_elasticsearch_client): "type": "haystack_integrations.components.retrievers.elasticsearch.bm25_retriever.ElasticsearchBM25Retriever", "init_parameters": { "document_store": { - "init_parameters": {"hosts": "some fake host", "index": "default"}, + "init_parameters": {"hosts": "some fake host", "index": "default", "sparse_vector_field": None}, "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", }, "filters": {}, diff --git a/integrations/elasticsearch/tests/test_embedding_retriever.py b/integrations/elasticsearch/tests/test_embedding_retriever.py index abb00050f0..555ddce727 100644 --- a/integrations/elasticsearch/tests/test_embedding_retriever.py +++ b/integrations/elasticsearch/tests/test_embedding_retriever.py @@ -55,6 +55,7 @@ def test_to_dict(_mock_elasticsearch_client): "custom_mapping": None, "index": "default", "embedding_similarity_function": "cosine", + "sparse_vector_field": None, }, "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", }, @@ -73,7 +74,7 @@ def test_from_dict(_mock_elasticsearch_client): "type": t, "init_parameters": { "document_store": { - "init_parameters": {"hosts": "some fake host", "index": "default"}, + "init_parameters": {"hosts": "some fake host", "index": "default", "sparse_vector_field": None}, "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", }, "filters": {}, @@ -96,7 +97,7 @@ def test_from_dict_no_filter_policy(_mock_elasticsearch_client): "type": t, "init_parameters": { "document_store": { - "init_parameters": {"hosts": "some fake host", "index": "default"}, + "init_parameters": {"hosts": "some fake host", "index": "default", "sparse_vector_field": None}, "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", }, "filters": {}, From 29541c212c3a30ff76f3f728977758f2a8a417db Mon Sep 17 00:00:00 2001 From: GunaPalanivel Date: Thu, 19 Mar 2026 13:05:57 +0530 Subject: [PATCH 03/15] test: add sync and async tests for sparse vector storage - Add est_write_documents_with_sparse_vectors and est_write_documents_with_sparse_embedding_warning to est_document_store.py - Add est_write_documents_async_with_sparse_vectors to est_document_store_async.py - Update existing warning test in est_document_store_async.py - Add est_init_with_sparse_vector_field and update serialization tests. --- .../tests/test_document_store.py | 40 +++++++++++++++++++ .../tests/test_document_store_async.py | 19 ++++++++- 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py index 1966c341b9..93aef6226c 100644 --- a/integrations/elasticsearch/tests/test_document_store.py +++ b/integrations/elasticsearch/tests/test_document_store.py @@ -8,6 +8,7 @@ import pytest from elasticsearch.exceptions import BadRequestError # type: ignore[import-not-found] from haystack.dataclasses.document import Document +from haystack.dataclasses.sparse_embedding import SparseEmbedding from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack.testing.document_store import DocumentStoreBaseExtendedTests @@ -62,6 +63,7 @@ def test_to_dict(): "custom_mapping": None, "index": "default", "embedding_similarity_function": "cosine", + "sparse_vector_field": None, }, } @@ -76,6 +78,7 @@ def test_from_dict(): "api_key": None, "api_key_id": None, "embedding_similarity_function": "cosine", + "sparse_vector_field": None, }, } document_store = ElasticsearchDocumentStore.from_dict(data) @@ -83,6 +86,7 @@ def test_from_dict(): assert document_store._index == "default" assert document_store._custom_mapping is None assert document_store._api_key is None + assert document_store._sparse_vector_field is None assert document_store._api_key_id is None assert document_store._embedding_similarity_function == "cosine" @@ -127,6 +131,7 @@ def test_from_dict_with_api_keys_env_vars(): "api_key": {"type": "env_var", "env_vars": ["ELASTIC_API_KEY"], "strict": False}, "api_key_id": {"type": "env_var", "env_vars": ["ELASTIC_API_KEY_ID"], "strict": False}, "embedding_similarity_function": "cosine", + "sparse_vector_field": None, }, } @@ -145,6 +150,7 @@ def test_from_dict_with_api_keys_str(): "api_key": "my_api_key", "api_key_id": "my_api_key_id", "embedding_similarity_function": "cosine", + "sparse_vector_field": None, }, } @@ -287,6 +293,33 @@ def test_write_documents(self, document_store: ElasticsearchDocumentStore): with pytest.raises(DuplicateDocumentError): document_store.write_documents(docs, DuplicatePolicy.FAIL) + def test_write_documents_with_sparse_vectors(self): + store = ElasticsearchDocumentStore( + hosts=["http://localhost:9200"], index="test_sync_sparse", sparse_vector_field="sparse_vec" + ) + store.client.options(ignore_status=[400, 404]).indices.delete(index="test_sync_sparse") + + doc = Document(id="1", content="test", sparse_embedding=SparseEmbedding(indices=[0, 1], values=[0.5, 0.5])) + store.write_documents([doc]) + + # check ES natively + raw_doc = store.client.get(index="test_sync_sparse", id="1") + assert raw_doc["_source"]["sparse_vec"] == {"0": 0.5, "1": 0.5} + + store.client.indices.delete(index="test_sync_sparse") + + def test_write_documents_with_sparse_embedding_warning(self, document_store, caplog): + """Test write_documents with document containing sparse_embedding field""" + doc = Document(id="1", content="test", sparse_embedding=SparseEmbedding(indices=[0, 1], values=[0.5, 0.5])) + + document_store.write_documents([doc]) + assert "but `sparse_vector_field` is not configured" in caplog.text + + results = document_store.filter_documents() + assert len(results) == 1 + assert results[0].id == "1" + assert not hasattr(results[0], "sparse_embedding") or results[0].sparse_embedding is None + def test_bm25_retrieval(self, document_store: ElasticsearchDocumentStore): document_store.write_documents( [ @@ -462,6 +495,13 @@ def test_write_documents_different_embedding_sizes_fail(self, document_store: El with pytest.raises(DocumentStoreError): document_store.write_documents(docs) + def test_init_with_sparse_vector_field(self): + store = ElasticsearchDocumentStore( + hosts=["http://localhost:9200"], index="test_init_sparse", sparse_vector_field="sparse_vec" + ) + assert "sparse_vec" in store._default_mappings["properties"] + assert store._default_mappings["properties"]["sparse_vec"]["type"] == "sparse_vector" + @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch") def test_init_with_custom_mapping(self, mock_elasticsearch): custom_mapping = { diff --git a/integrations/elasticsearch/tests/test_document_store_async.py b/integrations/elasticsearch/tests/test_document_store_async.py index 3aa0552f86..5576b2c71a 100644 --- a/integrations/elasticsearch/tests/test_document_store_async.py +++ b/integrations/elasticsearch/tests/test_document_store_async.py @@ -152,13 +152,30 @@ async def test_write_documents_async_with_sparse_embedding_warning(self, documen doc = Document(id="1", content="test", sparse_embedding=SparseEmbedding(indices=[0, 1], values=[0.5, 0.5])) await document_store.write_documents_async([doc]) - assert "but storing sparse embeddings in Elasticsearch is not currently supported." in caplog.text + assert "but `sparse_vector_field` is not configured" in caplog.text results = await document_store.filter_documents_async() assert len(results) == 1 assert results[0].id == "1" assert not hasattr(results[0], "sparse_embedding") or results[0].sparse_embedding is None + @pytest.mark.asyncio + async def test_write_documents_async_with_sparse_vectors(self): + """Test write_documents with document containing sparse_embedding field""" + store = ElasticsearchDocumentStore( + hosts=["http://localhost:9200"], index="test_async_sparse", sparse_vector_field="sparse_vec" + ) + store.client.options(ignore_status=[400, 404]).indices.delete(index="test_async_sparse") + + doc = Document(id="1", content="test", sparse_embedding=SparseEmbedding(indices=[0, 1], values=[0.5, 0.5])) + await store.write_documents_async([doc]) + + # check ES natively + raw_doc = await store.async_client.get(index="test_async_sparse", id="1") + assert raw_doc["_source"]["sparse_vec"] == {"0": 0.5, "1": 0.5} + + store.client.indices.delete(index="test_async_sparse") + @pytest.mark.asyncio async def test_delete_all_documents_async(self, document_store): docs = [ From b3413457bc685c68dcad6427449654973a4a7834 Mon Sep 17 00:00:00 2001 From: GunaPalanivel Date: Thu, 19 Mar 2026 13:16:40 +0530 Subject: [PATCH 04/15] style: fix B905 (strict zip) and E501 (line length) linting errors --- .../document_stores/elasticsearch/document_store.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py index eb4c58cef5..ca813e6199 100644 --- a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py @@ -464,7 +464,8 @@ def write_documents( if sparse_embedding: if self._sparse_vector_field: doc_dict[self._sparse_vector_field] = { - str(idx): val for idx, val in zip(sparse_embedding["indices"], sparse_embedding["values"]) + str(idx): val + for idx, val in zip(sparse_embedding["indices"], sparse_embedding["values"], strict=True) } else: logger.warning( @@ -556,7 +557,8 @@ async def write_documents_async( if sparse_embedding: if self._sparse_vector_field: doc_dict[self._sparse_vector_field] = { - str(idx): val for idx, val in zip(sparse_embedding["indices"], sparse_embedding["values"]) + str(idx): val + for idx, val in zip(sparse_embedding["indices"], sparse_embedding["values"], strict=True) } else: logger.warning( From 8f85f8d7f5e015460d467c1a3ddcbbadac576c1a Mon Sep 17 00:00:00 2001 From: GunaPalanivel Date: Thu, 19 Mar 2026 13:29:12 +0530 Subject: [PATCH 05/15] style: fix mypy type inference for _default_mappings --- .../document_stores/elasticsearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py index ca813e6199..869880acf8 100644 --- a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py @@ -136,7 +136,7 @@ def __init__( raise ValueError(msg) if not self._custom_mapping: - self._default_mappings = { + self._default_mappings: dict[str, Any] = { "properties": { "embedding": { "type": "dense_vector", From b8f77c1b9b5578b5da0ab25d1c51842a000e6883 Mon Sep 17 00:00:00 2001 From: GunaPalanivel Date: Sat, 21 Mar 2026 23:06:46 +0530 Subject: [PATCH 06/15] refactor: address PR review feedback for sparse vector storage - Add SPECIAL_FIELDS validation for sparse_vector_field in __init__ - Add sparse_vector_field to __init__ docstring - Inject sparse_vector mapping into custom_mapping when both provided - Extract _handle_sparse_embedding helper to deduplicate write methods - Convert _deserialize_document to reconstruct SparseEmbedding on read --- .../elasticsearch/document_store.py | 82 +++++++++++-------- 1 file changed, 48 insertions(+), 34 deletions(-) diff --git a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py index 869880acf8..06360b2d16 100644 --- a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py @@ -117,6 +117,9 @@ def __init__( To choose the most appropriate function, look for information about your embedding model. To understand how document scores are computed, see the Elasticsearch [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params) + :param sparse_vector_field: If set, the name of the Elasticsearch field where sparse embeddings + will be stored using the `sparse_vector` field type. When not set, any `sparse_embedding` + data on Documents is silently dropped during writes. :param **kwargs: Optional arguments that `Elasticsearch` takes. """ self._hosts = hosts @@ -131,10 +134,18 @@ def __init__( self._kwargs = kwargs self._initialized = False + if self._sparse_vector_field and self._sparse_vector_field in SPECIAL_FIELDS: + msg = f"sparse_vector_field '{self._sparse_vector_field}' conflicts with a reserved field name." + raise ValueError(msg) + if self._custom_mapping and not isinstance(self._custom_mapping, dict): msg = "custom_mapping must be a dictionary" raise ValueError(msg) + if self._custom_mapping and self._sparse_vector_field: + self._custom_mapping.setdefault("properties", {}) + self._custom_mapping["properties"][self._sparse_vector_field] = {"type": "sparse_vector"} + if not self._custom_mapping: self._default_mappings: dict[str, Any] = { "properties": { @@ -407,8 +418,7 @@ async def filter_documents_async(self, filters: dict[str, Any] | None = None) -> documents = await self._search_documents_async(query=query) return documents - @staticmethod - def _deserialize_document(hit: dict[str, Any]) -> Document: + def _deserialize_document(self, hit: dict[str, Any]) -> Document: """ Creates a `Document` from the search hit provided. This is mostly useful in self.filter_documents(). @@ -421,8 +431,42 @@ def _deserialize_document(hit: dict[str, Any]) -> Document: data["metadata"]["highlighted"] = hit["highlight"] data["score"] = hit["_score"] + if self._sparse_vector_field and self._sparse_vector_field in data: + es_sparse = data.pop(self._sparse_vector_field) + sorted_items = sorted(es_sparse.items(), key=lambda x: int(x[0])) + data["sparse_embedding"] = { + "indices": [int(k) for k, _ in sorted_items], + "values": [v for _, v in sorted_items], + } + return Document.from_dict(data) + def _handle_sparse_embedding(self, doc_dict: dict[str, Any], doc_id: str) -> None: + """ + Extracts the sparse_embedding from a document dict and converts it to + the Elasticsearch sparse_vector format if sparse_vector_field is configured. + Otherwise logs a warning. + + :param doc_dict: The dictionary representation of the document. + :param doc_id: The document ID, used for warning messages. + """ + if "sparse_embedding" not in doc_dict: + return + sparse_embedding = doc_dict.pop("sparse_embedding") + if not sparse_embedding: + return + if self._sparse_vector_field: + doc_dict[self._sparse_vector_field] = { + str(idx): val for idx, val in zip(sparse_embedding["indices"], sparse_embedding["values"], strict=True) + } + else: + logger.warning( + "Document {doc_id} has the `sparse_embedding` field set, " + "but `sparse_vector_field` is not configured for this ElasticsearchDocumentStore. " + "The `sparse_embedding` field will be ignored.", + doc_id=doc_id, + ) + def write_documents( self, documents: list[Document], @@ -458,22 +502,7 @@ def write_documents( elasticsearch_actions = [] for doc in documents: doc_dict = doc.to_dict() - - if "sparse_embedding" in doc_dict: - sparse_embedding = doc_dict.pop("sparse_embedding", None) - if sparse_embedding: - if self._sparse_vector_field: - doc_dict[self._sparse_vector_field] = { - str(idx): val - for idx, val in zip(sparse_embedding["indices"], sparse_embedding["values"], strict=True) - } - else: - logger.warning( - "Document {doc_id} has the `sparse_embedding` field set, " - "but `sparse_vector_field` is not configured for this ElasticsearchDocumentStore. " - "The `sparse_embedding` field will be ignored.", - doc_id=doc.id, - ) + self._handle_sparse_embedding(doc_dict, doc.id) elasticsearch_actions.append( { "_op_type": action, @@ -551,22 +580,7 @@ async def write_documents_async( actions = [] for doc in documents: doc_dict = doc.to_dict() - - if "sparse_embedding" in doc_dict: - sparse_embedding = doc_dict.pop("sparse_embedding", None) - if sparse_embedding: - if self._sparse_vector_field: - doc_dict[self._sparse_vector_field] = { - str(idx): val - for idx, val in zip(sparse_embedding["indices"], sparse_embedding["values"], strict=True) - } - else: - logger.warning( - "Document {doc_id} has the `sparse_embedding` field set, " - "but `sparse_vector_field` is not configured for this ElasticsearchDocumentStore. " - "The `sparse_embedding` field will be ignored.", - doc_id=doc.id, - ) + self._handle_sparse_embedding(doc_dict, doc.id) action = { "_op_type": "create" if policy == DuplicatePolicy.FAIL else "index", From 766def90d24b6fa657e7dd51670091f449a1ab74 Mon Sep 17 00:00:00 2001 From: GunaPalanivel Date: Sat, 21 Mar 2026 23:07:40 +0530 Subject: [PATCH 07/15] test: address PR review feedback for sparse vector tests - Add SPECIAL_FIELDS validation test - Add custom_mapping injection test - Add legacy from_dict backward compat test - Fix async test to use async_client for index deletion - Add retrieval reconstruction assertions to sync and async sparse tests --- .../tests/test_document_store.py | 36 +++++++++++++++++++ .../tests/test_document_store_async.py | 11 ++++-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py index 93aef6226c..ddb4644688 100644 --- a/integrations/elasticsearch/tests/test_document_store.py +++ b/integrations/elasticsearch/tests/test_document_store.py @@ -24,6 +24,18 @@ def test_init_is_lazy(_mock_es_client): _mock_es_client.assert_not_called() +def test_init_with_special_fields_raises_error(): + with pytest.raises(ValueError, match=r"sparse_vector_field 'content' conflicts with a reserved field name\."): + ElasticsearchDocumentStore(sparse_vector_field="content") + + +def test_init_with_custom_mapping_injects_sparse_vector(): + custom_mapping = {"properties": {"some_field": {"type": "text"}}} + store = ElasticsearchDocumentStore(custom_mapping=custom_mapping, sparse_vector_field="my_sparse_vec") + assert "my_sparse_vec" in store._custom_mapping["properties"] + assert store._custom_mapping["properties"]["my_sparse_vec"] == {"type": "sparse_vector"} + + @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch") def test_headers_are_supported(_mock_es_client): _ = ElasticsearchDocumentStore( @@ -159,6 +171,23 @@ def test_from_dict_with_api_keys_str(): assert document_store._api_key_id == "my_api_key_id" +def test_from_dict_without_sparse_vector_field(): + data = { + "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", + "init_parameters": { + "hosts": "some hosts", + "custom_mapping": None, + "index": "default", + "api_key": "my_api_key", + "api_key_id": "my_api_key_id", + "embedding_similarity_function": "cosine", + }, + } + + document_store = ElasticsearchDocumentStore.from_dict(data) + assert document_store._sparse_vector_field is None + + def test_api_key_validation_only_api_key(): api_key = Secret.from_token("test_api_key") document_store = ElasticsearchDocumentStore(hosts="https://localhost:9200", api_key=api_key) @@ -306,6 +335,13 @@ def test_write_documents_with_sparse_vectors(self): raw_doc = store.client.get(index="test_sync_sparse", id="1") assert raw_doc["_source"]["sparse_vec"] == {"0": 0.5, "1": 0.5} + # check retrieval reconstruction + results = store.filter_documents() + assert len(results) == 1 + assert results[0].sparse_embedding is not None + assert results[0].sparse_embedding.indices == [0, 1] + assert results[0].sparse_embedding.values == [0.5, 0.5] + store.client.indices.delete(index="test_sync_sparse") def test_write_documents_with_sparse_embedding_warning(self, document_store, caplog): diff --git a/integrations/elasticsearch/tests/test_document_store_async.py b/integrations/elasticsearch/tests/test_document_store_async.py index 5576b2c71a..c8a9131e7f 100644 --- a/integrations/elasticsearch/tests/test_document_store_async.py +++ b/integrations/elasticsearch/tests/test_document_store_async.py @@ -165,7 +165,7 @@ async def test_write_documents_async_with_sparse_vectors(self): store = ElasticsearchDocumentStore( hosts=["http://localhost:9200"], index="test_async_sparse", sparse_vector_field="sparse_vec" ) - store.client.options(ignore_status=[400, 404]).indices.delete(index="test_async_sparse") + await store.async_client.options(ignore_status=[400, 404]).indices.delete(index="test_async_sparse") doc = Document(id="1", content="test", sparse_embedding=SparseEmbedding(indices=[0, 1], values=[0.5, 0.5])) await store.write_documents_async([doc]) @@ -174,7 +174,14 @@ async def test_write_documents_async_with_sparse_vectors(self): raw_doc = await store.async_client.get(index="test_async_sparse", id="1") assert raw_doc["_source"]["sparse_vec"] == {"0": 0.5, "1": 0.5} - store.client.indices.delete(index="test_async_sparse") + # check retrieval + results = await store.filter_documents_async() + assert len(results) == 1 + assert results[0].sparse_embedding is not None + assert results[0].sparse_embedding.indices == [0, 1] + assert results[0].sparse_embedding.values == [0.5, 0.5] + + await store.async_client.indices.delete(index="test_async_sparse") @pytest.mark.asyncio async def test_delete_all_documents_async(self, document_store): From a78cbdfaf2e24640ae83b4b2155267760aa951ea Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 24 Mar 2026 10:21:15 +0100 Subject: [PATCH 08/15] fixing docstrings --- .../document_stores/elasticsearch/document_store.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py index 052b2470d1..f62477894e 100644 --- a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py @@ -446,9 +446,7 @@ def _deserialize_document(self, hit: dict[str, Any]) -> Document: def _handle_sparse_embedding(self, doc_dict: dict[str, Any], doc_id: str) -> None: """ - Extracts the sparse_embedding from a document dict and converts it to - the Elasticsearch sparse_vector format if sparse_vector_field is configured. - Otherwise logs a warning. + Extracts the sparse_embedding from a document dict and converts it to the Elasticsearch sparse_vector format. :param doc_dict: The dictionary representation of the document. :param doc_id: The document ID, used for warning messages. From 884e1c52b79b6a01bd2965c37df24e2effb73025 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 24 Mar 2026 10:44:58 +0100 Subject: [PATCH 09/15] just as a safeguard original custom_mapping dict is left unchanged --- .../document_stores/elasticsearch/document_store.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py index f62477894e..de3328805b 100644 --- a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 - +import copy # ruff: noqa: FBT002, FBT001 boolean-type-hint-positional-argument and boolean-default-value-positional-argument # ruff: noqa: B008 function-call-in-default-argument # ruff: noqa: S101 disable checks for uses of the assert keyword @@ -143,6 +143,7 @@ def __init__( raise ValueError(msg) if self._custom_mapping and self._sparse_vector_field: + self._custom_mapping = copy.deepcopy(custom_mapping) # original custom_mapping dict is left unchanged self._custom_mapping.setdefault("properties", {}) self._custom_mapping["properties"][self._sparse_vector_field] = {"type": "sparse_vector"} From 98c096d424b29da22a24bd8a2b18eacd66c3ff12 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 24 Mar 2026 10:46:43 +0100 Subject: [PATCH 10/15] organising imports --- .../document_stores/elasticsearch/document_store.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py index de3328805b..43e4ba68b3 100644 --- a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py @@ -2,11 +2,10 @@ # # SPDX-License-Identifier: Apache-2.0 import copy + # ruff: noqa: FBT002, FBT001 boolean-type-hint-positional-argument and boolean-default-value-positional-argument # ruff: noqa: B008 function-call-in-default-argument # ruff: noqa: S101 disable checks for uses of the assert keyword - - from collections.abc import Mapping from typing import Any, Literal @@ -424,7 +423,7 @@ def _deserialize_document(self, hit: dict[str, Any]) -> Document: """ Creates a `Document` from the search hit provided. - This is mostly useful in self.filter_documents(). + This is mostly useful in self.filter_documents() and self.filter_documents_async(). :param hit: A search hit from Elasticsearch. :returns: `Document` created from the search hit. From 1187f8920714b7b23ab83b6d9c1fe5f1b98302f5 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 24 Mar 2026 10:48:15 +0100 Subject: [PATCH 11/15] formatting --- .../document_stores/elasticsearch/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py index 43e4ba68b3..6fc88a850f 100644 --- a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py @@ -142,7 +142,7 @@ def __init__( raise ValueError(msg) if self._custom_mapping and self._sparse_vector_field: - self._custom_mapping = copy.deepcopy(custom_mapping) # original custom_mapping dict is left unchanged + self._custom_mapping = copy.deepcopy(custom_mapping) # original custom_mapping dict is left unchanged self._custom_mapping.setdefault("properties", {}) self._custom_mapping["properties"][self._sparse_vector_field] = {"type": "sparse_vector"} From 6a60ccde22091180cdd5595c7a8351d2da2ebee6 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Tue, 24 Mar 2026 13:21:13 +0100 Subject: [PATCH 12/15] adding more tests + fixing typing issues --- .../elasticsearch/document_store.py | 4 +- .../tests/test_document_store.py | 41 +++++++++++++++++ .../tests/test_document_store_async.py | 45 +++++++++++++++++++ 3 files changed, 88 insertions(+), 2 deletions(-) diff --git a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py index 6fc88a850f..f4edc1b93c 100644 --- a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py @@ -143,8 +143,8 @@ def __init__( if self._custom_mapping and self._sparse_vector_field: self._custom_mapping = copy.deepcopy(custom_mapping) # original custom_mapping dict is left unchanged - self._custom_mapping.setdefault("properties", {}) - self._custom_mapping["properties"][self._sparse_vector_field] = {"type": "sparse_vector"} + self._custom_mapping.setdefault("properties", {}) # type: ignore # can't be None here + self._custom_mapping["properties"][self._sparse_vector_field] = {"type": "sparse_vector"} # type: ignore # can't be None here if not self._custom_mapping: self._default_mappings: dict[str, Any] = { diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py index f1a1cd519d..1de9909be4 100644 --- a/integrations/elasticsearch/tests/test_document_store.py +++ b/integrations/elasticsearch/tests/test_document_store.py @@ -358,6 +358,47 @@ def test_write_documents_with_sparse_vectors(self): store.client.indices.delete(index="test_sync_sparse") + def test_write_documents_with_non_contiguous_sparse_indices(self): + store = ElasticsearchDocumentStore( + hosts=["http://localhost:9200"], index="test_sync_sparse_noncontiguous", sparse_vector_field="sparse_vec" + ) + store.client.options(ignore_status=[400, 404]).indices.delete(index="test_sync_sparse_noncontiguous") + + doc = Document( + id="1", content="test", sparse_embedding=SparseEmbedding(indices=[100, 5, 42], values=[0.1, 0.9, 0.5]) + ) + store.write_documents([doc]) + + results = store.filter_documents() + assert len(results) == 1 + assert results[0].sparse_embedding is not None + assert results[0].sparse_embedding.indices == [5, 42, 100] + assert results[0].sparse_embedding.values == [0.9, 0.5, 0.1] + + store.client.indices.delete(index="test_sync_sparse_noncontiguous") + + def test_write_documents_mixed_sparse_and_non_sparse(self): + store = ElasticsearchDocumentStore( + hosts=["http://localhost:9200"], index="test_sync_sparse_mixed", sparse_vector_field="sparse_vec" + ) + store.client.options(ignore_status=[400, 404]).indices.delete(index="test_sync_sparse_mixed") + + docs = [ + Document( + id="1", content="with sparse", sparse_embedding=SparseEmbedding(indices=[0, 1], values=[0.5, 0.5]) + ), + Document(id="2", content="without sparse"), + ] + store.write_documents(docs) + + results = sorted(store.filter_documents(), key=lambda d: d.id) + assert len(results) == 2 + assert results[0].sparse_embedding is not None + assert results[0].sparse_embedding.indices == [0, 1] + assert results[1].sparse_embedding is None + + store.client.indices.delete(index="test_sync_sparse_mixed") + def test_write_documents_with_sparse_embedding_warning(self, document_store, caplog): """Test write_documents with document containing sparse_embedding field""" doc = Document(id="1", content="test", sparse_embedding=SparseEmbedding(indices=[0, 1], values=[0.5, 0.5])) diff --git a/integrations/elasticsearch/tests/test_document_store_async.py b/integrations/elasticsearch/tests/test_document_store_async.py index c8a9131e7f..a08cea8995 100644 --- a/integrations/elasticsearch/tests/test_document_store_async.py +++ b/integrations/elasticsearch/tests/test_document_store_async.py @@ -183,6 +183,51 @@ async def test_write_documents_async_with_sparse_vectors(self): await store.async_client.indices.delete(index="test_async_sparse") + @pytest.mark.asyncio + async def test_write_documents_async_with_non_contiguous_sparse_indices(self): + store = ElasticsearchDocumentStore( + hosts=["http://localhost:9200"], index="test_async_sparse_noncontiguous", sparse_vector_field="sparse_vec" + ) + await store.async_client.options(ignore_status=[400, 404]).indices.delete( + index="test_async_sparse_noncontiguous" + ) + + doc = Document( + id="1", content="test", sparse_embedding=SparseEmbedding(indices=[100, 5, 42], values=[0.1, 0.9, 0.5]) + ) + await store.write_documents_async([doc]) + + results = await store.filter_documents_async() + assert len(results) == 1 + assert results[0].sparse_embedding is not None + assert results[0].sparse_embedding.indices == [5, 42, 100] + assert results[0].sparse_embedding.values == [0.9, 0.5, 0.1] + + await store.async_client.indices.delete(index="test_async_sparse_noncontiguous") + + @pytest.mark.asyncio + async def test_write_documents_async_mixed_sparse_and_non_sparse(self): + store = ElasticsearchDocumentStore( + hosts=["http://localhost:9200"], index="test_async_sparse_mixed", sparse_vector_field="sparse_vec" + ) + await store.async_client.options(ignore_status=[400, 404]).indices.delete(index="test_async_sparse_mixed") + + docs = [ + Document( + id="1", content="with sparse", sparse_embedding=SparseEmbedding(indices=[0, 1], values=[0.5, 0.5]) + ), + Document(id="2", content="without sparse"), + ] + await store.write_documents_async(docs) + + results = sorted(await store.filter_documents_async(), key=lambda d: d.id) + assert len(results) == 2 + assert results[0].sparse_embedding is not None + assert results[0].sparse_embedding.indices == [0, 1] + assert results[1].sparse_embedding is None + + await store.async_client.indices.delete(index="test_async_sparse_mixed") + @pytest.mark.asyncio async def test_delete_all_documents_async(self, document_store): docs = [ From 3dab6230123fbb5914af7e94253e76ffd27fa2ab Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 9 Apr 2026 14:37:40 +0200 Subject: [PATCH 13/15] formatting --- integrations/elasticsearch/tests/test_document_store.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py index 26e9aa9d7e..6f7f3325bb 100644 --- a/integrations/elasticsearch/tests/test_document_store.py +++ b/integrations/elasticsearch/tests/test_document_store.py @@ -617,7 +617,6 @@ def test_write_documents_different_embedding_sizes_fail(self, document_store: El with pytest.raises(DocumentStoreError): document_store.write_documents(docs) - def test_init_with_sparse_vector_field(self): store = ElasticsearchDocumentStore( hosts=["http://localhost:9200"], index="test_init_sparse", sparse_vector_field="sparse_vec" From e2cfd6f4a6ad721d6fab1976d38396d2cfa2c110 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 10 Apr 2026 13:58:57 +0200 Subject: [PATCH 14/15] updating unit tests --- .../elasticsearch/tests/test_elasticsearch_hybrid_retriever.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/elasticsearch/tests/test_elasticsearch_hybrid_retriever.py b/integrations/elasticsearch/tests/test_elasticsearch_hybrid_retriever.py index 4803ff5c64..1abd62fdb8 100644 --- a/integrations/elasticsearch/tests/test_elasticsearch_hybrid_retriever.py +++ b/integrations/elasticsearch/tests/test_elasticsearch_hybrid_retriever.py @@ -37,6 +37,7 @@ class TestElasticsearchHybridRetriever: "api_key": {"type": "env_var", "env_vars": ["ELASTIC_API_KEY"], "strict": False}, "api_key_id": {"type": "env_var", "env_vars": ["ELASTIC_API_KEY_ID"], "strict": False}, "embedding_similarity_function": "cosine", + "sparse_vector_field": None, }, }, "embedder": { From 82530e18e60b0e35d33a52d18ea53e4cb17a96ae Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Fri, 10 Apr 2026 14:55:04 +0200 Subject: [PATCH 15/15] adding unit tests for _handle_sparse_embedding function --- .../tests/test_document_store.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py index 6f7f3325bb..c37a7518a7 100644 --- a/integrations/elasticsearch/tests/test_document_store.py +++ b/integrations/elasticsearch/tests/test_document_store.py @@ -43,6 +43,36 @@ def test_init_with_custom_mapping_injects_sparse_vector(): assert store._custom_mapping["properties"]["my_sparse_vec"] == {"type": "sparse_vector"} +def test_handle_sparse_embedding_no_op_when_absent(): + store = ElasticsearchDocumentStore(hosts="testhost") + doc_dict = {"id": "doc-1", "content": "hello"} + store._handle_sparse_embedding(doc_dict, "doc-1") + assert doc_dict == {"id": "doc-1", "content": "hello"} + + +def test_handle_sparse_embedding_converts_to_es_format(): + store = ElasticsearchDocumentStore(hosts="testhost", sparse_vector_field="my_sparse") + doc_dict = { + "id": "doc-1", + "sparse_embedding": {"indices": [0, 5], "values": [0.3, 0.7]}, + } + store._handle_sparse_embedding(doc_dict, "doc-1") + assert "sparse_embedding" not in doc_dict + assert doc_dict["my_sparse"] == {"0": 0.3, "5": 0.7} + + +def test_handle_sparse_embedding_warns_when_no_field_configured(caplog): + store = ElasticsearchDocumentStore(hosts="testhost") + doc_dict = { + "id": "doc-1", + "content": "hello", + "sparse_embedding": {"indices": [0, 1], "values": [0.5, 0.5]}, + } + store._handle_sparse_embedding(doc_dict, "doc-1") + assert "but `sparse_vector_field` is not configured" in caplog.text + assert "sparse_embedding" not in doc_dict + + @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch") def test_headers_are_supported(_mock_es_client): _ = ElasticsearchDocumentStore(