Skip to content

Commit b62b7d4

Browse files
committed
increasing test coverage and dealing with none fields
1 parent c2c7b04 commit b62b7d4

5 files changed

Lines changed: 61 additions & 22 deletions

File tree

integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -373,11 +373,15 @@ def _search_documents(self, **kwargs: Any) -> list[Document]:
373373
if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
374374
top_k = kwargs["knn"]["k"]
375375

376-
# sparse_vector data written by an ingest pipeline is not stored in _source,
377-
# but is retrievable via the fields API. Request it explicitly so that
378-
# _deserialize_document can populate Document.sparse_embedding correctly.
379-
if self._sparse_vector_field and "fields" not in kwargs:
380-
kwargs["fields"] = [self._sparse_vector_field]
376+
# When an ingest pipeline is configured, sparse_vector data is not stored in _source
377+
# (ES indexes it but omits it from the stored document). Request it via the fields API
378+
# so that _deserialize_document can populate Document.sparse_embedding correctly.
379+
# Merge rather than replace: a caller may already pass fields=["title", ...] for a custom
380+
# projection — dropping their list would silently hide sparse embeddings on those docs.
381+
if self._ingest_pipeline and self._sparse_vector_field:
382+
existing = list(kwargs.get("fields") or [])
383+
if self._sparse_vector_field not in existing:
384+
kwargs["fields"] = [*existing, self._sparse_vector_field]
381385

382386
documents: list[Document] = []
383387
from_ = 0
@@ -406,11 +410,15 @@ async def _search_documents_async(self, **kwargs: Any) -> list[Document]:
406410
if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
407411
top_k = kwargs["knn"]["k"]
408412

409-
# sparse_vector data written by an ingest pipeline is not stored in _source,
410-
# but is retrievable via the fields API. Request it explicitly so that
411-
# _deserialize_document can populate Document.sparse_embedding correctly.
412-
if self._sparse_vector_field and "fields" not in kwargs:
413-
kwargs["fields"] = [self._sparse_vector_field]
413+
# When an ingest pipeline is configured, sparse_vector data is not stored in _source
414+
# (ES indexes it but omits it from the stored document). Request it via the fields API
415+
# so that _deserialize_document can populate Document.sparse_embedding correctly.
416+
# Merge rather than replace: a caller may already pass fields=["title", ...] for a custom
417+
# projection — dropping their list would silently hide sparse embeddings on those docs.
418+
if self._ingest_pipeline and self._sparse_vector_field:
419+
existing = list(kwargs.get("fields") or [])
420+
if self._sparse_vector_field not in existing:
421+
kwargs["fields"] = [*existing, self._sparse_vector_field]
414422

415423
documents: list[Document] = []
416424
from_ = 0
@@ -680,10 +688,11 @@ def write_documents(
680688
for doc in documents:
681689
doc_dict = doc.to_dict()
682690
# ES rejects null for strongly-typed fields (dense_vector, sparse_vector) when the
683-
# index mapping carries explicit configuration such as `dims`. A missing field is
684-
# always valid — it lets ingest pipelines populate the value at index time, and for
685-
# ordinary writes it simply means no value is stored. We only strip the known
686-
# Haystack document fields here; metadata values are left untouched intentionally.
691+
# index mapping carries explicit configuration such as `dims`. This applies to all
692+
# writes, not just ingest pipeline writes: any index with a custom_mapping that
693+
# declares explicit field types will reject null values. A missing field is always
694+
# valid — ES treats it as "no value stored". We only strip the known Haystack
695+
# document fields here; metadata values are left untouched intentionally.
687696
for field in ("embedding", "blob", "score"):
688697
if doc_dict.get(field) is None:
689698
doc_dict.pop(field, None)
@@ -770,10 +779,11 @@ async def write_documents_async(
770779
for doc in documents:
771780
doc_dict = doc.to_dict()
772781
# ES rejects null for strongly-typed fields (dense_vector, sparse_vector) when the
773-
# index mapping carries explicit configuration such as `dims`. A missing field is
774-
# always valid — it lets ingest pipelines populate the value at index time, and for
775-
# ordinary writes it simply means no value is stored. We only strip the known
776-
# Haystack document fields here; metadata values are left untouched intentionally.
782+
# index mapping carries explicit configuration such as `dims`. This applies to all
783+
# writes, not just ingest pipeline writes: any index with a custom_mapping that
784+
# declares explicit field types will reject null values. A missing field is always
785+
# valid — ES treats it as "no value stored". We only strip the known Haystack
786+
# document fields here; metadata values are left untouched intentionally.
777787
for field in ("embedding", "blob", "score"):
778788
if doc_dict.get(field) is None:
779789
doc_dict.pop(field, None)

integrations/elasticsearch/tests/test_cloud_hybrid_retriever.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ def _index_documents_with_inference(client, index: str, inference_id: str, docs:
241241

242242

243243
@pytest.mark.integration
244-
class TestElasticsearchInferenceHybridRetrieverIntegration:
244+
class TestElasticsearchInferenceHybridRetriever:
245245
"""
246246
End-to-end tests against a real Elastic Cloud cluster with a deployed ELSER endpoint.
247247
Run with: pytest -m integration

integrations/elasticsearch/tests/test_cloud_ingest_pipeline.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def _get_dense_query_embedding(client, inference_id: str, text: str) -> list[flo
4545

4646

4747
@pytest.mark.integration
48-
class TestIngestPipelineDense:
48+
class TestElasticSearchIngestPipelineDense:
4949
"""
5050
End-to-end integration tests for ElasticsearchDocumentStore with an ingest pipeline
5151
that generates dense embeddings at index time.
@@ -192,7 +192,7 @@ async def test_async_write_documents_via_pipeline(self, ingest_pipeline_dense_do
192192

193193

194194
@pytest.mark.integration
195-
class TestIngestPipelineSparse:
195+
class TestElasticSearchIngestPipelineSparse:
196196
"""
197197
End-to-end integration tests for ElasticsearchDocumentStore with an ingest pipeline
198198
that generates ELSER sparse embeddings at index time.

integrations/elasticsearch/tests/test_cloud_sparse_retriever.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ def _index_documents_with_inference(client, index: str, inference_id: str, docum
304304

305305

306306
@pytest.mark.integration
307-
class TestElasticsearchInferenceSparseRetrieverIntegration:
307+
class TestElasticsearchInferenceSparseRetriever:
308308
"""
309309
End-to-end integration tests for ElasticsearchInferenceSparseRetriever.
310310

integrations/elasticsearch/tests/test_document_store.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,13 @@ def test_write_documents_bulk_passes_pipeline_when_configured(mock_es, _mock_asy
280280
mock_bulk.assert_called_once()
281281
assert mock_bulk.call_args.kwargs["pipeline"] == "my-ingest"
282282

283+
call_actions = mock_bulk.call_args.kwargs["actions"]
284+
assert len(call_actions) == 1
285+
source = call_actions[0]["_source"]
286+
assert "embedding" not in source
287+
assert "blob" not in source
288+
assert "score" not in source
289+
283290

284291
@patch("haystack_integrations.document_stores.elasticsearch.document_store.helpers.bulk")
285292
@patch("haystack_integrations.document_stores.elasticsearch.document_store.AsyncElasticsearch")
@@ -325,6 +332,28 @@ async def test_write_documents_async_bulk_passes_pipeline_when_configured(mock_e
325332
assert mock_async_bulk.call_args.kwargs["pipeline"] == "pipe-async"
326333

327334

335+
@pytest.mark.asyncio
336+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.helpers.async_bulk")
337+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.AsyncElasticsearch")
338+
@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
339+
async def test_write_documents_async_bulk_omits_pipeline_when_not_configured(mock_es, mock_async_es_cls, mock_async_bulk):
340+
mock_client = Mock()
341+
mock_client.info.return_value = {"version": {"number": "8.0.0"}}
342+
mock_client.indices.exists.return_value = True
343+
mock_es.return_value = mock_client
344+
345+
mock_async_es_cls.return_value = AsyncMock()
346+
347+
mock_async_bulk.return_value = (1, [])
348+
349+
store = ElasticsearchDocumentStore(hosts="http://localhost:9200", index="idx_async_no_pipeline")
350+
_ = store.client
351+
await store.write_documents_async([Document(id="1", content="a")])
352+
353+
mock_async_bulk.assert_called_once()
354+
assert "pipeline" not in mock_async_bulk.call_args.kwargs
355+
356+
328357
def test_api_key_validation_only_api_key():
329358
api_key = Secret.from_token("test_api_key")
330359
document_store = ElasticsearchDocumentStore(hosts="https://localhost:9200", api_key=api_key)

0 commit comments

Comments
 (0)