increasing test coverage and dealing with none fields

davidsbatista · davidsbatista · commit b62b7d49d38a · 2026-05-04T14:03:15.000+02:00
diff --git a/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py
@@ -373,11 +373,15 @@ def _search_documents(self, **kwargs: Any) -> list[Document]:
         if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
             top_k = kwargs["knn"]["k"]
 
-        # sparse_vector data written by an ingest pipeline is not stored in _source,
-        # but is retrievable via the fields API. Request it explicitly so that
-        # _deserialize_document can populate Document.sparse_embedding correctly.
-        if self._sparse_vector_field and "fields" not in kwargs:
-            kwargs["fields"] = [self._sparse_vector_field]
+        # When an ingest pipeline is configured, sparse_vector data is not stored in _source
+        # (ES indexes it but omits it from the stored document). Request it via the fields API
+        # so that _deserialize_document can populate Document.sparse_embedding correctly.
+        # Merge rather than replace: a caller may already pass fields=["title", ...] for a custom
+        # projection — dropping their list would silently hide sparse embeddings on those docs.
+        if self._ingest_pipeline and self._sparse_vector_field:
+            existing = list(kwargs.get("fields") or [])
+            if self._sparse_vector_field not in existing:
+                kwargs["fields"] = [*existing, self._sparse_vector_field]
 
         documents: list[Document] = []
         from_ = 0
@@ -406,11 +410,15 @@ async def _search_documents_async(self, **kwargs: Any) -> list[Document]:
         if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]:
             top_k = kwargs["knn"]["k"]
 
-        # sparse_vector data written by an ingest pipeline is not stored in _source,
-        # but is retrievable via the fields API. Request it explicitly so that
-        # _deserialize_document can populate Document.sparse_embedding correctly.
-        if self._sparse_vector_field and "fields" not in kwargs:
-            kwargs["fields"] = [self._sparse_vector_field]
+        # When an ingest pipeline is configured, sparse_vector data is not stored in _source
+        # (ES indexes it but omits it from the stored document). Request it via the fields API
+        # so that _deserialize_document can populate Document.sparse_embedding correctly.
+        # Merge rather than replace: a caller may already pass fields=["title", ...] for a custom
+        # projection — dropping their list would silently hide sparse embeddings on those docs.
+        if self._ingest_pipeline and self._sparse_vector_field:
+            existing = list(kwargs.get("fields") or [])
+            if self._sparse_vector_field not in existing:
+                kwargs["fields"] = [*existing, self._sparse_vector_field]
 
         documents: list[Document] = []
         from_ = 0
@@ -680,10 +688,11 @@ def write_documents(
         for doc in documents:
             doc_dict = doc.to_dict()
             # ES rejects null for strongly-typed fields (dense_vector, sparse_vector) when the
-            # index mapping carries explicit configuration such as `dims`. A missing field is
-            # always valid — it lets ingest pipelines populate the value at index time, and for
-            # ordinary writes it simply means no value is stored. We only strip the known
-            # Haystack document fields here; metadata values are left untouched intentionally.
+            # index mapping carries explicit configuration such as `dims`. This applies to all
+            # writes, not just ingest pipeline writes: any index with a custom_mapping that
+            # declares explicit field types will reject null values. A missing field is always
+            # valid — ES treats it as "no value stored". We only strip the known Haystack
+            # document fields here; metadata values are left untouched intentionally.
             for field in ("embedding", "blob", "score"):
                 if doc_dict.get(field) is None:
                     doc_dict.pop(field, None)
@@ -770,10 +779,11 @@ async def write_documents_async(
         for doc in documents:
             doc_dict = doc.to_dict()
             # ES rejects null for strongly-typed fields (dense_vector, sparse_vector) when the
-            # index mapping carries explicit configuration such as `dims`. A missing field is
-            # always valid — it lets ingest pipelines populate the value at index time, and for
-            # ordinary writes it simply means no value is stored. We only strip the known
-            # Haystack document fields here; metadata values are left untouched intentionally.
+            # index mapping carries explicit configuration such as `dims`. This applies to all
+            # writes, not just ingest pipeline writes: any index with a custom_mapping that
+            # declares explicit field types will reject null values. A missing field is always
+            # valid — ES treats it as "no value stored". We only strip the known Haystack
+            # document fields here; metadata values are left untouched intentionally.
             for field in ("embedding", "blob", "score"):
                 if doc_dict.get(field) is None:
                     doc_dict.pop(field, None)
diff --git a/integrations/elasticsearch/tests/test_cloud_hybrid_retriever.py b/integrations/elasticsearch/tests/test_cloud_hybrid_retriever.py
@@ -241,7 +241,7 @@ def _index_documents_with_inference(client, index: str, inference_id: str, docs:
 
 
 @pytest.mark.integration
-class TestElasticsearchInferenceHybridRetrieverIntegration:
+class TestElasticsearchInferenceHybridRetriever:
     """
     End-to-end tests against a real Elastic Cloud cluster with a deployed ELSER endpoint.
     Run with: pytest -m integration
diff --git a/integrations/elasticsearch/tests/test_cloud_ingest_pipeline.py b/integrations/elasticsearch/tests/test_cloud_ingest_pipeline.py
@@ -45,7 +45,7 @@ def _get_dense_query_embedding(client, inference_id: str, text: str) -> list[flo
 
 
 @pytest.mark.integration
-class TestIngestPipelineDense:
+class TestElasticSearchIngestPipelineDense:
     """
     End-to-end integration tests for ElasticsearchDocumentStore with an ingest pipeline
     that generates dense embeddings at index time.
@@ -192,7 +192,7 @@ async def test_async_write_documents_via_pipeline(self, ingest_pipeline_dense_do
 
 
 @pytest.mark.integration
-class TestIngestPipelineSparse:
+class TestElasticSearchIngestPipelineSparse:
     """
     End-to-end integration tests for ElasticsearchDocumentStore with an ingest pipeline
     that generates ELSER sparse embeddings at index time.
diff --git a/integrations/elasticsearch/tests/test_cloud_sparse_retriever.py b/integrations/elasticsearch/tests/test_cloud_sparse_retriever.py
@@ -304,7 +304,7 @@ def _index_documents_with_inference(client, index: str, inference_id: str, docum
 
 
 @pytest.mark.integration
-class TestElasticsearchInferenceSparseRetrieverIntegration:
+class TestElasticsearchInferenceSparseRetriever:
     """
     End-to-end integration tests for ElasticsearchInferenceSparseRetriever.
 
diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py
@@ -280,6 +280,13 @@ def test_write_documents_bulk_passes_pipeline_when_configured(mock_es, _mock_asy
     mock_bulk.assert_called_once()
     assert mock_bulk.call_args.kwargs["pipeline"] == "my-ingest"
 
+    call_actions = mock_bulk.call_args.kwargs["actions"]
+    assert len(call_actions) == 1
+    source = call_actions[0]["_source"]
+    assert "embedding" not in source
+    assert "blob" not in source
+    assert "score" not in source
+
 
 @patch("haystack_integrations.document_stores.elasticsearch.document_store.helpers.bulk")
 @patch("haystack_integrations.document_stores.elasticsearch.document_store.AsyncElasticsearch")
@@ -325,6 +332,28 @@ async def test_write_documents_async_bulk_passes_pipeline_when_configured(mock_e
     assert mock_async_bulk.call_args.kwargs["pipeline"] == "pipe-async"
 
 
+@pytest.mark.asyncio
+@patch("haystack_integrations.document_stores.elasticsearch.document_store.helpers.async_bulk")
+@patch("haystack_integrations.document_stores.elasticsearch.document_store.AsyncElasticsearch")
+@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch")
+async def test_write_documents_async_bulk_omits_pipeline_when_not_configured(mock_es, mock_async_es_cls, mock_async_bulk):
+    mock_client = Mock()
+    mock_client.info.return_value = {"version": {"number": "8.0.0"}}
+    mock_client.indices.exists.return_value = True
+    mock_es.return_value = mock_client
+
+    mock_async_es_cls.return_value = AsyncMock()
+
+    mock_async_bulk.return_value = (1, [])
+
+    store = ElasticsearchDocumentStore(hosts="http://localhost:9200", index="idx_async_no_pipeline")
+    _ = store.client
+    await store.write_documents_async([Document(id="1", content="a")])
+
+    mock_async_bulk.assert_called_once()
+    assert "pipeline" not in mock_async_bulk.call_args.kwargs
+
+
 def test_api_key_validation_only_api_key():
     api_key = Secret.from_token("test_api_key")
     document_store = ElasticsearchDocumentStore(hosts="https://localhost:9200", api_key=api_key)