@@ -373,11 +373,15 @@ def _search_documents(self, **kwargs: Any) -> list[Document]:
373373 if top_k is None and "knn" in kwargs and "k" in kwargs ["knn" ]:
374374 top_k = kwargs ["knn" ]["k" ]
375375
376- # sparse_vector data written by an ingest pipeline is not stored in _source,
377- # but is retrievable via the fields API. Request it explicitly so that
378- # _deserialize_document can populate Document.sparse_embedding correctly.
379- if self ._sparse_vector_field and "fields" not in kwargs :
380- kwargs ["fields" ] = [self ._sparse_vector_field ]
376+ # When an ingest pipeline is configured, sparse_vector data is not stored in _source
377+ # (ES indexes it but omits it from the stored document). Request it via the fields API
378+ # so that _deserialize_document can populate Document.sparse_embedding correctly.
379+ # Merge rather than replace: a caller may already pass fields=["title", ...] for a custom
380+ # projection — dropping their list would silently hide sparse embeddings on those docs.
381+ if self ._ingest_pipeline and self ._sparse_vector_field :
382+ existing = list (kwargs .get ("fields" ) or [])
383+ if self ._sparse_vector_field not in existing :
384+ kwargs ["fields" ] = [* existing , self ._sparse_vector_field ]
381385
382386 documents : list [Document ] = []
383387 from_ = 0
@@ -406,11 +410,15 @@ async def _search_documents_async(self, **kwargs: Any) -> list[Document]:
406410 if top_k is None and "knn" in kwargs and "k" in kwargs ["knn" ]:
407411 top_k = kwargs ["knn" ]["k" ]
408412
409- # sparse_vector data written by an ingest pipeline is not stored in _source,
410- # but is retrievable via the fields API. Request it explicitly so that
411- # _deserialize_document can populate Document.sparse_embedding correctly.
412- if self ._sparse_vector_field and "fields" not in kwargs :
413- kwargs ["fields" ] = [self ._sparse_vector_field ]
413+ # When an ingest pipeline is configured, sparse_vector data is not stored in _source
414+ # (ES indexes it but omits it from the stored document). Request it via the fields API
415+ # so that _deserialize_document can populate Document.sparse_embedding correctly.
416+ # Merge rather than replace: a caller may already pass fields=["title", ...] for a custom
417+ # projection — dropping their list would silently hide sparse embeddings on those docs.
418+ if self ._ingest_pipeline and self ._sparse_vector_field :
419+ existing = list (kwargs .get ("fields" ) or [])
420+ if self ._sparse_vector_field not in existing :
421+ kwargs ["fields" ] = [* existing , self ._sparse_vector_field ]
414422
415423 documents : list [Document ] = []
416424 from_ = 0
@@ -680,10 +688,11 @@ def write_documents(
680688 for doc in documents :
681689 doc_dict = doc .to_dict ()
682690 # ES rejects null for strongly-typed fields (dense_vector, sparse_vector) when the
683- # index mapping carries explicit configuration such as `dims`. A missing field is
684- # always valid — it lets ingest pipelines populate the value at index time, and for
685- # ordinary writes it simply means no value is stored. We only strip the known
686- # Haystack document fields here; metadata values are left untouched intentionally.
691+ # index mapping carries explicit configuration such as `dims`. This applies to all
692+ # writes, not just ingest pipeline writes: any index with a custom_mapping that
693+ # declares explicit field types will reject null values. A missing field is always
694+ # valid — ES treats it as "no value stored". We only strip the known Haystack
695+ # document fields here; metadata values are left untouched intentionally.
687696 for field in ("embedding" , "blob" , "score" ):
688697 if doc_dict .get (field ) is None :
689698 doc_dict .pop (field , None )
@@ -770,10 +779,11 @@ async def write_documents_async(
770779 for doc in documents :
771780 doc_dict = doc .to_dict ()
772781 # ES rejects null for strongly-typed fields (dense_vector, sparse_vector) when the
773- # index mapping carries explicit configuration such as `dims`. A missing field is
774- # always valid — it lets ingest pipelines populate the value at index time, and for
775- # ordinary writes it simply means no value is stored. We only strip the known
776- # Haystack document fields here; metadata values are left untouched intentionally.
782+ # index mapping carries explicit configuration such as `dims`. This applies to all
783+ # writes, not just ingest pipeline writes: any index with a custom_mapping that
784+ # declares explicit field types will reject null values. A missing field is always
785+ # valid — ES treats it as "no value stored". We only strip the known Haystack
786+ # document fields here; metadata values are left untouched intentionally.
777787 for field in ("embedding" , "blob" , "score" ):
778788 if doc_dict .get (field ) is None :
779789 doc_dict .pop (field , None )
0 commit comments