Skip to content

Commit ab18eff

Browse files
authored
fix: Weaviate - stop ignoring _split_overlap meta field (#2966)
* stop ignoring _split_overlap meta field * rename test
1 parent 3d1c245 commit ab18eff

2 files changed

Lines changed: 20 additions & 26 deletions

File tree

integrations/weaviate/src/haystack_integrations/document_stores/weaviate/document_store.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -745,15 +745,6 @@ def _to_data_object(document: Document) -> dict[str, Any]:
745745
# The embedding vector is stored separately from the rest of the data
746746
del data["embedding"]
747747

748-
# _split_overlap meta field is unsupported because of a bug
749-
# https://github.com/deepset-ai/haystack-core-integrations/issues/1172
750-
if "_split_overlap" in data:
751-
data.pop("_split_overlap")
752-
logger.warning(
753-
"Document {id} has the unsupported `_split_overlap` meta field. It will be ignored.",
754-
id=data["_original_id"],
755-
)
756-
757748
if "sparse_embedding" in data:
758749
sparse_embedding = data.pop("sparse_embedding", None)
759750
if sparse_embedding:

integrations/weaviate/tests/test_document_store.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -464,29 +464,32 @@ def test_comparison_less_than_equal_with_iso_date(self, document_store, filterab
464464
],
465465
)
466466

467-
def test_meta_split_overlap_is_skipped(self, document_store):
467+
def test_split_overlap_preserved(self, document_store):
468+
"""Split overlap meta is written and read back correctly."""
469+
overlap = [
470+
{"range": [3.0, 13.0], "doc_id": "34326b7e6be489cb4c031152fc378cb50479ca5fcc3861e7e61dfb2e4e4e968b"},
471+
{"range": [0.0, 13.0], "doc_id": "780f791c09d499c0bf01f87bce047b45c44224d36c79f0c9d8c1405a3197fc1a"},
472+
]
468473
doc = Document(
469-
content="The moonlight shimmered ",
474+
id="6edd24e8b01f3cd6e4b71fef7d57b52f17664e14db5ab01b8ef429f97add3620",
475+
content="an eighth test. ",
470476
meta={
471-
"source_id": "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0",
472-
"page_number": 1,
473-
"split_id": 0,
474-
"split_idx_start": 0,
475-
"_split_overlap": [
476-
{"doc_id": "68ed48ba830048c5d7815874ed2de794722e6d10866b6c55349a914fd9a0df65", "range": (0, 20)}
477-
],
477+
"_split_overlap": overlap,
478+
"page_number": 1.0,
479+
"split_id": 33.0,
480+
"split_idx_start": 159.0,
481+
"source_id": "fdbde6d217f04d3dd60c01f36541794f3153a61f13b4ca669655f4c5610c1664",
478482
},
479483
)
480484
document_store.write_documents([doc])
481-
482485
written_doc = document_store.filter_documents()[0]
483-
484-
assert written_doc.content == "The moonlight shimmered "
485-
assert written_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0"
486-
assert written_doc.meta["page_number"] == 1.0
487-
assert written_doc.meta["split_id"] == 0.0
488-
assert written_doc.meta["split_idx_start"] == 0.0
489-
assert "_split_overlap" not in written_doc.meta
486+
assert "_split_overlap" in written_doc.meta
487+
written_overlap = written_doc.meta["_split_overlap"]
488+
assert len(written_overlap) == 2
489+
assert written_overlap[0]["doc_id"] == overlap[0]["doc_id"]
490+
assert list(written_overlap[0]["range"]) == [3, 13]
491+
assert written_overlap[1]["doc_id"] == overlap[1]["doc_id"]
492+
assert list(written_overlap[1]["range"]) == [0, 13]
490493

491494
def test_bm25_retrieval(self, document_store):
492495
document_store.write_documents(

0 commit comments

Comments
 (0)