Skip to content

Commit 6b2081b

Browse files
committed
adding roundtrip tests to assert documents metadata is correctly written and retrieved
1 parent 98ddcf3 commit 6b2081b

4 files changed

Lines changed: 104 additions & 23 deletions

File tree

integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -352,29 +352,10 @@ async def count_documents_async(self) -> int:
352352
@staticmethod
353353
def _deserialize_search_hits(hits: list[dict[str, Any]]) -> list[Document]:
354354
out = []
355-
356355
for hit in hits:
357-
data = hit["_source"].copy()
358-
359-
# Reconstruct metadata dict from flattened fields
360-
meta = {}
361-
fields_to_remove = []
362-
for key, value in data.items():
363-
if key not in SPECIAL_FIELDS:
364-
meta[key] = value
365-
fields_to_remove.append(key)
366-
367-
# Remove metadata fields from top level and add them to meta
368-
for key in fields_to_remove:
369-
data.pop(key, None)
370-
371-
if meta:
372-
data["meta"] = meta
373-
356+
data = hit["_source"]
374357
if "highlight" in hit:
375-
if "meta" not in data:
376-
data["meta"] = {}
377-
data["meta"]["highlighted"] = hit["highlight"]
358+
data["metadata"]["highlighted"] = hit["highlight"]
378359
data["score"] = hit["_score"]
379360
out.append(Document.from_dict(data))
380361

integrations/opensearch/tests/test_bm25_retriever.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,60 @@ def test_bm25_retriever_runtime_document_store_switching(
424424
assert len(results_1_again["documents"]) == 1
425425

426426

427+
@pytest.mark.integration
428+
def test_bm25_retriever_document_structure_with_metadata(document_store):
429+
"""
430+
Test document structure with complex metadata (nested values, lists, etc.)
431+
"""
432+
docs = [
433+
Document(
434+
content="Python is versatile",
435+
meta={
436+
"category": "programming",
437+
"tags": ["python", "general-purpose"],
438+
"rating": 4.5,
439+
"active": True,
440+
"author": {"name": "John", "role": "developer"},
441+
},
442+
id="python_doc",
443+
),
444+
Document(
445+
content="JavaScript is dynamic",
446+
meta={
447+
"category": "programming",
448+
"tags": ["javascript", "web"],
449+
"rating": 4.8,
450+
"active": True,
451+
},
452+
id="js_doc",
453+
),
454+
]
455+
document_store.write_documents(docs, refresh=True)
456+
retriever = OpenSearchBM25Retriever(document_store=document_store)
457+
458+
results = retriever.run(query="programming", top_k=2)
459+
assert len(results["documents"]) == 2
460+
461+
for doc in results["documents"]:
462+
# Verify structure
463+
assert hasattr(doc, "content")
464+
assert hasattr(doc, "meta")
465+
assert isinstance(doc.meta, dict)
466+
467+
# Verify complex metadata is preserved
468+
assert "category" in doc.meta
469+
assert "tags" in doc.meta
470+
assert isinstance(doc.meta["tags"], list)
471+
assert "rating" in doc.meta
472+
473+
# Verify document can be serialized/deserialized
474+
doc_dict = doc.to_dict()
475+
doc_from_dict = Document.from_dict(doc_dict)
476+
assert doc_from_dict.content == doc.content
477+
assert doc_from_dict.meta == doc.meta
478+
assert doc_from_dict.id == doc.id
479+
480+
427481
@pytest.mark.asyncio
428482
@pytest.mark.integration
429483
async def test_bm25_retriever_async_runtime_document_store_switching(

integrations/opensearch/tests/test_document_store.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -615,7 +615,6 @@ def test_update_by_filter(self, document_store: OpenSearchDocumentStore):
615615
assert len(draft_docs) == 1
616616
assert draft_docs[0].meta["category"] == "B"
617617

618-
619618
def test_count_documents_by_filter(self, document_store: OpenSearchDocumentStore):
620619
docs = [
621620
Document(content="Doc 1", meta={"category": "A", "status": "active"}),
@@ -914,4 +913,3 @@ def test_delete_with_routing(self, document_store: OpenSearchDocumentStore):
914913
document_store.delete_documents(["1", "2"], routing=routing_map)
915914

916915
assert document_store.count_documents() == 1
917-

integrations/opensearch/tests/test_embedding_retriever.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,3 +404,51 @@ async def test_embedding_retriever_runtime_document_store_switching_async(
404404
python_query_embedding = [0.4, 0.4, 0.4] + [0.0] * 765
405405
results_1_again = await retriever.run_async(query_embedding=python_query_embedding)
406406
assert "Python" in results_1_again["documents"][0].content
407+
408+
409+
@pytest.mark.integration
410+
def test_embedding_retriever_document_structure_with_metadata(document_store, test_documents_with_embeddings_1):
411+
"""
412+
Test that documents returned by embedding retriever have correct structure:
413+
- Metadata fields are in doc.meta (not at top level)
414+
- Special fields (content, embedding, id, score) are at top level
415+
- All original metadata is preserved
416+
"""
417+
document_store.write_documents(test_documents_with_embeddings_1, refresh=True)
418+
retriever = OpenSearchEmbeddingRetriever(document_store=document_store)
419+
420+
# Query embedding to match functional programming languages
421+
query_embedding = [0.2, 0.3, 0.4] + [0.0] * 765
422+
results = retriever.run(query_embedding=query_embedding, top_k=5)
423+
424+
assert len(results["documents"]) > 0
425+
426+
for doc in results["documents"]:
427+
# Verify special fields are at top level
428+
assert hasattr(doc, "content")
429+
assert isinstance(doc.content, str)
430+
assert hasattr(doc, "id")
431+
assert isinstance(doc.id, str)
432+
assert hasattr(doc, "score")
433+
assert doc.score is not None
434+
assert hasattr(doc, "embedding")
435+
assert isinstance(doc.embedding, list)
436+
assert len(doc.embedding) == 768
437+
438+
# Verify metadata fields are in meta dict (not at top level)
439+
assert hasattr(doc, "meta")
440+
assert isinstance(doc.meta, dict)
441+
442+
# Verify original metadata is preserved
443+
assert "likes" in doc.meta
444+
assert "language_type" in doc.meta
445+
assert isinstance(doc.meta["likes"], int)
446+
assert isinstance(doc.meta["language_type"], str)
447+
448+
# Verify document can be serialized/deserialized
449+
doc_dict = doc.to_dict()
450+
doc_from_dict = Document.from_dict(doc_dict)
451+
assert doc_from_dict.content == doc.content
452+
assert doc_from_dict.meta == doc.meta
453+
assert doc_from_dict.id == doc.id
454+
assert doc_from_dict.embedding == doc.embedding

0 commit comments

Comments
 (0)