Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 21 additions & 12 deletions haystack/components/extractors/llm_metadata_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import copy
import json
from concurrent.futures import ThreadPoolExecutor
from dataclasses import replace
from typing import Any, Dict, List, Optional, Union

from jinja2 import meta
Expand Down Expand Up @@ -319,23 +320,31 @@ def run(self, documents: List[Document], page_range: Optional[List[Union[str, in
failed_documents = []
for document, result in zip(documents, results):
if "error" in result:
document.meta["metadata_extraction_error"] = result["error"]
document.meta["metadata_extraction_response"] = None
failed_documents.append(document)
new_meta = {
**document.meta,
"metadata_extraction_error": result["error"],
"metadata_extraction_response": None,
}
# We use replace to ensure we don't modify the original document
failed_documents.append(replace(document, meta=new_meta))
continue

parsed_metadata = self._extract_metadata(result["replies"][0].text)
if "error" in parsed_metadata:
document.meta["metadata_extraction_error"] = parsed_metadata["error"]
document.meta["metadata_extraction_response"] = result["replies"][0]
failed_documents.append(document)
new_meta = {
**document.meta,
"metadata_extraction_error": parsed_metadata["error"],
"metadata_extraction_response": result["replies"][0],
}
# We use replace to ensure we don't modify the original document
failed_documents.append(replace(document, meta=new_meta))
continue

for key in parsed_metadata:
document.meta[key] = parsed_metadata[key]
# Remove metadata_extraction_error and metadata_extraction_response if present from previous runs
document.meta.pop("metadata_extraction_error", None)
document.meta.pop("metadata_extraction_response", None)
successful_documents.append(document)
new_meta = {**document.meta, **parsed_metadata}
# Remove metadata_extraction_error and metadata_extraction_response if present from previous runs
new_meta.pop("metadata_extraction_error", None)
new_meta.pop("metadata_extraction_response", None)
# We use replace to ensure we don't modify the original document
successful_documents.append(replace(document, meta=new_meta))

return {"documents": successful_documents, "failed_documents": failed_documents}
8 changes: 8 additions & 0 deletions test/components/extractors/test_llm_metadata_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,13 @@ def test_run_with_document_content_none(self, monkeypatch):
assert len(result["failed_documents"]) == 2

failed_doc_none = result["failed_documents"][0]
assert failed_doc_none.id
assert failed_doc_none.id == doc_with_none_content.id
assert "metadata_extraction_error" in failed_doc_none.meta
assert failed_doc_none.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."

failed_doc_empty = result["failed_documents"][1]
assert failed_doc_empty.id
assert failed_doc_empty.id == doc_with_empty_content.id
assert "metadata_extraction_error" in failed_doc_empty.meta
assert failed_doc_empty.meta["metadata_extraction_error"] == "Document has no content, skipping LLM call."
Expand Down Expand Up @@ -322,3 +324,9 @@ def test_live_run(self):
assert len(doc_store_docs) == 2
assert "entities" in doc_store_docs[0].meta
assert "entities" in doc_store_docs[1].meta

# Check that IDs of documents in doc store are the same as the original documents
doc_store_doc_ids = {doc.id for doc in doc_store_docs}
assert len(doc_store_doc_ids) == 2
original_doc_ids = {doc.id for doc in docs}
assert doc_store_doc_ids == original_doc_ids
Loading