Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,12 @@ class MetaExtractor(BaseMetaExtractor):

def extract_chunk_meta(self, chunk: BaseChunk) -> dict[str, Any]:
"""Extract chunk meta."""
return {"dl_meta": chunk.export_json_dict()}
meta: dict[str, Any] = {"dl_meta": chunk.export_json_dict()}
doc_items = getattr(chunk.meta, "doc_items", [])
page_nos = {prov.page_no for item in doc_items for prov in getattr(item, "prov", [])}
if page_nos:
meta["page_number"] = min(page_nos)
return meta

def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]:
"""Extract Docling document meta."""
Expand All @@ -97,7 +102,7 @@ def __init__(
self,
converter: DocumentConverter | None = None,
convert_kwargs: dict[str, Any] | None = None,
export_type: ExportType = ExportType.DOC_CHUNKS,
export_type: ExportType = ExportType.MARKDOWN,
md_export_kwargs: dict[str, Any] | None = None,
chunker: BaseChunker | None = None,
meta_extractor: BaseMetaExtractor | None = None,
Expand All @@ -110,10 +115,10 @@ def __init__(
:param convert_kwargs: Any parameters to pass to Docling conversion; if not set, a
system default is used.
:param export_type: The export mode to use:
* `ExportType.MARKDOWN` captures each input document as a single
* `ExportType.MARKDOWN` (default) captures each input document as a single
markdown `Document`.
* `ExportType.DOC_CHUNKS` (default) first chunks each input document
and then returns one `Document` per chunk.
* `ExportType.DOC_CHUNKS` first chunks each input document and then returns
one `Document` per chunk.
* `ExportType.JSON` serializes the full Docling document to a JSON string.
:param md_export_kwargs: Any parameters to pass to Markdown export (applicable in
case of `ExportType.MARKDOWN`).
Expand Down Expand Up @@ -234,15 +239,17 @@ def run(
merged_meta = source_meta

if self.export_type == ExportType.DOC_CHUNKS:
chunk_iter = self._chunker_instance.chunk(dl_doc=dl_doc)
hs_docs = [
Document(
content=self._chunker_instance.contextualize(chunk=chunk),
meta={**self._meta_extractor_instance.extract_chunk_meta(chunk=chunk), **merged_meta},
)
for chunk in chunk_iter
]
documents.extend(hs_docs)
split_idx_start = 0
for split_id, chunk in enumerate(self._chunker_instance.chunk(dl_doc=dl_doc)):
content = self._chunker_instance.contextualize(chunk=chunk)
meta = {
**self._meta_extractor_instance.extract_chunk_meta(chunk=chunk),
"split_id": split_id,
"split_idx_start": split_idx_start,
**merged_meta,
}
documents.append(Document(content=content, meta=meta))
split_idx_start += len(chunk.text)
elif self.export_type == ExportType.MARKDOWN:
hs_doc = Document(
content=dl_doc.export_to_markdown(**self.md_export_kwargs),
Expand Down
110 changes: 107 additions & 3 deletions integrations/docling/tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def chunk_side_effect(dl_doc: Any) -> list[SimpleNamespace]:

assert "contextualized-chunk-1-of-dl-doc-for-file-a.pdf" in contents
assert "contextualized-chunk-2-of-dl-doc-for-file-a.pdf" in contents
assert {"chunk_id": "chunk-1-of-dl-doc-for-file-a.pdf"} in metas
assert any(m.get("chunk_id") == "chunk-1-of-dl-doc-for-file-a.pdf" for m in metas)

# Ensure our collaborators were actually exercised.
assert converter_mock.convert.call_count == len(paths)
Expand Down Expand Up @@ -152,7 +152,7 @@ def test_component_to_dict_defaults() -> None:
"init_parameters": {
"converter": None,
"convert_kwargs": {},
"export_type": "doc_chunks",
"export_type": "markdown",
"md_export_kwargs": {"image_placeholder": ""},
"chunker": None,
"meta_extractor": None,
Expand Down Expand Up @@ -233,7 +233,7 @@ def test_component_from_dict_custom_params() -> None:


def test_component_to_dict_chunker_warns_and_is_dropped() -> None:
converter = DoclingConverter(chunker=HybridChunker(merge_peers=False))
converter = DoclingConverter(export_type=ExportType.DOC_CHUNKS, chunker=HybridChunker(merge_peers=False))

assert converter.to_dict() == {
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
Expand Down Expand Up @@ -484,12 +484,43 @@ class TestMetaExtractor:
def test_extract_chunk_meta_wraps_export_json_dict(self) -> None:
chunk = MagicMock()
chunk.export_json_dict.return_value = {"some": "dict"}
chunk.meta.doc_items = []

result = MetaExtractor().extract_chunk_meta(chunk=chunk)

assert result == {"dl_meta": {"some": "dict"}}
chunk.export_json_dict.assert_called_once_with()

def test_extract_chunk_meta_includes_page_number(self) -> None:
prov = MagicMock()
prov.page_no = 3
doc_item = MagicMock()
doc_item.prov = [prov]

chunk = MagicMock()
chunk.export_json_dict.return_value = {"some": "dict"}
chunk.meta.doc_items = [doc_item]

result = MetaExtractor().extract_chunk_meta(chunk=chunk)

assert result == {"dl_meta": {"some": "dict"}, "page_number": 3}

def test_extract_chunk_meta_page_number_uses_minimum(self) -> None:
prov1 = MagicMock()
prov1.page_no = 5
prov2 = MagicMock()
prov2.page_no = 3
doc_item = MagicMock()
doc_item.prov = [prov1, prov2]

chunk = MagicMock()
chunk.export_json_dict.return_value = {}
chunk.meta.doc_items = [doc_item]

result = MetaExtractor().extract_chunk_meta(chunk=chunk)

assert result["page_number"] == 3

def test_extract_dl_doc_meta_with_origin(self) -> None:
dl_doc = MagicMock()
dl_doc.origin.model_dump.return_value = {"filename": "foo.pdf", "mimetype": "application/pdf"}
Expand All @@ -512,3 +543,76 @@ def test_run_without_sources_or_paths_raises_value_error() -> None:
converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock())
with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."):
converter.run()


def test_run_doc_chunks_split_id_and_split_idx_start() -> None:
converter_mock = MagicMock()
chunker_mock = MagicMock()
meta_extractor_mock = MagicMock()

converter_mock.convert.return_value = SimpleNamespace(document="dl-doc")

chunks = [
SimpleNamespace(text="hello world"),
SimpleNamespace(text="foo bar baz"),
]
chunker_mock.chunk.return_value = chunks
chunker_mock.contextualize.side_effect = lambda chunk: f"ctx:{chunk.text}"
meta_extractor_mock.extract_chunk_meta.return_value = {}

converter = DoclingConverter(
converter=converter_mock,
export_type=ExportType.DOC_CHUNKS,
chunker=chunker_mock,
meta_extractor=meta_extractor_mock,
)

result = converter.run(sources=["doc.pdf"])
documents = result["documents"]

assert len(documents) == 2
assert documents[0].meta["split_id"] == 0
assert documents[0].meta["split_idx_start"] == 0
assert documents[1].meta["split_id"] == 1
assert documents[1].meta["split_idx_start"] == len("hello world")


def test_run_doc_chunks_split_id_resets_per_document() -> None:
converter_mock = MagicMock()
chunker_mock = MagicMock()
meta_extractor_mock = MagicMock()

converter_mock.convert.side_effect = [
SimpleNamespace(document="dl-doc-a"),
SimpleNamespace(document="dl-doc-b"),
]
chunker_mock.chunk.side_effect = lambda dl_doc: [
SimpleNamespace(text=f"chunk-1-of-{dl_doc}"),
SimpleNamespace(text=f"chunk-2-of-{dl_doc}"),
]
chunker_mock.contextualize.side_effect = lambda chunk: chunk.text
meta_extractor_mock.extract_chunk_meta.return_value = {}

converter = DoclingConverter(
converter=converter_mock,
export_type=ExportType.DOC_CHUNKS,
chunker=chunker_mock,
meta_extractor=meta_extractor_mock,
)

result = converter.run(sources=["a.pdf", "b.pdf"])
documents = result["documents"]

# split_id and split_idx_start reset for each source document
doc_a_chunks = documents[:2]
doc_b_chunks = documents[2:]

assert doc_a_chunks[0].meta["split_id"] == 0
assert doc_a_chunks[0].meta["split_idx_start"] == 0
assert doc_a_chunks[1].meta["split_id"] == 1
assert doc_a_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-a")

assert doc_b_chunks[0].meta["split_id"] == 0
assert doc_b_chunks[0].meta["split_idx_start"] == 0
assert doc_b_chunks[1].meta["split_id"] == 1
assert doc_b_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-b")
Loading