From b787be678478200a1bf1e78db2aff5bb0ad04664 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Wed, 6 May 2026 12:28:14 +0500 Subject: [PATCH 1/2] fix(docling): change default export type to MARKDOWN and add page_number to chunk metadata - ExportType.MARKDOWN is now the default (was DOC_CHUNKS), aligning with Haystack convention of separating conversion from chunking - MetaExtractor.extract_chunk_meta now extracts page_number from chunk provenance, making metadata consistent with other Haystack splitters --- integrations/docling/CHANGELOG.md | 10 ++++++ .../converters/docling/converter.py | 15 +++++--- integrations/docling/tests/test_converter.py | 35 +++++++++++++++++-- 3 files changed, 53 insertions(+), 7 deletions(-) diff --git a/integrations/docling/CHANGELOG.md b/integrations/docling/CHANGELOG.md index bdf7c9657c..523cc117e5 100644 --- a/integrations/docling/CHANGELOG.md +++ b/integrations/docling/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog +## [Unreleased] + +### 🚀 Features + +- Make `ExportType.MARKDOWN` the default export type; add `page_number` to chunk metadata (#3256) + +### ⚠ Breaking Changes + +- `DoclingConverter` now defaults to `ExportType.MARKDOWN` instead of `ExportType.DOC_CHUNKS` + ## [integrations/docling-v0.4.0] - 2026-05-04 ### 🚀 Features diff --git a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py index eaa4958e2f..24d97a5fac 100644 --- a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py +++ b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py @@ -82,7 +82,12 @@ class MetaExtractor(BaseMetaExtractor): def extract_chunk_meta(self, chunk: BaseChunk) -> dict[str, Any]: """Extract chunk meta.""" - return {"dl_meta": chunk.export_json_dict()} + meta: dict[str, Any] = {"dl_meta": chunk.export_json_dict()} + doc_items = getattr(chunk.meta, "doc_items", []) + page_nos = {prov.page_no for item in doc_items for prov in getattr(item, "prov", [])} + if page_nos: + meta["page_number"] = min(page_nos) + return meta def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]: """Extract Docling document meta.""" @@ -97,7 +102,7 @@ def __init__( self, converter: DocumentConverter | None = None, convert_kwargs: dict[str, Any] | None = None, - export_type: ExportType = ExportType.DOC_CHUNKS, + export_type: ExportType = ExportType.MARKDOWN, md_export_kwargs: dict[str, Any] | None = None, chunker: BaseChunker | None = None, meta_extractor: BaseMetaExtractor | None = None, @@ -110,10 +115,10 @@ def __init__( :param convert_kwargs: Any parameters to pass to Docling conversion; if not set, a system default is used. :param export_type: The export mode to use: - * `ExportType.MARKDOWN` captures each input document as a single + * `ExportType.MARKDOWN` (default) captures each input document as a single markdown `Document`. - * `ExportType.DOC_CHUNKS` (default) first chunks each input document - and then returns one `Document` per chunk. + * `ExportType.DOC_CHUNKS` first chunks each input document and then returns + one `Document` per chunk. * `ExportType.JSON` serializes the full Docling document to a JSON string. :param md_export_kwargs: Any parameters to pass to Markdown export (applicable in case of `ExportType.MARKDOWN`). diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index 5ba9f9c4e4..eac251e91f 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -152,7 +152,7 @@ def test_component_to_dict_defaults() -> None: "init_parameters": { "converter": None, "convert_kwargs": {}, - "export_type": "doc_chunks", + "export_type": "markdown", "md_export_kwargs": {"image_placeholder": ""}, "chunker": None, "meta_extractor": None, @@ -233,7 +233,7 @@ def test_component_from_dict_custom_params() -> None: def test_component_to_dict_chunker_warns_and_is_dropped() -> None: - converter = DoclingConverter(chunker=HybridChunker(merge_peers=False)) + converter = DoclingConverter(export_type=ExportType.DOC_CHUNKS, chunker=HybridChunker(merge_peers=False)) assert converter.to_dict() == { "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", @@ -484,12 +484,43 @@ class TestMetaExtractor: def test_extract_chunk_meta_wraps_export_json_dict(self) -> None: chunk = MagicMock() chunk.export_json_dict.return_value = {"some": "dict"} + chunk.meta.doc_items = [] result = MetaExtractor().extract_chunk_meta(chunk=chunk) assert result == {"dl_meta": {"some": "dict"}} chunk.export_json_dict.assert_called_once_with() + def test_extract_chunk_meta_includes_page_number(self) -> None: + prov = MagicMock() + prov.page_no = 3 + doc_item = MagicMock() + doc_item.prov = [prov] + + chunk = MagicMock() + chunk.export_json_dict.return_value = {"some": "dict"} + chunk.meta.doc_items = [doc_item] + + result = MetaExtractor().extract_chunk_meta(chunk=chunk) + + assert result == {"dl_meta": {"some": "dict"}, "page_number": 3} + + def test_extract_chunk_meta_page_number_uses_minimum(self) -> None: + prov1 = MagicMock() + prov1.page_no = 5 + prov2 = MagicMock() + prov2.page_no = 3 + doc_item = MagicMock() + doc_item.prov = [prov1, prov2] + + chunk = MagicMock() + chunk.export_json_dict.return_value = {} + chunk.meta.doc_items = [doc_item] + + result = MetaExtractor().extract_chunk_meta(chunk=chunk) + + assert result["page_number"] == 3 + def test_extract_dl_doc_meta_with_origin(self) -> None: dl_doc = MagicMock() dl_doc.origin.model_dump.return_value = {"filename": "foo.pdf", "mimetype": "application/pdf"} From ab4121c02e8940863e72c37d6170c38eed5e42d8 Mon Sep 17 00:00:00 2001 From: SyedShahmeerAli12 Date: Thu, 7 May 2026 01:01:38 +0500 Subject: [PATCH 2/2] fix(docling): add split_id and split_idx_start to chunk metadata; revert CHANGELOG --- integrations/docling/CHANGELOG.md | 10 --- .../converters/docling/converter.py | 20 ++--- integrations/docling/tests/test_converter.py | 75 ++++++++++++++++++- 3 files changed, 85 insertions(+), 20 deletions(-) diff --git a/integrations/docling/CHANGELOG.md b/integrations/docling/CHANGELOG.md index 523cc117e5..bdf7c9657c 100644 --- a/integrations/docling/CHANGELOG.md +++ b/integrations/docling/CHANGELOG.md @@ -1,15 +1,5 @@ # Changelog -## [Unreleased] - -### 🚀 Features - -- Make `ExportType.MARKDOWN` the default export type; add `page_number` to chunk metadata (#3256) - -### ⚠ Breaking Changes - -- `DoclingConverter` now defaults to `ExportType.MARKDOWN` instead of `ExportType.DOC_CHUNKS` - ## [integrations/docling-v0.4.0] - 2026-05-04 ### 🚀 Features diff --git a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py index 24d97a5fac..0c69c5c600 100644 --- a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py +++ b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py @@ -239,15 +239,17 @@ def run( merged_meta = source_meta if self.export_type == ExportType.DOC_CHUNKS: - chunk_iter = self._chunker_instance.chunk(dl_doc=dl_doc) - hs_docs = [ - Document( - content=self._chunker_instance.contextualize(chunk=chunk), - meta={**self._meta_extractor_instance.extract_chunk_meta(chunk=chunk), **merged_meta}, - ) - for chunk in chunk_iter - ] - documents.extend(hs_docs) + split_idx_start = 0 + for split_id, chunk in enumerate(self._chunker_instance.chunk(dl_doc=dl_doc)): + content = self._chunker_instance.contextualize(chunk=chunk) + meta = { + **self._meta_extractor_instance.extract_chunk_meta(chunk=chunk), + "split_id": split_id, + "split_idx_start": split_idx_start, + **merged_meta, + } + documents.append(Document(content=content, meta=meta)) + split_idx_start += len(chunk.text) elif self.export_type == ExportType.MARKDOWN: hs_doc = Document( content=dl_doc.export_to_markdown(**self.md_export_kwargs), diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index eac251e91f..e758b6ec15 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -59,7 +59,7 @@ def chunk_side_effect(dl_doc: Any) -> list[SimpleNamespace]: assert "contextualized-chunk-1-of-dl-doc-for-file-a.pdf" in contents assert "contextualized-chunk-2-of-dl-doc-for-file-a.pdf" in contents - assert {"chunk_id": "chunk-1-of-dl-doc-for-file-a.pdf"} in metas + assert any(m.get("chunk_id") == "chunk-1-of-dl-doc-for-file-a.pdf" for m in metas) # Ensure our collaborators were actually exercised. assert converter_mock.convert.call_count == len(paths) @@ -543,3 +543,76 @@ def test_run_without_sources_or_paths_raises_value_error() -> None: converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock()) with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."): converter.run() + + +def test_run_doc_chunks_split_id_and_split_idx_start() -> None: + converter_mock = MagicMock() + chunker_mock = MagicMock() + meta_extractor_mock = MagicMock() + + converter_mock.convert.return_value = SimpleNamespace(document="dl-doc") + + chunks = [ + SimpleNamespace(text="hello world"), + SimpleNamespace(text="foo bar baz"), + ] + chunker_mock.chunk.return_value = chunks + chunker_mock.contextualize.side_effect = lambda chunk: f"ctx:{chunk.text}" + meta_extractor_mock.extract_chunk_meta.return_value = {} + + converter = DoclingConverter( + converter=converter_mock, + export_type=ExportType.DOC_CHUNKS, + chunker=chunker_mock, + meta_extractor=meta_extractor_mock, + ) + + result = converter.run(sources=["doc.pdf"]) + documents = result["documents"] + + assert len(documents) == 2 + assert documents[0].meta["split_id"] == 0 + assert documents[0].meta["split_idx_start"] == 0 + assert documents[1].meta["split_id"] == 1 + assert documents[1].meta["split_idx_start"] == len("hello world") + + +def test_run_doc_chunks_split_id_resets_per_document() -> None: + converter_mock = MagicMock() + chunker_mock = MagicMock() + meta_extractor_mock = MagicMock() + + converter_mock.convert.side_effect = [ + SimpleNamespace(document="dl-doc-a"), + SimpleNamespace(document="dl-doc-b"), + ] + chunker_mock.chunk.side_effect = lambda dl_doc: [ + SimpleNamespace(text=f"chunk-1-of-{dl_doc}"), + SimpleNamespace(text=f"chunk-2-of-{dl_doc}"), + ] + chunker_mock.contextualize.side_effect = lambda chunk: chunk.text + meta_extractor_mock.extract_chunk_meta.return_value = {} + + converter = DoclingConverter( + converter=converter_mock, + export_type=ExportType.DOC_CHUNKS, + chunker=chunker_mock, + meta_extractor=meta_extractor_mock, + ) + + result = converter.run(sources=["a.pdf", "b.pdf"]) + documents = result["documents"] + + # split_id and split_idx_start reset for each source document + doc_a_chunks = documents[:2] + doc_b_chunks = documents[2:] + + assert doc_a_chunks[0].meta["split_id"] == 0 + assert doc_a_chunks[0].meta["split_idx_start"] == 0 + assert doc_a_chunks[1].meta["split_id"] == 1 + assert doc_a_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-a") + + assert doc_b_chunks[0].meta["split_id"] == 0 + assert doc_b_chunks[0].meta["split_idx_start"] == 0 + assert doc_b_chunks[1].meta["split_id"] == 1 + assert doc_b_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-b")