Skip to content

Commit ab4121c

Browse files
fix(docling): add split_id and split_idx_start to chunk metadata; revert CHANGELOG
1 parent b787be6 commit ab4121c

3 files changed

Lines changed: 85 additions & 20 deletions

File tree

integrations/docling/CHANGELOG.md

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,5 @@
11
# Changelog
22

3-
## [Unreleased]
4-
5-
### 🚀 Features
6-
7-
- Make `ExportType.MARKDOWN` the default export type; add `page_number` to chunk metadata (#3256)
8-
9-
### ⚠ Breaking Changes
10-
11-
- `DoclingConverter` now defaults to `ExportType.MARKDOWN` instead of `ExportType.DOC_CHUNKS`
12-
133
## [integrations/docling-v0.4.0] - 2026-05-04
144

155
### 🚀 Features

integrations/docling/src/haystack_integrations/components/converters/docling/converter.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -239,15 +239,17 @@ def run(
239239
merged_meta = source_meta
240240

241241
if self.export_type == ExportType.DOC_CHUNKS:
242-
chunk_iter = self._chunker_instance.chunk(dl_doc=dl_doc)
243-
hs_docs = [
244-
Document(
245-
content=self._chunker_instance.contextualize(chunk=chunk),
246-
meta={**self._meta_extractor_instance.extract_chunk_meta(chunk=chunk), **merged_meta},
247-
)
248-
for chunk in chunk_iter
249-
]
250-
documents.extend(hs_docs)
242+
split_idx_start = 0
243+
for split_id, chunk in enumerate(self._chunker_instance.chunk(dl_doc=dl_doc)):
244+
content = self._chunker_instance.contextualize(chunk=chunk)
245+
meta = {
246+
**self._meta_extractor_instance.extract_chunk_meta(chunk=chunk),
247+
"split_id": split_id,
248+
"split_idx_start": split_idx_start,
249+
**merged_meta,
250+
}
251+
documents.append(Document(content=content, meta=meta))
252+
split_idx_start += len(chunk.text)
251253
elif self.export_type == ExportType.MARKDOWN:
252254
hs_doc = Document(
253255
content=dl_doc.export_to_markdown(**self.md_export_kwargs),

integrations/docling/tests/test_converter.py

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def chunk_side_effect(dl_doc: Any) -> list[SimpleNamespace]:
5959

6060
assert "contextualized-chunk-1-of-dl-doc-for-file-a.pdf" in contents
6161
assert "contextualized-chunk-2-of-dl-doc-for-file-a.pdf" in contents
62-
assert {"chunk_id": "chunk-1-of-dl-doc-for-file-a.pdf"} in metas
62+
assert any(m.get("chunk_id") == "chunk-1-of-dl-doc-for-file-a.pdf" for m in metas)
6363

6464
# Ensure our collaborators were actually exercised.
6565
assert converter_mock.convert.call_count == len(paths)
@@ -543,3 +543,76 @@ def test_run_without_sources_or_paths_raises_value_error() -> None:
543543
converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock())
544544
with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."):
545545
converter.run()
546+
547+
548+
def test_run_doc_chunks_split_id_and_split_idx_start() -> None:
549+
converter_mock = MagicMock()
550+
chunker_mock = MagicMock()
551+
meta_extractor_mock = MagicMock()
552+
553+
converter_mock.convert.return_value = SimpleNamespace(document="dl-doc")
554+
555+
chunks = [
556+
SimpleNamespace(text="hello world"),
557+
SimpleNamespace(text="foo bar baz"),
558+
]
559+
chunker_mock.chunk.return_value = chunks
560+
chunker_mock.contextualize.side_effect = lambda chunk: f"ctx:{chunk.text}"
561+
meta_extractor_mock.extract_chunk_meta.return_value = {}
562+
563+
converter = DoclingConverter(
564+
converter=converter_mock,
565+
export_type=ExportType.DOC_CHUNKS,
566+
chunker=chunker_mock,
567+
meta_extractor=meta_extractor_mock,
568+
)
569+
570+
result = converter.run(sources=["doc.pdf"])
571+
documents = result["documents"]
572+
573+
assert len(documents) == 2
574+
assert documents[0].meta["split_id"] == 0
575+
assert documents[0].meta["split_idx_start"] == 0
576+
assert documents[1].meta["split_id"] == 1
577+
assert documents[1].meta["split_idx_start"] == len("hello world")
578+
579+
580+
def test_run_doc_chunks_split_id_resets_per_document() -> None:
581+
converter_mock = MagicMock()
582+
chunker_mock = MagicMock()
583+
meta_extractor_mock = MagicMock()
584+
585+
converter_mock.convert.side_effect = [
586+
SimpleNamespace(document="dl-doc-a"),
587+
SimpleNamespace(document="dl-doc-b"),
588+
]
589+
chunker_mock.chunk.side_effect = lambda dl_doc: [
590+
SimpleNamespace(text=f"chunk-1-of-{dl_doc}"),
591+
SimpleNamespace(text=f"chunk-2-of-{dl_doc}"),
592+
]
593+
chunker_mock.contextualize.side_effect = lambda chunk: chunk.text
594+
meta_extractor_mock.extract_chunk_meta.return_value = {}
595+
596+
converter = DoclingConverter(
597+
converter=converter_mock,
598+
export_type=ExportType.DOC_CHUNKS,
599+
chunker=chunker_mock,
600+
meta_extractor=meta_extractor_mock,
601+
)
602+
603+
result = converter.run(sources=["a.pdf", "b.pdf"])
604+
documents = result["documents"]
605+
606+
# split_id and split_idx_start reset for each source document
607+
doc_a_chunks = documents[:2]
608+
doc_b_chunks = documents[2:]
609+
610+
assert doc_a_chunks[0].meta["split_id"] == 0
611+
assert doc_a_chunks[0].meta["split_idx_start"] == 0
612+
assert doc_a_chunks[1].meta["split_id"] == 1
613+
assert doc_a_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-a")
614+
615+
assert doc_b_chunks[0].meta["split_id"] == 0
616+
assert doc_b_chunks[0].meta["split_idx_start"] == 0
617+
assert doc_b_chunks[1].meta["split_id"] == 1
618+
assert doc_b_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-b")

0 commit comments

Comments
 (0)