Skip to content

Commit 75644fe

Browse files
authored
feat: Add serialization/deserialization to DoclingConverter (#3267)
1 parent a402a09 commit 75644fe

2 files changed

Lines changed: 131 additions & 49 deletions

File tree

integrations/docling/src/haystack_integrations/components/converters/docling/converter.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,18 @@
1010
from typing import Any
1111

1212
from docling_core.types.io import DocumentStream
13-
from haystack import Document, component
13+
from haystack import Document, component, logging
1414
from haystack.components.converters.utils import normalize_metadata
15+
from haystack.core.serialization import default_from_dict, default_to_dict
1516
from haystack.dataclasses import ByteStream
17+
from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance
1618

1719
from docling.chunking import BaseChunk, BaseChunker, HybridChunker
1820
from docling.datamodel.document import DoclingDocument
1921
from docling.document_converter import DocumentConverter
2022

23+
logger = logging.getLogger(__name__)
24+
2125

2226
def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream:
2327
"""
@@ -63,6 +67,15 @@ def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]:
6367
"""Extract Docling document meta."""
6468
raise NotImplementedError()
6569

70+
def to_dict(self) -> dict[str, Any]:
71+
"""Serialize to a dictionary."""
72+
return {}
73+
74+
@classmethod
75+
def from_dict(cls, data: dict[str, Any]) -> "BaseMetaExtractor": # noqa: ARG003
76+
"""Deserialize from a dictionary."""
77+
return cls()
78+
6679

6780
class MetaExtractor(BaseMetaExtractor):
6881
"""MetaExtractor."""
@@ -123,6 +136,53 @@ def __init__(
123136
self._chunker_instance = chunker or HybridChunker()
124137
self._meta_extractor_instance = meta_extractor or MetaExtractor()
125138

139+
def to_dict(self) -> dict[str, Any]:
140+
"""Serialize this component to a dictionary."""
141+
if self.converter is not None:
142+
logger.warning(
143+
"DoclingConverter.to_dict: the 'converter' parameter cannot be serialized and will be dropped. "
144+
"The component will use the default DocumentConverter when restored from the serialized form."
145+
)
146+
if self.chunker is not None:
147+
logger.warning(
148+
"DoclingConverter.to_dict: the 'chunker' parameter cannot be serialized and will be dropped. "
149+
"The component will use the default chunker when restored from the serialized form."
150+
)
151+
152+
meta_extractor_data = None
153+
if self.meta_extractor is not None:
154+
meta_extractor_data = serialize_class_instance(self.meta_extractor)
155+
156+
return default_to_dict(
157+
self,
158+
converter=None,
159+
convert_kwargs=self.convert_kwargs,
160+
export_type=self.export_type.value,
161+
md_export_kwargs=self.md_export_kwargs,
162+
chunker=None,
163+
meta_extractor=meta_extractor_data,
164+
)
165+
166+
@classmethod
167+
def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter":
168+
"""
169+
Deserialize this component from a dictionary.
170+
171+
The `converter` and `chunker` parameters are not serializable and are always ignored during
172+
deserialization; the restored instance will use the default `DocumentConverter` and `HybridChunker`
173+
respectively.
174+
175+
:param data: Dictionary with keys `type` and `init_parameters`, as produced by `to_dict`.
176+
:returns: A new `DoclingConverter` instance.
177+
"""
178+
init_params = data.get("init_parameters", {})
179+
180+
meta_extractor_data = init_params.get("meta_extractor")
181+
if meta_extractor_data is not None:
182+
init_params["meta_extractor"] = deserialize_class_instance(meta_extractor_data)
183+
184+
return default_from_dict(cls, data)
185+
126186
@component.output_types(documents=list[Document])
127187
def run(
128188
self,

integrations/docling/tests/test_converter.py

Lines changed: 70 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@
77
from unittest.mock import MagicMock
88

99
import pytest
10+
from docling.chunking import HybridChunker
11+
from docling.document_converter import DocumentConverter
1012
from docling_core.types.io import DocumentStream
11-
from haystack.core.serialization import component_from_dict, component_to_dict
1213
from haystack.dataclasses import ByteStream
1314

1415
from haystack_integrations.components.converters.docling import (
@@ -134,8 +135,6 @@ def test_run_json_minimal() -> None:
134135

135136

136137
def test_legacy_import_path() -> None:
137-
import warnings
138-
139138
with warnings.catch_warnings(record=True) as caught:
140139
warnings.simplefilter("always")
141140
from docling_haystack.converter import DoclingConverter as LegacyDoclingConverter
@@ -146,63 +145,59 @@ def test_legacy_import_path() -> None:
146145
)
147146

148147

149-
def test_component_from_dict_legacy_nulls() -> None:
150-
# Before the public-attribute refactor, default serialization couldn't find
151-
# the _-prefixed attributes and fell back to the init defaults, so
152-
# convert_kwargs and md_export_kwargs were always serialized as null.
153-
# Verify that such a serialized dict still deserializes correctly.
154-
legacy_data = {
148+
def test_component_to_dict_defaults() -> None:
149+
converter = DoclingConverter()
150+
assert converter.to_dict() == {
155151
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
156152
"init_parameters": {
157153
"converter": None,
158-
"convert_kwargs": None,
154+
"convert_kwargs": {},
159155
"export_type": "doc_chunks",
160-
"md_export_kwargs": None,
156+
"md_export_kwargs": {"image_placeholder": ""},
161157
"chunker": None,
162158
"meta_extractor": None,
163159
},
164160
}
165-
restored = component_from_dict(DoclingConverter, legacy_data, "docling_converter")
166-
167-
assert restored.convert_kwargs == {}
168-
assert restored.md_export_kwargs == {"image_placeholder": ""}
169-
assert restored.export_type == ExportType.DOC_CHUNKS
170-
assert restored.converter is None
171-
assert restored.chunker is None
172-
assert restored.meta_extractor is None
173-
174-
175-
def test_component_to_dict_defaults() -> None:
176-
converter = DoclingConverter()
177-
data = component_to_dict(converter, "docling_converter")
178-
179-
init_params = data["init_parameters"]
180-
assert init_params["converter"] is None
181-
assert init_params["convert_kwargs"] == {}
182-
assert init_params["export_type"] == ExportType.DOC_CHUNKS
183-
assert init_params["md_export_kwargs"] == {"image_placeholder": ""}
184-
assert init_params["chunker"] is None
185-
assert init_params["meta_extractor"] is None
186161

187162

188163
def test_component_to_dict_custom_params() -> None:
189164
converter = DoclingConverter(
165+
converter=DocumentConverter(),
190166
convert_kwargs={"raises_on_error": False},
191167
export_type=ExportType.MARKDOWN,
192168
md_export_kwargs={"image_placeholder": "[img]"},
169+
meta_extractor=MetaExtractor(),
193170
)
194-
data = component_to_dict(converter, "docling_converter")
195-
196-
init_params = data["init_parameters"]
197-
assert init_params["convert_kwargs"] == {"raises_on_error": False}
198-
assert init_params["export_type"] == ExportType.MARKDOWN
199-
assert init_params["md_export_kwargs"] == {"image_placeholder": "[img]"}
171+
assert converter.to_dict() == {
172+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
173+
"init_parameters": {
174+
"converter": None,
175+
"convert_kwargs": {"raises_on_error": False},
176+
"export_type": "markdown",
177+
"md_export_kwargs": {"image_placeholder": "[img]"},
178+
"chunker": None,
179+
"meta_extractor": {
180+
"type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
181+
"data": {},
182+
},
183+
},
184+
}
200185

201186

202187
def test_component_from_dict_defaults() -> None:
203-
converter = DoclingConverter()
204-
data = component_to_dict(converter, "docling_converter")
205-
restored = component_from_dict(DoclingConverter, data, "docling_converter")
188+
# null kwargs mirror the pre-refactor serialization format and must still deserialize correctly
189+
data = {
190+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
191+
"init_parameters": {
192+
"converter": None,
193+
"convert_kwargs": None,
194+
"export_type": "doc_chunks",
195+
"md_export_kwargs": None,
196+
"chunker": None,
197+
"meta_extractor": None,
198+
},
199+
}
200+
restored = DoclingConverter.from_dict(data)
206201

207202
assert restored.converter is None
208203
assert restored.convert_kwargs == {}
@@ -213,17 +208,44 @@ def test_component_from_dict_defaults() -> None:
213208

214209

215210
def test_component_from_dict_custom_params() -> None:
216-
converter = DoclingConverter(
217-
convert_kwargs={"raises_on_error": False},
218-
export_type=ExportType.JSON,
219-
md_export_kwargs={"image_placeholder": "[img]"},
220-
)
221-
data = component_to_dict(converter, "docling_converter")
222-
restored = component_from_dict(DoclingConverter, data, "docling_converter")
211+
data = {
212+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
213+
"init_parameters": {
214+
"converter": None,
215+
"convert_kwargs": {"raises_on_error": False},
216+
"export_type": "json",
217+
"md_export_kwargs": {"image_placeholder": "[img]"},
218+
"chunker": None,
219+
"meta_extractor": {
220+
"type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
221+
"data": {},
222+
},
223+
},
224+
}
225+
restored = DoclingConverter.from_dict(data)
223226

227+
assert restored.converter is None
224228
assert restored.convert_kwargs == {"raises_on_error": False}
225229
assert restored.export_type == ExportType.JSON
226230
assert restored.md_export_kwargs == {"image_placeholder": "[img]"}
231+
assert restored.chunker is None
232+
assert isinstance(restored.meta_extractor, MetaExtractor)
233+
234+
235+
def test_component_to_dict_chunker_warns_and_is_dropped() -> None:
236+
converter = DoclingConverter(chunker=HybridChunker(merge_peers=False))
237+
238+
assert converter.to_dict() == {
239+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
240+
"init_parameters": {
241+
"converter": None,
242+
"convert_kwargs": {},
243+
"export_type": "doc_chunks",
244+
"md_export_kwargs": {"image_placeholder": ""},
245+
"chunker": None,
246+
"meta_extractor": None,
247+
},
248+
}
227249

228250

229251
def test_run_with_sources_parameter() -> None:

0 commit comments

Comments
 (0)