diff --git a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py index f7581e9c66..eaa4958e2f 100644 --- a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py +++ b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py @@ -10,14 +10,18 @@ from typing import Any from docling_core.types.io import DocumentStream -from haystack import Document, component +from haystack import Document, component, logging from haystack.components.converters.utils import normalize_metadata +from haystack.core.serialization import default_from_dict, default_to_dict from haystack.dataclasses import ByteStream +from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance from docling.chunking import BaseChunk, BaseChunker, HybridChunker from docling.datamodel.document import DoclingDocument from docling.document_converter import DocumentConverter +logger = logging.getLogger(__name__) + def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream: """ @@ -63,6 +67,15 @@ def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]: """Extract Docling document meta.""" raise NotImplementedError() + def to_dict(self) -> dict[str, Any]: + """Serialize to a dictionary.""" + return {} + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "BaseMetaExtractor": # noqa: ARG003 + """Deserialize from a dictionary.""" + return cls() + class MetaExtractor(BaseMetaExtractor): """MetaExtractor.""" @@ -123,6 +136,53 @@ def __init__( self._chunker_instance = chunker or HybridChunker() self._meta_extractor_instance = meta_extractor or MetaExtractor() + def to_dict(self) -> dict[str, Any]: + """Serialize this component to a dictionary.""" + if self.converter is not None: + logger.warning( + "DoclingConverter.to_dict: the 'converter' parameter cannot be serialized and will be dropped. " + "The component will use the default DocumentConverter when restored from the serialized form." + ) + if self.chunker is not None: + logger.warning( + "DoclingConverter.to_dict: the 'chunker' parameter cannot be serialized and will be dropped. " + "The component will use the default chunker when restored from the serialized form." + ) + + meta_extractor_data = None + if self.meta_extractor is not None: + meta_extractor_data = serialize_class_instance(self.meta_extractor) + + return default_to_dict( + self, + converter=None, + convert_kwargs=self.convert_kwargs, + export_type=self.export_type.value, + md_export_kwargs=self.md_export_kwargs, + chunker=None, + meta_extractor=meta_extractor_data, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter": + """ + Deserialize this component from a dictionary. + + The `converter` and `chunker` parameters are not serializable and are always ignored during + deserialization; the restored instance will use the default `DocumentConverter` and `HybridChunker` + respectively. + + :param data: Dictionary with keys `type` and `init_parameters`, as produced by `to_dict`. + :returns: A new `DoclingConverter` instance. + """ + init_params = data.get("init_parameters", {}) + + meta_extractor_data = init_params.get("meta_extractor") + if meta_extractor_data is not None: + init_params["meta_extractor"] = deserialize_class_instance(meta_extractor_data) + + return default_from_dict(cls, data) + @component.output_types(documents=list[Document]) def run( self, diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index 778bff36fd..5ba9f9c4e4 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -7,8 +7,9 @@ from unittest.mock import MagicMock import pytest +from docling.chunking import HybridChunker +from docling.document_converter import DocumentConverter from docling_core.types.io import DocumentStream -from haystack.core.serialization import component_from_dict, component_to_dict from haystack.dataclasses import ByteStream from haystack_integrations.components.converters.docling import ( @@ -134,8 +135,6 @@ def test_run_json_minimal() -> None: def test_legacy_import_path() -> None: - import warnings - with warnings.catch_warnings(record=True) as caught: warnings.simplefilter("always") from docling_haystack.converter import DoclingConverter as LegacyDoclingConverter @@ -146,63 +145,59 @@ def test_legacy_import_path() -> None: ) -def test_component_from_dict_legacy_nulls() -> None: - # Before the public-attribute refactor, default serialization couldn't find - # the _-prefixed attributes and fell back to the init defaults, so - # convert_kwargs and md_export_kwargs were always serialized as null. - # Verify that such a serialized dict still deserializes correctly. - legacy_data = { +def test_component_to_dict_defaults() -> None: + converter = DoclingConverter() + assert converter.to_dict() == { "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", "init_parameters": { "converter": None, - "convert_kwargs": None, + "convert_kwargs": {}, "export_type": "doc_chunks", - "md_export_kwargs": None, + "md_export_kwargs": {"image_placeholder": ""}, "chunker": None, "meta_extractor": None, }, } - restored = component_from_dict(DoclingConverter, legacy_data, "docling_converter") - - assert restored.convert_kwargs == {} - assert restored.md_export_kwargs == {"image_placeholder": ""} - assert restored.export_type == ExportType.DOC_CHUNKS - assert restored.converter is None - assert restored.chunker is None - assert restored.meta_extractor is None - - -def test_component_to_dict_defaults() -> None: - converter = DoclingConverter() - data = component_to_dict(converter, "docling_converter") - - init_params = data["init_parameters"] - assert init_params["converter"] is None - assert init_params["convert_kwargs"] == {} - assert init_params["export_type"] == ExportType.DOC_CHUNKS - assert init_params["md_export_kwargs"] == {"image_placeholder": ""} - assert init_params["chunker"] is None - assert init_params["meta_extractor"] is None def test_component_to_dict_custom_params() -> None: converter = DoclingConverter( + converter=DocumentConverter(), convert_kwargs={"raises_on_error": False}, export_type=ExportType.MARKDOWN, md_export_kwargs={"image_placeholder": "[img]"}, + meta_extractor=MetaExtractor(), ) - data = component_to_dict(converter, "docling_converter") - - init_params = data["init_parameters"] - assert init_params["convert_kwargs"] == {"raises_on_error": False} - assert init_params["export_type"] == ExportType.MARKDOWN - assert init_params["md_export_kwargs"] == {"image_placeholder": "[img]"} + assert converter.to_dict() == { + "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", + "init_parameters": { + "converter": None, + "convert_kwargs": {"raises_on_error": False}, + "export_type": "markdown", + "md_export_kwargs": {"image_placeholder": "[img]"}, + "chunker": None, + "meta_extractor": { + "type": "haystack_integrations.components.converters.docling.converter.MetaExtractor", + "data": {}, + }, + }, + } def test_component_from_dict_defaults() -> None: - converter = DoclingConverter() - data = component_to_dict(converter, "docling_converter") - restored = component_from_dict(DoclingConverter, data, "docling_converter") + # null kwargs mirror the pre-refactor serialization format and must still deserialize correctly + data = { + "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", + "init_parameters": { + "converter": None, + "convert_kwargs": None, + "export_type": "doc_chunks", + "md_export_kwargs": None, + "chunker": None, + "meta_extractor": None, + }, + } + restored = DoclingConverter.from_dict(data) assert restored.converter is None assert restored.convert_kwargs == {} @@ -213,17 +208,44 @@ def test_component_from_dict_defaults() -> None: def test_component_from_dict_custom_params() -> None: - converter = DoclingConverter( - convert_kwargs={"raises_on_error": False}, - export_type=ExportType.JSON, - md_export_kwargs={"image_placeholder": "[img]"}, - ) - data = component_to_dict(converter, "docling_converter") - restored = component_from_dict(DoclingConverter, data, "docling_converter") + data = { + "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", + "init_parameters": { + "converter": None, + "convert_kwargs": {"raises_on_error": False}, + "export_type": "json", + "md_export_kwargs": {"image_placeholder": "[img]"}, + "chunker": None, + "meta_extractor": { + "type": "haystack_integrations.components.converters.docling.converter.MetaExtractor", + "data": {}, + }, + }, + } + restored = DoclingConverter.from_dict(data) + assert restored.converter is None assert restored.convert_kwargs == {"raises_on_error": False} assert restored.export_type == ExportType.JSON assert restored.md_export_kwargs == {"image_placeholder": "[img]"} + assert restored.chunker is None + assert isinstance(restored.meta_extractor, MetaExtractor) + + +def test_component_to_dict_chunker_warns_and_is_dropped() -> None: + converter = DoclingConverter(chunker=HybridChunker(merge_peers=False)) + + assert converter.to_dict() == { + "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", + "init_parameters": { + "converter": None, + "convert_kwargs": {}, + "export_type": "doc_chunks", + "md_export_kwargs": {"image_placeholder": ""}, + "chunker": None, + "meta_extractor": None, + }, + } def test_run_with_sources_parameter() -> None: