From c26058cddd90b664f4c8e848f452c97aeb300e1e Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Thu, 30 Apr 2026 13:54:42 +0200 Subject: [PATCH 1/6] Update se/de --- .../converters/docling/converter.py | 63 ++++++++++++++ integrations/docling/tests/test_converter.py | 83 +++++++++---------- 2 files changed, 104 insertions(+), 42 deletions(-) diff --git a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py index f7581e9c66..9049193c34 100644 --- a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py +++ b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py @@ -12,7 +12,14 @@ from docling_core.types.io import DocumentStream from haystack import Document, component from haystack.components.converters.utils import normalize_metadata +from haystack.core.serialization import ( + default_from_dict, + default_to_dict, + generate_qualified_class_name, + import_class_by_name, +) from haystack.dataclasses import ByteStream +from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance from docling.chunking import BaseChunk, BaseChunker, HybridChunker from docling.datamodel.document import DoclingDocument @@ -63,6 +70,15 @@ def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]: """Extract Docling document meta.""" raise NotImplementedError() + def to_dict(self) -> dict[str, Any]: + """Serialize to a dictionary.""" + return {} + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "BaseMetaExtractor": + """Deserialize from a dictionary.""" + return cls() + class MetaExtractor(BaseMetaExtractor): """MetaExtractor.""" @@ -123,6 +139,53 @@ def __init__( self._chunker_instance = chunker or HybridChunker() self._meta_extractor_instance = meta_extractor or MetaExtractor() + def to_dict(self) -> dict[str, Any]: + """Serialize this component to a dictionary.""" + chunker_data = None + if self.chunker is not None: + try: + pydantic_data = self.chunker.model_dump(mode="json") + except Exception: + # Fall back to primitive fields only when complex nested objects + # (e.g. non-Pydantic serializer providers) block full JSON serialization. + raw = self.chunker.model_dump() + pydantic_data = { + k: v + for k, v in raw.items() + if not k.startswith("_") and isinstance(v, (str, int, float, bool, type(None))) + } + chunker_data = {"type": generate_qualified_class_name(type(self.chunker)), "data": pydantic_data} + + meta_extractor_data = None + if self.meta_extractor is not None: + meta_extractor_data = serialize_class_instance(self.meta_extractor) + + return default_to_dict( + self, + converter=None, + convert_kwargs=self.convert_kwargs, + export_type=self.export_type.value, + md_export_kwargs=self.md_export_kwargs, + chunker=chunker_data, + meta_extractor=meta_extractor_data, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter": + """Deserialize this component from a dictionary.""" + init_params = data.get("init_parameters", {}) + + chunker_data = init_params.get("chunker") + if chunker_data is not None: + chunker_cls = import_class_by_name(chunker_data["type"]) + init_params["chunker"] = chunker_cls.model_validate(chunker_data["data"]) + + meta_extractor_data = init_params.get("meta_extractor") + if meta_extractor_data is not None: + init_params["meta_extractor"] = deserialize_class_instance(meta_extractor_data) + + return default_from_dict(cls, data) + @component.output_types(documents=list[Document]) def run( self, diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index 778bff36fd..c00525ad53 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -146,43 +146,18 @@ def test_legacy_import_path() -> None: ) -def test_component_from_dict_legacy_nulls() -> None: - # Before the public-attribute refactor, default serialization couldn't find - # the _-prefixed attributes and fell back to the init defaults, so - # convert_kwargs and md_export_kwargs were always serialized as null. - # Verify that such a serialized dict still deserializes correctly. - legacy_data = { - "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", - "init_parameters": { - "converter": None, - "convert_kwargs": None, - "export_type": "doc_chunks", - "md_export_kwargs": None, - "chunker": None, - "meta_extractor": None, - }, - } - restored = component_from_dict(DoclingConverter, legacy_data, "docling_converter") - - assert restored.convert_kwargs == {} - assert restored.md_export_kwargs == {"image_placeholder": ""} - assert restored.export_type == ExportType.DOC_CHUNKS - assert restored.converter is None - assert restored.chunker is None - assert restored.meta_extractor is None - - def test_component_to_dict_defaults() -> None: converter = DoclingConverter() data = component_to_dict(converter, "docling_converter") - init_params = data["init_parameters"] - assert init_params["converter"] is None - assert init_params["convert_kwargs"] == {} - assert init_params["export_type"] == ExportType.DOC_CHUNKS - assert init_params["md_export_kwargs"] == {"image_placeholder": ""} - assert init_params["chunker"] is None - assert init_params["meta_extractor"] is None + assert data["init_parameters"] == { + "converter": None, + "convert_kwargs": {}, + "export_type": "doc_chunks", + "md_export_kwargs": {"image_placeholder": ""}, + "chunker": None, + "meta_extractor": None, + } def test_component_to_dict_custom_params() -> None: @@ -190,18 +165,33 @@ def test_component_to_dict_custom_params() -> None: convert_kwargs={"raises_on_error": False}, export_type=ExportType.MARKDOWN, md_export_kwargs={"image_placeholder": "[img]"}, + meta_extractor=MetaExtractor(), ) data = component_to_dict(converter, "docling_converter") init_params = data["init_parameters"] assert init_params["convert_kwargs"] == {"raises_on_error": False} - assert init_params["export_type"] == ExportType.MARKDOWN + assert init_params["export_type"] == "markdown" assert init_params["md_export_kwargs"] == {"image_placeholder": "[img]"} + assert init_params["meta_extractor"] == { + "type": "haystack_integrations.components.converters.docling.converter.MetaExtractor", + "data": {}, + } def test_component_from_dict_defaults() -> None: - converter = DoclingConverter() - data = component_to_dict(converter, "docling_converter") + # null kwargs mirror the pre-refactor serialization format and must still deserialize correctly + data = { + "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", + "init_parameters": { + "converter": None, + "convert_kwargs": None, + "export_type": "doc_chunks", + "md_export_kwargs": None, + "chunker": None, + "meta_extractor": None, + }, + } restored = component_from_dict(DoclingConverter, data, "docling_converter") assert restored.converter is None @@ -213,17 +203,26 @@ def test_component_from_dict_defaults() -> None: def test_component_from_dict_custom_params() -> None: - converter = DoclingConverter( - convert_kwargs={"raises_on_error": False}, - export_type=ExportType.JSON, - md_export_kwargs={"image_placeholder": "[img]"}, - ) - data = component_to_dict(converter, "docling_converter") + data = { + "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", + "init_parameters": { + "converter": None, + "convert_kwargs": {"raises_on_error": False}, + "export_type": "json", + "md_export_kwargs": {"image_placeholder": "[img]"}, + "chunker": None, + "meta_extractor": { + "type": "haystack_integrations.components.converters.docling.converter.MetaExtractor", + "data": {}, + }, + }, + } restored = component_from_dict(DoclingConverter, data, "docling_converter") assert restored.convert_kwargs == {"raises_on_error": False} assert restored.export_type == ExportType.JSON assert restored.md_export_kwargs == {"image_placeholder": "[img]"} + assert isinstance(restored.meta_extractor, MetaExtractor) def test_run_with_sources_parameter() -> None: From e78f6efc53a160fd9972c106d22e40fd05cd96a0 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 4 May 2026 09:23:26 +0200 Subject: [PATCH 2/6] give up on trying to se/de the chunker it is a generic class and a pydantic class which makes it hard --- .../converters/docling/converter.py | 33 +++++-------------- integrations/docling/tests/test_converter.py | 10 ++++++ 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py index 9049193c34..5d5cc2d61f 100644 --- a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py +++ b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py @@ -12,12 +12,7 @@ from docling_core.types.io import DocumentStream from haystack import Document, component from haystack.components.converters.utils import normalize_metadata -from haystack.core.serialization import ( - default_from_dict, - default_to_dict, - generate_qualified_class_name, - import_class_by_name, -) +from haystack.core.serialization import default_from_dict, default_to_dict from haystack.dataclasses import ByteStream from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance @@ -141,20 +136,13 @@ def __init__( def to_dict(self) -> dict[str, Any]: """Serialize this component to a dictionary.""" - chunker_data = None if self.chunker is not None: - try: - pydantic_data = self.chunker.model_dump(mode="json") - except Exception: - # Fall back to primitive fields only when complex nested objects - # (e.g. non-Pydantic serializer providers) block full JSON serialization. - raw = self.chunker.model_dump() - pydantic_data = { - k: v - for k, v in raw.items() - if not k.startswith("_") and isinstance(v, (str, int, float, bool, type(None))) - } - chunker_data = {"type": generate_qualified_class_name(type(self.chunker)), "data": pydantic_data} + warnings.warn( + "DoclingConverter.to_dict: the 'chunker' parameter cannot be serialized and will be dropped. " + "The converter will use the default chunker when restored from the serialized form.", + UserWarning, + stacklevel=2, + ) meta_extractor_data = None if self.meta_extractor is not None: @@ -166,7 +154,7 @@ def to_dict(self) -> dict[str, Any]: convert_kwargs=self.convert_kwargs, export_type=self.export_type.value, md_export_kwargs=self.md_export_kwargs, - chunker=chunker_data, + chunker=None, meta_extractor=meta_extractor_data, ) @@ -175,11 +163,6 @@ def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter": """Deserialize this component from a dictionary.""" init_params = data.get("init_parameters", {}) - chunker_data = init_params.get("chunker") - if chunker_data is not None: - chunker_cls = import_class_by_name(chunker_data["type"]) - init_params["chunker"] = chunker_cls.model_validate(chunker_data["data"]) - meta_extractor_data = init_params.get("meta_extractor") if meta_extractor_data is not None: init_params["meta_extractor"] = deserialize_class_instance(meta_extractor_data) diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index c00525ad53..321caba9ef 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -7,6 +7,7 @@ from unittest.mock import MagicMock import pytest +from docling.chunking import HybridChunker from docling_core.types.io import DocumentStream from haystack.core.serialization import component_from_dict, component_to_dict from haystack.dataclasses import ByteStream @@ -225,6 +226,15 @@ def test_component_from_dict_custom_params() -> None: assert isinstance(restored.meta_extractor, MetaExtractor) +def test_component_to_dict_chunker_warns_and_is_dropped() -> None: + converter = DoclingConverter(chunker=HybridChunker(merge_peers=False)) + + with pytest.warns(UserWarning, match="chunker"): + data = component_to_dict(converter, "docling_converter") + + assert data["init_parameters"]["chunker"] is None + + def test_run_with_sources_parameter() -> None: converter_mock = MagicMock() chunker_mock = MagicMock() From d4d1abe3cad868a4d03d4a881c24129ccb0926bd Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 4 May 2026 10:08:59 +0200 Subject: [PATCH 3/6] update tests --- integrations/docling/tests/test_converter.py | 65 ++++++++++++-------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index 321caba9ef..700024185f 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -9,7 +9,7 @@ import pytest from docling.chunking import HybridChunker from docling_core.types.io import DocumentStream -from haystack.core.serialization import component_from_dict, component_to_dict + from haystack.dataclasses import ByteStream from haystack_integrations.components.converters.docling import ( @@ -135,8 +135,6 @@ def test_run_json_minimal() -> None: def test_legacy_import_path() -> None: - import warnings - with warnings.catch_warnings(record=True) as caught: warnings.simplefilter("always") from docling_haystack.converter import DoclingConverter as LegacyDoclingConverter @@ -149,15 +147,16 @@ def test_legacy_import_path() -> None: def test_component_to_dict_defaults() -> None: converter = DoclingConverter() - data = component_to_dict(converter, "docling_converter") - - assert data["init_parameters"] == { - "converter": None, - "convert_kwargs": {}, - "export_type": "doc_chunks", - "md_export_kwargs": {"image_placeholder": ""}, - "chunker": None, - "meta_extractor": None, + assert converter.to_dict() == { + "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", + "init_parameters": { + "converter": None, + "convert_kwargs": {}, + "export_type": "doc_chunks", + "md_export_kwargs": {"image_placeholder": ""}, + "chunker": None, + "meta_extractor": None, + }, } @@ -168,15 +167,19 @@ def test_component_to_dict_custom_params() -> None: md_export_kwargs={"image_placeholder": "[img]"}, meta_extractor=MetaExtractor(), ) - data = component_to_dict(converter, "docling_converter") - - init_params = data["init_parameters"] - assert init_params["convert_kwargs"] == {"raises_on_error": False} - assert init_params["export_type"] == "markdown" - assert init_params["md_export_kwargs"] == {"image_placeholder": "[img]"} - assert init_params["meta_extractor"] == { - "type": "haystack_integrations.components.converters.docling.converter.MetaExtractor", - "data": {}, + assert converter.to_dict() == { + "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", + "init_parameters": { + "converter": None, + "convert_kwargs": {"raises_on_error": False}, + "export_type": "markdown", + "md_export_kwargs": {"image_placeholder": "[img]"}, + "chunker": None, + "meta_extractor": { + "type": "haystack_integrations.components.converters.docling.converter.MetaExtractor", + "data": {}, + }, + }, } @@ -193,7 +196,7 @@ def test_component_from_dict_defaults() -> None: "meta_extractor": None, }, } - restored = component_from_dict(DoclingConverter, data, "docling_converter") + restored = DoclingConverter.from_dict(data) assert restored.converter is None assert restored.convert_kwargs == {} @@ -218,11 +221,13 @@ def test_component_from_dict_custom_params() -> None: }, }, } - restored = component_from_dict(DoclingConverter, data, "docling_converter") + restored = DoclingConverter.from_dict(data) + assert restored.converter is None assert restored.convert_kwargs == {"raises_on_error": False} assert restored.export_type == ExportType.JSON assert restored.md_export_kwargs == {"image_placeholder": "[img]"} + assert restored.chunker is None assert isinstance(restored.meta_extractor, MetaExtractor) @@ -230,9 +235,19 @@ def test_component_to_dict_chunker_warns_and_is_dropped() -> None: converter = DoclingConverter(chunker=HybridChunker(merge_peers=False)) with pytest.warns(UserWarning, match="chunker"): - data = component_to_dict(converter, "docling_converter") + data = converter.to_dict() - assert data["init_parameters"]["chunker"] is None + assert data == { + "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", + "init_parameters": { + "converter": None, + "convert_kwargs": {}, + "export_type": "doc_chunks", + "md_export_kwargs": {"image_placeholder": ""}, + "chunker": None, + "meta_extractor": None, + }, + } def test_run_with_sources_parameter() -> None: From 0292be2de19d60c84d6b0fb9130749d7ad5c9482 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 4 May 2026 10:18:38 +0200 Subject: [PATCH 4/6] Add warning if converter is specified and then component serialized --- .../components/converters/docling/converter.py | 15 ++++++++++----- integrations/docling/tests/test_converter.py | 7 +++---- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py index 5d5cc2d61f..3d52523d30 100644 --- a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py +++ b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py @@ -10,12 +10,14 @@ from typing import Any from docling_core.types.io import DocumentStream -from haystack import Document, component +from haystack import Document, component, logging from haystack.components.converters.utils import normalize_metadata from haystack.core.serialization import default_from_dict, default_to_dict from haystack.dataclasses import ByteStream from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance +logger = logging.getLogger(__name__) + from docling.chunking import BaseChunk, BaseChunker, HybridChunker from docling.datamodel.document import DoclingDocument from docling.document_converter import DocumentConverter @@ -136,12 +138,15 @@ def __init__( def to_dict(self) -> dict[str, Any]: """Serialize this component to a dictionary.""" + if self.converter is not None: + logger.warning( + "DoclingConverter.to_dict: the 'converter' parameter cannot be serialized and will be dropped. " + "The component will use the default DocumentConverter when restored from the serialized form." + ) if self.chunker is not None: - warnings.warn( + logger.warning( "DoclingConverter.to_dict: the 'chunker' parameter cannot be serialized and will be dropped. " - "The converter will use the default chunker when restored from the serialized form.", - UserWarning, - stacklevel=2, + "The component will use the default chunker when restored from the serialized form." ) meta_extractor_data = None diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index 700024185f..a2684218ec 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -8,6 +8,7 @@ import pytest from docling.chunking import HybridChunker +from docling.document_converter import DocumentConverter from docling_core.types.io import DocumentStream from haystack.dataclasses import ByteStream @@ -162,6 +163,7 @@ def test_component_to_dict_defaults() -> None: def test_component_to_dict_custom_params() -> None: converter = DoclingConverter( + converter=DocumentConverter(), convert_kwargs={"raises_on_error": False}, export_type=ExportType.MARKDOWN, md_export_kwargs={"image_placeholder": "[img]"}, @@ -234,10 +236,7 @@ def test_component_from_dict_custom_params() -> None: def test_component_to_dict_chunker_warns_and_is_dropped() -> None: converter = DoclingConverter(chunker=HybridChunker(merge_peers=False)) - with pytest.warns(UserWarning, match="chunker"): - data = converter.to_dict() - - assert data == { + assert converter.to_dict() == { "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", "init_parameters": { "converter": None, From 99e0f4df970c01cda8d79cc27706d5b239fc1099 Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 4 May 2026 10:20:37 +0200 Subject: [PATCH 5/6] Update docstrings --- .../components/converters/docling/converter.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py index 3d52523d30..f747d6c7c7 100644 --- a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py +++ b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py @@ -165,7 +165,16 @@ def to_dict(self) -> dict[str, Any]: @classmethod def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter": - """Deserialize this component from a dictionary.""" + """ + Deserialize this component from a dictionary. + + The `converter` and `chunker` parameters are not serializable and are always ignored during + deserialization; the restored instance will use the default `DocumentConverter` and `HybridChunker` + respectively. + + :param data: Dictionary with keys `type` and `init_parameters`, as produced by `to_dict`. + :returns: A new `DoclingConverter` instance. + """ init_params = data.get("init_parameters", {}) meta_extractor_data = init_params.get("meta_extractor") From b5c913ec84cb9a0876d7b67c6700068d6b842e7d Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Mon, 4 May 2026 10:25:42 +0200 Subject: [PATCH 6/6] formatting --- .../components/converters/docling/converter.py | 6 +++--- integrations/docling/tests/test_converter.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py index f747d6c7c7..eaa4958e2f 100644 --- a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py +++ b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py @@ -16,12 +16,12 @@ from haystack.dataclasses import ByteStream from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance -logger = logging.getLogger(__name__) - from docling.chunking import BaseChunk, BaseChunker, HybridChunker from docling.datamodel.document import DoclingDocument from docling.document_converter import DocumentConverter +logger = logging.getLogger(__name__) + def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream: """ @@ -72,7 +72,7 @@ def to_dict(self) -> dict[str, Any]: return {} @classmethod - def from_dict(cls, data: dict[str, Any]) -> "BaseMetaExtractor": + def from_dict(cls, data: dict[str, Any]) -> "BaseMetaExtractor": # noqa: ARG003 """Deserialize from a dictionary.""" return cls() diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index a2684218ec..5ba9f9c4e4 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -10,7 +10,6 @@ from docling.chunking import HybridChunker from docling.document_converter import DocumentConverter from docling_core.types.io import DocumentStream - from haystack.dataclasses import ByteStream from haystack_integrations.components.converters.docling import (