Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,18 @@
from typing import Any

from docling_core.types.io import DocumentStream
from haystack import Document, component
from haystack import Document, component, logging
from haystack.components.converters.utils import normalize_metadata
from haystack.core.serialization import default_from_dict, default_to_dict
from haystack.dataclasses import ByteStream
from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance

from docling.chunking import BaseChunk, BaseChunker, HybridChunker
from docling.datamodel.document import DoclingDocument
from docling.document_converter import DocumentConverter

logger = logging.getLogger(__name__)


def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream:
"""
Expand Down Expand Up @@ -63,6 +67,15 @@ def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]:
"""Extract Docling document meta."""
raise NotImplementedError()

def to_dict(self) -> dict[str, Any]:
"""Serialize to a dictionary."""
return {}

@classmethod
def from_dict(cls, data: dict[str, Any]) -> "BaseMetaExtractor": # noqa: ARG003
"""Deserialize from a dictionary."""
return cls()


class MetaExtractor(BaseMetaExtractor):
"""MetaExtractor."""
Expand Down Expand Up @@ -123,6 +136,53 @@ def __init__(
self._chunker_instance = chunker or HybridChunker()
self._meta_extractor_instance = meta_extractor or MetaExtractor()

def to_dict(self) -> dict[str, Any]:
"""Serialize this component to a dictionary."""
if self.converter is not None:
logger.warning(
"DoclingConverter.to_dict: the 'converter' parameter cannot be serialized and will be dropped. "
"The component will use the default DocumentConverter when restored from the serialized form."
)
if self.chunker is not None:
logger.warning(
"DoclingConverter.to_dict: the 'chunker' parameter cannot be serialized and will be dropped. "
"The component will use the default chunker when restored from the serialized form."
)

meta_extractor_data = None
if self.meta_extractor is not None:
meta_extractor_data = serialize_class_instance(self.meta_extractor)

return default_to_dict(
self,
converter=None,
convert_kwargs=self.convert_kwargs,
export_type=self.export_type.value,
md_export_kwargs=self.md_export_kwargs,
chunker=None,
meta_extractor=meta_extractor_data,
)

@classmethod
def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter":
"""
Deserialize this component from a dictionary.

The `converter` and `chunker` parameters are not serializable and are always ignored during
deserialization; the restored instance will use the default `DocumentConverter` and `HybridChunker`
respectively.

:param data: Dictionary with keys `type` and `init_parameters`, as produced by `to_dict`.
:returns: A new `DoclingConverter` instance.
"""
init_params = data.get("init_parameters", {})

meta_extractor_data = init_params.get("meta_extractor")
if meta_extractor_data is not None:
init_params["meta_extractor"] = deserialize_class_instance(meta_extractor_data)

return default_from_dict(cls, data)

@component.output_types(documents=list[Document])
def run(
self,
Expand Down
118 changes: 70 additions & 48 deletions integrations/docling/tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
from unittest.mock import MagicMock

import pytest
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter
from docling_core.types.io import DocumentStream
from haystack.core.serialization import component_from_dict, component_to_dict
from haystack.dataclasses import ByteStream

from haystack_integrations.components.converters.docling import (
Expand Down Expand Up @@ -134,8 +135,6 @@ def test_run_json_minimal() -> None:


def test_legacy_import_path() -> None:
import warnings

with warnings.catch_warnings(record=True) as caught:
warnings.simplefilter("always")
from docling_haystack.converter import DoclingConverter as LegacyDoclingConverter
Expand All @@ -146,63 +145,59 @@ def test_legacy_import_path() -> None:
)


def test_component_from_dict_legacy_nulls() -> None:
# Before the public-attribute refactor, default serialization couldn't find
# the _-prefixed attributes and fell back to the init defaults, so
# convert_kwargs and md_export_kwargs were always serialized as null.
# Verify that such a serialized dict still deserializes correctly.
legacy_data = {
def test_component_to_dict_defaults() -> None:
converter = DoclingConverter()
assert converter.to_dict() == {
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
"init_parameters": {
"converter": None,
"convert_kwargs": None,
"convert_kwargs": {},
"export_type": "doc_chunks",
"md_export_kwargs": None,
"md_export_kwargs": {"image_placeholder": ""},
"chunker": None,
"meta_extractor": None,
},
}
restored = component_from_dict(DoclingConverter, legacy_data, "docling_converter")

assert restored.convert_kwargs == {}
assert restored.md_export_kwargs == {"image_placeholder": ""}
assert restored.export_type == ExportType.DOC_CHUNKS
assert restored.converter is None
assert restored.chunker is None
assert restored.meta_extractor is None


def test_component_to_dict_defaults() -> None:
converter = DoclingConverter()
data = component_to_dict(converter, "docling_converter")

init_params = data["init_parameters"]
assert init_params["converter"] is None
assert init_params["convert_kwargs"] == {}
assert init_params["export_type"] == ExportType.DOC_CHUNKS
assert init_params["md_export_kwargs"] == {"image_placeholder": ""}
assert init_params["chunker"] is None
assert init_params["meta_extractor"] is None


def test_component_to_dict_custom_params() -> None:
converter = DoclingConverter(
converter=DocumentConverter(),
convert_kwargs={"raises_on_error": False},
export_type=ExportType.MARKDOWN,
md_export_kwargs={"image_placeholder": "[img]"},
meta_extractor=MetaExtractor(),
)
data = component_to_dict(converter, "docling_converter")

init_params = data["init_parameters"]
assert init_params["convert_kwargs"] == {"raises_on_error": False}
assert init_params["export_type"] == ExportType.MARKDOWN
assert init_params["md_export_kwargs"] == {"image_placeholder": "[img]"}
assert converter.to_dict() == {
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
"init_parameters": {
"converter": None,
"convert_kwargs": {"raises_on_error": False},
"export_type": "markdown",
"md_export_kwargs": {"image_placeholder": "[img]"},
"chunker": None,
"meta_extractor": {
"type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
"data": {},
},
},
}


def test_component_from_dict_defaults() -> None:
converter = DoclingConverter()
data = component_to_dict(converter, "docling_converter")
restored = component_from_dict(DoclingConverter, data, "docling_converter")
# null kwargs mirror the pre-refactor serialization format and must still deserialize correctly
data = {
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
"init_parameters": {
"converter": None,
"convert_kwargs": None,
"export_type": "doc_chunks",
"md_export_kwargs": None,
"chunker": None,
"meta_extractor": None,
},
}
restored = DoclingConverter.from_dict(data)

assert restored.converter is None
assert restored.convert_kwargs == {}
Expand All @@ -213,17 +208,44 @@ def test_component_from_dict_defaults() -> None:


def test_component_from_dict_custom_params() -> None:
converter = DoclingConverter(
convert_kwargs={"raises_on_error": False},
export_type=ExportType.JSON,
md_export_kwargs={"image_placeholder": "[img]"},
)
data = component_to_dict(converter, "docling_converter")
restored = component_from_dict(DoclingConverter, data, "docling_converter")
data = {
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
"init_parameters": {
"converter": None,
"convert_kwargs": {"raises_on_error": False},
"export_type": "json",
"md_export_kwargs": {"image_placeholder": "[img]"},
"chunker": None,
"meta_extractor": {
"type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
"data": {},
},
},
}
restored = DoclingConverter.from_dict(data)

assert restored.converter is None
assert restored.convert_kwargs == {"raises_on_error": False}
assert restored.export_type == ExportType.JSON
assert restored.md_export_kwargs == {"image_placeholder": "[img]"}
assert restored.chunker is None
assert isinstance(restored.meta_extractor, MetaExtractor)


def test_component_to_dict_chunker_warns_and_is_dropped() -> None:
converter = DoclingConverter(chunker=HybridChunker(merge_peers=False))

assert converter.to_dict() == {
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
"init_parameters": {
"converter": None,
"convert_kwargs": {},
"export_type": "doc_chunks",
"md_export_kwargs": {"image_placeholder": ""},
"chunker": None,
"meta_extractor": None,
},
}


def test_run_with_sources_parameter() -> None:
Expand Down
Loading