Skip to content

Commit e462250

Browse files
authored
test: docling - add a few unit tests (#3212)
1 parent aae36f1 commit e462250

1 file changed

Lines changed: 50 additions & 1 deletion

File tree

integrations/docling/tests/test_converter.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@
1111
from haystack.core.serialization import component_from_dict, component_to_dict
1212
from haystack.dataclasses import ByteStream
1313

14-
from haystack_integrations.components.converters.docling import DoclingConverter, ExportType
14+
from haystack_integrations.components.converters.docling import (
15+
DoclingConverter,
16+
ExportType,
17+
MetaExtractor,
18+
)
1519
from haystack_integrations.components.converters.docling.converter import _bytestream_to_document_stream
1620

1721

@@ -441,3 +445,48 @@ def test_returns_document_stream_with_bytesio(self) -> None:
441445
ds = _bytestream_to_document_stream(bs)
442446
assert isinstance(ds, DocumentStream)
443447
assert isinstance(ds.stream, BytesIO)
448+
449+
def test_unknown_mime_type_keeps_base_name(self) -> None:
450+
# mimetypes.guess_extension returns None for unknown types, so the name stays as-is.
451+
assert mimetypes.guess_extension("application/x-totally-made-up-type") is None
452+
bs = ByteStream(
453+
data=b"data",
454+
meta={"file_path": "report"},
455+
mime_type="application/x-totally-made-up-type",
456+
)
457+
ds = _bytestream_to_document_stream(bs)
458+
assert ds.name == "report"
459+
460+
461+
class TestMetaExtractor:
462+
def test_extract_chunk_meta_wraps_export_json_dict(self) -> None:
463+
chunk = MagicMock()
464+
chunk.export_json_dict.return_value = {"some": "dict"}
465+
466+
result = MetaExtractor().extract_chunk_meta(chunk=chunk)
467+
468+
assert result == {"dl_meta": {"some": "dict"}}
469+
chunk.export_json_dict.assert_called_once_with()
470+
471+
def test_extract_dl_doc_meta_with_origin(self) -> None:
472+
dl_doc = MagicMock()
473+
dl_doc.origin.model_dump.return_value = {"filename": "foo.pdf", "mimetype": "application/pdf"}
474+
475+
result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
476+
477+
assert result == {"dl_meta": {"origin": {"filename": "foo.pdf", "mimetype": "application/pdf"}}}
478+
dl_doc.origin.model_dump.assert_called_once_with(exclude_none=True)
479+
480+
def test_extract_dl_doc_meta_without_origin(self) -> None:
481+
dl_doc = MagicMock()
482+
dl_doc.origin = None
483+
484+
result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
485+
486+
assert result == {}
487+
488+
489+
def test_run_without_sources_or_paths_raises_value_error() -> None:
490+
converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock())
491+
with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."):
492+
converter.run()

0 commit comments

Comments
 (0)