|
11 | 11 | from haystack.core.serialization import component_from_dict, component_to_dict |
12 | 12 | from haystack.dataclasses import ByteStream |
13 | 13 |
|
14 | | -from haystack_integrations.components.converters.docling import DoclingConverter, ExportType |
| 14 | +from haystack_integrations.components.converters.docling import ( |
| 15 | + DoclingConverter, |
| 16 | + ExportType, |
| 17 | + MetaExtractor, |
| 18 | +) |
15 | 19 | from haystack_integrations.components.converters.docling.converter import _bytestream_to_document_stream |
16 | 20 |
|
17 | 21 |
|
@@ -441,3 +445,48 @@ def test_returns_document_stream_with_bytesio(self) -> None: |
441 | 445 | ds = _bytestream_to_document_stream(bs) |
442 | 446 | assert isinstance(ds, DocumentStream) |
443 | 447 | assert isinstance(ds.stream, BytesIO) |
| 448 | + |
| 449 | + def test_unknown_mime_type_keeps_base_name(self) -> None: |
| 450 | + # mimetypes.guess_extension returns None for unknown types, so the name stays as-is. |
| 451 | + assert mimetypes.guess_extension("application/x-totally-made-up-type") is None |
| 452 | + bs = ByteStream( |
| 453 | + data=b"data", |
| 454 | + meta={"file_path": "report"}, |
| 455 | + mime_type="application/x-totally-made-up-type", |
| 456 | + ) |
| 457 | + ds = _bytestream_to_document_stream(bs) |
| 458 | + assert ds.name == "report" |
| 459 | + |
| 460 | + |
| 461 | +class TestMetaExtractor: |
| 462 | + def test_extract_chunk_meta_wraps_export_json_dict(self) -> None: |
| 463 | + chunk = MagicMock() |
| 464 | + chunk.export_json_dict.return_value = {"some": "dict"} |
| 465 | + |
| 466 | + result = MetaExtractor().extract_chunk_meta(chunk=chunk) |
| 467 | + |
| 468 | + assert result == {"dl_meta": {"some": "dict"}} |
| 469 | + chunk.export_json_dict.assert_called_once_with() |
| 470 | + |
| 471 | + def test_extract_dl_doc_meta_with_origin(self) -> None: |
| 472 | + dl_doc = MagicMock() |
| 473 | + dl_doc.origin.model_dump.return_value = {"filename": "foo.pdf", "mimetype": "application/pdf"} |
| 474 | + |
| 475 | + result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc) |
| 476 | + |
| 477 | + assert result == {"dl_meta": {"origin": {"filename": "foo.pdf", "mimetype": "application/pdf"}}} |
| 478 | + dl_doc.origin.model_dump.assert_called_once_with(exclude_none=True) |
| 479 | + |
| 480 | + def test_extract_dl_doc_meta_without_origin(self) -> None: |
| 481 | + dl_doc = MagicMock() |
| 482 | + dl_doc.origin = None |
| 483 | + |
| 484 | + result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc) |
| 485 | + |
| 486 | + assert result == {} |
| 487 | + |
| 488 | + |
| 489 | +def test_run_without_sources_or_paths_raises_value_error() -> None: |
| 490 | + converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock()) |
| 491 | + with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."): |
| 492 | + converter.run() |
0 commit comments