From a55748478f5e6a35a15e9ad360284e3f8c7d0604 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 31 Mar 2026 13:03:35 +0200 Subject: [PATCH 01/12] feat: add Docling document converter Signed-off-by: Panos Vagenas --- .github/labeler.yml | 5 + .github/workflows/docling.yml | 79 +++++++ README.md | 1 + integrations/docling/LICENSE.txt | 201 ++++++++++++++++++ integrations/docling/README.md | 14 ++ integrations/docling/docling.md | 112 ++++++++++ .../docling/pydoc/config_docusaurus.yml | 13 ++ integrations/docling/pyproject.toml | 171 +++++++++++++++ .../components/converters/docling/__init__.py | 7 + .../converters/docling/converter.py | 139 ++++++++++++ .../components/converters/py.typed | 0 integrations/docling/tests/__init__.py | 3 + integrations/docling/tests/test_converter.py | 122 +++++++++++ 13 files changed, 867 insertions(+) create mode 100644 .github/workflows/docling.yml create mode 100644 integrations/docling/LICENSE.txt create mode 100644 integrations/docling/README.md create mode 100644 integrations/docling/docling.md create mode 100644 integrations/docling/pydoc/config_docusaurus.yml create mode 100644 integrations/docling/pyproject.toml create mode 100644 integrations/docling/src/haystack_integrations/components/converters/docling/__init__.py create mode 100644 integrations/docling/src/haystack_integrations/components/converters/docling/converter.py create mode 100644 integrations/docling/src/haystack_integrations/components/converters/py.typed create mode 100644 integrations/docling/tests/__init__.py create mode 100644 integrations/docling/tests/test_converter.py diff --git a/.github/labeler.yml b/.github/labeler.yml index 5db4a70fb4..90f9264e6c 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -59,6 +59,11 @@ integration:deepeval: - any-glob-to-any-file: "integrations/deepeval/**/*" - any-glob-to-any-file: ".github/workflows/deepeval.yml" +integration:docling: + - changed-files: + - any-glob-to-any-file: "integrations/docling/**/*" + - any-glob-to-any-file: ".github/workflows/docling.yml" + integration:elasticsearch: - changed-files: - any-glob-to-any-file: "integrations/elasticsearch/**/*" diff --git a/.github/workflows/docling.yml b/.github/workflows/docling.yml new file mode 100644 index 0000000000..38fbbbc3b1 --- /dev/null +++ b/.github/workflows/docling.yml @@ -0,0 +1,79 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / docling + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/docling/**" + - "!integrations/docling/*.md" + - ".github/workflows/docling.yml" + +defaults: + run: + working-directory: integrations/docling + +concurrency: + group: docling-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ["3.10", "3.14"] + + steps: + - name: Support longpaths + if: matrix.os == 'windows-latest' + working-directory: . + run: git config --system core.longpaths true + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run tests + run: hatch run test:cov-retry + + - name: Run unit tests with lowest direct dependencies + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:cov-retry + + + notify-slack-on-failure: + needs: run + if: failure() && github.event_name == 'schedule' + runs-on: ubuntu-slim + steps: + - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1 + with: + slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/README.md b/README.md index 299182c4c6..66346dbee0 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [cohere-haystack](integrations/cohere/) | Embedder, Generator, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) | | [cometapi-haystack](integrations/cometapi/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cometapi-haystack.svg)](https://pypi.org/project/cometapi-haystack) | [![Test / cometapi](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cometapi.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cometapi.yml) | | [deepeval-haystack](integrations/deepeval/) | Evaluator | [![PyPI - Version](https://img.shields.io/pypi/v/deepeval-haystack.svg)](https://pypi.org/project/deepeval-haystack) | [![Test / deepeval](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml) | +| [docling-haystack](integrations/docling/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/docling-haystack.svg)](https://pypi.org/project/docling-haystack) | [![Test / docling](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling.yml) | | [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | | [faiss-haystack](integrations/faiss/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/faiss-haystack.svg)](https://pypi.org/project/faiss-haystack) | [![Test / faiss](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/faiss.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/faiss.yml) | | [fastembed-haystack](integrations/fastembed/) | Embedder, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/fastembed-haystack.svg)](https://pypi.org/project/fastembed-haystack/) | [![Test / fastembed](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/fastembed.yml) | diff --git a/integrations/docling/LICENSE.txt b/integrations/docling/LICENSE.txt new file mode 100644 index 0000000000..6134ab324f --- /dev/null +++ b/integrations/docling/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023-present deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/docling/README.md b/integrations/docling/README.md new file mode 100644 index 0000000000..4748ba6593 --- /dev/null +++ b/integrations/docling/README.md @@ -0,0 +1,14 @@ +# docling-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/docling-haystack.svg)](https://pypi.org/project/docling-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling-haystack.svg)](https://pypi.org/project/docling-haystack) + +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/docling/CHANGELOG.md) + +Haystack integration of [Docling](https://www.docling.ai) for document conversion. + +--- + +## Contributing + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). diff --git a/integrations/docling/docling.md b/integrations/docling/docling.md new file mode 100644 index 0000000000..5f68e53b59 --- /dev/null +++ b/integrations/docling/docling.md @@ -0,0 +1,112 @@ +--- +title: "Docling" +id: integrations-docling +description: "Docling integration for Haystack" +slug: "/integrations-docling" +--- + + +## haystack_integrations.components.converters.docling.converter + +Docling Haystack converter module. + +### ExportType + +Bases: str, Enum + +Enumeration of available export types. + +### BaseMetaExtractor + +Bases: ABC + +BaseMetaExtractor. + +#### extract_chunk_meta + +```python +extract_chunk_meta(chunk: BaseChunk) -> dict[str, Any] +``` + +Extract chunk meta. + +#### extract_dl_doc_meta + +```python +extract_dl_doc_meta(dl_doc: DoclingDocument) -> dict[str, Any] +``` + +Extract Docling document meta. + +### MetaExtractor + +Bases: BaseMetaExtractor + +MetaExtractor. + +#### extract_chunk_meta + +```python +extract_chunk_meta(chunk: BaseChunk) -> dict[str, Any] +``` + +Extract chunk meta. + +#### extract_dl_doc_meta + +```python +extract_dl_doc_meta(dl_doc: DoclingDocument) -> dict[str, Any] +``` + +Extract Docling document meta. + +### DoclingConverter + +Docling Haystack converter. + +#### __init__ + +```python +__init__( + converter: DocumentConverter | None = None, + convert_kwargs: dict[str, Any] | None = None, + export_type: ExportType = ExportType.DOC_CHUNKS, + md_export_kwargs: dict[str, Any] | None = None, + chunker: BaseChunker | None = None, + meta_extractor: BaseMetaExtractor | None = None, +) -> None +``` + +Create a Docling Haystack converter. + +Args: +converter: The Docling `DocumentConverter` to use; if not set, a system +default is used. +convert_kwargs: Any parameters to pass to Docling conversion; if not set, a +system default is used. +export_type: The export mode to use: +\* `ExportType.MARKDOWN` captures each input document as a single +markdown `Document`. +\* `ExportType.DOC_CHUNKS` (default) first chunks each input document +and then returns one `Document` per chunk. +\* `ExportType.JSON` serializes the full Docling document to a JSON string. +md_export_kwargs: Any parameters to pass to Markdown export (applicable in +case of `ExportType.MARKDOWN`). +chunker: The Docling chunker instance to use; if not set, a system default +is used. +meta_extractor: The extractor instance to use for populating the output +document metadata; if not set, a system default is used. + +#### run + +```python +run(paths: Iterable[Path | str]) -> dict[str, list[Document]] +``` + +Run the DoclingConverter. + +Args: +paths: The input document locations, either as local paths or URLs. + +Returns: +list\[Document\]: The output Haystack Documents. diff --git a/integrations/docling/pydoc/config_docusaurus.yml b/integrations/docling/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..5089a59a29 --- /dev/null +++ b/integrations/docling/pydoc/config_docusaurus.yml @@ -0,0 +1,13 @@ +loaders: + - modules: + - haystack_integrations.components.converters.docling.converter + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: Docling integration for Haystack + id: integrations-docling + filename: docling.md + title: Docling diff --git a/integrations/docling/pyproject.toml b/integrations/docling/pyproject.toml new file mode 100644 index 0000000000..6d21a8bb4c --- /dev/null +++ b/integrations/docling/pyproject.toml @@ -0,0 +1,171 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "docling-haystack" +dynamic = ["version"] +description = "Haystack integration for docling" +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = [ + "Docling", + "Documents", + "Unstructured Data", + "Document Intelligence", + "Haystack", + "OCR", + "PDF", + "Document Converter", +] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai>=2.8.0,<3.0.0", + "docling>=2.32.0,<3.0.0", +] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/docling-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/docling-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}' +types = "mypy -p haystack_integrations.components.converters.docling {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", + "ANN", + "ARG", + "B", + "C", + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D205", # 1 blank line required between summary line and description + "D209", # Closing triple quotes go to new line + "D213", # summary lines must be positioned on the second physical line of the docstring + "D417", # Missing argument descriptions in the docstring + "D419", # Docstring is empty + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow function calls in argument defaults (common Haystack pattern for Secret.from_env_var) + "B008", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + # Allow `Any` type - used legitimately for dynamic types and SDK boundaries + "ANN401", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, relative imports, and don't need type annotations +"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +parallel = false + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true +asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/docling/src/haystack_integrations/components/converters/docling/__init__.py b/integrations/docling/src/haystack_integrations/components/converters/docling/__init__.py new file mode 100644 index 0000000000..e6aaf916ec --- /dev/null +++ b/integrations/docling/src/haystack_integrations/components/converters/docling/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from .converter import BaseMetaExtractor, DoclingConverter, ExportType, MetaExtractor + +__all__ = ["BaseMetaExtractor", "DoclingConverter", "ExportType", "MetaExtractor"] diff --git a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py new file mode 100644 index 0000000000..088d5179e8 --- /dev/null +++ b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py @@ -0,0 +1,139 @@ +"""Docling Haystack converter module.""" + +import json +from abc import ABC, abstractmethod +from collections.abc import Iterable +from enum import Enum +from pathlib import Path +from typing import Any + +from haystack import Document, component + +from docling.chunking import BaseChunk, BaseChunker, HybridChunker +from docling.datamodel.document import DoclingDocument +from docling.document_converter import DocumentConverter + + +class ExportType(str, Enum): + """Enumeration of available export types.""" + + MARKDOWN = "markdown" + DOC_CHUNKS = "doc_chunks" + JSON = "json" + + +class BaseMetaExtractor(ABC): + """BaseMetaExtractor.""" + + @abstractmethod + def extract_chunk_meta(self, chunk: BaseChunk) -> dict[str, Any]: + """Extract chunk meta.""" + raise NotImplementedError() + + @abstractmethod + def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]: + """Extract Docling document meta.""" + raise NotImplementedError() + + +class MetaExtractor(BaseMetaExtractor): + """MetaExtractor.""" + + def extract_chunk_meta(self, chunk: BaseChunk) -> dict[str, Any]: + """Extract chunk meta.""" + return {"dl_meta": chunk.export_json_dict()} + + def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]: + """Extract Docling document meta.""" + return {"dl_meta": {"origin": dl_doc.origin.model_dump(exclude_none=True)}} if dl_doc.origin else {} + + +@component +class DoclingConverter: + """Docling Haystack converter.""" + + def __init__( + self, + converter: DocumentConverter | None = None, + convert_kwargs: dict[str, Any] | None = None, + export_type: ExportType = ExportType.DOC_CHUNKS, + md_export_kwargs: dict[str, Any] | None = None, + chunker: BaseChunker | None = None, + meta_extractor: BaseMetaExtractor | None = None, + ) -> None: + """ + Create a Docling Haystack converter. + + Args: + converter: The Docling `DocumentConverter` to use; if not set, a system + default is used. + convert_kwargs: Any parameters to pass to Docling conversion; if not set, a + system default is used. + export_type: The export mode to use: + * `ExportType.MARKDOWN` captures each input document as a single + markdown `Document`. + * `ExportType.DOC_CHUNKS` (default) first chunks each input document + and then returns one `Document` per chunk. + * `ExportType.JSON` serializes the full Docling document to a JSON string. + md_export_kwargs: Any parameters to pass to Markdown export (applicable in + case of `ExportType.MARKDOWN`). + chunker: The Docling chunker instance to use; if not set, a system default + is used. + meta_extractor: The extractor instance to use for populating the output + document metadata; if not set, a system default is used. + """ + self._converter = converter or DocumentConverter() + self._convert_kwargs = convert_kwargs if convert_kwargs is not None else {} + self._export_type = export_type + self._md_export_kwargs = md_export_kwargs if md_export_kwargs is not None else {"image_placeholder": ""} + if self._export_type == ExportType.DOC_CHUNKS: + self._chunker = chunker or HybridChunker() + self._meta_extractor = meta_extractor or MetaExtractor() + + @component.output_types(documents=list[Document]) + def run( + self, + paths: Iterable[Path | str], + ) -> dict[str, list[Document]]: + """ + Run the DoclingConverter. + + Args: + paths: The input document locations, either as local paths or URLs. + + Returns: + list[Document]: The output Haystack Documents. + """ + documents: list[Document] = [] + for filepath in paths: + dl_doc = self._converter.convert( + source=filepath, + **self._convert_kwargs, + ).document + + if self._export_type == ExportType.DOC_CHUNKS: + chunk_iter = self._chunker.chunk(dl_doc=dl_doc) + hs_docs = [ + Document( + content=self._chunker.contextualize(chunk=chunk), + meta=self._meta_extractor.extract_chunk_meta(chunk=chunk), + ) + for chunk in chunk_iter + ] + documents.extend(hs_docs) + elif self._export_type == ExportType.MARKDOWN: + hs_doc = Document( + content=dl_doc.export_to_markdown(**self._md_export_kwargs), + meta=self._meta_extractor.extract_dl_doc_meta(dl_doc=dl_doc), + ) + documents.append(hs_doc) + elif self._export_type == ExportType.JSON: + hs_doc = Document( + content=json.dumps(dl_doc.export_to_dict()), + meta=self._meta_extractor.extract_dl_doc_meta(dl_doc=dl_doc), + ) + documents.append(hs_doc) + else: + err_msg = f"Unexpected export type: {self._export_type}" + raise RuntimeError(err_msg) + return {"documents": documents} diff --git a/integrations/docling/src/haystack_integrations/components/converters/py.typed b/integrations/docling/src/haystack_integrations/components/converters/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/docling/tests/__init__.py b/integrations/docling/tests/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/integrations/docling/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py new file mode 100644 index 0000000000..209fddd4bc --- /dev/null +++ b/integrations/docling/tests/test_converter.py @@ -0,0 +1,122 @@ +import json +from typing import Any +from types import SimpleNamespace +from unittest.mock import MagicMock + +from haystack_integrations.components.converters.docling import DoclingConverter, ExportType + + +def test_run_doc_chunks_minimal() -> None: + paths = ["file-a.pdf", "file-b.pdf"] + converter_mock = MagicMock() + chunker_mock = MagicMock() + meta_extractor_mock = MagicMock() + + converter_mock.convert.side_effect = [ + SimpleNamespace(document="dl-doc-for-file-a.pdf"), + SimpleNamespace(document="dl-doc-for-file-b.pdf"), + ] + + def chunk_side_effect(dl_doc: Any) -> list[SimpleNamespace]: + return [ + SimpleNamespace(text=f"chunk-1-of-{dl_doc}"), + SimpleNamespace(text=f"chunk-2-of-{dl_doc}"), + ] + + chunker_mock.chunk.side_effect = chunk_side_effect + chunker_mock.contextualize.side_effect = lambda chunk: f"contextualized-{chunk.text}" + + meta_extractor_mock.extract_chunk_meta.side_effect = ( + lambda chunk: {"chunk_id": chunk.text} + ) + + converter = DoclingConverter( + converter=converter_mock, + export_type=ExportType.DOC_CHUNKS, + chunker=chunker_mock, + meta_extractor=meta_extractor_mock, + ) + + result = converter.run(paths=paths) + documents = result["documents"] + + # Two chunks per input path from our mocked implementation. + assert len(documents) == 4 + contents = [doc.content for doc in documents] + metas = [doc.meta for doc in documents] + + assert "contextualized-chunk-1-of-dl-doc-for-file-a.pdf" in contents + assert "contextualized-chunk-2-of-dl-doc-for-file-a.pdf" in contents + assert {"chunk_id": "chunk-1-of-dl-doc-for-file-a.pdf"} in metas + + # Ensure our collaborators were actually exercised. + assert converter_mock.convert.call_count == len(paths) + assert chunker_mock.chunk.call_count == len(paths) + assert meta_extractor_mock.extract_chunk_meta.call_count == len(documents) + + +def test_run_markdown_minimal() -> None: + paths = ["doc-1.json"] + converter_mock = MagicMock() + meta_extractor_mock = MagicMock() + + dl_doc = MagicMock() + dl_doc.export_to_markdown.return_value = "markdown-for-doc-1.json-image_placeholder=[img]" + converter_mock.convert.return_value = SimpleNamespace(document=dl_doc) + meta_extractor_mock.extract_dl_doc_meta.return_value = { + "doc_id": "DummyMarkdownDoc(name='doc-1.json')", + } + + converter = DoclingConverter( + converter=converter_mock, + export_type=ExportType.MARKDOWN, + meta_extractor=meta_extractor_mock, + md_export_kwargs={"image_placeholder": "[img]"}, + ) + + result = converter.run(paths=paths) + documents = result["documents"] + + assert len(documents) == 1 + doc = documents[0] + + # Content and meta are derived entirely from our mocked implementations. + assert doc.content == "markdown-for-doc-1.json-image_placeholder=[img]" + assert doc.meta == {"doc_id": "DummyMarkdownDoc(name='doc-1.json')"} + + converter_mock.convert.assert_called_once() + dl_doc.export_to_markdown.assert_called_once_with(image_placeholder="[img]") + meta_extractor_mock.extract_dl_doc_meta.assert_called_once_with(dl_doc=dl_doc) + + +def test_run_json_minimal() -> None: + paths = ["doc-1.json"] + converter_mock = MagicMock() + meta_extractor_mock = MagicMock() + + dl_doc = MagicMock() + dl_doc.export_to_dict.return_value = {"name": "doc-1.json", "kind": "dummy-json"} + converter_mock.convert.return_value = SimpleNamespace(document=dl_doc) + meta_extractor_mock.extract_dl_doc_meta.return_value = { + "doc_id": "DummyJsonDoc(name='doc-1.json')", + } + + converter = DoclingConverter( + converter=converter_mock, + export_type=ExportType.JSON, + meta_extractor=meta_extractor_mock, + ) + + result = converter.run(paths=paths) + documents = result["documents"] + + assert len(documents) == 1 + doc = documents[0] + + # Content is JSON-encoded export dict from our mocked implementation. + assert json.loads(doc.content) == {"name": "doc-1.json", "kind": "dummy-json"} + assert doc.meta == {"doc_id": "DummyJsonDoc(name='doc-1.json')"} + + converter_mock.convert.assert_called_once() + dl_doc.export_to_dict.assert_called_once_with() + meta_extractor_mock.extract_dl_doc_meta.assert_called_once_with(dl_doc=dl_doc) From 67a6169450debddca685f884d1c0aa490b37c045 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 31 Mar 2026 13:14:49 +0200 Subject: [PATCH 02/12] fixing formatting issue Signed-off-by: Panos Vagenas --- integrations/docling/tests/test_converter.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index 209fddd4bc..8f6e8aabb7 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -1,6 +1,6 @@ import json -from typing import Any from types import SimpleNamespace +from typing import Any from unittest.mock import MagicMock from haystack_integrations.components.converters.docling import DoclingConverter, ExportType @@ -26,9 +26,7 @@ def chunk_side_effect(dl_doc: Any) -> list[SimpleNamespace]: chunker_mock.chunk.side_effect = chunk_side_effect chunker_mock.contextualize.side_effect = lambda chunk: f"contextualized-{chunk.text}" - meta_extractor_mock.extract_chunk_meta.side_effect = ( - lambda chunk: {"chunk_id": chunk.text} - ) + meta_extractor_mock.extract_chunk_meta.side_effect = lambda chunk: {"chunk_id": chunk.text} converter = DoclingConverter( converter=converter_mock, From 39c0c59783f1fc03cf33ecdb8fd689184ed8d70a Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 2 Apr 2026 16:28:29 +0200 Subject: [PATCH 03/12] pin lxml dependency for python 3.14 compatibility --- integrations/docling/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/docling/pyproject.toml b/integrations/docling/pyproject.toml index 6d21a8bb4c..6ab90bf00b 100644 --- a/integrations/docling/pyproject.toml +++ b/integrations/docling/pyproject.toml @@ -35,6 +35,7 @@ classifiers = [ dependencies = [ "haystack-ai>=2.8.0,<3.0.0", "docling>=2.32.0,<3.0.0", + "lxml>=6.0.2", ] [project.urls] From 042b068875507fea9672f16c84bb1d20b2a90ccb Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Thu, 2 Apr 2026 16:43:00 +0200 Subject: [PATCH 04/12] adjust workflow incl code coverage tests --- .github/workflows/docling.yml | 67 ++++++++++++++++++++++++++--- integrations/docling/pyproject.toml | 2 + 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/.github/workflows/docling.yml b/.github/workflows/docling.yml index 38fbbbc3b1..00d5db95ad 100644 --- a/.github/workflows/docling.yml +++ b/.github/workflows/docling.yml @@ -10,28 +10,55 @@ on: - "integrations/docling/**" - "!integrations/docling/*.md" - ".github/workflows/docling.yml" + push: + branches: + - main + paths: + - "integrations/docling/**" + - "!integrations/docling/*.md" + - ".github/workflows/docling.yml" defaults: run: working-directory: integrations/docling concurrency: - group: docling-${{ github.head_ref }} + group: docling-${{ github.head_ref || github.sha }} cancel-in-progress: true env: PYTHONUNBUFFERED: "1" FORCE_COLOR: "1" + TEST_MATRIX_OS: '["ubuntu-latest", "windows-latest", "macos-latest"]' + TEST_MATRIX_PYTHON: '["3.10", "3.14"]' jobs: + compute-test-matrix: + runs-on: ubuntu-slim + defaults: + run: + working-directory: . + outputs: + os: ${{ steps.set.outputs.os }} + python-version: ${{ steps.set.outputs.python-version }} + steps: + - id: set + run: | + echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT" + echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT" + run: name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + needs: compute-test-matrix runs-on: ${{ matrix.os }} + permissions: + contents: write + pull-requests: write strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ["3.10", "3.14"] + os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }} + python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }} steps: - name: Support longpaths @@ -48,14 +75,42 @@ jobs: - name: Install Hatch run: pip install --upgrade hatch + - name: Lint if: matrix.python-version == '3.10' && runner.os == 'Linux' run: hatch run fmt-check && hatch run test:types - - name: Run tests - run: hatch run test:cov-retry + - name: Run unit tests + run: hatch run test:unit-cov-retry + + # On PR: generates coverage comment artifact. On push to main: stores coverage baseline on data branch. + - name: Store unit tests coverage + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/docling + SUBPROJECT_ID: docling + COMMENT_ARTIFACT_NAME: coverage-comment-docling + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Run integration tests + run: hatch run test:integration-cov-append-retry + + - name: Store combined coverage + if: github.event_name == 'push' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/docling + SUBPROJECT_ID: docling-combined + COMMENT_ARTIFACT_NAME: coverage-comment-docling-combined + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 - name: Run unit tests with lowest direct dependencies + if: github.event_name != 'push' run: | hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt @@ -66,7 +121,7 @@ jobs: run: | hatch env prune hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main - hatch run test:cov-retry + hatch run test:unit-cov-retry notify-slack-on-failure: diff --git a/integrations/docling/pyproject.toml b/integrations/docling/pyproject.toml index 6ab90bf00b..49a100eb64 100644 --- a/integrations/docling/pyproject.toml +++ b/integrations/docling/pyproject.toml @@ -78,6 +78,8 @@ unit = 'pytest -m "not integration" {args:tests}' integration = 'pytest -m "integration" {args:tests}' all = 'pytest {args:tests}' cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}' +unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}' +integration-cov-append-retry = 'pytest --cov=haystack_integrations --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}' types = "mypy -p haystack_integrations.components.converters.docling {args}" [tool.mypy] From 2a2317c3c0ac468eb32c2a3414f536e920874c12 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 3 Apr 2026 09:11:35 +0200 Subject: [PATCH 05/12] fix code coverage tracking --- integrations/docling/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/docling/pyproject.toml b/integrations/docling/pyproject.toml index 49a100eb64..896b60af00 100644 --- a/integrations/docling/pyproject.toml +++ b/integrations/docling/pyproject.toml @@ -159,6 +159,7 @@ ban-relative-imports = "parents" source = ["haystack_integrations"] branch = true parallel = false +relative_files = true [tool.coverage.report] omit = ["*/tests/*", "*/__init__.py"] From dc639256f347b01155daadf6fe76acfbf31f1a9a Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 3 Apr 2026 09:22:19 +0200 Subject: [PATCH 06/12] adjust docstring style --- .../converters/docling/converter.py | 42 +++++++++---------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py index 088d5179e8..ad104f7031 100644 --- a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py +++ b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py @@ -64,23 +64,22 @@ def __init__( """ Create a Docling Haystack converter. - Args: - converter: The Docling `DocumentConverter` to use; if not set, a system - default is used. - convert_kwargs: Any parameters to pass to Docling conversion; if not set, a - system default is used. - export_type: The export mode to use: - * `ExportType.MARKDOWN` captures each input document as a single - markdown `Document`. - * `ExportType.DOC_CHUNKS` (default) first chunks each input document - and then returns one `Document` per chunk. - * `ExportType.JSON` serializes the full Docling document to a JSON string. - md_export_kwargs: Any parameters to pass to Markdown export (applicable in - case of `ExportType.MARKDOWN`). - chunker: The Docling chunker instance to use; if not set, a system default - is used. - meta_extractor: The extractor instance to use for populating the output - document metadata; if not set, a system default is used. + :param converter: The Docling `DocumentConverter` to use; if not set, a system + default is used. + :param convert_kwargs: Any parameters to pass to Docling conversion; if not set, a + system default is used. + :param export_type: The export mode to use: + * `ExportType.MARKDOWN` captures each input document as a single + markdown `Document`. + * `ExportType.DOC_CHUNKS` (default) first chunks each input document + and then returns one `Document` per chunk. + * `ExportType.JSON` serializes the full Docling document to a JSON string. + :param md_export_kwargs: Any parameters to pass to Markdown export (applicable in + case of `ExportType.MARKDOWN`). + :param chunker: The Docling chunker instance to use; if not set, a system default + is used. + :param meta_extractor: The extractor instance to use for populating the output + document metadata; if not set, a system default is used. """ self._converter = converter or DocumentConverter() self._convert_kwargs = convert_kwargs if convert_kwargs is not None else {} @@ -98,11 +97,10 @@ def run( """ Run the DoclingConverter. - Args: - paths: The input document locations, either as local paths or URLs. - - Returns: - list[Document]: The output Haystack Documents. + :param paths: The input document locations, either as local paths or URLs. + :returns: + A dictionary with key `"documents"` containing the output Haystack Documents. + :raises RuntimeError: If an unexpected `export_type` is encountered. """ documents: list[Document] = [] for filepath in paths: From 17e15647fb8b2966770ed786e35b9ff286c036d7 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 3 Apr 2026 09:24:26 +0200 Subject: [PATCH 07/12] remove docling.md file --- integrations/docling/docling.md | 112 -------------------------------- 1 file changed, 112 deletions(-) delete mode 100644 integrations/docling/docling.md diff --git a/integrations/docling/docling.md b/integrations/docling/docling.md deleted file mode 100644 index 5f68e53b59..0000000000 --- a/integrations/docling/docling.md +++ /dev/null @@ -1,112 +0,0 @@ ---- -title: "Docling" -id: integrations-docling -description: "Docling integration for Haystack" -slug: "/integrations-docling" ---- - - -## haystack_integrations.components.converters.docling.converter - -Docling Haystack converter module. - -### ExportType - -Bases: str, Enum - -Enumeration of available export types. - -### BaseMetaExtractor - -Bases: ABC - -BaseMetaExtractor. - -#### extract_chunk_meta - -```python -extract_chunk_meta(chunk: BaseChunk) -> dict[str, Any] -``` - -Extract chunk meta. - -#### extract_dl_doc_meta - -```python -extract_dl_doc_meta(dl_doc: DoclingDocument) -> dict[str, Any] -``` - -Extract Docling document meta. - -### MetaExtractor - -Bases: BaseMetaExtractor - -MetaExtractor. - -#### extract_chunk_meta - -```python -extract_chunk_meta(chunk: BaseChunk) -> dict[str, Any] -``` - -Extract chunk meta. - -#### extract_dl_doc_meta - -```python -extract_dl_doc_meta(dl_doc: DoclingDocument) -> dict[str, Any] -``` - -Extract Docling document meta. - -### DoclingConverter - -Docling Haystack converter. - -#### __init__ - -```python -__init__( - converter: DocumentConverter | None = None, - convert_kwargs: dict[str, Any] | None = None, - export_type: ExportType = ExportType.DOC_CHUNKS, - md_export_kwargs: dict[str, Any] | None = None, - chunker: BaseChunker | None = None, - meta_extractor: BaseMetaExtractor | None = None, -) -> None -``` - -Create a Docling Haystack converter. - -Args: -converter: The Docling `DocumentConverter` to use; if not set, a system -default is used. -convert_kwargs: Any parameters to pass to Docling conversion; if not set, a -system default is used. -export_type: The export mode to use: -\* `ExportType.MARKDOWN` captures each input document as a single -markdown `Document`. -\* `ExportType.DOC_CHUNKS` (default) first chunks each input document -and then returns one `Document` per chunk. -\* `ExportType.JSON` serializes the full Docling document to a JSON string. -md_export_kwargs: Any parameters to pass to Markdown export (applicable in -case of `ExportType.MARKDOWN`). -chunker: The Docling chunker instance to use; if not set, a system default -is used. -meta_extractor: The extractor instance to use for populating the output -document metadata; if not set, a system default is used. - -#### run - -```python -run(paths: Iterable[Path | str]) -> dict[str, list[Document]] -``` - -Run the DoclingConverter. - -Args: -paths: The input document locations, either as local paths or URLs. - -Returns: -list\[Document\]: The output Haystack Documents. From ed8acf1bda13b5e9e746e99a70ba5933228bbfbc Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 3 Apr 2026 09:39:53 +0200 Subject: [PATCH 08/12] change private to public attributes for default serialization --- .../converters/docling/converter.py | 44 ++++++++------- integrations/docling/tests/test_converter.py | 56 +++++++++++++++++++ 2 files changed, 81 insertions(+), 19 deletions(-) diff --git a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py index ad104f7031..c0cf7f3a70 100644 --- a/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py +++ b/integrations/docling/src/haystack_integrations/components/converters/docling/converter.py @@ -81,13 +81,19 @@ def __init__( :param meta_extractor: The extractor instance to use for populating the output document metadata; if not set, a system default is used. """ - self._converter = converter or DocumentConverter() - self._convert_kwargs = convert_kwargs if convert_kwargs is not None else {} - self._export_type = export_type - self._md_export_kwargs = md_export_kwargs if md_export_kwargs is not None else {"image_placeholder": ""} - if self._export_type == ExportType.DOC_CHUNKS: - self._chunker = chunker or HybridChunker() - self._meta_extractor = meta_extractor or MetaExtractor() + # Public attributes match init parameter names 1:1 for default serialization. + self.converter = converter + self.convert_kwargs = convert_kwargs if convert_kwargs is not None else {} + self.export_type = ExportType(export_type) + self.md_export_kwargs = md_export_kwargs if md_export_kwargs is not None else {"image_placeholder": ""} + self.chunker = chunker + self.meta_extractor = meta_extractor + + # Resolved instances used internally at runtime. + self._converter_instance = converter or DocumentConverter() + if self.export_type == ExportType.DOC_CHUNKS: + self._chunker_instance = chunker or HybridChunker() + self._meta_extractor_instance = meta_extractor or MetaExtractor() @component.output_types(documents=list[Document]) def run( @@ -104,34 +110,34 @@ def run( """ documents: list[Document] = [] for filepath in paths: - dl_doc = self._converter.convert( + dl_doc = self._converter_instance.convert( source=filepath, - **self._convert_kwargs, + **self.convert_kwargs, ).document - if self._export_type == ExportType.DOC_CHUNKS: - chunk_iter = self._chunker.chunk(dl_doc=dl_doc) + if self.export_type == ExportType.DOC_CHUNKS: + chunk_iter = self._chunker_instance.chunk(dl_doc=dl_doc) hs_docs = [ Document( - content=self._chunker.contextualize(chunk=chunk), - meta=self._meta_extractor.extract_chunk_meta(chunk=chunk), + content=self._chunker_instance.contextualize(chunk=chunk), + meta=self._meta_extractor_instance.extract_chunk_meta(chunk=chunk), ) for chunk in chunk_iter ] documents.extend(hs_docs) - elif self._export_type == ExportType.MARKDOWN: + elif self.export_type == ExportType.MARKDOWN: hs_doc = Document( - content=dl_doc.export_to_markdown(**self._md_export_kwargs), - meta=self._meta_extractor.extract_dl_doc_meta(dl_doc=dl_doc), + content=dl_doc.export_to_markdown(**self.md_export_kwargs), + meta=self._meta_extractor_instance.extract_dl_doc_meta(dl_doc=dl_doc), ) documents.append(hs_doc) - elif self._export_type == ExportType.JSON: + elif self.export_type == ExportType.JSON: hs_doc = Document( content=json.dumps(dl_doc.export_to_dict()), - meta=self._meta_extractor.extract_dl_doc_meta(dl_doc=dl_doc), + meta=self._meta_extractor_instance.extract_dl_doc_meta(dl_doc=dl_doc), ) documents.append(hs_doc) else: - err_msg = f"Unexpected export type: {self._export_type}" + err_msg = f"Unexpected export type: {self.export_type}" raise RuntimeError(err_msg) return {"documents": documents} diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index 8f6e8aabb7..7389c0f2fa 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -3,6 +3,8 @@ from typing import Any from unittest.mock import MagicMock +from haystack.core.serialization import component_from_dict, component_to_dict + from haystack_integrations.components.converters.docling import DoclingConverter, ExportType @@ -118,3 +120,57 @@ def test_run_json_minimal() -> None: converter_mock.convert.assert_called_once() dl_doc.export_to_dict.assert_called_once_with() meta_extractor_mock.extract_dl_doc_meta.assert_called_once_with(dl_doc=dl_doc) + + +def test_component_to_dict_defaults() -> None: + converter = DoclingConverter() + data = component_to_dict(converter, "docling_converter") + + init_params = data["init_parameters"] + assert init_params["converter"] is None + assert init_params["convert_kwargs"] == {} + assert init_params["export_type"] == ExportType.DOC_CHUNKS + assert init_params["md_export_kwargs"] == {"image_placeholder": ""} + assert init_params["chunker"] is None + assert init_params["meta_extractor"] is None + + +def test_component_to_dict_custom_params() -> None: + converter = DoclingConverter( + convert_kwargs={"raises_on_error": False}, + export_type=ExportType.MARKDOWN, + md_export_kwargs={"image_placeholder": "[img]"}, + ) + data = component_to_dict(converter, "docling_converter") + + init_params = data["init_parameters"] + assert init_params["convert_kwargs"] == {"raises_on_error": False} + assert init_params["export_type"] == ExportType.MARKDOWN + assert init_params["md_export_kwargs"] == {"image_placeholder": "[img]"} + + +def test_component_from_dict_defaults() -> None: + converter = DoclingConverter() + data = component_to_dict(converter, "docling_converter") + restored = component_from_dict(DoclingConverter, data, "docling_converter") + + assert restored.converter is None + assert restored.convert_kwargs == {} + assert restored.export_type == ExportType.DOC_CHUNKS + assert restored.md_export_kwargs == {"image_placeholder": ""} + assert restored.chunker is None + assert restored.meta_extractor is None + + +def test_component_from_dict_custom_params() -> None: + converter = DoclingConverter( + convert_kwargs={"raises_on_error": False}, + export_type=ExportType.JSON, + md_export_kwargs={"image_placeholder": "[img]"}, + ) + data = component_to_dict(converter, "docling_converter") + restored = component_from_dict(DoclingConverter, data, "docling_converter") + + assert restored.convert_kwargs == {"raises_on_error": False} + assert restored.export_type == ExportType.JSON + assert restored.md_export_kwargs == {"image_placeholder": "[img]"} From 174df02d00f8c66620fbad584a9b7e700e818fe7 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 3 Apr 2026 09:40:32 +0200 Subject: [PATCH 09/12] test backward compatibility of deserialization --- integrations/docling/tests/test_converter.py | 26 ++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index 7389c0f2fa..652bb6607e 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -122,6 +122,32 @@ def test_run_json_minimal() -> None: meta_extractor_mock.extract_dl_doc_meta.assert_called_once_with(dl_doc=dl_doc) +def test_component_from_dict_legacy_nulls() -> None: + # Before the public-attribute refactor, default serialization couldn't find + # the _-prefixed attributes and fell back to the init defaults, so + # convert_kwargs and md_export_kwargs were always serialized as null. + # Verify that such a serialized dict still deserializes correctly. + legacy_data = { + "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter", + "init_parameters": { + "converter": None, + "convert_kwargs": None, + "export_type": "doc_chunks", + "md_export_kwargs": None, + "chunker": None, + "meta_extractor": None, + }, + } + restored = component_from_dict(DoclingConverter, legacy_data, "docling_converter") + + assert restored.convert_kwargs == {} + assert restored.md_export_kwargs == {"image_placeholder": ""} + assert restored.export_type == ExportType.DOC_CHUNKS + assert restored.converter is None + assert restored.chunker is None + assert restored.meta_extractor is None + + def test_component_to_dict_defaults() -> None: converter = DoclingConverter() data = component_to_dict(converter, "docling_converter") From 1948bba259c3dc2b0d9119af8758aadc25ff5fc4 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 3 Apr 2026 09:46:03 +0200 Subject: [PATCH 10/12] backward compatibility for old import paths --- integrations/docling/pyproject.toml | 2 +- .../docling/src/docling_haystack/__init__.py | 3 +++ .../docling/src/docling_haystack/converter.py | 17 +++++++++++++++++ integrations/docling/tests/test_converter.py | 11 +++++++++++ 4 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 integrations/docling/src/docling_haystack/__init__.py create mode 100644 integrations/docling/src/docling_haystack/converter.py diff --git a/integrations/docling/pyproject.toml b/integrations/docling/pyproject.toml index 896b60af00..56f01ea74f 100644 --- a/integrations/docling/pyproject.toml +++ b/integrations/docling/pyproject.toml @@ -44,7 +44,7 @@ Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling" [tool.hatch.build.targets.wheel] -packages = ["src/haystack_integrations"] +packages = ["src/haystack_integrations", "src/docling_haystack"] [tool.hatch.version] source = "vcs" diff --git a/integrations/docling/src/docling_haystack/__init__.py b/integrations/docling/src/docling_haystack/__init__.py new file mode 100644 index 0000000000..ee6d47c5c5 --- /dev/null +++ b/integrations/docling/src/docling_haystack/__init__.py @@ -0,0 +1,3 @@ +# Backward-compatibility shim for the old docling-haystack distribution. +# The canonical import path is now: +# from haystack_integrations.components.converters.docling import DoclingConverter diff --git a/integrations/docling/src/docling_haystack/converter.py b/integrations/docling/src/docling_haystack/converter.py new file mode 100644 index 0000000000..e77bd1da51 --- /dev/null +++ b/integrations/docling/src/docling_haystack/converter.py @@ -0,0 +1,17 @@ +"""Backward-compatibility shim for the old docling-haystack import path.""" + +import warnings + +warnings.warn( + "Importing from 'docling_haystack.converter' is deprecated and will be removed in a future release. " + "Use 'from haystack_integrations.components.converters.docling import DoclingConverter' instead.", + DeprecationWarning, + stacklevel=2, +) + +from haystack_integrations.components.converters.docling.converter import ( # noqa: E402, F401 + BaseMetaExtractor, + DoclingConverter, + ExportType, + MetaExtractor, +) diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index 652bb6607e..6dae4ca4e4 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -122,6 +122,17 @@ def test_run_json_minimal() -> None: meta_extractor_mock.extract_dl_doc_meta.assert_called_once_with(dl_doc=dl_doc) +def test_legacy_import_path() -> None: + import warnings + + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + from docling_haystack.converter import DoclingConverter as LegacyDoclingConverter + + assert LegacyDoclingConverter is DoclingConverter + assert any(issubclass(w.category, DeprecationWarning) and "docling_haystack.converter" in str(w.message) for w in caught) + + def test_component_from_dict_legacy_nulls() -> None: # Before the public-attribute refactor, default serialization couldn't find # the _-prefixed attributes and fell back to the init defaults, so From 014cb6dde9324f2014aa67525a6939da5642411e Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 3 Apr 2026 09:50:58 +0200 Subject: [PATCH 11/12] lint --- integrations/docling/pyproject.toml | 2 +- integrations/docling/tests/test_converter.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/integrations/docling/pyproject.toml b/integrations/docling/pyproject.toml index 56f01ea74f..13adef52a2 100644 --- a/integrations/docling/pyproject.toml +++ b/integrations/docling/pyproject.toml @@ -153,7 +153,7 @@ ban-relative-imports = "parents" [tool.ruff.lint.per-file-ignores] # Tests can use magic values, assertions, relative imports, and don't need type annotations -"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"] +"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN", "PLC0415"] [tool.coverage.run] source = ["haystack_integrations"] diff --git a/integrations/docling/tests/test_converter.py b/integrations/docling/tests/test_converter.py index 6dae4ca4e4..c561342de7 100644 --- a/integrations/docling/tests/test_converter.py +++ b/integrations/docling/tests/test_converter.py @@ -130,7 +130,9 @@ def test_legacy_import_path() -> None: from docling_haystack.converter import DoclingConverter as LegacyDoclingConverter assert LegacyDoclingConverter is DoclingConverter - assert any(issubclass(w.category, DeprecationWarning) and "docling_haystack.converter" in str(w.message) for w in caught) + assert any( + issubclass(w.category, DeprecationWarning) and "docling_haystack.converter" in str(w.message) for w in caught + ) def test_component_from_dict_legacy_nulls() -> None: From 6ac49a57373816a58389d0eb7ac579c8c02f7fa4 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 3 Apr 2026 09:54:04 +0200 Subject: [PATCH 12/12] dont treat 0 integration tests as error --- integrations/docling/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/docling/pyproject.toml b/integrations/docling/pyproject.toml index 13adef52a2..b990ceb8b3 100644 --- a/integrations/docling/pyproject.toml +++ b/integrations/docling/pyproject.toml @@ -79,7 +79,7 @@ integration = 'pytest -m "integration" {args:tests}' all = 'pytest {args:tests}' cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}' unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}' -integration-cov-append-retry = 'pytest --cov=haystack_integrations --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}' +integration-cov-append-retry = "bash -c 'pytest --cov=haystack_integrations --cov-append --reruns 3 --reruns-delay 30 -x -m integration {args:tests}; ret=$?; [ $ret -eq 5 ] && exit 0 || exit $ret'" types = "mypy -p haystack_integrations.components.converters.docling {args}" [tool.mypy]