diff --git a/.github/labeler.yml b/.github/labeler.yml index 349edf670c..52e92c8e74 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -64,6 +64,11 @@ integration:docling: - any-glob-to-any-file: "integrations/docling/**/*" - any-glob-to-any-file: ".github/workflows/docling.yml" +integration:docling-serve: + - changed-files: + - any-glob-to-any-file: "integrations/docling_serve/**/*" + - any-glob-to-any-file: ".github/workflows/docling_serve.yml" + integration:elasticsearch: - changed-files: - any-glob-to-any-file: "integrations/elasticsearch/**/*" diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml index e4d682b7cf..e55b60e451 100644 --- a/.github/workflows/CI_coverage_comment.yml +++ b/.github/workflows/CI_coverage_comment.yml @@ -15,6 +15,7 @@ on: - "Test / cohere" - "Test / cometapi" - "Test / deepeval" + - "Test / docling_serve" - "Test / dspy" - "Test / elasticsearch" - "Test / faiss" diff --git a/.github/workflows/docling_serve.yml b/.github/workflows/docling_serve.yml new file mode 100644 index 0000000000..e3ef1467b9 --- /dev/null +++ b/.github/workflows/docling_serve.yml @@ -0,0 +1,157 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / docling_serve + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/docling_serve/**" + - "!integrations/docling_serve/*.md" + - ".github/workflows/docling_serve.yml" + push: + branches: + - main + paths: + - "integrations/docling_serve/**" + - "!integrations/docling_serve/*.md" + - ".github/workflows/docling_serve.yml" + +defaults: + run: + working-directory: integrations/docling_serve + +concurrency: + group: docling_serve-${{ github.head_ref || github.sha }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + TEST_MATRIX_OS: '["ubuntu-latest", "windows-latest", "macos-latest"]' + TEST_MATRIX_PYTHON: '["3.10", "3.14"]' + +jobs: + compute-test-matrix: + runs-on: ubuntu-slim + defaults: + run: + working-directory: . + outputs: + os: ${{ steps.set.outputs.os }} + python-version: ${{ steps.set.outputs.python-version }} + steps: + - id: set + run: | + echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT" + echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT" + + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + needs: compute-test-matrix + permissions: + contents: write + pull-requests: write + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }} + python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }} + + steps: + - name: Support longpaths + if: matrix.os == 'windows-latest' + working-directory: . + run: git config --system core.longpaths true + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run unit tests + run: hatch run test:unit-cov-retry + + # On PR: posts coverage comment (directly on same-repo PRs; via artifact for fork PRs). On push to main: stores coverage baseline on data branch. + - name: Store unit tests coverage + id: coverage_comment + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/docling_serve + SUBPROJECT_ID: docling_serve + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Upload coverage comment to be posted + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true' + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: coverage-comment-docling_serve + path: python-coverage-comment-action-docling_serve.txt + + - name: Start docling-serve + if: runner.os == 'Linux' + run: | + docker run -d --name docling-serve -p 5001:5001 ghcr.io/docling-project/docling-serve-cpu:latest + echo "Waiting for docling-serve to be ready..." + for i in $(seq 1 60); do + if curl -sf http://localhost:5001/health > /dev/null 2>&1; then + echo "docling-serve is ready" + break + fi + echo "Attempt $i/60 - waiting 10s..." + sleep 10 + done + curl -sf http://localhost:5001/health || (echo "docling-serve failed to start" && docker logs docling-serve && exit 1) + + - name: Run integration tests + if: runner.os == 'Linux' + env: + DOCLING_SERVE_URL: http://localhost:5001 + run: hatch run test:integration-cov-append-retry + + - name: Store combined coverage + if: github.event_name == 'push' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/docling_serve + SUBPROJECT_ID: docling_serve-combined + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Run unit tests with lowest direct dependencies + if: github.event_name != 'push' + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run unit tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:unit + + + notify-slack-on-failure: + needs: run + if: failure() && github.event_name == 'schedule' + runs-on: ubuntu-slim + steps: + - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1 + with: + slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/README.md b/README.md index 42891b8360..1444b4e6b7 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [cometapi-haystack](integrations/cometapi/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cometapi-haystack.svg)](https://pypi.org/project/cometapi-haystack) | [![Test / cometapi](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cometapi.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cometapi.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cometapi/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cometapi/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cometapi-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cometapi-combined/htmlcov/index.html) | | [deepeval-haystack](integrations/deepeval/) | Evaluator | [![PyPI - Version](https://img.shields.io/pypi/v/deepeval-haystack.svg)](https://pypi.org/project/deepeval-haystack) | [![Test / deepeval](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-deepeval/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-deepeval/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-deepeval-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-deepeval-combined/htmlcov/index.html) | | [docling-haystack](integrations/docling/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/docling-haystack.svg)](https://pypi.org/project/docling-haystack) | [![Test / docling](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-docling/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-docling/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-docling-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-docling-combined/htmlcov/index.html) | +| [docling-serve-haystack](integrations/docling_serve/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/docling-serve-haystack.svg)](https://pypi.org/project/docling-serve-haystack) | [![Test / docling_serve](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling_serve.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling_serve.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-docling_serve/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-docling_serve/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-docling_serve-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-docling_serve-combined/htmlcov/index.html) | | [dspy-haystack](integrations/dspy/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/dspy-haystack.svg)](https://pypi.org/project/dspy-haystack) | [![Test / dspy](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/dspy.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/dspy.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-dspy/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-dspy/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-dspy-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-dspy-combined/htmlcov/index.html) | | [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-elasticsearch/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-elasticsearch/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-elasticsearch-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-elasticsearch-combined/htmlcov/index.html) | | [faiss-haystack](integrations/faiss/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/faiss-haystack.svg)](https://pypi.org/project/faiss-haystack) | [![Test / faiss](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/faiss.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/faiss.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-faiss/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-faiss/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-faiss-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-faiss-combined/htmlcov/index.html) | diff --git a/integrations/docling_serve/LICENSE.txt b/integrations/docling_serve/LICENSE.txt new file mode 100644 index 0000000000..6134ab324f --- /dev/null +++ b/integrations/docling_serve/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023-present deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/docling_serve/README.md b/integrations/docling_serve/README.md new file mode 100644 index 0000000000..2f0b369379 --- /dev/null +++ b/integrations/docling_serve/README.md @@ -0,0 +1,12 @@ +# docling-serve-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/docling-serve-haystack.svg)](https://pypi.org/project/docling-serve-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling-serve-haystack.svg)](https://pypi.org/project/docling-serve-haystack) + +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/docling_serve/CHANGELOG.md) + +--- + +## Contributing + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). diff --git a/integrations/docling_serve/pydoc/config_docusaurus.yml b/integrations/docling_serve/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..4add9196df --- /dev/null +++ b/integrations/docling_serve/pydoc/config_docusaurus.yml @@ -0,0 +1,13 @@ +loaders: + - modules: + - haystack_integrations.components.converters.docling_serve.converter + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: Docling Serve integration for Haystack + id: integrations-docling_serve + filename: docling_serve.md + title: Docling Serve diff --git a/integrations/docling_serve/pyproject.toml b/integrations/docling_serve/pyproject.toml new file mode 100644 index 0000000000..c081d16831 --- /dev/null +++ b/integrations/docling_serve/pyproject.toml @@ -0,0 +1,161 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "docling-serve-haystack" +dynamic = ["version"] +description = "Haystack integration for docling_serve" +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = [] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = ["haystack-ai>=2.12.0", "httpx>=0.27.0"] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling_serve#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling_serve" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/docling_serve-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/docling_serve-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}' +integration-cov-append-retry = 'pytest --cov=haystack_integrations --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}' +types = "mypy -p haystack_integrations.components.converters.docling_serve {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", + "ANN", + "ARG", + "B", + "C", + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D205", # 1 blank line required between summary line and description + "D209", # Closing triple quotes go to new line + "D213", # summary lines must be positioned on the second physical line of the docstring + "D417", # Missing argument descriptions in the docstring + "D419", # Docstring is empty + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow function calls in argument defaults (common Haystack pattern for Secret.from_env_var) + "B008", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + # Allow `Any` type - used legitimately for dynamic types and SDK boundaries + "ANN401", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, relative imports, and don't need type annotations +"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +parallel = false +relative_files = true + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true +asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/docling_serve/src/haystack_integrations/components/converters/docling_serve/__init__.py b/integrations/docling_serve/src/haystack_integrations/components/converters/docling_serve/__init__.py new file mode 100644 index 0000000000..b14379eca4 --- /dev/null +++ b/integrations/docling_serve/src/haystack_integrations/components/converters/docling_serve/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_integrations.components.converters.docling_serve.converter import DoclingServeConverter + +__all__ = ["DoclingServeConverter"] diff --git a/integrations/docling_serve/src/haystack_integrations/components/converters/docling_serve/converter.py b/integrations/docling_serve/src/haystack_integrations/components/converters/docling_serve/converter.py new file mode 100644 index 0000000000..56174a5697 --- /dev/null +++ b/integrations/docling_serve/src/haystack_integrations/components/converters/docling_serve/converter.py @@ -0,0 +1,333 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Haystack converter component for docling-serve.""" + +import mimetypes +from pathlib import Path +from typing import Any +from urllib.parse import urlparse + +import httpx +from haystack import Document, component, default_from_dict, default_to_dict, logging +from haystack.components.converters.utils import normalize_metadata +from haystack.dataclasses import ByteStream +from haystack.utils import Secret, deserialize_secrets_inplace + +logger = logging.getLogger(__name__) + +_FILE_CONVERT_PATH = "/v1/convert/file" +_SOURCE_CONVERT_PATH = "/v1/convert/source" + + +def _is_url(source: str) -> bool: + """Check if a string looks like a URL.""" + parsed = urlparse(source) + return parsed.scheme in ("http", "https") + + +def _resolve_source_name(source: str | Path | ByteStream) -> str: + """Extract a human-readable name for a source.""" + if isinstance(source, ByteStream): + meta = source.meta or {} + return meta.get("file_path") or meta.get("file_name") or meta.get("name") or "document" + return str(source) + + +def _guess_mime_type(filename: str) -> str: + """Guess the MIME type of a file based on its name.""" + return mimetypes.guess_type(filename)[0] or "application/octet-stream" + + +def _build_file_upload( + source: str | Path | ByteStream, +) -> tuple[str, bytes, str]: + """ + Prepare file upload data from a source. + + :returns: Tuple of (filename, file_bytes, mime_type). + """ + if isinstance(source, ByteStream): + meta = source.meta or {} + filename = meta.get("file_name") or meta.get("file_path") or meta.get("name") or "document" + filename = Path(filename).name + mime_type = source.mime_type or _guess_mime_type(filename) + return filename, source.data, mime_type + + file_path = Path(source) + return file_path.name, file_path.read_bytes(), _guess_mime_type(file_path.name) + + +def _extract_document(response_json: dict[str, Any], source_name: str, extra_meta: dict[str, Any]) -> Document: + """ + Extract a Haystack Document from a docling-serve response. + + :param response_json: The parsed JSON response from docling-serve. + :param source_name: Human-readable name of the source. + :param extra_meta: Additional metadata to merge in. + :returns: A Haystack Document. + """ + document_data = response_json.get("document", {}) + content = document_data.get("md_content") or document_data.get("text_content") or "" + + meta = { + "source_file": source_name, + "conversion_status": response_json.get("status", ""), + "processing_time": response_json.get("processing_time", 0.0), + **extra_meta, + } + + return Document(content=content, meta=meta) + + +@component +class DoclingServeConverter: + """ + Convert documents using a running docling-serve instance. + + Sends files or URLs to a docling-serve API endpoint and converts the responses + into Haystack Document objects. Local files and ByteStreams are uploaded via the + ``/v1/convert/file`` endpoint, while URL strings are sent to ``/v1/convert/source``. + + ### Usage example + + ```python + from haystack_integrations.components.converters.docling_serve import DoclingServeConverter + + converter = DoclingServeConverter(base_url="http://localhost:5001") + result = converter.run(sources=["path/to/document.pdf"]) + documents = result["documents"] + ``` + """ + + def __init__( + self, + *, + base_url: str = "http://localhost:5001", + api_key: Secret | None = Secret.from_env_var("DOCLING_SERVE_API_KEY", strict=False), + timeout: int = 300, + convert_options: dict[str, Any] | None = None, + ) -> None: + """ + Initialize the DoclingServeConverter. + + :param base_url: Root URL of the docling-serve instance (e.g. ``http://localhost:5001``). + :param api_key: API key for authentication. Reads from the ``DOCLING_SERVE_API_KEY`` + environment variable by default. Set to ``None`` to disable authentication. + :param timeout: Request timeout in seconds. Document conversion can be slow, + so the default is 300 seconds. + :param convert_options: Dictionary of conversion parameters passed to docling-serve. + Supports all parameters from docling-serve's ``ConvertDocumentsRequestOptions``, + such as ``from_formats``, ``to_formats``, ``do_ocr``, ``ocr_engine``, + ``table_mode``, etc. See the + `docling-serve documentation `_ + for the full list. + """ + self.base_url = base_url + self.api_key = api_key + self.timeout = timeout + self.convert_options = dict(convert_options) if convert_options else {} + + def to_dict(self) -> dict[str, Any]: + """Serialize the component to a dictionary.""" + return default_to_dict( + self, + base_url=self.base_url, + api_key=self.api_key.to_dict() if self.api_key else None, + timeout=self.timeout, + convert_options=self.convert_options, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "DoclingServeConverter": + """Deserialize the component from a dictionary.""" + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) + + def _build_headers(self) -> dict[str, str]: + """Build HTTP headers including authentication if configured.""" + headers: dict[str, str] = {"accept": "application/json"} + if self.api_key: + resolved = self.api_key.resolve_value() + if resolved: + headers["X-Api-Key"] = resolved + return headers + + def _convert_file_sync( + self, + client: httpx.Client, + source: str | Path | ByteStream, + headers: dict[str, str], + ) -> dict[str, Any]: + """Convert a local file or ByteStream via the /v1/convert/file endpoint (sync).""" + filename, file_bytes, mime_type = _build_file_upload(source) + url = f"{self.base_url}{_FILE_CONVERT_PATH}" + + response = client.post( + url, + files={"files": (filename, file_bytes, mime_type)}, + data=self.convert_options, + headers=headers, + ) + response.raise_for_status() + return response.json() + + def _convert_url_sync( + self, + client: httpx.Client, + source_url: str, + headers: dict[str, str], + ) -> dict[str, Any]: + """Convert a URL source via the /v1/convert/source endpoint (sync).""" + url = f"{self.base_url}{_SOURCE_CONVERT_PATH}" + payload: dict[str, Any] = { + "options": self.convert_options, + "sources": [{"kind": "http", "url": source_url}], + } + + response = client.post(url, json=payload, headers=headers) + response.raise_for_status() + return response.json() + + async def _convert_file_async( + self, + client: httpx.AsyncClient, + source: str | Path | ByteStream, + headers: dict[str, str], + ) -> dict[str, Any]: + """Convert a local file or ByteStream via the /v1/convert/file endpoint (async).""" + filename, file_bytes, mime_type = _build_file_upload(source) + url = f"{self.base_url}{_FILE_CONVERT_PATH}" + + response = await client.post( + url, + files={"files": (filename, file_bytes, mime_type)}, + data=self.convert_options, + headers=headers, + ) + response.raise_for_status() + return response.json() + + async def _convert_url_async( + self, + client: httpx.AsyncClient, + source_url: str, + headers: dict[str, str], + ) -> dict[str, Any]: + """Convert a URL source via the /v1/convert/source endpoint (async).""" + url = f"{self.base_url}{_SOURCE_CONVERT_PATH}" + payload: dict[str, Any] = { + "options": self.convert_options, + "sources": [{"kind": "http", "url": source_url}], + } + + response = await client.post(url, json=payload, headers=headers) + response.raise_for_status() + return response.json() + + @component.output_types(documents=list[Document]) + def run( + self, + sources: list[str | Path | ByteStream], + meta: dict[str, Any] | list[dict[str, Any]] | None = None, + ) -> dict[str, list[Document]]: + """ + Convert sources to Documents using docling-serve. + + :param sources: List of file paths, URLs, or ByteStream objects to convert. + Strings starting with ``http://`` or ``https://`` are treated as URLs and sent + to the ``/v1/convert/source`` endpoint. All other sources are uploaded to + ``/v1/convert/file``. + :param meta: Optional metadata to attach to the Documents. + Can be a single dictionary (applied to all Documents) or a list of dictionaries + (one per source). If a source is a ByteStream, its metadata is also merged. + :returns: + A dictionary with key ``"documents"`` containing the output Documents. + """ + meta_list = normalize_metadata(meta=meta, sources_count=len(sources)) + headers = self._build_headers() + documents: list[Document] = [] + + with httpx.Client(timeout=self.timeout) as client: + for source, source_meta in zip(sources, meta_list, strict=True): + source_name = _resolve_source_name(source) + merged_meta = {**(source.meta if isinstance(source, ByteStream) else {}), **source_meta} + + try: + if isinstance(source, str) and _is_url(source): + result = self._convert_url_sync(client, source, headers) + else: + result = self._convert_file_sync(client, source, headers) + + documents.append(_extract_document(result, source_name, merged_meta)) + + except httpx.HTTPStatusError as e: + body = e.response.text + logger.warning( + "docling-serve returned HTTP {status} for {source}: {body}", + status=e.response.status_code, + source=source_name, + body=body, + ) + except httpx.HTTPError as e: + logger.warning( + "Failed to call docling-serve for {source}: {error}", + source=source_name, + error=str(e), + ) + + return {"documents": documents} + + @component.output_types(documents=list[Document]) + async def run_async( + self, + sources: list[str | Path | ByteStream], + meta: dict[str, Any] | list[dict[str, Any]] | None = None, + ) -> dict[str, list[Document]]: + """ + Asynchronously convert sources to Documents using docling-serve. + + :param sources: List of file paths, URLs, or ByteStream objects to convert. + Strings starting with ``http://`` or ``https://`` are treated as URLs and sent + to the ``/v1/convert/source`` endpoint. All other sources are uploaded to + ``/v1/convert/file``. + :param meta: Optional metadata to attach to the Documents. + Can be a single dictionary (applied to all Documents) or a list of dictionaries + (one per source). If a source is a ByteStream, its metadata is also merged. + :returns: + A dictionary with key ``"documents"`` containing the output Documents. + """ + meta_list = normalize_metadata(meta=meta, sources_count=len(sources)) + headers = self._build_headers() + documents: list[Document] = [] + + async with httpx.AsyncClient(timeout=self.timeout) as client: + for source, source_meta in zip(sources, meta_list, strict=True): + source_name = _resolve_source_name(source) + merged_meta = {**(source.meta if isinstance(source, ByteStream) else {}), **source_meta} + + try: + if isinstance(source, str) and _is_url(source): + result = await self._convert_url_async(client, source, headers) + else: + result = await self._convert_file_async(client, source, headers) + + documents.append(_extract_document(result, source_name, merged_meta)) + + except httpx.HTTPStatusError as e: + body = e.response.text + logger.warning( + "docling-serve returned HTTP {status} for {source}: {body}", + status=e.response.status_code, + source=source_name, + body=body, + ) + except httpx.HTTPError as e: + logger.warning( + "Failed to call docling-serve for {source}: {error}", + source=source_name, + error=str(e), + ) + + return {"documents": documents} diff --git a/integrations/docling_serve/src/haystack_integrations/components/converters/py.typed b/integrations/docling_serve/src/haystack_integrations/components/converters/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/docling_serve/tests/__init__.py b/integrations/docling_serve/tests/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/integrations/docling_serve/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/docling_serve/tests/test_converter.py b/integrations/docling_serve/tests/test_converter.py new file mode 100644 index 0000000000..9596af17ec --- /dev/null +++ b/integrations/docling_serve/tests/test_converter.py @@ -0,0 +1,520 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os +from unittest.mock import patch + +import httpx +import pytest +from haystack.dataclasses import ByteStream +from haystack.utils import Secret + +from haystack_integrations.components.converters.docling_serve import DoclingServeConverter + +SAMPLE_RESPONSE = { + "document": { + "md_content": "# Sample Document\n\nThis is the content.", + }, + "status": "success", + "processing_time": 1.23, + "errors": [], + "timings": {}, +} + + +def _mock_response(json_data=None, status_code=200): + """Create a mock httpx.Response.""" + if json_data is None: + json_data = SAMPLE_RESPONSE + return httpx.Response( + status_code=status_code, + json=json_data, + request=httpx.Request("POST", "http://test"), + ) + + +class TestInit: + def test_defaults(self): + converter = DoclingServeConverter() + assert converter.base_url == "http://localhost:5001" + assert converter.timeout == 300 + assert converter.convert_options == {} + assert converter.api_key is not None + + def test_custom_params(self): + converter = DoclingServeConverter( + base_url="http://myserver:8080", + api_key=Secret.from_token("test-key"), + timeout=60, + convert_options={"from_formats": ["pdf"], "do_ocr": True}, + ) + assert converter.base_url == "http://myserver:8080" + assert converter.api_key.resolve_value() == "test-key" + assert converter.timeout == 60 + assert converter.convert_options == {"from_formats": ["pdf"], "do_ocr": True} + + def test_api_key_none(self): + converter = DoclingServeConverter(api_key=None) + assert converter.api_key is None + + +class TestSerialization: + def test_to_dict(self): + converter = DoclingServeConverter( + base_url="http://myserver:8080", + api_key=Secret.from_env_var("MY_KEY"), + timeout=60, + convert_options={"from_formats": ["pdf"]}, + ) + result = converter.to_dict() + assert result["type"] == ( + "haystack_integrations.components.converters.docling_serve.converter.DoclingServeConverter" + ) + assert result["init_parameters"]["base_url"] == "http://myserver:8080" + assert result["init_parameters"]["timeout"] == 60 + assert result["init_parameters"]["convert_options"] == {"from_formats": ["pdf"]} + assert result["init_parameters"]["api_key"] == {"type": "env_var", "env_vars": ["MY_KEY"], "strict": True} + + def test_to_dict_no_api_key(self): + converter = DoclingServeConverter(api_key=None) + result = converter.to_dict() + assert result["init_parameters"]["api_key"] is None + + def test_from_dict(self): + data = { + "type": "haystack_integrations.components.converters.docling_serve.converter.DoclingServeConverter", + "init_parameters": { + "base_url": "http://myserver:8080", + "api_key": {"type": "env_var", "env_vars": ["MY_KEY"], "strict": True}, + "timeout": 60, + "convert_options": {"do_ocr": False}, + }, + } + converter = DoclingServeConverter.from_dict(data) + assert converter.base_url == "http://myserver:8080" + assert converter.timeout == 60 + assert converter.convert_options == {"do_ocr": False} + assert isinstance(converter.api_key, Secret) + + def test_from_dict_no_api_key(self): + data = { + "type": "haystack_integrations.components.converters.docling_serve.converter.DoclingServeConverter", + "init_parameters": { + "base_url": "http://localhost:5001", + "api_key": None, + "timeout": 300, + "convert_options": {}, + }, + } + converter = DoclingServeConverter.from_dict(data) + assert converter.api_key is None + + def test_round_trip(self): + converter = DoclingServeConverter( + base_url="http://myserver:8080", + api_key=Secret.from_env_var("MY_KEY"), + timeout=120, + convert_options={"from_formats": ["pdf", "docx"], "to_formats": ["md"]}, + ) + data = converter.to_dict() + restored = DoclingServeConverter.from_dict(data) + assert restored.base_url == converter.base_url + assert restored.timeout == converter.timeout + assert restored.convert_options == converter.convert_options + + +class TestRunWithFilePath: + def test_converts_file(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"%PDF-1.4 fake content") + + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.Client.post", return_value=_mock_response()) as mock_post: + result = converter.run(sources=[str(test_file)]) + + assert len(result["documents"]) == 1 + doc = result["documents"][0] + assert doc.content == "# Sample Document\n\nThis is the content." + assert doc.meta["source_file"] == str(test_file) + assert doc.meta["conversion_status"] == "success" + assert doc.meta["processing_time"] == 1.23 + + # Verify the request was made to the file endpoint + call_kwargs = mock_post.call_args + assert "/v1/convert/file" in call_kwargs.args[0] + + def test_converts_path_object(self, tmp_path): + test_file = tmp_path / "doc.docx" + test_file.write_bytes(b"fake docx") + + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.Client.post", return_value=_mock_response()): + result = converter.run(sources=[test_file]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].meta["source_file"] == str(test_file) + + +class TestRunWithByteStream: + def test_converts_bytestream(self): + bs = ByteStream(data=b"fake pdf content", meta={"file_name": "report.pdf"}, mime_type="application/pdf") + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.Client.post", return_value=_mock_response()) as mock_post: + result = converter.run(sources=[bs]) + + assert len(result["documents"]) == 1 + doc = result["documents"][0] + assert doc.content == "# Sample Document\n\nThis is the content." + assert doc.meta["source_file"] == "report.pdf" + assert doc.meta["file_name"] == "report.pdf" + + call_kwargs = mock_post.call_args + assert "/v1/convert/file" in call_kwargs.args[0] + + def test_bytestream_meta_merged(self): + bs = ByteStream(data=b"content", meta={"file_name": "doc.pdf", "custom_key": "custom_value"}) + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.Client.post", return_value=_mock_response()): + result = converter.run(sources=[bs], meta={"user_key": "user_value"}) + + doc = result["documents"][0] + assert doc.meta["custom_key"] == "custom_value" + assert doc.meta["user_key"] == "user_value" + + +class TestRunWithURL: + def test_converts_url(self): + converter = DoclingServeConverter(api_key=None) + url = "https://example.com/document.pdf" + + with patch("httpx.Client.post", return_value=_mock_response()) as mock_post: + result = converter.run(sources=[url]) + + assert len(result["documents"]) == 1 + doc = result["documents"][0] + assert doc.meta["source_file"] == url + + call_kwargs = mock_post.call_args + assert "/v1/convert/source" in call_kwargs.args[0] + body = call_kwargs.kwargs["json"] + assert body["sources"] == [{"kind": "http", "url": url}] + + def test_http_url_detected(self): + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.Client.post", return_value=_mock_response()) as mock_post: + converter.run(sources=["http://example.com/doc.pdf"]) + + call_kwargs = mock_post.call_args + assert "/v1/convert/source" in call_kwargs.args[0] + + +class TestRunWithMeta: + def test_single_dict_meta(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"content") + + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.Client.post", return_value=_mock_response()): + result = converter.run(sources=[str(test_file)], meta={"category": "report"}) + + assert result["documents"][0].meta["category"] == "report" + + def test_list_of_dicts_meta(self, tmp_path): + f1 = tmp_path / "a.pdf" + f2 = tmp_path / "b.pdf" + f1.write_bytes(b"a") + f2.write_bytes(b"b") + + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.Client.post", return_value=_mock_response()): + result = converter.run( + sources=[str(f1), str(f2)], + meta=[{"category": "report"}, {"category": "invoice"}], + ) + + assert result["documents"][0].meta["category"] == "report" + assert result["documents"][1].meta["category"] == "invoice" + + def test_meta_list_length_mismatch_raises(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"content") + + converter = DoclingServeConverter(api_key=None) + + with pytest.raises(ValueError, match="metadata"): + converter.run(sources=[str(test_file)], meta=[{"a": 1}, {"b": 2}]) + + +class TestRunWithConvertOptions: + def test_options_passed_to_file_endpoint(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"content") + + converter = DoclingServeConverter( + api_key=None, + convert_options={"from_formats": ["pdf"], "do_ocr": True}, + ) + + with patch("httpx.Client.post", return_value=_mock_response()) as mock_post: + converter.run(sources=[str(test_file)]) + + call_kwargs = mock_post.call_args + assert call_kwargs.kwargs["data"] == {"from_formats": ["pdf"], "do_ocr": True} + + def test_options_passed_to_source_endpoint(self): + converter = DoclingServeConverter( + api_key=None, + convert_options={"to_formats": ["json"], "do_ocr": False}, + ) + + with patch("httpx.Client.post", return_value=_mock_response()) as mock_post: + converter.run(sources=["https://example.com/doc.pdf"]) + + call_kwargs = mock_post.call_args + body = call_kwargs.kwargs["json"] + assert body["options"] == {"to_formats": ["json"], "do_ocr": False} + + +class TestRunWithAuth: + def test_api_key_sent_in_headers(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"content") + + converter = DoclingServeConverter(api_key=Secret.from_token("my-secret-key")) + + with patch("httpx.Client.post", return_value=_mock_response()) as mock_post: + converter.run(sources=[str(test_file)]) + + call_kwargs = mock_post.call_args + headers = call_kwargs.kwargs["headers"] + assert headers["X-Api-Key"] == "my-secret-key" + + def test_no_api_key_header_when_none(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"content") + + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.Client.post", return_value=_mock_response()) as mock_post: + converter.run(sources=[str(test_file)]) + + call_kwargs = mock_post.call_args + headers = call_kwargs.kwargs["headers"] + assert "X-Api-Key" not in headers + + +class TestRunErrorHandling: + def test_http_error_logged_not_raised(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"content") + + converter = DoclingServeConverter(api_key=None) + + error_response = httpx.Response( + status_code=500, + text="Internal Server Error", + request=httpx.Request("POST", "http://test"), + ) + + with patch("httpx.Client.post", return_value=error_response): + result = converter.run(sources=[str(test_file)]) + + # Should return empty list, not raise + assert result["documents"] == [] + + def test_connection_error_logged_not_raised(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"content") + + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.Client.post", side_effect=httpx.ConnectError("Connection refused")): + result = converter.run(sources=[str(test_file)]) + + assert result["documents"] == [] + + def test_partial_failure(self, tmp_path): + """When one source fails, others should still be converted.""" + good_file = tmp_path / "good.pdf" + bad_file = tmp_path / "bad.pdf" + good_file.write_bytes(b"good") + bad_file.write_bytes(b"bad") + + converter = DoclingServeConverter(api_key=None) + + call_count = 0 + + def mock_post(*_args, **_kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + return httpx.Response( + status_code=500, + text="Error", + request=httpx.Request("POST", "http://test"), + ) + return _mock_response() + + with patch("httpx.Client.post", side_effect=mock_post): + result = converter.run(sources=[str(bad_file), str(good_file)]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].meta["source_file"] == str(good_file) + + +class TestRunWithTextContentFallback: + def test_falls_back_to_text_content(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"content") + + response_data = { + "document": { + "md_content": None, + "text_content": "Plain text fallback.", + }, + "status": "success", + "processing_time": 0.5, + } + + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.Client.post", return_value=_mock_response(response_data)): + result = converter.run(sources=[str(test_file)]) + + assert result["documents"][0].content == "Plain text fallback." + + def test_empty_string_when_no_content(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"content") + + response_data = { + "document": {}, + "status": "success", + "processing_time": 0.1, + } + + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.Client.post", return_value=_mock_response(response_data)): + result = converter.run(sources=[str(test_file)]) + + assert result["documents"][0].content == "" + + +class TestRunAsync: + @pytest.mark.asyncio + async def test_converts_file(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"content") + + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.AsyncClient.post", return_value=_mock_response()) as mock_post: + result = await converter.run_async(sources=[str(test_file)]) + + assert len(result["documents"]) == 1 + doc = result["documents"][0] + assert doc.content == "# Sample Document\n\nThis is the content." + assert doc.meta["conversion_status"] == "success" + + call_kwargs = mock_post.call_args + assert "/v1/convert/file" in call_kwargs.args[0] + + @pytest.mark.asyncio + async def test_converts_url(self): + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.AsyncClient.post", return_value=_mock_response()) as mock_post: + result = await converter.run_async(sources=["https://example.com/doc.pdf"]) + + assert len(result["documents"]) == 1 + + call_kwargs = mock_post.call_args + assert "/v1/convert/source" in call_kwargs.args[0] + + @pytest.mark.asyncio + async def test_error_handling(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"content") + + converter = DoclingServeConverter(api_key=None) + + with patch("httpx.AsyncClient.post", side_effect=httpx.ConnectError("Connection refused")): + result = await converter.run_async(sources=[str(test_file)]) + + assert result["documents"] == [] + + +class TestMixedSources: + def test_file_and_url_together(self, tmp_path): + test_file = tmp_path / "test.pdf" + test_file.write_bytes(b"content") + + converter = DoclingServeConverter(api_key=None) + + call_count = 0 + + def mock_post(_url, **_kwargs): + nonlocal call_count + call_count += 1 + return _mock_response() + + with patch("httpx.Client.post", side_effect=mock_post): + result = converter.run( + sources=[str(test_file), "https://example.com/doc.pdf"], + meta=[{"type": "local"}, {"type": "remote"}], + ) + + assert len(result["documents"]) == 2 + assert result["documents"][0].meta["type"] == "local" + assert result["documents"][1].meta["type"] == "remote" + + +@pytest.mark.integration +class TestIntegration: + @pytest.mark.skipif( + not os.environ.get("DOCLING_SERVE_URL"), + reason="Set DOCLING_SERVE_URL to run integration tests (e.g. http://localhost:5001)", + ) + def test_convert_file(self, tmp_path): + """Convert a simple text file against a running docling-serve instance.""" + test_file = tmp_path / "hello.md" + test_file.write_text("# Hello\n\nThis is a test document.") + + url = os.environ["DOCLING_SERVE_URL"] + converter = DoclingServeConverter(base_url=url, api_key=None) + result = converter.run(sources=[str(test_file)]) + + assert len(result["documents"]) == 1 + doc = result["documents"][0] + assert doc.content + assert doc.meta["conversion_status"] == "success" + assert doc.meta["processing_time"] > 0 + + @pytest.mark.skipif( + not os.environ.get("DOCLING_SERVE_URL"), + reason="Set DOCLING_SERVE_URL to run integration tests (e.g. http://localhost:5001)", + ) + def test_convert_url(self): + """Convert a URL source against a running docling-serve instance. + + This tests the /v1/convert/source endpoint with the v1 sources format + (discriminated union with "kind": "http"). + """ + url = os.environ["DOCLING_SERVE_URL"] + converter = DoclingServeConverter(base_url=url, api_key=None) + result = converter.run(sources=["https://raw.githubusercontent.com/deepset-ai/haystack/main/README.md"]) + + assert len(result["documents"]) == 1 + doc = result["documents"][0] + assert doc.content + assert doc.meta["conversion_status"] == "success" + assert doc.meta["processing_time"] > 0