Skip to content

Commit 3599976

Browse files
feat(docling-serve): add DoclingServeConverter integration
Adds a new `docling-serve-haystack` integration with a `DoclingServeConverter` component that converts documents via a remote DoclingServe HTTP server instead of loading heavy ML dependencies locally (no PyTorch required). - Supports URLs, local file paths, and ByteStream sources - Export formats: Markdown (default), plain text, JSON - Both sync `run()` and async `arun()` methods - Configurable conversion options, timeout, and optional API key auth - Full unit test suite (mocked httpx) + integration test markers - CI workflow, labeler, coverage comment, and root README table entry Closes #2960
1 parent f6f7d8e commit 3599976

9 files changed

Lines changed: 930 additions & 0 deletions

File tree

.github/labeler.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ integration:docling:
6464
- any-glob-to-any-file: "integrations/docling/**/*"
6565
- any-glob-to-any-file: ".github/workflows/docling.yml"
6666

67+
integration:docling-serve:
68+
- changed-files:
69+
- any-glob-to-any-file: "integrations/docling_serve/**/*"
70+
- any-glob-to-any-file: ".github/workflows/docling_serve.yml"
71+
6772
integration:elasticsearch:
6873
- changed-files:
6974
- any-glob-to-any-file: "integrations/elasticsearch/**/*"

.github/workflows/CI_coverage_comment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ on:
1515
- "Test / cohere"
1616
- "Test / cometapi"
1717
- "Test / deepeval"
18+
- "Test / docling_serve"
1819
- "Test / dspy"
1920
- "Test / elasticsearch"
2021
- "Test / faiss"
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
name: Test / docling_serve
2+
3+
on:
4+
schedule:
5+
- cron: "0 0 * * *"
6+
pull_request:
7+
paths:
8+
- "integrations/docling_serve/**"
9+
- "!integrations/docling_serve/*.md"
10+
- ".github/workflows/docling_serve.yml"
11+
12+
defaults:
13+
run:
14+
working-directory: integrations/docling_serve
15+
16+
concurrency:
17+
group: docling_serve-${{ github.head_ref }}
18+
cancel-in-progress: true
19+
20+
env:
21+
PYTHONUNBUFFERED: "1"
22+
FORCE_COLOR: "1"
23+
24+
jobs:
25+
run:
26+
name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
27+
runs-on: ${{ matrix.os }}
28+
strategy:
29+
fail-fast: false
30+
matrix:
31+
os: [ubuntu-latest]
32+
python-version: ["3.10", "3.14"]
33+
34+
steps:
35+
- uses: actions/checkout@v4
36+
37+
- name: Set up Python ${{ matrix.python-version }}
38+
uses: actions/setup-python@v5
39+
with:
40+
python-version: ${{ matrix.python-version }}
41+
42+
- name: Install Hatch
43+
run: pip install hatch
44+
45+
- name: Lint
46+
if: matrix.python-version == '3.10' && runner.os == 'Linux'
47+
run: hatch run fmt-check && hatch run test:types
48+
49+
- name: Run unit tests
50+
run: hatch run test:unit-cov-retry
51+
52+
- name: Store unit tests coverage
53+
id: coverage_comment
54+
if: matrix.python-version == '3.10' && runner.os == 'Linux'
55+
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
56+
with:
57+
GITHUB_TOKEN: ${{ github.token }}
58+
COVERAGE_PATH: integrations/docling_serve
59+
SUBPROJECT_ID: docling_serve
60+
MINIMUM_GREEN: 90
61+
MINIMUM_ORANGE: 60
62+
63+
- name: Upload coverage comment to be posted
64+
if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true'
65+
uses: actions/upload-artifact@v4
66+
with:
67+
name: coverage-comment-docling_serve
68+
path: python-coverage-comment-action-docling_serve.txt
69+
70+
- name: Run unit tests with lowest direct dependencies
71+
run: |
72+
hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
73+
hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
74+
hatch run test:unit
75+
76+
- name: Nightly - run unit tests with Haystack main branch
77+
if: github.event_name == 'schedule'
78+
run: |
79+
hatch env prune
80+
hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
81+
hatch run test:unit
82+
83+
notify-slack-on-failure:
84+
needs: run
85+
if: failure() && github.event_name == 'schedule'
86+
runs-on: ubuntu-latest
87+
steps:
88+
- uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
89+
with:
90+
slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
3838
| [cometapi-haystack](integrations/cometapi/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cometapi-haystack.svg)](https://pypi.org/project/cometapi-haystack) | [![Test / cometapi](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cometapi.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cometapi.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cometapi/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cometapi/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cometapi-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cometapi-combined/htmlcov/index.html) |
3939
| [deepeval-haystack](integrations/deepeval/) | Evaluator | [![PyPI - Version](https://img.shields.io/pypi/v/deepeval-haystack.svg)](https://pypi.org/project/deepeval-haystack) | [![Test / deepeval](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/deepeval.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-deepeval/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-deepeval/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-deepeval-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-deepeval-combined/htmlcov/index.html) |
4040
| [docling-haystack](integrations/docling/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/docling-haystack.svg)](https://pypi.org/project/docling-haystack) | [![Test / docling](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-docling/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-docling/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-docling-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-docling-combined/htmlcov/index.html) |
41+
| [docling-serve-haystack](integrations/docling_serve/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/docling-serve-haystack.svg)](https://pypi.org/project/docling-serve-haystack) | [![Test / docling_serve](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling_serve.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/docling_serve.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-docling_serve/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-docling_serve/htmlcov/index.html) | N/A |
4142
| [dspy-haystack](integrations/dspy/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/dspy-haystack.svg)](https://pypi.org/project/dspy-haystack) | [![Test / dspy](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/dspy.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/dspy.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-dspy/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-dspy/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-dspy-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-dspy-combined/htmlcov/index.html) |
4243
| [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-elasticsearch/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-elasticsearch/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-elasticsearch-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-elasticsearch-combined/htmlcov/index.html) |
4344
| [faiss-haystack](integrations/faiss/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/faiss-haystack.svg)](https://pypi.org/project/faiss-haystack) | [![Test / faiss](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/faiss.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/faiss.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-faiss/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-faiss/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-faiss-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-faiss-combined/htmlcov/index.html) |
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# docling-serve-haystack
2+
3+
[![PyPI - Version](https://img.shields.io/pypi/v/docling-serve-haystack.svg)](https://pypi.org/project/docling-serve-haystack)
4+
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling-serve-haystack.svg)](https://pypi.org/project/docling-serve-haystack)
5+
6+
- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/docling_serve/CHANGELOG.md)
7+
8+
---
9+
10+
Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
[build-system]
2+
requires = ["hatchling", "hatch-vcs"]
3+
build-backend = "hatchling.build"
4+
5+
[project]
6+
name = "docling-serve-haystack"
7+
dynamic = ["version"]
8+
description = "Haystack converter component for DoclingServe — document conversion via HTTP"
9+
readme = "README.md"
10+
requires-python = ">=3.10"
11+
license = "Apache-2.0"
12+
keywords = ["Haystack", "Docling", "DoclingServe", "document conversion", "PDF", "OCR"]
13+
authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }]
14+
classifiers = [
15+
"License :: OSI Approved :: Apache Software License",
16+
"Development Status :: 4 - Beta",
17+
"Programming Language :: Python",
18+
"Programming Language :: Python :: 3.10",
19+
"Programming Language :: Python :: 3.11",
20+
"Programming Language :: Python :: 3.12",
21+
"Programming Language :: Python :: 3.13",
22+
"Programming Language :: Python :: 3.14",
23+
"Programming Language :: Python :: Implementation :: CPython",
24+
"Programming Language :: Python :: Implementation :: PyPy",
25+
]
26+
dependencies = [
27+
"haystack-ai>=2.9.0",
28+
"httpx>=0.27.0",
29+
]
30+
31+
[project.urls]
32+
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling_serve#readme"
33+
Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues"
34+
Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling_serve"
35+
36+
[tool.hatch.build.targets.wheel]
37+
packages = ["src/haystack_integrations"]
38+
39+
[tool.hatch.version]
40+
source = "vcs"
41+
tag-pattern = 'integrations\/docling_serve-v(?P<version>.*)'
42+
43+
[tool.hatch.version.raw-options]
44+
root = "../.."
45+
git_describe_command = 'git describe --tags --match="integrations/docling_serve-v[0-9]*"'
46+
47+
[tool.hatch.envs.default]
48+
installer = "uv"
49+
dependencies = ["haystack-pydoc-tools", "ruff"]
50+
51+
[tool.hatch.envs.default.scripts]
52+
docs = ["haystack-pydoc pydoc/config_docusaurus.yml"]
53+
fmt = "ruff check --fix {args}; ruff format {args}"
54+
fmt-check = "ruff check {args} && ruff format --check {args}"
55+
56+
[tool.hatch.envs.test]
57+
dependencies = [
58+
"pytest",
59+
"pytest-asyncio",
60+
"pytest-cov",
61+
"pytest-rerunfailures",
62+
"mypy",
63+
"pip",
64+
]
65+
66+
[tool.hatch.envs.test.scripts]
67+
unit = 'pytest -m "not integration" {args:tests}'
68+
integration = 'pytest -m "integration" {args:tests}'
69+
all = 'pytest {args:tests}'
70+
unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}'
71+
types = "mypy -p haystack_integrations.components.converters.docling_serve {args}"
72+
73+
[tool.mypy]
74+
install_types = true
75+
non_interactive = true
76+
check_untyped_defs = true
77+
disallow_incomplete_defs = true
78+
79+
[[tool.mypy.overrides]]
80+
module = ["httpx"]
81+
ignore_missing_imports = true
82+
83+
[tool.ruff]
84+
line-length = 120
85+
86+
[tool.ruff.lint]
87+
select = [
88+
"A",
89+
"ANN",
90+
"ARG",
91+
"B",
92+
"C",
93+
"D102",
94+
"D103",
95+
"D205",
96+
"D209",
97+
"D213",
98+
"D417",
99+
"D419",
100+
"DTZ",
101+
"E",
102+
"EM",
103+
"F",
104+
"I",
105+
"ICN",
106+
"ISC",
107+
"N",
108+
"PLC",
109+
"PLE",
110+
"PLR",
111+
"PLW",
112+
"Q",
113+
"RUF",
114+
"S",
115+
"T",
116+
"TID",
117+
"UP",
118+
"W",
119+
"YTT",
120+
]
121+
ignore = [
122+
"B027",
123+
"B008",
124+
"S105",
125+
"S106",
126+
"S107",
127+
"C901",
128+
"PLR0911",
129+
"PLR0912",
130+
"PLR0913",
131+
"PLR0915",
132+
"ANN401",
133+
]
134+
135+
[tool.ruff.lint.isort]
136+
known-first-party = ["haystack_integrations"]
137+
138+
[tool.ruff.lint.flake8-tidy-imports]
139+
ban-relative-imports = "parents"
140+
141+
[tool.ruff.lint.per-file-ignores]
142+
"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"]
143+
144+
[tool.coverage.run]
145+
source = ["haystack_integrations"]
146+
branch = true
147+
relative_files = true
148+
parallel = false
149+
150+
[tool.coverage.report]
151+
omit = ["*/tests/*", "*/__init__.py"]
152+
show_missing = true
153+
exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
154+
155+
[tool.pytest.ini_options]
156+
addopts = "--strict-markers"
157+
markers = [
158+
"integration: integration tests",
159+
]
160+
log_cli = true
161+
asyncio_mode = "auto"
162+
asyncio_default_fixture_loop_scope = "function"
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from haystack_integrations.components.converters.docling_serve.converter import DoclingServeConverter, ExportType
6+
7+
__all__ = ["DoclingServeConverter", "ExportType"]

0 commit comments

Comments
 (0)