Skip to content

Commit 7873a6b

Browse files
Merge branch 'main' into feat/elasticsearch-inference-hybrid-retriever
2 parents 196a0a6 + 498fafb commit 7873a6b

22 files changed

Lines changed: 2015 additions & 0 deletions

.github/labeler.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ integration:azure-doc-intelligence:
3939
- any-glob-to-any-file: "integrations/azure_doc_intelligence/**/*"
4040
- any-glob-to-any-file: ".github/workflows/azure_doc_intelligence.yml"
4141

42+
integration:chonkie:
43+
- changed-files:
44+
- any-glob-to-any-file: "integrations/chonkie/**/*"
45+
- any-glob-to-any-file: ".github/workflows/chonkie.yml"
46+
4247
integration:chroma:
4348
- changed-files:
4449
- any-glob-to-any-file: "integrations/chroma/**/*"
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
name: Core / Notify maintainers on fork PRs not running integration tests
2+
3+
on:
4+
pull_request_target:
5+
types: [opened, reopened, synchronize]
6+
7+
permissions:
8+
contents: read
9+
pull-requests: write
10+
11+
env:
12+
NON_TEST_SECRETS: "SLACK_WEBHOOK_URL_NOTIFICATIONS"
13+
14+
jobs:
15+
notify:
16+
if: github.event.pull_request.head.repo.fork == true
17+
runs-on: ubuntu-slim
18+
steps:
19+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
20+
with:
21+
ref: ${{ github.event.pull_request.base.sha }}
22+
23+
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
24+
with:
25+
python-version: "3.13"
26+
27+
- name: Detect integrations requiring API keys for integration tests
28+
id: affected
29+
shell: python
30+
env:
31+
GH_TOKEN: ${{ github.token }}
32+
PR_NUMBER: ${{ github.event.pull_request.number }}
33+
run: |
34+
import os
35+
import re
36+
import subprocess
37+
from pathlib import Path
38+
39+
WORKFLOWS_DIR = Path(".github/workflows")
40+
NON_TEST_SECRETS = set(os.environ["NON_TEST_SECRETS"].split())
41+
SECRET_REF = re.compile(r"secrets\.([A-Z0-9_]+)")
42+
43+
44+
def needs_api_key(integration: str) -> bool:
45+
wf = WORKFLOWS_DIR / f"{integration}.yml"
46+
if not wf.exists():
47+
return False
48+
referenced = set(SECRET_REF.findall(wf.read_text()))
49+
return bool(referenced - NON_TEST_SECRETS)
50+
51+
52+
# 1. PR file list
53+
paths = subprocess.check_output(
54+
[
55+
"gh", "pr", "view", os.environ["PR_NUMBER"],
56+
"--json", "files", "-q", ".files[].path",
57+
],
58+
text=True,
59+
).splitlines()
60+
61+
# 2. Integrations touched by this PR
62+
touched = set()
63+
for p in paths:
64+
parts = Path(p).parts
65+
if len(parts) >= 2 and parts[0] == "integrations":
66+
touched.add(parts[1])
67+
elif (
68+
len(parts) == 3
69+
and parts[0] == ".github"
70+
and parts[1] == "workflows"
71+
and parts[2].endswith(".yml")
72+
and not parts[2].startswith("CI_")
73+
):
74+
touched.add(parts[2].removesuffix(".yml"))
75+
76+
# 3. Of those, which need API keys
77+
affected = sorted(t for t in touched if needs_api_key(t))
78+
print(f"touched = {sorted(touched)}")
79+
print(f"affected = {affected}")
80+
81+
list_value = "\n".join(f"- {name}" for name in affected)
82+
with open(os.environ["GITHUB_OUTPUT"], "a") as f:
83+
f.write(f"list<<EOF\n{list_value}\nEOF\n")
84+
85+
- name: Post or update sticky comment
86+
if: steps.affected.outputs.list != ''
87+
uses: marocchino/sticky-pull-request-comment@0ea0beb66eb9baf113663a64ec522f60e49231c0 # v3.0.4
88+
with:
89+
header: fork-pr-api-keys
90+
number: ${{ github.event.pull_request.number }}
91+
message: |
92+
**Heads-up for maintainers**
93+
94+
This PR is from a fork and touches integrations whose integration tests require API keys.
95+
Those tests are **skipped** in CI because fork PRs don't have access to repo secrets for security reasons.
96+
97+
Affected integrations:
98+
${{ steps.affected.outputs.list }}
99+
100+
Please run the integration tests locally (`hatch run test:integration` inside each folder) before approving.
101+
102+
- name: Remove stale comment
103+
if: steps.affected.outputs.list == ''
104+
uses: marocchino/sticky-pull-request-comment@0ea0beb66eb9baf113663a64ec522f60e49231c0 # v3.0.4
105+
with:
106+
header: fork-pr-api-keys
107+
number: ${{ github.event.pull_request.number }}
108+
delete: true

.github/workflows/CI_coverage_comment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ on:
1111
- "Test / astra"
1212
- "Test / azure_ai_search"
1313
- "Test / azure_doc_intelligence"
14+
- "Test / chonkie"
1415
- "Test / chroma"
1516
- "Test / cohere"
1617
- "Test / cometapi"

.github/workflows/chonkie.yml

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# This workflow comes from https://github.com/ofek/hatch-mypyc
2+
# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
3+
name: Test / chonkie
4+
5+
on:
6+
schedule:
7+
- cron: "0 0 * * *"
8+
pull_request:
9+
paths:
10+
- "integrations/chonkie/**"
11+
- "!integrations/chonkie/*.md"
12+
- ".github/workflows/chonkie.yml"
13+
push:
14+
branches:
15+
- main
16+
paths:
17+
- "integrations/chonkie/**"
18+
- "!integrations/chonkie/*.md"
19+
- ".github/workflows/chonkie.yml"
20+
21+
defaults:
22+
run:
23+
working-directory: integrations/chonkie
24+
25+
concurrency:
26+
group: chonkie-${{ github.head_ref || github.sha }}
27+
cancel-in-progress: true
28+
29+
env:
30+
PYTHONUNBUFFERED: "1"
31+
FORCE_COLOR: "1"
32+
TEST_MATRIX_OS: '["ubuntu-latest", "windows-latest", "macos-latest"]'
33+
TEST_MATRIX_PYTHON: '["3.10", "3.13"]'
34+
35+
jobs:
36+
compute-test-matrix:
37+
runs-on: ubuntu-slim
38+
defaults:
39+
run:
40+
working-directory: .
41+
outputs:
42+
os: ${{ steps.set.outputs.os }}
43+
python-version: ${{ steps.set.outputs.python-version }}
44+
steps:
45+
- id: set
46+
run: |
47+
echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT"
48+
echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT"
49+
50+
run:
51+
name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
52+
needs: compute-test-matrix
53+
permissions:
54+
contents: write
55+
pull-requests: write
56+
runs-on: ${{ matrix.os }}
57+
strategy:
58+
fail-fast: false
59+
matrix:
60+
os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }}
61+
python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }}
62+
63+
steps:
64+
- name: Support longpaths
65+
if: matrix.os == 'windows-latest'
66+
working-directory: .
67+
run: git config --system core.longpaths true
68+
69+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
70+
71+
- name: Set up Python ${{ matrix.python-version }}
72+
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
73+
with:
74+
python-version: ${{ matrix.python-version }}
75+
76+
- name: Install Hatch
77+
run: pip install --upgrade hatch
78+
- name: Lint
79+
if: matrix.python-version == '3.10' && runner.os == 'Linux'
80+
run: hatch run fmt-check && hatch run test:types
81+
82+
- name: Run unit tests
83+
run: hatch run test:unit-cov-retry
84+
85+
# On PR: posts coverage comment (directly on same-repo PRs; via artifact for fork PRs). On push to main: stores coverage baseline on data branch.
86+
- name: Store unit tests coverage
87+
id: coverage_comment
88+
if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule'
89+
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
90+
with:
91+
GITHUB_TOKEN: ${{ github.token }}
92+
COVERAGE_PATH: integrations/chonkie
93+
SUBPROJECT_ID: chonkie
94+
MINIMUM_GREEN: 90
95+
MINIMUM_ORANGE: 60
96+
97+
- name: Upload coverage comment to be posted
98+
if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true'
99+
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
100+
with:
101+
name: coverage-comment-chonkie
102+
path: python-coverage-comment-action-chonkie.txt
103+
104+
- name: Run integration tests
105+
run: hatch run test:integration-cov-append-retry
106+
107+
- name: Store combined coverage
108+
if: github.event_name == 'push'
109+
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
110+
with:
111+
GITHUB_TOKEN: ${{ github.token }}
112+
COVERAGE_PATH: integrations/chonkie
113+
SUBPROJECT_ID: chonkie-combined
114+
MINIMUM_GREEN: 90
115+
MINIMUM_ORANGE: 60
116+
117+
- name: Run unit tests with lowest direct dependencies
118+
if: github.event_name != 'push'
119+
run: |
120+
hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
121+
hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
122+
hatch run test:unit
123+
124+
- name: Nightly - run unit tests with Haystack main branch
125+
if: github.event_name == 'schedule'
126+
run: |
127+
hatch env prune
128+
hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
129+
hatch run test:unit
130+
131+
132+
notify-slack-on-failure:
133+
needs: run
134+
if: failure() && github.event_name == 'schedule'
135+
runs-on: ubuntu-slim
136+
steps:
137+
- uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
138+
with:
139+
slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
3333
| [astra-haystack](integrations/astra/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/astra-haystack.svg)](https://pypi.org/project/astra-haystack) | [![Test / astra](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-astra/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-astra/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-astra-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-astra-combined/htmlcov/index.html) |
3434
| [azure-ai-search-haystack](integrations/azure_ai_search/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/azure-ai-search-haystack.svg)](https://pypi.org/project/azure-ai-search-haystack) | [![Test / azure-ai-search](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_ai_search.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_ai_search.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-azure_ai_search/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-azure_ai_search/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-azure_ai_search-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-azure_ai_search-combined/htmlcov/index.html) |
3535
| [azure-doc-intelligence-haystack](integrations/azure_doc_intelligence/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/azure-doc-intelligence-haystack.svg)](https://pypi.org/project/azure-doc-intelligence-haystack) | [![Test / azure_doc_intelligence](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_doc_intelligence.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_doc_intelligence.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-azure_doc_intelligence/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-azure_doc_intelligence/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-azure_doc_intelligence-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-azure_doc_intelligence-combined/htmlcov/index.html) |
36+
| [chonkie-haystack](integrations/chonkie/) | Preprocessor | [![PyPI - Version](https://img.shields.io/pypi/v/chonkie-haystack.svg)](https://pypi.org/project/chonkie-haystack) | [![Test / chonkie](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chonkie.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chonkie.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-chonkie/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-chonkie/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-chonkie-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-chonkie-combined/htmlcov/index.html) |
3637
| [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-chroma/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-chroma/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-chroma-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-chroma-combined/htmlcov/index.html) |
3738
| [cohere-haystack](integrations/cohere/) | Embedder, Generator, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cohere/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cohere/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cohere-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cohere-combined/htmlcov/index.html) |
3839
| [cometapi-haystack](integrations/cometapi/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cometapi-haystack.svg)](https://pypi.org/project/cometapi-haystack) | [![Test / cometapi](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cometapi.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cometapi.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cometapi/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cometapi/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cometapi-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cometapi-combined/htmlcov/index.html) |

integrations/chonkie/CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Changelog
2+
3+
## [integrations/chonkie-v1.0.0] - 2026-04-27
4+
5+
### 🚀 Features
6+
7+
- Implement chonkie integration with four chunkers (#3223)
8+
9+
<!-- generated by git-cliff -->

0 commit comments

Comments
 (0)