Skip to content

Commit 702983d

Browse files
authored
feat: implement chonkie integration with four chunkers (#3223)
1 parent bc3284f commit 702983d

19 files changed

Lines changed: 1847 additions & 0 deletions

.github/labeler.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ integration:azure-doc-intelligence:
3939
- any-glob-to-any-file: "integrations/azure_doc_intelligence/**/*"
4040
- any-glob-to-any-file: ".github/workflows/azure_doc_intelligence.yml"
4141

42+
integration:chonkie:
43+
- changed-files:
44+
- any-glob-to-any-file: "integrations/chonkie/**/*"
45+
- any-glob-to-any-file: ".github/workflows/chonkie.yml"
46+
4247
integration:chroma:
4348
- changed-files:
4449
- any-glob-to-any-file: "integrations/chroma/**/*"

.github/workflows/CI_coverage_comment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ on:
1111
- "Test / astra"
1212
- "Test / azure_ai_search"
1313
- "Test / azure_doc_intelligence"
14+
- "Test / chonkie"
1415
- "Test / chroma"
1516
- "Test / cohere"
1617
- "Test / cometapi"

.github/workflows/chonkie.yml

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# This workflow comes from https://github.com/ofek/hatch-mypyc
2+
# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
3+
name: Test / chonkie
4+
5+
on:
6+
schedule:
7+
- cron: "0 0 * * *"
8+
pull_request:
9+
paths:
10+
- "integrations/chonkie/**"
11+
- "!integrations/chonkie/*.md"
12+
- ".github/workflows/chonkie.yml"
13+
push:
14+
branches:
15+
- main
16+
paths:
17+
- "integrations/chonkie/**"
18+
- "!integrations/chonkie/*.md"
19+
- ".github/workflows/chonkie.yml"
20+
21+
defaults:
22+
run:
23+
working-directory: integrations/chonkie
24+
25+
concurrency:
26+
group: chonkie-${{ github.head_ref || github.sha }}
27+
cancel-in-progress: true
28+
29+
env:
30+
PYTHONUNBUFFERED: "1"
31+
FORCE_COLOR: "1"
32+
TEST_MATRIX_OS: '["ubuntu-latest", "windows-latest", "macos-latest"]'
33+
TEST_MATRIX_PYTHON: '["3.10", "3.13"]'
34+
35+
jobs:
36+
compute-test-matrix:
37+
runs-on: ubuntu-slim
38+
defaults:
39+
run:
40+
working-directory: .
41+
outputs:
42+
os: ${{ steps.set.outputs.os }}
43+
python-version: ${{ steps.set.outputs.python-version }}
44+
steps:
45+
- id: set
46+
run: |
47+
echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT"
48+
echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT"
49+
50+
run:
51+
name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
52+
needs: compute-test-matrix
53+
permissions:
54+
contents: write
55+
pull-requests: write
56+
runs-on: ${{ matrix.os }}
57+
strategy:
58+
fail-fast: false
59+
matrix:
60+
os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }}
61+
python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }}
62+
63+
steps:
64+
- name: Support longpaths
65+
if: matrix.os == 'windows-latest'
66+
working-directory: .
67+
run: git config --system core.longpaths true
68+
69+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
70+
71+
- name: Set up Python ${{ matrix.python-version }}
72+
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
73+
with:
74+
python-version: ${{ matrix.python-version }}
75+
76+
- name: Install Hatch
77+
run: pip install --upgrade hatch
78+
- name: Lint
79+
if: matrix.python-version == '3.10' && runner.os == 'Linux'
80+
run: hatch run fmt-check && hatch run test:types
81+
82+
- name: Run unit tests
83+
run: hatch run test:unit-cov-retry
84+
85+
# On PR: posts coverage comment (directly on same-repo PRs; via artifact for fork PRs). On push to main: stores coverage baseline on data branch.
86+
- name: Store unit tests coverage
87+
id: coverage_comment
88+
if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule'
89+
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
90+
with:
91+
GITHUB_TOKEN: ${{ github.token }}
92+
COVERAGE_PATH: integrations/chonkie
93+
SUBPROJECT_ID: chonkie
94+
MINIMUM_GREEN: 90
95+
MINIMUM_ORANGE: 60
96+
97+
- name: Upload coverage comment to be posted
98+
if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true'
99+
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
100+
with:
101+
name: coverage-comment-chonkie
102+
path: python-coverage-comment-action-chonkie.txt
103+
104+
- name: Run integration tests
105+
run: hatch run test:integration-cov-append-retry
106+
107+
- name: Store combined coverage
108+
if: github.event_name == 'push'
109+
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
110+
with:
111+
GITHUB_TOKEN: ${{ github.token }}
112+
COVERAGE_PATH: integrations/chonkie
113+
SUBPROJECT_ID: chonkie-combined
114+
MINIMUM_GREEN: 90
115+
MINIMUM_ORANGE: 60
116+
117+
- name: Run unit tests with lowest direct dependencies
118+
if: github.event_name != 'push'
119+
run: |
120+
hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
121+
hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
122+
hatch run test:unit
123+
124+
- name: Nightly - run unit tests with Haystack main branch
125+
if: github.event_name == 'schedule'
126+
run: |
127+
hatch env prune
128+
hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
129+
hatch run test:unit
130+
131+
132+
notify-slack-on-failure:
133+
needs: run
134+
if: failure() && github.event_name == 'schedule'
135+
runs-on: ubuntu-slim
136+
steps:
137+
- uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
138+
with:
139+
slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
3333
| [astra-haystack](integrations/astra/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/astra-haystack.svg)](https://pypi.org/project/astra-haystack) | [![Test / astra](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-astra/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-astra/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-astra-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-astra-combined/htmlcov/index.html) |
3434
| [azure-ai-search-haystack](integrations/azure_ai_search/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/azure-ai-search-haystack.svg)](https://pypi.org/project/azure-ai-search-haystack) | [![Test / azure-ai-search](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_ai_search.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_ai_search.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-azure_ai_search/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-azure_ai_search/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-azure_ai_search-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-azure_ai_search-combined/htmlcov/index.html) |
3535
| [azure-doc-intelligence-haystack](integrations/azure_doc_intelligence/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/azure-doc-intelligence-haystack.svg)](https://pypi.org/project/azure-doc-intelligence-haystack) | [![Test / azure_doc_intelligence](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_doc_intelligence.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_doc_intelligence.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-azure_doc_intelligence/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-azure_doc_intelligence/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-azure_doc_intelligence-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-azure_doc_intelligence-combined/htmlcov/index.html) |
36+
| [chonkie-haystack](integrations/chonkie/) | Preprocessor | [![PyPI - Version](https://img.shields.io/pypi/v/chonkie-haystack.svg)](https://pypi.org/project/chonkie-haystack) | [![Test / chonkie](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chonkie.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chonkie.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-chonkie/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-chonkie/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-chonkie-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-chonkie-combined/htmlcov/index.html) |
3637
| [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-chroma/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-chroma/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-chroma-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-chroma-combined/htmlcov/index.html) |
3738
| [cohere-haystack](integrations/cohere/) | Embedder, Generator, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cohere/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cohere/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cohere-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cohere-combined/htmlcov/index.html) |
3839
| [cometapi-haystack](integrations/cometapi/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cometapi-haystack.svg)](https://pypi.org/project/cometapi-haystack) | [![Test / cometapi](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cometapi.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cometapi.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cometapi/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cometapi/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-cometapi-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-cometapi-combined/htmlcov/index.html) |

0 commit comments

Comments
 (0)