diff --git a/.github/labeler.yml b/.github/labeler.yml index c9cd838a2e..143181b1a9 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -213,6 +213,11 @@ integration:ollama: - any-glob-to-any-file: "integrations/ollama/**/*" - any-glob-to-any-file: ".github/workflows/ollama.yml" +integration:olostep: + - changed-files: + - any-glob-to-any-file: "integrations/olostep/**/*" + - any-glob-to-any-file: ".github/workflows/olostep.yml" + integration:openrouter: - changed-files: - any-glob-to-any-file: "integrations/openrouter/**/*" diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml index 73c3dbabee..ed1728855d 100644 --- a/.github/workflows/CI_coverage_comment.yml +++ b/.github/workflows/CI_coverage_comment.yml @@ -42,6 +42,7 @@ on: - "Test / mongodb_atlas" - "Test / nvidia" - "Test / ollama" + - "Test / olostep" - "Test / openrouter" - "Test / opensearch" - "Test / optimum" diff --git a/.github/workflows/olostep.yml b/.github/workflows/olostep.yml new file mode 100644 index 0000000000..eb8dee2906 --- /dev/null +++ b/.github/workflows/olostep.yml @@ -0,0 +1,144 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / olostep + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/olostep/**" + - "!integrations/olostep/*.md" + - ".github/workflows/olostep.yml" + push: + branches: + - main + paths: + - "integrations/olostep/**" + - "!integrations/olostep/*.md" + - ".github/workflows/olostep.yml" + +defaults: + run: + working-directory: integrations/olostep + +concurrency: + group: olostep-${{ github.head_ref || github.sha }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + TEST_MATRIX_OS: '["ubuntu-latest", "windows-latest", "macos-latest"]' + TEST_MATRIX_PYTHON: '["3.10", "3.14"]' + +jobs: + compute-test-matrix: + runs-on: ubuntu-slim + defaults: + run: + working-directory: . + outputs: + os: ${{ steps.set.outputs.os }} + python-version: ${{ steps.set.outputs.python-version }} + steps: + - id: set + run: | + if [ "${{ github.event_name }}" = "push" ]; then + echo 'os=["ubuntu-latest"]' >> "$GITHUB_OUTPUT" + echo 'python-version=["3.10"]' >> "$GITHUB_OUTPUT" + else + echo "os=${TEST_MATRIX_OS}" >> "$GITHUB_OUTPUT" + echo "python-version=${TEST_MATRIX_PYTHON}" >> "$GITHUB_OUTPUT" + fi + + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + needs: compute-test-matrix + permissions: + contents: write + pull-requests: write + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }} + python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }} + + steps: + - name: Support longpaths + if: matrix.os == 'windows-latest' + working-directory: . + run: git config --system core.longpaths true + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run unit tests + run: hatch run test:unit-cov-retry + + # On PR: posts coverage comment (directly on same-repo PRs; via artifact for fork PRs). On push to main: stores coverage baseline on data branch. + - name: Store unit tests coverage + id: coverage_comment + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/olostep + SUBPROJECT_ID: olostep + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Upload coverage comment to be posted + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true' + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: coverage-comment-olostep + path: python-coverage-comment-action-olostep.txt + + - name: Run integration tests + run: hatch run test:integration-cov-append-retry + + - name: Store combined coverage + if: github.event_name == 'push' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/olostep + SUBPROJECT_ID: olostep-combined + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Run unit tests with lowest direct dependencies + if: github.event_name != 'push' + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run unit tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:unit + + + notify-slack-on-failure: + needs: run + if: failure() && github.event_name == 'schedule' + runs-on: ubuntu-slim + steps: + - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1 + with: + slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/README.md b/README.md index be79b323e5..018bba5d32 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [mongodb-atlas-haystack](integrations/mongodb_atlas/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/mongodb-atlas-haystack.svg?color=orange)](https://pypi.org/project/mongodb-atlas-haystack) | [![Test / mongodb-atlas](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mongodb_atlas.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mongodb_atlas.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-mongodb_atlas/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-mongodb_atlas/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-mongodb_atlas-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-mongodb_atlas-combined/htmlcov/index.html) | | [nvidia-haystack](integrations/nvidia/) | Embedder, Generator, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/nvidia-haystack.svg?color=orange)](https://pypi.org/project/nvidia-haystack) | [![Test / nvidia](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/nvidia.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/nvidia.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-nvidia/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-nvidia/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-nvidia-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-nvidia-combined/htmlcov/index.html) | | [ollama-haystack](integrations/ollama/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/ollama-haystack) | [![Test / ollama](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-ollama/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-ollama/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-ollama-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-ollama-combined/htmlcov/index.html) | +| [olostep-haystack](integrations/olostep/) | Tool | [![PyPI - Version](https://img.shields.io/pypi/v/olostep-haystack.svg)](https://pypi.org/project/olostep-haystack) | [![Test / olostep](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/olostep.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/olostep.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-olostep/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-olostep/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-olostep-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-olostep-combined/htmlcov/index.html) | | [openrouter-haystack](integrations/openrouter/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/openrouter-haystack.svg)](https://pypi.org/project/openrouter-haystack) | [![Test / openrouter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/openrouter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/openrouter.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-openrouter/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-openrouter/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-openrouter-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-openrouter-combined/htmlcov/index.html) | | [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-opensearch/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-opensearch/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-opensearch-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-opensearch-combined/htmlcov/index.html) | | [optimum-haystack](integrations/optimum/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/optimum-haystack.svg)](https://pypi.org/project/optimum-haystack) | [![Test / optimum](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/optimum.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-optimum/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-optimum/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-optimum-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-optimum-combined/htmlcov/index.html) | diff --git a/integrations/olostep/LICENSE.txt b/integrations/olostep/LICENSE.txt new file mode 100644 index 0000000000..6134ab324f --- /dev/null +++ b/integrations/olostep/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023-present deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/olostep/README.md b/integrations/olostep/README.md new file mode 100644 index 0000000000..82edc25144 --- /dev/null +++ b/integrations/olostep/README.md @@ -0,0 +1,12 @@ +# olostep-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/olostep-haystack.svg)](https://pypi.org/project/olostep-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/olostep-haystack.svg)](https://pypi.org/project/olostep-haystack) + +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/olostep/CHANGELOG.md) + +--- + +## Contributing + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). diff --git a/integrations/olostep/olostep.md b/integrations/olostep/olostep.md new file mode 100644 index 0000000000..a2628a7e7d --- /dev/null +++ b/integrations/olostep/olostep.md @@ -0,0 +1,119 @@ +--- +title: "Olostep" +id: integrations-olostep +description: "Olostep integration for Haystack" +slug: "/integrations-olostep" +--- + + +## olostep_haystack.fetcher + +### OlostepFetcherError + +Bases: Exception + +Raised when Olostep fetching fails. + +### OlostepFetcher + +Fetch and convert web pages to Markdown using Olostep's scrape API. + +Uses SyncOlostepClient (the current Olostep Python SDK). +Do NOT use the legacy Olostep class with client.scrapes.create(). + +Usage: +from olostep_haystack import OlostepFetcher +fetcher = OlostepFetcher(api_key=Secret.from_env_var("OLOSTEP_API_KEY")) +result = fetcher.run(urls=["https://example.com"]) +\# result["documents"] -> List[Document] + +#### run + +```python +run(urls: list[str]) -> dict[str, Any] +``` + +Fetch one or more URLs and return their content as Documents. + +**Parameters:** + +- **urls** (list\[str\]) – list of URLs to scrape + +**Returns:** + +- dict\[str, Any\] – dict with 'documents' (List[Document]) + +**Raises:** + +- OlostepFetcherError – on API failure + +#### to_dict + +```python +to_dict() -> dict[str, Any] +``` + +Serialize the component to a dictionary. + +#### from_dict + +```python +from_dict(data: dict[str, Any]) -> OlostepFetcher +``` + +Deserialize a component from a dictionary. + +## olostep_haystack.web_search + +### OlostepSearchError + +Bases: Exception + +Raised when Olostep search fails. + +### OlostepWebSearch + +Search the web using Olostep's /searches endpoint. + +Usage: +from olostep_haystack import OlostepWebSearch +search = OlostepWebSearch(api_key=Secret.from_env_var("OLOSTEP_API_KEY")) +result = search.run(query="what is haystack?") +\# result["documents"] -> List[Document] +\# result["links"] -> List[str] + +#### run + +```python +run(query: str) -> dict[str, Any] +``` + +Search the web using Olostep. + +**Parameters:** + +- **query** (str) – the search query string + +**Returns:** + +- dict\[str, Any\] – dict with 'documents' (List[Document]) and 'links' (List[str]) + +**Raises:** + +- OlostepSearchError – on API failure + +#### to_dict + +```python +to_dict() -> dict[str, Any] +``` + +Serialize the component to a dictionary. + +#### from_dict + +```python +from_dict(data: dict[str, Any]) -> OlostepWebSearch +``` + +Deserialize a component from a dictionary. diff --git a/integrations/olostep/pydoc/config_docusaurus.yml b/integrations/olostep/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..160c195852 --- /dev/null +++ b/integrations/olostep/pydoc/config_docusaurus.yml @@ -0,0 +1,14 @@ +loaders: + - modules: + - olostep_haystack.web_search + - olostep_haystack.fetcher + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: Olostep integration for Haystack + id: integrations-olostep + filename: olostep.md + title: Olostep diff --git a/integrations/olostep/pyproject.toml b/integrations/olostep/pyproject.toml new file mode 100644 index 0000000000..a9a832705d --- /dev/null +++ b/integrations/olostep/pyproject.toml @@ -0,0 +1,164 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "olostep-haystack" +dynamic = ["version"] +description = "Haystack components for Olostep web search and URL scraping" +readme = "README.md" +requires-python = ">=3.11" +license = "Apache-2.0" +keywords = ["olostep", "web-search", "scraping", "haystack"] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = ["haystack-ai>=2.24.1", "olostep>=0.1.0", "requests>=2.28.0"] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/olostep#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/olostep" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations", "src/olostep_haystack"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/olostep-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/olostep-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +unit-cov-retry = 'pytest --cov=olostep_haystack --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}' +integration-cov-append-retry = 'pytest --cov=olostep_haystack --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}' +types = "mypy -p olostep_haystack {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[[tool.mypy.overrides]] +module = ["olostep.*"] +ignore_missing_imports = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", + "ANN", + "ARG", + "B", + "C", + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D205", # 1 blank line required between summary line and description + "D209", # Closing triple quotes go to new line + "D213", # summary lines must be positioned on the second physical line of the docstring + "D417", # Missing argument descriptions in the docstring + "D419", # Docstring is empty + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow function calls in argument defaults (common Haystack pattern for Secret.from_env_var) + "B008", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + # Allow `Any` type - used legitimately for dynamic types and SDK boundaries + "ANN401", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, relative imports, and don't need type annotations +"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"] + +[tool.coverage.run] +source = ["olostep_haystack"] +branch = true +parallel = false +relative_files = true + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true +asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/olostep/src/haystack_integrations/components/tools/olostep/__init__.py b/integrations/olostep/src/haystack_integrations/components/tools/olostep/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/integrations/olostep/src/haystack_integrations/components/tools/olostep/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/olostep/src/haystack_integrations/components/tools/py.typed b/integrations/olostep/src/haystack_integrations/components/tools/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/olostep/src/olostep_haystack/__init__.py b/integrations/olostep/src/olostep_haystack/__init__.py new file mode 100644 index 0000000000..9041e260ca --- /dev/null +++ b/integrations/olostep/src/olostep_haystack/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: 2024-present Olostep +# +# SPDX-License-Identifier: Apache-2.0 + +from olostep_haystack.fetcher import OlostepFetcher +from olostep_haystack.web_search import OlostepWebSearch + +__all__ = ["OlostepFetcher", "OlostepWebSearch"] diff --git a/integrations/olostep/src/olostep_haystack/fetcher.py b/integrations/olostep/src/olostep_haystack/fetcher.py new file mode 100644 index 0000000000..45082b2c53 --- /dev/null +++ b/integrations/olostep/src/olostep_haystack/fetcher.py @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: 2024-present Olostep +# +# SPDX-License-Identifier: Apache-2.0 + +import logging +from typing import Any + +from haystack import Document, component, default_from_dict, default_to_dict +from haystack.utils import Secret, deserialize_secrets_inplace + +logger = logging.getLogger(__name__) + + +class OlostepFetcherError(Exception): + """Raised when Olostep fetching fails.""" + + +@component +class OlostepFetcher: + """ + Fetch and convert web pages to Markdown using Olostep's scrape API. + + Uses SyncOlostepClient (the current Olostep Python SDK). + Do NOT use the legacy Olostep class with client.scrapes.create(). + + Usage: + from olostep_haystack import OlostepFetcher + fetcher = OlostepFetcher(api_key=Secret.from_env_var("OLOSTEP_API_KEY")) + result = fetcher.run(urls=["https://example.com"]) + # result["documents"] -> List[Document] + """ + + def __init__( + self, + api_key: Secret = Secret.from_env_var("OLOSTEP_API_KEY"), + format: str = "markdown", # noqa: A002 + ) -> None: + if format not in ("markdown", "html"): + msg = "format must be 'markdown' or 'html'" + raise ValueError(msg) + self.api_key = api_key + self.format = format + + @component.output_types(documents=list[Document]) + def run(self, urls: list[str]) -> dict[str, Any]: + """ + Fetch one or more URLs and return their content as Documents. + + :param urls: list of URLs to scrape + :returns: dict with 'documents' (List[Document]) + :raises OlostepFetcherError: on API failure + """ + from olostep import SyncOlostepClient # noqa: PLC0415 + from olostep.errors import Olostep_BaseError, OlostepServerError_AuthFailed # noqa: PLC0415 + + resolved_key = self.api_key.resolve_value() + if not resolved_key: + msg = "OLOSTEP_API_KEY is not set. Set it in your environment or pass it explicitly." + raise OlostepFetcherError(msg) + + client = SyncOlostepClient(api_key=resolved_key) + documents: list[Document] = [] + + for url in urls: + try: + scrape_result = client.scrape(url) + content_obj = scrape_result.retrieve([self.format]) + content = content_obj.markdown_content if self.format == "markdown" else content_obj.html_content + if content: + documents.append(Document(content=content, meta={"url": url})) + else: + logger.warning("Olostep returned no %s content for %s", self.format, url) + except OlostepServerError_AuthFailed as e: + msg = "Olostep authentication failed — check your API key." + raise OlostepFetcherError(msg) from e + except Olostep_BaseError as e: + logger.warning("Olostep error for %s: %s", url, e) + + return {"documents": documents} + + def to_dict(self) -> dict[str, Any]: + """Serialize the component to a dictionary.""" + return default_to_dict( + self, + api_key=self.api_key.to_dict(), + format=self.format, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "OlostepFetcher": + """Deserialize a component from a dictionary.""" + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) diff --git a/integrations/olostep/src/olostep_haystack/py.typed b/integrations/olostep/src/olostep_haystack/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/olostep/src/olostep_haystack/web_search.py b/integrations/olostep/src/olostep_haystack/web_search.py new file mode 100644 index 0000000000..1bc8dfaebb --- /dev/null +++ b/integrations/olostep/src/olostep_haystack/web_search.py @@ -0,0 +1,114 @@ +# SPDX-FileCopyrightText: 2024-present Olostep +# +# SPDX-License-Identifier: Apache-2.0 + +import logging +from typing import Any + +import requests +from haystack import Document, component, default_from_dict, default_to_dict +from haystack.utils import Secret, deserialize_secrets_inplace + +logger = logging.getLogger(__name__) + + +class OlostepSearchError(Exception): + """Raised when Olostep search fails.""" + + +@component +class OlostepWebSearch: + """ + Search the web using Olostep's /searches endpoint. + + Usage: + from olostep_haystack import OlostepWebSearch + search = OlostepWebSearch(api_key=Secret.from_env_var("OLOSTEP_API_KEY")) + result = search.run(query="what is haystack?") + # result["documents"] -> List[Document] + # result["links"] -> List[str] + """ + + def __init__( + self, + api_key: Secret = Secret.from_env_var("OLOSTEP_API_KEY"), + top_k: int = 5, + allowed_domains: list[str] | None = None, + search_params: dict[str, Any] | None = None, + ) -> None: + self.api_key = api_key + self.top_k = top_k + self.allowed_domains = allowed_domains or [] + self.search_params = search_params or {} + + @component.output_types(documents=list[Document], links=list[str]) + def run(self, query: str) -> dict[str, Any]: + """ + Search the web using Olostep. + + :param query: the search query string + :returns: dict with 'documents' (List[Document]) and 'links' (List[str]) + :raises OlostepSearchError: on API failure + """ + resolved_key = self.api_key.resolve_value() + if not resolved_key: + msg = "OLOSTEP_API_KEY is not set. Set it in your environment or pass it explicitly." + raise OlostepSearchError(msg) + + try: + response = requests.post( + "https://api.olostep.com/v1/searches", + headers={ + "Authorization": f"Bearer {resolved_key}", + "Content-Type": "application/json", + }, + json={"query": query, **self.search_params}, + timeout=30, + ) + response.raise_for_status() + except requests.HTTPError as e: + msg = f"Olostep /searches request failed: {e.response.status_code} {e.response.text}" + raise OlostepSearchError(msg) from e + except requests.RequestException as e: + msg = f"Olostep /searches network error: {e}" + raise OlostepSearchError(msg) from e + + data = response.json() + links_data = data.get("result", {}).get("links", []) + + if self.allowed_domains: + links_data = [ + link for link in links_data if any(domain in link.get("url", "") for domain in self.allowed_domains) + ] + + links_data = links_data[: self.top_k] + + documents = [ + Document( + content=link.get("description", ""), + meta={ + "title": link.get("title", ""), + "link": link.get("url", ""), + }, + ) + for link in links_data + ] + links = [link.get("url", "") for link in links_data] + + return {"documents": documents, "links": links} + + def to_dict(self) -> dict[str, Any]: + """Serialize the component to a dictionary.""" + return default_to_dict( + self, + api_key=self.api_key.to_dict(), + top_k=self.top_k, + allowed_domains=self.allowed_domains, + search_params=self.search_params, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "OlostepWebSearch": + """Deserialize a component from a dictionary.""" + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) diff --git a/integrations/olostep/tests/__init__.py b/integrations/olostep/tests/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/integrations/olostep/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/olostep/tests/test_olostep_fetcher.py b/integrations/olostep/tests/test_olostep_fetcher.py new file mode 100644 index 0000000000..092e73c469 --- /dev/null +++ b/integrations/olostep/tests/test_olostep_fetcher.py @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: 2024-present Olostep +# +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os +from unittest.mock import MagicMock, patch + +import pytest +from haystack.utils import Secret +from olostep.errors import OlostepServerError_AuthFailed + +from olostep_haystack.fetcher import OlostepFetcher, OlostepFetcherError + + +class TestOlostepFetcher: + def test_run_returns_documents(self): + mock_client = MagicMock() + mock_scrape_result = MagicMock() + mock_content = MagicMock() + mock_content.markdown_content = "# Hello World" + mock_scrape_result.retrieve.return_value = mock_content + mock_client.scrape.return_value = mock_scrape_result + + with patch("olostep.SyncOlostepClient", return_value=mock_client, create=True): + fetcher = OlostepFetcher(api_key=Secret.from_token("test-key")) + result = fetcher.run(urls=["https://example.com"]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "# Hello World" + assert result["documents"][0].meta["url"] == "https://example.com" + + def test_run_empty_content_logs_warning(self, caplog): + mock_client = MagicMock() + mock_scrape_result = MagicMock() + mock_content = MagicMock() + mock_content.markdown_content = None + mock_scrape_result.retrieve.return_value = mock_content + mock_client.scrape.return_value = mock_scrape_result + + with patch("olostep.SyncOlostepClient", return_value=mock_client, create=True): + fetcher = OlostepFetcher(api_key=Secret.from_token("test-key")) + with caplog.at_level(logging.WARNING, logger="olostep_haystack.fetcher"): + result = fetcher.run(urls=["https://example.com"]) + + assert result["documents"] == [] + assert "Olostep returned no markdown content for https://example.com" in caplog.text + + def test_auth_error_raises(self): + mock_client = MagicMock() + mock_client.scrape.side_effect = OlostepServerError_AuthFailed("auth failed") + + with patch("olostep.SyncOlostepClient", return_value=mock_client, create=True): + fetcher = OlostepFetcher(api_key=Secret.from_token("test-key")) + with pytest.raises(OlostepFetcherError, match="authentication failed"): + fetcher.run(urls=["https://example.com"]) + + def test_to_dict_from_dict_round_trip(self, monkeypatch): + monkeypatch.setenv("OLOSTEP_API_KEY", "test-key") + fetcher = OlostepFetcher(api_key=Secret.from_env_var("OLOSTEP_API_KEY"), format="markdown") + + data = fetcher.to_dict() + restored = OlostepFetcher.from_dict(data) + + assert restored.format == "markdown" + assert restored.api_key.resolve_value() == "test-key" + + @pytest.mark.skipif( + not os.environ.get("OLOSTEP_API_KEY"), + reason="Export OLOSTEP_API_KEY to run integration tests.", + ) + @pytest.mark.integration + def test_run_integration(self): + fetcher = OlostepFetcher(api_key=Secret.from_env_var("OLOSTEP_API_KEY"), format="markdown") + result = fetcher.run(urls=["https://example.com"]) + assert len(result["documents"]) > 0 diff --git a/integrations/olostep/tests/test_olostep_web_search.py b/integrations/olostep/tests/test_olostep_web_search.py new file mode 100644 index 0000000000..1b6b70afb4 --- /dev/null +++ b/integrations/olostep/tests/test_olostep_web_search.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: 2024-present Olostep +# +# SPDX-License-Identifier: Apache-2.0 + +import os +from unittest.mock import MagicMock, patch + +import pytest +from haystack import Document +from haystack.utils import Secret + +from olostep_haystack.web_search import OlostepSearchError, OlostepWebSearch + +MOCK_RESPONSE = { + "result": { + "links": [ + {"url": "https://example.com", "title": "Example", "description": "An example site"}, + {"url": "https://another.com", "title": "Another", "description": "Another site"}, + ] + } +} + + +class TestOlostepWebSearch: + def test_run_returns_documents_and_links(self): + ws = OlostepWebSearch(api_key=Secret.from_token("test-key"), top_k=5) + + mock_response = MagicMock() + mock_response.json.return_value = MOCK_RESPONSE + + with patch("olostep_haystack.web_search.requests.post", return_value=mock_response): + result = ws.run(query="test") + + assert len(result["documents"]) == 2 + assert isinstance(result["documents"][0], Document) + assert result["documents"][0].content == "An example site" + assert result["documents"][0].meta["title"] == "Example" + assert result["documents"][0].meta["link"] == "https://example.com" + assert result["links"] == ["https://example.com", "https://another.com"] + + def test_run_top_k_limits_results(self): + ws = OlostepWebSearch(api_key=Secret.from_token("test-key"), top_k=1) + + mock_response = MagicMock() + mock_response.json.return_value = MOCK_RESPONSE + + with patch("olostep_haystack.web_search.requests.post", return_value=mock_response): + result = ws.run(query="test") + + assert len(result["documents"]) == 1 + assert result["links"] == ["https://example.com"] + + def test_missing_api_key_raises(self): + ws = OlostepWebSearch(api_key=Secret.from_env_var("OLOSTEP_API_KEY", strict=False)) + + with pytest.raises(OlostepSearchError, match="OLOSTEP_API_KEY is not set"): + ws.run(query="test") + + def test_to_dict_from_dict_round_trip(self, monkeypatch): + monkeypatch.setenv("OLOSTEP_API_KEY", "test-key") + ws = OlostepWebSearch( + api_key=Secret.from_env_var("OLOSTEP_API_KEY"), + top_k=3, + allowed_domains=["example.com"], + search_params={"foo": "bar"}, + ) + + data = ws.to_dict() + restored = OlostepWebSearch.from_dict(data) + + assert restored.top_k == 3 + assert restored.allowed_domains == ["example.com"] + assert restored.search_params == {"foo": "bar"} + assert restored.api_key.resolve_value() == "test-key" + + @pytest.mark.skipif( + not os.environ.get("OLOSTEP_API_KEY"), + reason="Export OLOSTEP_API_KEY to run integration tests.", + ) + @pytest.mark.integration + def test_run_integration(self): + ws = OlostepWebSearch(api_key=Secret.from_env_var("OLOSTEP_API_KEY"), top_k=3) + result = ws.run(query="What is Haystack by deepset?") + assert len(result["documents"]) > 0 + assert len(result["links"]) > 0 + assert isinstance(result["documents"][0], Document)