diff --git a/.github/labeler.yml b/.github/labeler.yml index e16d1c6ad3..349edf670c 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -253,6 +253,11 @@ integration:valkey: - any-glob-to-any-file: "integrations/valkey/**/*" - any-glob-to-any-file: ".github/workflows/valkey.yml" +integration:vllm: + - changed-files: + - any-glob-to-any-file: "integrations/vllm/**/*" + - any-glob-to-any-file: ".github/workflows/vllm.yml" + integration:watsonx: - changed-files: - any-glob-to-any-file: "integrations/watsonx/**/*" diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml index f4b83385a5..e4d682b7cf 100644 --- a/.github/workflows/CI_coverage_comment.yml +++ b/.github/workflows/CI_coverage_comment.yml @@ -1,4 +1,4 @@ -name: Add comment about test coverage to PRs +name: Core / Add comment about test coverage to PRs on: workflow_run: @@ -52,6 +52,7 @@ on: - "Test / togetherai" - "Test / unstructured" - "Test / valkey" + - "Test / vllm" - "Test / watsonx" - "Test / weave" - "Test / weaviate" diff --git a/.github/workflows/CI_workflows_linting.yml b/.github/workflows/CI_workflows_linting.yml index 7f19a515d8..deca4f1e95 100644 --- a/.github/workflows/CI_workflows_linting.yml +++ b/.github/workflows/CI_workflows_linting.yml @@ -1,4 +1,4 @@ -name: Github workflows linter +name: Core / Github workflows linter on: pull_request: diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml new file mode 100644 index 0000000000..4fcb2e4992 --- /dev/null +++ b/.github/workflows/vllm.yml @@ -0,0 +1,180 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / vllm + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/vllm/**" + - "!integrations/vllm/*.md" + - ".github/workflows/vllm.yml" + push: + branches: + - main + paths: + - "integrations/vllm/**" + - "!integrations/vllm/*.md" + - ".github/workflows/vllm.yml" + +defaults: + run: + working-directory: integrations/vllm + +concurrency: + group: vllm-${{ github.head_ref || github.sha }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + VLLM_MODEL: "Qwen/Qwen3-0.6B" + # we only test on Ubuntu to keep vLLM server running simple + TEST_MATRIX_OS: '["ubuntu-latest"]' + # vLLM is not compatible with Python 3.14. https://github.com/vllm-project/vllm/issues/34096 + TEST_MATRIX_PYTHON: '["3.10", "3.13"]' + +jobs: + compute-test-matrix: + runs-on: ubuntu-slim + defaults: + run: + working-directory: . + outputs: + os: ${{ steps.set.outputs.os }} + python-version: ${{ steps.set.outputs.python-version }} + steps: + - id: set + run: | + echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT" + echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT" + + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + needs: compute-test-matrix + permissions: + contents: write + pull-requests: write + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }} + python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }} + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install hatch + + - name: Install vLLM (CPU) + run: | + # vLLM on PyPI is GPU-only and requires CUDA, so it won't run on CPU-only systems. + # CPU wheels are not published to PyPI; they are only available as direct downloads from GitHub releases. + # We fetch the latest release and install the appropriate x86 CPU wheel. + # The --torch-backend cpu flag ensures uv installs PyTorch from the official CPU-only index, + # since the required torch+cpu builds are also not available on PyPI. + VLLM_VERSION="$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')" + export VLLM_VERSION + echo "Installing vLLM ${VLLM_VERSION} (CPU)" + hatch run -- uv pip install \ + "https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl" \ + --torch-backend cpu + + - name: Start vLLM server + env: + VLLM_TARGET_DEVICE: "cpu" + VLLM_CPU_KVCACHE_SPACE: "4" + run: | + nohup hatch run -- vllm serve ${{ env.VLLM_MODEL }} \ + --reasoning-parser qwen3 \ + --max-model-len 1024 \ + --enforce-eager \ + --dtype bfloat16 \ + --enable-auto-tool-choice \ + --tool-call-parser hermes \ + --max-num-seqs 1 & + + # Wait for the vLLM server to be ready with a timeout of 300 seconds + timeout=300 + while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8000/health > /dev/null 2>&1; do + echo "Waiting for vLLM server to start..." + sleep 10 + ((timeout-=10)) + done + + if [ $timeout -eq 0 ]; then + echo "Timed out waiting for vLLM server to start." + exit 1 + fi + + echo "vLLM server started successfully." + + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run unit tests + run: hatch run test:unit-cov-retry + + # On PR: posts coverage comment (directly on same-repo PRs; via artifact for fork PRs). On push to main: stores coverage baseline on data branch. + - name: Store unit tests coverage + id: coverage_comment + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/vllm + SUBPROJECT_ID: vllm + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Upload coverage comment to be posted + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true' + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: coverage-comment-vllm + path: python-coverage-comment-action-vllm.txt + + - name: Run integration tests + run: hatch run test:integration-cov-append-retry + + - name: Store combined coverage + if: github.event_name == 'push' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/vllm + SUBPROJECT_ID: vllm-combined + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Run unit tests with lowest direct dependencies + if: github.event_name != 'push' + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run unit tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:unit + + notify-slack-on-failure: + needs: run + if: failure() && github.event_name == 'schedule' + runs-on: ubuntu-slim + steps: + - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1 + with: + slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/README.md b/README.md index 146ccb334d..42891b8360 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [togetherai-haystack](integrations/togetherai/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/togetherai-haystack.svg)](https://pypi.org/project/togetherai-haystack) | [![Test / togetherai](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/togetherai.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/togetherai.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-togetherai/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-togetherai/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-togetherai-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-togetherai-combined/htmlcov/index.html) | | [unstructured-fileconverter-haystack](integrations/unstructured/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-unstructured/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-unstructured/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-unstructured-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-unstructured-combined/htmlcov/index.html) | | [valkey-haystack](integrations/valkey/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/valkey-haystack.svg)](https://pypi.org/project/valkey-haystack) | [![Test / valkey](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/valkey.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/valkey.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-valkey/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-valkey/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-valkey-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-valkey-combined/htmlcov/index.html) | +| [vllm-haystack](integrations/vllm/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/vllm-haystack.svg)](https://pypi.org/project/vllm-haystack) | [![Test / vllm](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/vllm.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/vllm.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-vllm/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-vllm/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-vllm-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-vllm-combined/htmlcov/index.html) | | [watsonx-haystack](integrations/watsonx/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/watsonx-haystack.svg?color=orange)](https://pypi.org/project/watsonx-haystack) | [![Test / watsonx](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/watsonx.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/watsonx.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-watsonx/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-watsonx/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-watsonx-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-watsonx-combined/htmlcov/index.html) | | [weave-haystack](integrations/weave/) | Tracer | [![PyPI - Version](https://img.shields.io/pypi/v/weave-haystack.svg)](https://pypi.org/project/weave-haystack) | [![Test / weave](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/weave.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/weave.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-weave/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-weave/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-weave-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-weave-combined/htmlcov/index.html) | | [weaviate-haystack](integrations/weaviate/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/weaviate-haystack.svg)](https://pypi.org/project/weaviate-haystack) | [![Test / weaviate](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/weaviate.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/weaviate.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-weaviate/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-weaviate/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-weaviate-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-weaviate-combined/htmlcov/index.html) | diff --git a/integrations/vllm/LICENSE.txt b/integrations/vllm/LICENSE.txt new file mode 100644 index 0000000000..6134ab324f --- /dev/null +++ b/integrations/vllm/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023-present deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/vllm/README.md b/integrations/vllm/README.md new file mode 100644 index 0000000000..3d987bbefb --- /dev/null +++ b/integrations/vllm/README.md @@ -0,0 +1,20 @@ +# vllm-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/vllm-haystack.svg)](https://pypi.org/project/vllm-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/vllm-haystack.svg)](https://pypi.org/project/vllm-haystack) + +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/vllm/CHANGELOG.md) + +--- + +## Contributing + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). + +To run integration tests locally, you need to have a running vLLM server. Refer to the [workflow file](https://github.com/deepset-ai/haystack-core-integrations/blob/main/.github/workflows/vllm.yml) for more details. + +For example, on macOs, you can install [vLLM-metal](https://github.com/vllm-project/vllm-metal) and run the server with: + +```bash +source ~/.venv-vllm-metal/bin/activate && vllm serve Qwen/Qwen3-0.6B --reasoning-parser qwen3 --max-model-len 1024 --enforce-eager --enable-auto-tool-choice --tool-call-parser hermes +``` \ No newline at end of file diff --git a/integrations/vllm/pydoc/config_docusaurus.yml b/integrations/vllm/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..13c26dd968 --- /dev/null +++ b/integrations/vllm/pydoc/config_docusaurus.yml @@ -0,0 +1,13 @@ +loaders: + - modules: + - haystack_integrations.components.generators.vllm.chat.chat_generator + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: vLLM integration for Haystack + id: integrations-vllm + filename: vllm.md + title: vLLM diff --git a/integrations/vllm/pyproject.toml b/integrations/vllm/pyproject.toml new file mode 100644 index 0000000000..53cec11e33 --- /dev/null +++ b/integrations/vllm/pyproject.toml @@ -0,0 +1,161 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "vllm-haystack" +dynamic = ["version"] +description = "Haystack integration for vllm" +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = [] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = ["haystack-ai>=2.23.0", "openai"] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/vllm#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/vllm" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/vllm-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/vllm-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", + "Pillow", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}' +integration-cov-append-retry = 'pytest --cov=haystack_integrations --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}' +types = "mypy -p haystack_integrations.components.generators.vllm {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", + "ANN", + "ARG", + "B", + "C", + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D205", # 1 blank line required between summary line and description + "D209", # Closing triple quotes go to new line + "D213", # summary lines must be positioned on the second physical line of the docstring + "D417", # Missing argument descriptions in the docstring + "D419", # Docstring is empty + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow function calls in argument defaults (common Haystack pattern for Secret.from_env_var) + "B008", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + # Allow `Any` type - used legitimately for dynamic types and SDK boundaries + "ANN401", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, relative imports, and don't need type annotations +"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +parallel = false +relative_files = true + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true +asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/vllm/src/haystack_integrations/components/generators/py.typed b/integrations/vllm/src/haystack_integrations/components/generators/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/vllm/src/haystack_integrations/components/generators/vllm/__init__.py b/integrations/vllm/src/haystack_integrations/components/generators/vllm/__init__.py new file mode 100644 index 0000000000..c929b5cd12 --- /dev/null +++ b/integrations/vllm/src/haystack_integrations/components/generators/vllm/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from .chat.chat_generator import VLLMChatGenerator + +__all__ = ["VLLMChatGenerator"] diff --git a/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/__init__.py b/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/chat_generator.py b/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/chat_generator.py new file mode 100644 index 0000000000..5d21c3abaa --- /dev/null +++ b/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/chat_generator.py @@ -0,0 +1,549 @@ +import asyncio +import dataclasses +import json +from typing import Any + +from haystack import default_from_dict, default_to_dict, logging +from haystack.components.generators.chat.openai import ( + _check_finish_reason, + _convert_chat_completion_chunk_to_streaming_chunk, +) +from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message, _serialize_object +from haystack.core.component import component +from haystack.dataclasses import ChatMessage, ToolCall +from haystack.dataclasses.chat_message import ReasoningContent +from haystack.dataclasses.streaming_chunk import ( + AsyncStreamingCallbackT, + ComponentInfo, + StreamingCallbackT, + StreamingChunk, + SyncStreamingCallbackT, + select_streaming_callback, +) +from haystack.tools import ( + ToolsType, + _check_duplicate_tool_names, + deserialize_tools_or_toolset_inplace, + flatten_tools_or_toolsets, + serialize_tools_or_toolset, + warm_up_tools, +) +from haystack.utils import Secret, deserialize_callable, serialize_callable +from haystack.utils.http_client import init_http_client +from openai import AsyncOpenAI, AsyncStream, OpenAI, Stream +from openai.types.chat import ChatCompletion, ChatCompletionChunk +from openai.types.chat.chat_completion import Choice + +logger = logging.getLogger(__name__) + + +def _convert_chat_completion_to_chat_message(completion: ChatCompletion, choice: Choice) -> ChatMessage: + """ + Convert a vLLM chat completion response to a ChatMessage, including reasoning content if present. + + :param completion: The completion returned by the vLLM API. + :param choice: The choice returned by the vLLM API. + :return: The ChatMessage. + """ + message = choice.message + text = message.content + + tool_calls = [] + if message.tool_calls: + for tc in message.tool_calls: + if not hasattr(tc, "function"): + continue + try: + arguments = json.loads(tc.function.arguments) + tool_calls.append(ToolCall(id=tc.id, tool_name=tc.function.name, arguments=arguments)) + except json.JSONDecodeError: + logger.warning( + "vLLM returned a malformed JSON string for tool call arguments. This tool call " + "will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}", + _id=tc.id, + _name=tc.function.name, + _arguments=tc.function.arguments, + ) + + meta: dict[str, Any] = { + "model": completion.model, + "index": choice.index, + "finish_reason": choice.finish_reason, + "usage": _serialize_object(completion.usage), + } + + reasoning_text = getattr(message, "reasoning", None) + reasoning = ReasoningContent(reasoning_text=reasoning_text) if reasoning_text else None + + return ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta, reasoning=reasoning) + + +@component +class VLLMChatGenerator: + """ + A component for generating chat completions using models served with [vLLM](https://docs.vllm.ai/). + + It expects a vLLM server to be running and accessible at the `api_base_url` parameter. + + ### Starting the vLLM server + + Before using this component, start a vLLM server: + + ```bash + vllm serve Qwen/Qwen3-4B-Instruct-2507 + ``` + + For reasoning models, start the server with the appropriate reasoning parser: + + ```bash + vllm serve Qwen/Qwen3-0.6B --reasoning-parser qwen3 + ``` + + For tool calling, the server must be started with `--enable-auto-tool-choice` and `--tool-call-parser`: + + ```bash + vllm serve Qwen/Qwen3-0.6B --enable-auto-tool-choice --tool-call-parser hermes + ``` + + The available tool call parsers depend on the model. See the + [vLLM tool calling docs](https://docs.vllm.ai/en/stable/features/tool_calling/) for the full list. + + For details on server options, see the [vLLM CLI docs](https://docs.vllm.ai/en/stable/cli/serve/). + + ### Usage example + + ```python + from haystack.dataclasses import ChatMessage + from haystack_integrations.components.generators.vllm import VLLMChatGenerator + + generator = VLLMChatGenerator( + model="Qwen/Qwen3-0.6B", + generation_kwargs={"max_tokens": 512, "temperature": 0.7}, + ) + + messages = [ChatMessage.from_user("What's Natural Language Processing?")] + response = generator.run(messages=messages) + print(response["replies"][0].text) + ``` + + ### Usage example with vLLM-specific parameters + + Pass the vLLM-specific parameters inside the `generation_kwargs`["extra_body"] dictionary. + + ```python + from haystack_integrations.components.generators.vllm import VLLMChatGenerator + + generator = VLLMChatGenerator( + model="Qwen/Qwen3-0.6B", + generation_kwargs={ + "max_tokens": 512, + "extra_body": { + "top_k": 50, + "min_tokens": 10, + "repetition_penalty": 1.1, + }, + }, + ) + ``` + + ### Usage example with tool calling + + To use tool calling, start the vLLM server with `--enable-auto-tool-choice` and `--tool-call-parser`. + + ```python + from haystack.dataclasses import ChatMessage + from haystack.tools import tool + from haystack_integrations.components.generators.vllm import VLLMChatGenerator + + @tool + def weather(city: str) -> str: + \"\"\"Get the weather in a given city.\"\"\" + return f"The weather in {city} is sunny" + + generator = VLLMChatGenerator(model="Qwen/Qwen3-0.6B", tools=[weather]) + + messages = [ChatMessage.from_user("What is the weather in Paris?")] + response = generator.run(messages=messages) + print(response["replies"][0].tool_calls) + ``` + + ### Usage example with reasoning models + + To use reasoning models, start the vLLM server with `--reasoning-parser`. + + ```python + from haystack.dataclasses import ChatMessage + from haystack_integrations.components.generators.vllm import VLLMChatGenerator + + generator = VLLMChatGenerator(model="Qwen/Qwen3-0.6B") + + messages = [ChatMessage.from_user("Solve step by step: what is 15 * 37?")] + response = generator.run(messages=messages) + reply = response["replies"][0] + if reply.reasoning: + print("Reasoning:", reply.reasoning.reasoning_text) + print("Answer:", reply.text) + ``` + """ + + def __init__( + self, + *, + model: str, + api_key: Secret | None = Secret.from_env_var("VLLM_API_KEY", strict=False), + streaming_callback: StreamingCallbackT | None = None, + api_base_url: str = "http://localhost:8000/v1", + generation_kwargs: dict[str, Any] | None = None, + timeout: float | None = None, + max_retries: int | None = None, + tools: ToolsType | None = None, + http_client_kwargs: dict[str, Any] | None = None, + ) -> None: + """ + Creates an instance of VLLMChatGenerator. + + :param model: The name of the model served by vLLM (e.g., "Qwen/Qwen3-0.6B"). + :param api_key: The vLLM API key. Defaults to the `VLLM_API_KEY` environment variable. + Only required if the vLLM server was started with `--api-key`. + :param streaming_callback: A callback function that is called when a new token is received from the stream. + The callback function accepts + [StreamingChunk](https://docs.haystack.deepset.ai/docs/data-classes#streamingchunk) + as an argument. + :param api_base_url: The base URL of the vLLM server. + :param generation_kwargs: Additional parameters for text generation. These parameters are sent directly to + the vLLM OpenAI-compatible endpoint. See + [vLLM documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server/) + for more details. + Some of the supported parameters: + - `max_tokens`: Maximum number of tokens to generate. + - `temperature`: Sampling temperature. + - `top_p`: Nucleus sampling parameter. + - `n`: Number of completions to generate for each prompt. + - `stop`: One or more sequences after which the model should stop generating tokens. + - `response_format`: A JSON schema or a Pydantic model that enforces the structure of the response. + - `extra_body`: A dictionary of vLLM-specific parameters not part of the standard OpenAI API + (e.g., `top_k`, `min_tokens`, `repetition_penalty`). + :param timeout: + Timeout for vLLM client calls. If not set, it defaults to the default set by the OpenAI client. + :param max_retries: + Maximum number of retries to attempt for failed requests. If not set, it defaults to the default + set by the OpenAI client. + :param tools: + A list of Tool and/or Toolset objects, or a single Toolset for which the model can prepare calls. + Each tool should have a unique name. Not all models support tools. + :param http_client_kwargs: + A dictionary of keyword arguments to configure a custom `httpx.Client` or `httpx.AsyncClient`. + For more information, see the [HTTPX documentation](https://www.python-httpx.org/api/#client). + """ + + self.model = model + self.api_key = api_key + self.streaming_callback = streaming_callback + self.api_base_url = api_base_url + self.generation_kwargs = generation_kwargs or {} + self.timeout = timeout + self.max_retries = max_retries + self.tools = tools + self.http_client_kwargs = http_client_kwargs + + _check_duplicate_tool_names(flatten_tools_or_toolsets(self.tools)) + + self._client: OpenAI | None = None + self._async_client: AsyncOpenAI | None = None + self._is_warmed_up = False + + def warm_up(self) -> None: + """Create the OpenAI clients and warm up tools.""" + if self._is_warmed_up: + return + + api_key = "placeholder-api-key" + if self.api_key and (resolved_value := self.api_key.resolve_value()): + api_key = resolved_value + + client_kwargs: dict[str, Any] = { + "api_key": api_key, + "base_url": self.api_base_url, + } + if self.timeout is not None: + client_kwargs["timeout"] = self.timeout + if self.max_retries is not None: + client_kwargs["max_retries"] = self.max_retries + + self._client = OpenAI( + http_client=init_http_client(self.http_client_kwargs, async_client=False), **client_kwargs + ) + self._async_client = AsyncOpenAI( + http_client=init_http_client(self.http_client_kwargs, async_client=True), **client_kwargs + ) + warm_up_tools(self.tools) + self._is_warmed_up = True + + def to_dict(self) -> dict[str, Any]: + """ + Serialize this component to a dictionary. + + :returns: + The serialized component as a dictionary. + """ + callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None + return default_to_dict( + self, + model=self.model, + streaming_callback=callback_name, + api_base_url=self.api_base_url, + generation_kwargs=self.generation_kwargs, + api_key=self.api_key, + timeout=self.timeout, + max_retries=self.max_retries, + tools=serialize_tools_or_toolset(self.tools), + http_client_kwargs=self.http_client_kwargs, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "VLLMChatGenerator": + """ + Deserialize this component from a dictionary. + + :param data: The dictionary representation of this component. + :returns: + The deserialized component instance. + """ + deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools") + init_params = data.get("init_parameters", {}) + serialized_callback_handler = init_params.get("streaming_callback") + if serialized_callback_handler: + data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler) + return default_from_dict(cls, data) + + def _prepare_api_call( + self, + messages: list[ChatMessage], + streaming_callback: StreamingCallbackT | None, + generation_kwargs: dict[str, Any] | None, + tools: ToolsType | None, + ) -> dict[str, Any]: + """Build the kwargs dict for the OpenAI chat completions API call.""" + generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})} + + openai_formatted_messages = [message.to_openai_dict_format() for message in messages] + + flattened_tools = flatten_tools_or_toolsets(tools or self.tools) + _check_duplicate_tool_names(flattened_tools) + tool_definitions = None + if flattened_tools: + tool_definitions = [{"type": "function", "function": t.tool_spec} for t in flattened_tools] + + api_kwargs: dict[str, Any] = { + "model": self.model, + "messages": openai_formatted_messages, + "stream": streaming_callback is not None, + **generation_kwargs, + } + if tool_definitions: + api_kwargs["tools"] = tool_definitions + + return api_kwargs + + def _handle_stream_response(self, chat_completion: Stream, callback: SyncStreamingCallbackT) -> list[ChatMessage]: + """Handle a synchronous streaming response, extracting reasoning content from vLLM's reasoning chunks.""" + component_info = ComponentInfo.from_component(self) + chunks: list[StreamingChunk] = [] + + # track reasoning and content blocks. We use these flags to detect the transition and mark start=True + # on the first chunk of each block + reasoning_started = False + content_started = False + + for chunk in chat_completion: + assert len(chunk.choices) <= 1 # noqa: S101 + + reasoning_text = None + if chunk.choices: + reasoning_text = getattr(chunk.choices[0].delta, "reasoning", None) + + if reasoning_text: + streaming_chunk = StreamingChunk( + content="", + reasoning=ReasoningContent(reasoning_text=reasoning_text), + index=0, + start=not reasoning_started, + component_info=component_info, + meta={ + "model": chunk.model, + "index": chunk.choices[0].index, + "finish_reason": chunk.choices[0].finish_reason, + }, + ) + reasoning_started = True + else: + streaming_chunk = _convert_chat_completion_chunk_to_streaming_chunk( + chunk=chunk, previous_chunks=chunks, component_info=component_info + ) + # _convert_chat_completion_chunk_to_streaming_chunk doesn't know about reasoning chunks + # We set start=True on the first content chunk after reasoning. + if reasoning_started and not content_started: + streaming_chunk = dataclasses.replace(streaming_chunk, start=True) + content_started = True + + chunks.append(streaming_chunk) + callback(streaming_chunk) + + return [_convert_streaming_chunks_to_chat_message(chunks=chunks)] + + async def _handle_async_stream_response( + self, chat_completion: AsyncStream[ChatCompletionChunk], callback: AsyncStreamingCallbackT + ) -> list[ChatMessage]: + """Handle an asynchronous streaming response, extracting reasoning content from vLLM's reasoning chunks.""" + component_info = ComponentInfo.from_component(self) + chunks: list[StreamingChunk] = [] + reasoning_started = False + content_started = False + try: + async for chunk in chat_completion: + assert len(chunk.choices) <= 1 # noqa: S101 + + reasoning_text = None + if chunk.choices: + reasoning_text = getattr(chunk.choices[0].delta, "reasoning", None) + + if reasoning_text: + streaming_chunk = StreamingChunk( + content="", + reasoning=ReasoningContent(reasoning_text=reasoning_text), + index=0, + start=not reasoning_started, + component_info=component_info, + meta={ + "model": chunk.model, + "index": chunk.choices[0].index, + "finish_reason": chunk.choices[0].finish_reason, + }, + ) + reasoning_started = True + else: + streaming_chunk = _convert_chat_completion_chunk_to_streaming_chunk( + chunk=chunk, previous_chunks=chunks, component_info=component_info + ) + if reasoning_started and not content_started: + streaming_chunk = dataclasses.replace(streaming_chunk, start=True) + content_started = True + + chunks.append(streaming_chunk) + await callback(streaming_chunk) + except asyncio.CancelledError: + await asyncio.shield(chat_completion.close()) + raise + + return [_convert_streaming_chunks_to_chat_message(chunks=chunks)] + + @component.output_types(replies=list[ChatMessage]) + def run( + self, + messages: list[ChatMessage], + streaming_callback: StreamingCallbackT | None = None, + generation_kwargs: dict[str, Any] | None = None, + *, + tools: ToolsType | None = None, + ) -> dict[str, list[ChatMessage]]: + """ + Run the VLLM chat generator on the given input data. + + :param messages: + A list of ChatMessage instances representing the input messages. + :param streaming_callback: + A callback function that is called when a new token is received from the stream. + :param generation_kwargs: + Additional keyword arguments for text generation. These parameters will + override the parameters passed during component initialization. + For details on vLLM API parameters, see + [vLLM documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server/). + :param tools: + A list of Tool and/or Toolset objects, or a single Toolset for which the model can prepare calls. + If set, it will override the `tools` parameter provided during initialization. + + :returns: + A dictionary with the following key: + - `replies`: A list containing the generated responses as ChatMessage instances. + """ + if not self._is_warmed_up: + self.warm_up() + + if len(messages) == 0: + return {"replies": []} + + streaming_callback = select_streaming_callback( + init_callback=self.streaming_callback, runtime_callback=streaming_callback, requires_async=False + ) + + api_kwargs = self._prepare_api_call(messages, streaming_callback, generation_kwargs, tools) + assert self._client is not None # noqa: S101 + chat_completion = self._client.chat.completions.create(**api_kwargs) + + if streaming_callback is not None: + completions = self._handle_stream_response(chat_completion, streaming_callback) + else: + completions = [ + _convert_chat_completion_to_chat_message(chat_completion, choice) for choice in chat_completion.choices + ] + + for message in completions: + _check_finish_reason(message.meta) + + return {"replies": completions} + + @component.output_types(replies=list[ChatMessage]) + async def run_async( + self, + messages: list[ChatMessage], + streaming_callback: StreamingCallbackT | None = None, + generation_kwargs: dict[str, Any] | None = None, + *, + tools: ToolsType | None = None, + ) -> dict[str, list[ChatMessage]]: + """ + Run the VLLM chat generator on the given input data asynchronously. + + :param messages: + A list of ChatMessage instances representing the input messages. + :param streaming_callback: + A callback function that is called when a new token is received from the stream. + Must be a coroutine. + :param generation_kwargs: + Additional keyword arguments for text generation. These parameters will + override the parameters passed during component initialization. + For details on vLLM API parameters, see + [vLLM documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server/). + :param tools: + A list of Tool and/or Toolset objects, or a single Toolset for which the model can prepare calls. + If set, it will override the `tools` parameter provided during initialization. + + :returns: + A dictionary with the following key: + - `replies`: A list containing the generated responses as ChatMessage instances. + """ + if not self._is_warmed_up: + self.warm_up() + + if len(messages) == 0: + return {"replies": []} + + streaming_callback = select_streaming_callback( + init_callback=self.streaming_callback, runtime_callback=streaming_callback, requires_async=True + ) + + api_kwargs = self._prepare_api_call(messages, streaming_callback, generation_kwargs, tools) + assert self._async_client is not None # noqa: S101 + chat_completion = await self._async_client.chat.completions.create(**api_kwargs) + + if streaming_callback is not None: + completions = await self._handle_async_stream_response(chat_completion, streaming_callback) + else: + completions = [ + _convert_chat_completion_to_chat_message(chat_completion, choice) for choice in chat_completion.choices + ] + + for message in completions: + _check_finish_reason(message.meta) + + return {"replies": completions} diff --git a/integrations/vllm/tests/__init__.py b/integrations/vllm/tests/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/integrations/vllm/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/vllm/tests/test_chat_generator.py b/integrations/vllm/tests/test_chat_generator.py new file mode 100644 index 0000000000..dffee6e5ac --- /dev/null +++ b/integrations/vllm/tests/test_chat_generator.py @@ -0,0 +1,634 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import json +from collections.abc import Iterator +from datetime import datetime, timezone +from typing import Annotated +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from haystack.components.generators.utils import print_streaming_chunk +from haystack.dataclasses import ChatMessage +from haystack.tools import tool +from haystack.utils.auth import Secret +from openai import AsyncStream, Stream +from openai.types.chat import ChatCompletion, ChatCompletionChunk, chat_completion_chunk +from openai.types.chat.chat_completion import Choice +from openai.types.chat.chat_completion_message import ChatCompletionMessage + +from haystack_integrations.components.generators.vllm.chat.chat_generator import ( + VLLMChatGenerator, + _convert_chat_completion_to_chat_message, +) + +MODEL = "Qwen/Qwen3-0.6B" + + +class MockStream(Stream[ChatCompletionChunk]): + def __init__(self, mock_chunks, client=None, *args, **kwargs): + client = client or MagicMock() + super().__init__(client=client, *args, **kwargs) # noqa: B026 + self.mock_chunks = mock_chunks + + def __stream__(self) -> Iterator[ChatCompletionChunk]: + yield from self.mock_chunks + + +class AsyncMockStream(AsyncStream[ChatCompletionChunk]): + def __init__(self, mock_chunks): + self.mock_chunks = mock_chunks + self._index = 0 + + def __aiter__(self): + return self + + async def __anext__(self): + if self._index < len(self.mock_chunks): + chunk = self.mock_chunks[self._index] + self._index += 1 + return chunk + raise StopAsyncIteration + + +def _make_reasoning_chunk(text): + chunk = ChatCompletionChunk( + id="test-id", + model=MODEL, + object="chat.completion.chunk", + choices=[ + chat_completion_chunk.Choice( + finish_reason=None, + logprobs=None, + index=0, + delta=chat_completion_chunk.ChoiceDelta(content=None, role=None), + ) + ], + created=int(datetime.now(tz=timezone.utc).timestamp()), + usage=None, + ) + chunk.choices[0].delta.reasoning = text + return chunk + + +def _make_content_chunk(text, finish_reason=None): + return ChatCompletionChunk( + id="test-id", + model=MODEL, + object="chat.completion.chunk", + choices=[ + chat_completion_chunk.Choice( + finish_reason=finish_reason, + logprobs=None, + index=0, + delta=chat_completion_chunk.ChoiceDelta(content=text, role=None), + ) + ], + created=int(datetime.now(tz=timezone.utc).timestamp()), + usage=None, + ) + + +@pytest.fixture +def completion(): + return ChatCompletion( + id="test-id", + model=MODEL, + object="chat.completion", + choices=[ + { + "finish_reason": "stop", + "logprobs": None, + "index": 0, + "message": {"content": "Paris is the capital of France.", "role": "assistant"}, + } + ], + created=int(datetime.now(tz=timezone.utc).timestamp()), + usage={"prompt_tokens": 57, "completion_tokens": 40, "total_tokens": 97}, + ) + + +@pytest.fixture +def completion_with_reasoning(completion): + completion.choices[0].message.reasoning = "The user asked about the capital of France. I know it's Paris." + return completion + + +@pytest.fixture +def mock_chat_completion(completion): + with patch("openai.resources.chat.completions.Completions.create") as mock: + mock.return_value = completion + yield mock + + +@pytest.fixture +def mock_chat_completion_with_reasoning(completion_with_reasoning): + with patch("openai.resources.chat.completions.Completions.create") as mock: + mock.return_value = completion_with_reasoning + yield mock + + +@pytest.fixture +def mock_async_chat_completion(completion): + with patch("openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock) as mock: + mock.return_value = completion + yield mock + + +@pytest.fixture +def mock_async_chat_completion_with_reasoning(completion_with_reasoning): + with patch("openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock) as mock: + mock.return_value = completion_with_reasoning + yield mock + + +class TestConvertChatCompletionToChatMessage: + def test_without_reasoning(self, completion): + message = _convert_chat_completion_to_chat_message(completion, completion.choices[0]) + + assert message.text == "Paris is the capital of France." + assert message.meta["model"] == MODEL + assert message.reasoning is None + + def test_with_reasoning(self, completion_with_reasoning): + message = _convert_chat_completion_to_chat_message( + completion_with_reasoning, completion_with_reasoning.choices[0] + ) + + assert message.text == "Paris is the capital of France." + assert message.reasoning is not None + assert "capital of France" in message.reasoning.reasoning_text + + def test_preserves_tool_calls(self): + completion = ChatCompletion( + id="test-id", + model=MODEL, + object="chat.completion", + choices=[ + Choice( + finish_reason="tool_calls", + logprobs=None, + index=0, + message=ChatCompletionMessage( + role="assistant", + tool_calls=[ + { + "id": "call_123", + "type": "function", + "function": {"name": "weather", "arguments": '{"city": "Paris"}'}, + } + ], + ), + ) + ], + created=int(datetime.now(tz=timezone.utc).timestamp()), + usage={"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30}, + ) + + message = _convert_chat_completion_to_chat_message(completion, completion.choices[0]) + + assert len(message.tool_calls) == 1 + assert message.tool_calls[0].tool_name == "weather" + assert message.tool_calls[0].arguments == {"city": "Paris"} + + def test_skips_malformed_tool_call_arguments(self): + completion = ChatCompletion( + id="test-id", + model=MODEL, + object="chat.completion", + choices=[ + Choice( + finish_reason="tool_calls", + logprobs=None, + index=0, + message=ChatCompletionMessage( + role="assistant", + tool_calls=[ + { + "id": "call_bad", + "type": "function", + "function": {"name": "weather", "arguments": "not-valid-json"}, + } + ], + ), + ) + ], + created=int(datetime.now(tz=timezone.utc).timestamp()), + usage={"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30}, + ) + + message = _convert_chat_completion_to_chat_message(completion, completion.choices[0]) + + assert len(message.tool_calls) == 0 + + +class TestVLLMChatGeneratorInit: + def test_init_default(self): + component = VLLMChatGenerator(model=MODEL) + + assert component.model == MODEL + assert component.streaming_callback is None + assert not component.generation_kwargs + assert component.api_base_url == "http://localhost:8000/v1" + assert component._client is None + + def test_init_with_api_key_from_env(self, monkeypatch): + monkeypatch.setenv("VLLM_API_KEY", "test-vllm-key") + component = VLLMChatGenerator(model=MODEL) + + assert component.api_key.resolve_value() == "test-vllm-key" + + def test_init_with_parameters(self): + component = VLLMChatGenerator( + model=MODEL, + api_key=Secret.from_token("my-key"), + api_base_url="http://my-server:8000/v1", + generation_kwargs={"max_tokens": 512, "temperature": 0.7}, + timeout=60.0, + max_retries=3, + ) + + assert component._client is None + assert component._async_client is None + assert not component._is_warmed_up + assert component.tools is None + assert component.http_client_kwargs is None + assert component.streaming_callback is None + assert component.api_base_url == "http://my-server:8000/v1" + assert component.model == MODEL + assert component.api_key.resolve_value() == "my-key" + assert component.generation_kwargs == {"max_tokens": 512, "temperature": 0.7} + assert component.timeout == 60.0 + assert component.max_retries == 3 + + +class TestVLLMChatGeneratorWarmUp: + def test_warm_up_creates_clients(self): + component = VLLMChatGenerator(model=MODEL) + assert component._client is None + + component.warm_up() + + assert component._client is not None + assert component._async_client is not None + assert component._is_warmed_up is True + + def test_warm_up_is_idempotent(self): + component = VLLMChatGenerator(model=MODEL) + component.warm_up() + first_client = component._client + + component.warm_up() + + assert component._client is first_client + + +class TestVLLMChatGeneratorSerde: + def test_to_dict(self, monkeypatch): + monkeypatch.setenv("VLLM_API_KEY", "test-key") + component = VLLMChatGenerator( + model=MODEL, + generation_kwargs={"max_tokens": 512}, + ) + data = component.to_dict() + + assert data["init_parameters"]["model"] == MODEL + assert data["init_parameters"]["api_key"] == {"env_vars": ["VLLM_API_KEY"], "strict": False, "type": "env_var"} + assert data["init_parameters"]["api_base_url"] == "http://localhost:8000/v1" + assert data["init_parameters"]["generation_kwargs"] == {"max_tokens": 512} + assert data["init_parameters"]["http_client_kwargs"] is None + assert data["init_parameters"]["tools"] is None + assert data["init_parameters"]["timeout"] is None + assert data["init_parameters"]["max_retries"] is None + assert data["init_parameters"]["streaming_callback"] is None + + def test_from_dict(self): + data = { + "type": "haystack_integrations.components.generators.vllm.chat.chat_generator.VLLMChatGenerator", + "init_parameters": { + "model": MODEL, + "api_key": {"type": "env_var", "env_vars": ["VLLM_API_KEY"], "strict": False}, + "api_base_url": "http://my-server:8000/v1", + "generation_kwargs": {"max_tokens": 512}, + "streaming_callback": "haystack.components.generators.utils.print_streaming_chunk", + "tools": None, + "timeout": 30.0, + "max_retries": 5, + "http_client_kwargs": None, + }, + } + component = VLLMChatGenerator.from_dict(data) + + assert isinstance(component, VLLMChatGenerator) + assert component.model == MODEL + assert component.generation_kwargs == {"max_tokens": 512} + assert component.streaming_callback is print_streaming_chunk + assert component.api_base_url == "http://my-server:8000/v1" + assert component.api_key == Secret.from_env_var("VLLM_API_KEY", strict=False) + assert component.timeout == 30.0 + assert component.max_retries == 5 + assert component.http_client_kwargs is None + assert component.tools is None + + def test_from_dict_with_streaming_callback(self): + data = { + "type": "haystack_integrations.components.generators.vllm.chat.chat_generator.VLLMChatGenerator", + "init_parameters": { + "model": MODEL, + "api_key": {"type": "env_var", "env_vars": ["VLLM_API_KEY"], "strict": False}, + "api_base_url": "http://localhost:8000/v1", + "generation_kwargs": {}, + "streaming_callback": "haystack.components.generators.utils.print_streaming_chunk", + "tools": None, + "timeout": None, + "max_retries": None, + "http_client_kwargs": None, + }, + } + component = VLLMChatGenerator.from_dict(data) + + assert component.streaming_callback is not None + + +class TestVLLMChatGeneratorRun: + def test_run(self, mock_chat_completion): # noqa: ARG002 + component = VLLMChatGenerator(model=MODEL) + response = component.run([ChatMessage.from_user("What's the capital of France")]) + + assert len(response["replies"]) == 1 + assert response["replies"][0].text == "Paris is the capital of France." + assert response["replies"][0].reasoning is None + + def test_run_with_reasoning(self, mock_chat_completion_with_reasoning): # noqa: ARG002 + component = VLLMChatGenerator(model=MODEL) + response = component.run([ChatMessage.from_user("What's the capital of France")]) + + reply = response["replies"][0] + assert reply.text == "Paris is the capital of France." + assert reply.reasoning is not None + assert "capital of France" in reply.reasoning.reasoning_text + + def test_run_passes_generation_kwargs(self, mock_chat_completion): + component = VLLMChatGenerator( + model=MODEL, + generation_kwargs={"max_tokens": 100, "temperature": 0.5}, + ) + component.run([ChatMessage.from_user("Hello")]) + + _, kwargs = mock_chat_completion.call_args + assert kwargs["max_tokens"] == 100 + assert kwargs["temperature"] == 0.5 + + def test_run_empty_messages(self): + component = VLLMChatGenerator(model=MODEL) + assert component.run([]) == {"replies": []} + + def test_run_streaming(self): + openai_chunks = [ + _make_content_chunk("Hello", finish_reason="stop"), + ] + + chunks_received = [] + component = VLLMChatGenerator(model=MODEL, streaming_callback=chunks_received.append) + component.warm_up() + + with patch("openai.resources.chat.completions.Completions.create") as mock: + mock.return_value = MockStream(openai_chunks, cast_to=None, response=None, client=None) + response = component.run([ChatMessage.from_user("Hello")]) + + assert len(chunks_received) > 0 + assert len(response["replies"]) == 1 + assert response["replies"][0].text == "Hello" + + def test_run_streaming_with_reasoning(self): + """Test that streaming with reasoning correctly sets start=True on the first reasoning and content chunks.""" + openai_chunks = [ + _make_reasoning_chunk("Okay"), + _make_reasoning_chunk(", the"), + _make_reasoning_chunk(" capital"), + _make_reasoning_chunk(" is Paris.\n"), + _make_content_chunk("\n\n"), + _make_content_chunk("Paris"), + _make_content_chunk(None, finish_reason="stop"), + ] + + streaming_chunks = [] + component = VLLMChatGenerator(model=MODEL, streaming_callback=streaming_chunks.append) + component.warm_up() + + with patch("openai.resources.chat.completions.Completions.create") as mock: + mock.return_value = MockStream(openai_chunks, cast_to=None, response=None, client=None) + result = component.run([ChatMessage.from_user("Hello")]) + + assert result["replies"][0].text == "\n\nParis" + assert result["replies"][0].reasoning.reasoning_text == "Okay, the capital is Paris.\n" + + assert len(streaming_chunks) == 7 + # chunk 0: first reasoning chunk -> start=True + assert streaming_chunks[0].reasoning is not None + assert streaming_chunks[0].start is True + # chunks 1-3: subsequent reasoning chunks -> start=False + for i in range(1, 4): + assert streaming_chunks[i].reasoning is not None + assert streaming_chunks[i].start is False + # chunk 4: first content chunk after reasoning -> start=True + assert streaming_chunks[4].content == "\n\n" + assert streaming_chunks[4].start is True + # chunks 5-6: subsequent content chunks -> start=False + assert streaming_chunks[5].start is False + assert streaming_chunks[6].start is False + + +@pytest.mark.asyncio +class TestVLLMChatGeneratorRunAsync: + async def test_run_async_empty_messages(self): + component = VLLMChatGenerator(model=MODEL) + assert await component.run_async([]) == {"replies": []} + + async def test_run_async(self, mock_async_chat_completion): # noqa: ARG002 + component = VLLMChatGenerator(model=MODEL) + response = await component.run_async([ChatMessage.from_user("Hello")]) + + assert len(response["replies"]) == 1 + assert response["replies"][0].text == "Paris is the capital of France." + assert response["replies"][0].reasoning is None + + async def test_run_async_with_reasoning(self, mock_async_chat_completion_with_reasoning): # noqa: ARG002 + component = VLLMChatGenerator(model=MODEL) + response = await component.run_async([ChatMessage.from_user("Hello")]) + + reply = response["replies"][0] + assert reply.reasoning is not None + assert "capital of France" in reply.reasoning.reasoning_text + + async def test_run_async_streaming(self): + openai_chunks = [ + _make_content_chunk("Hello", finish_reason="stop"), + ] + + chunks_received = [] + + async def callback(chunk): + chunks_received.append(chunk) + + component = VLLMChatGenerator(model=MODEL, streaming_callback=callback) + component.warm_up() + + with patch("openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock) as mock: + mock.return_value = AsyncMockStream(openai_chunks) + response = await component.run_async([ChatMessage.from_user("Hello")]) + + assert len(chunks_received) > 0 + assert len(response["replies"]) == 1 + assert response["replies"][0].text == "Hello" + + async def test_run_async_streaming_with_reasoning(self): + """Test that async streaming with reasoning sets start=True on first reasoning and content chunks.""" + openai_chunks = [ + _make_reasoning_chunk("Okay"), + _make_reasoning_chunk(", the"), + _make_reasoning_chunk(" capital"), + _make_reasoning_chunk(" is Paris.\n"), + _make_content_chunk("\n\n"), + _make_content_chunk("Paris"), + _make_content_chunk(None, finish_reason="stop"), + ] + + streaming_chunks = [] + + async def callback(chunk): + streaming_chunks.append(chunk) + + component = VLLMChatGenerator(model=MODEL, streaming_callback=callback) + component.warm_up() + + with patch("openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock) as mock: + mock.return_value = AsyncMockStream(openai_chunks) + result = await component.run_async([ChatMessage.from_user("Hello")]) + + assert result["replies"][0].text == "\n\nParis" + assert result["replies"][0].reasoning.reasoning_text == "Okay, the capital is Paris.\n" + + assert len(streaming_chunks) == 7 + assert streaming_chunks[0].reasoning is not None + assert streaming_chunks[0].start is True + for i in range(1, 4): + assert streaming_chunks[i].reasoning is not None + assert streaming_chunks[i].start is False + assert streaming_chunks[4].content == "\n\n" + assert streaming_chunks[4].start is True + assert streaming_chunks[5].start is False + assert streaming_chunks[6].start is False + + +NO_THINKING_KWARGS = {"extra_body": {"chat_template_kwargs": {"enable_thinking": False}}} +THINKING_KWARGS = {"extra_body": {"chat_template_kwargs": {"enable_thinking": True}}} + + +@pytest.mark.integration +class TestVLLMChatGeneratorLiveRun: + @pytest.mark.parametrize("generation_kwargs", [NO_THINKING_KWARGS, THINKING_KWARGS]) + def test_live_run(self, generation_kwargs): + component = VLLMChatGenerator(model=MODEL, generation_kwargs=generation_kwargs) + response = component.run([ChatMessage.from_user("What is the capital of France?")]) + + assert len(response["replies"]) == 1 + reply = response["replies"][0] + assert "paris" in reply.text.lower() + + if generation_kwargs == NO_THINKING_KWARGS: + assert reply.reasoning is None + else: + assert reply.reasoning is not None + assert len(reply.reasoning.reasoning_text) > 0 + + @pytest.mark.parametrize("generation_kwargs", [NO_THINKING_KWARGS, THINKING_KWARGS]) + def test_live_run_streaming(self, generation_kwargs): + chunks_received = [] + component = VLLMChatGenerator( + model=MODEL, + generation_kwargs=generation_kwargs, + streaming_callback=chunks_received.append, + ) + response = component.run([ChatMessage.from_user("What is the capital of France?")]) + + assert len(chunks_received) > 0 + assert len(response["replies"]) == 1 + assert "paris" in response["replies"][0].text.lower() + if generation_kwargs == THINKING_KWARGS: + assert response["replies"][0].reasoning is not None + assert len(response["replies"][0].reasoning.reasoning_text) > 0 + else: + assert response["replies"][0].reasoning is None + + def test_live_run_with_reasoning_and_parallel_tool_calls(self): + + @tool + def weather(city: Annotated[str, "The city to get the weather for"]) -> str: + """Get the weather in a given city.""" + return f"The weather in {city} is sunny" + + component = VLLMChatGenerator( + model=MODEL, + tools=[weather], + generation_kwargs=THINKING_KWARGS, + ) + response = component.run([ChatMessage.from_user("What is the weather in Paris? And in Berlin?")]) + + assert len(response["replies"]) == 1 + message = response["replies"][0] + assert message.reasoning is not None + assert len(message.reasoning.reasoning_text) > 0 + + tool_calls = message.tool_calls + assert tool_calls[0].tool_name == "weather" + assert tool_calls[0].arguments == {"city": "Paris"} + assert tool_calls[1].tool_name == "weather" + assert tool_calls[1].arguments == {"city": "Berlin"} + + def test_live_run_with_structured_output(self): + response_format = { + "type": "json_schema", + "json_schema": { + "name": "capital_info", + "strict": True, + "schema": { + "type": "object", + "properties": {"capital": {"type": "string"}, "population": {"type": "number"}}, + "required": ["capital", "population"], + }, + }, + } + component = VLLMChatGenerator( + model=MODEL, + # reasoning produces more reliable JSON output + generation_kwargs={**THINKING_KWARGS, "response_format": response_format, "temperature": 0.0}, + ) + response = component.run( + [ChatMessage.from_user("What's the capital of France and its population? Respond in JSON.")] + ) + + assert len(response["replies"]) == 1 + response_data = json.loads(response["replies"][0].text) + assert isinstance(response_data, dict) + assert "capital" in response_data + assert "paris" in response_data["capital"].lower() + assert "population" in response_data + + @pytest.mark.asyncio + @pytest.mark.parametrize("generation_kwargs", [NO_THINKING_KWARGS, THINKING_KWARGS]) + async def test_live_run_async(self, generation_kwargs): + component = VLLMChatGenerator(model=MODEL, generation_kwargs=generation_kwargs) + response = await component.run_async( + [ChatMessage.from_user("What is the capital of France? Answer in one word.")] + ) + + assert len(response["replies"]) == 1 + reply = response["replies"][0] + assert "paris" in reply.text.lower() + if generation_kwargs == THINKING_KWARGS: + assert reply.reasoning is not None + assert len(reply.reasoning.reasoning_text) > 0 + else: + assert reply.reasoning is None