diff --git a/.github/labeler.yml b/.github/labeler.yml
index e16d1c6ad3..349edf670c 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -253,6 +253,11 @@ integration:valkey:
       - any-glob-to-any-file: "integrations/valkey/**/*"
       - any-glob-to-any-file: ".github/workflows/valkey.yml"
 
+integration:vllm:
+  - changed-files:
+      - any-glob-to-any-file: "integrations/vllm/**/*"
+      - any-glob-to-any-file: ".github/workflows/vllm.yml"
+
 integration:watsonx:
   - changed-files:
       - any-glob-to-any-file: "integrations/watsonx/**/*"
diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml
index f4b83385a5..e4d682b7cf 100644
--- a/.github/workflows/CI_coverage_comment.yml
+++ b/.github/workflows/CI_coverage_comment.yml
@@ -1,4 +1,4 @@
-name: Add comment about test coverage to PRs
+name: Core / Add comment about test coverage to PRs
 
 on:
   workflow_run:
@@ -52,6 +52,7 @@ on:
       - "Test / togetherai"
       - "Test / unstructured"
       - "Test / valkey"
+      - "Test / vllm"
       - "Test / watsonx"
       - "Test / weave"
       - "Test / weaviate"
diff --git a/.github/workflows/CI_workflows_linting.yml b/.github/workflows/CI_workflows_linting.yml
index 7f19a515d8..deca4f1e95 100644
--- a/.github/workflows/CI_workflows_linting.yml
+++ b/.github/workflows/CI_workflows_linting.yml
@@ -1,4 +1,4 @@
-name: Github workflows linter
+name: Core / Github workflows linter
 
 on:
   pull_request:
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
new file mode 100644
index 0000000000..4fcb2e4992
--- /dev/null
+++ b/.github/workflows/vllm.yml
@@ -0,0 +1,180 @@
+# This workflow comes from https://github.com/ofek/hatch-mypyc
+# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
+name: Test / vllm
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - "integrations/vllm/**"
+      - "!integrations/vllm/*.md"
+      - ".github/workflows/vllm.yml"
+  push:
+    branches:
+      - main
+    paths:
+      - "integrations/vllm/**"
+      - "!integrations/vllm/*.md"
+      - ".github/workflows/vllm.yml"
+
+defaults:
+  run:
+    working-directory: integrations/vllm
+
+concurrency:
+  group: vllm-${{ github.head_ref || github.sha }}
+  cancel-in-progress: true
+
+env:
+  PYTHONUNBUFFERED: "1"
+  FORCE_COLOR: "1"
+  VLLM_MODEL: "Qwen/Qwen3-0.6B"
+  # we only test on Ubuntu to keep vLLM server running simple
+  TEST_MATRIX_OS: '["ubuntu-latest"]'
+  # vLLM is not compatible with Python 3.14. https://github.com/vllm-project/vllm/issues/34096
+  TEST_MATRIX_PYTHON: '["3.10", "3.13"]'
+
+jobs:
+  compute-test-matrix:
+    runs-on: ubuntu-slim
+    defaults:
+      run:
+        working-directory: .
+    outputs:
+      os: ${{ steps.set.outputs.os }}
+      python-version: ${{ steps.set.outputs.python-version }}
+    steps:
+      - id: set
+        run: |
+          echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT"
+          echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT"
+
+  run:
+    name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
+    needs: compute-test-matrix
+    permissions:
+      contents: write
+      pull-requests: write
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }}
+        python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }}
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install Hatch
+        run: pip install hatch
+
+      - name: Install vLLM (CPU)
+        run: |
+          # vLLM on PyPI is GPU-only and requires CUDA, so it won't run on CPU-only systems.
+          # CPU wheels are not published to PyPI; they are only available as direct downloads from GitHub releases.
+          # We fetch the latest release and install the appropriate x86 CPU wheel.
+          # The --torch-backend cpu flag ensures uv installs PyTorch from the official CPU-only index,
+          # since the required torch+cpu builds are also not available on PyPI.
+          VLLM_VERSION="$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')"
+          export VLLM_VERSION
+          echo "Installing vLLM ${VLLM_VERSION} (CPU)"
+          hatch run -- uv pip install \
+            "https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl" \
+            --torch-backend cpu
+
+      - name: Start vLLM server
+        env:
+          VLLM_TARGET_DEVICE: "cpu"
+          VLLM_CPU_KVCACHE_SPACE: "4"
+        run: |
+          nohup hatch run -- vllm serve ${{ env.VLLM_MODEL }} \
+            --reasoning-parser qwen3 \
+            --max-model-len 1024 \
+            --enforce-eager \
+            --dtype bfloat16 \
+            --enable-auto-tool-choice \
+            --tool-call-parser hermes \
+            --max-num-seqs 1 &
+
+          # Wait for the vLLM server to be ready with a timeout of 300 seconds
+          timeout=300
+          while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8000/health > /dev/null 2>&1; do
+            echo "Waiting for vLLM server to start..."
+            sleep 10
+            ((timeout-=10))
+          done
+
+          if [ $timeout -eq 0 ]; then
+            echo "Timed out waiting for vLLM server to start."
+            exit 1
+          fi
+
+          echo "vLLM server started successfully."
+
+      - name: Lint
+        if: matrix.python-version == '3.10' && runner.os == 'Linux'
+        run: hatch run fmt-check && hatch run test:types
+
+      - name: Run unit tests
+        run: hatch run test:unit-cov-retry
+
+      # On PR: posts coverage comment (directly on same-repo PRs; via artifact for fork PRs). On push to main: stores coverage baseline on data branch.
+      - name: Store unit tests coverage
+        id: coverage_comment
+        if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule'
+        uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
+        with:
+          GITHUB_TOKEN: ${{ github.token }}
+          COVERAGE_PATH: integrations/vllm
+          SUBPROJECT_ID: vllm
+          MINIMUM_GREEN: 90
+          MINIMUM_ORANGE: 60
+
+      - name: Upload coverage comment to be posted
+        if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true'
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        with:
+          name: coverage-comment-vllm
+          path: python-coverage-comment-action-vllm.txt
+
+      - name: Run integration tests
+        run: hatch run test:integration-cov-append-retry
+
+      - name: Store combined coverage
+        if: github.event_name == 'push'
+        uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
+        with:
+          GITHUB_TOKEN: ${{ github.token }}
+          COVERAGE_PATH: integrations/vllm
+          SUBPROJECT_ID: vllm-combined
+          MINIMUM_GREEN: 90
+          MINIMUM_ORANGE: 60
+
+      - name: Run unit tests with lowest direct dependencies
+        if: github.event_name != 'push'
+        run: |
+          hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
+          hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
+          hatch run test:unit
+
+      - name: Nightly - run unit tests with Haystack main branch
+        if: github.event_name == 'schedule'
+        run: |
+          hatch env prune
+          hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
+          hatch run test:unit
+
+  notify-slack-on-failure:
+    needs: run
+    if: failure() && github.event_name == 'schedule'
+    runs-on: ubuntu-slim
+    steps:
+      - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
+        with:
+          slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}
diff --git a/README.md b/README.md
index 146ccb334d..42891b8360 100644
--- a/README.md
+++ b/README.md
@@ -77,6 +77,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
 | [togetherai-haystack](integrations/togetherai/)                         | Generator         | [![PyPI - Version](https://img.shields.io/pypi/v/togetherai-haystack.svg)](https://pypi.org/project/togetherai-haystack)                                 | [![Test / togetherai](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/togetherai.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/togetherai.yml)                                     | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-togetherai/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-togetherai/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-togetherai-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-togetherai-combined/htmlcov/index.html) |
 | [unstructured-fileconverter-haystack](integrations/unstructured/)       | File converter              | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured.yml)                               | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-unstructured/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-unstructured/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-unstructured-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-unstructured-combined/htmlcov/index.html) |
 | [valkey-haystack](integrations/valkey/)                                 | Document Store              | [![PyPI - Version](https://img.shields.io/pypi/v/valkey-haystack.svg)](https://pypi.org/project/valkey-haystack)                                         | [![Test / valkey](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/valkey.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/valkey.yml)                                                 | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-valkey/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-valkey/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-valkey-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-valkey-combined/htmlcov/index.html) |
+| [vllm-haystack](integrations/vllm/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/vllm-haystack.svg)](https://pypi.org/project/vllm-haystack) | [![Test / vllm](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/vllm.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/vllm.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-vllm/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-vllm/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-vllm-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-vllm-combined/htmlcov/index.html) |
 | [watsonx-haystack](integrations/watsonx/)                               | Embedder, Generator         | [![PyPI - Version](https://img.shields.io/pypi/v/watsonx-haystack.svg?color=orange)](https://pypi.org/project/watsonx-haystack)                          | [![Test / watsonx](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/watsonx.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/watsonx.yml)                                              | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-watsonx/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-watsonx/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-watsonx-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-watsonx-combined/htmlcov/index.html) |
 | [weave-haystack](integrations/weave/)                                   | Tracer                      | [![PyPI - Version](https://img.shields.io/pypi/v/weave-haystack.svg)](https://pypi.org/project/weave-haystack)                                           | [![Test / weave](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/weave.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/weave.yml)                                                    | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-weave/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-weave/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-weave-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-weave-combined/htmlcov/index.html) |
 | [weaviate-haystack](integrations/weaviate/)                             | Document Store              | [![PyPI - Version](https://img.shields.io/pypi/v/weaviate-haystack.svg)](https://pypi.org/project/weaviate-haystack)                                     | [![Test / weaviate](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/weaviate.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/weaviate.yml)                                           | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-weaviate/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-weaviate/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-weaviate-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-weaviate-combined/htmlcov/index.html) |
diff --git a/integrations/vllm/LICENSE.txt b/integrations/vllm/LICENSE.txt
new file mode 100644
index 0000000000..6134ab324f
--- /dev/null
+++ b/integrations/vllm/LICENSE.txt
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023-present deepset GmbH
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/integrations/vllm/README.md b/integrations/vllm/README.md
new file mode 100644
index 0000000000..3d987bbefb
--- /dev/null
+++ b/integrations/vllm/README.md
@@ -0,0 +1,20 @@
+# vllm-haystack
+
+[![PyPI - Version](https://img.shields.io/pypi/v/vllm-haystack.svg)](https://pypi.org/project/vllm-haystack)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/vllm-haystack.svg)](https://pypi.org/project/vllm-haystack)
+
+- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/vllm/CHANGELOG.md)
+
+---
+
+## Contributing
+
+Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md).
+
+To run integration tests locally, you need to have a running vLLM server. Refer to the [workflow file](https://github.com/deepset-ai/haystack-core-integrations/blob/main/.github/workflows/vllm.yml) for more details.
+
+For example, on macOs, you can install [vLLM-metal](https://github.com/vllm-project/vllm-metal) and run the server with:
+
+```bash
+source ~/.venv-vllm-metal/bin/activate && vllm serve Qwen/Qwen3-0.6B --reasoning-parser qwen3 --max-model-len 1024 --enforce-eager  --enable-auto-tool-choice --tool-call-parser hermes
+```
\ No newline at end of file
diff --git a/integrations/vllm/pydoc/config_docusaurus.yml b/integrations/vllm/pydoc/config_docusaurus.yml
new file mode 100644
index 0000000000..13c26dd968
--- /dev/null
+++ b/integrations/vllm/pydoc/config_docusaurus.yml
@@ -0,0 +1,13 @@
+loaders:
+  - modules:
+      - haystack_integrations.components.generators.vllm.chat.chat_generator
+    search_path: [../src]
+processors:
+  - type: filter
+    documented_only: true
+    skip_empty_modules: true
+renderer:
+  description: vLLM integration for Haystack
+  id: integrations-vllm
+  filename: vllm.md
+  title: vLLM
diff --git a/integrations/vllm/pyproject.toml b/integrations/vllm/pyproject.toml
new file mode 100644
index 0000000000..53cec11e33
--- /dev/null
+++ b/integrations/vllm/pyproject.toml
@@ -0,0 +1,161 @@
+[build-system]
+requires = ["hatchling", "hatch-vcs"]
+build-backend = "hatchling.build"
+
+[project]
+name = "vllm-haystack"
+dynamic = ["version"]
+description = "Haystack integration for vllm"
+readme = "README.md"
+requires-python = ">=3.10"
+license = "Apache-2.0"
+keywords = []
+authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }]
+classifiers = [
+  "License :: OSI Approved :: Apache Software License",
+  "Development Status :: 4 - Beta",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+]
+dependencies = ["haystack-ai>=2.23.0", "openai"]
+
+[project.urls]
+Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/vllm#readme"
+Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues"
+Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/vllm"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/haystack_integrations"]
+
+[tool.hatch.version]
+source = "vcs"
+tag-pattern = 'integrations\/vllm-v(?P<version>.*)'
+
+[tool.hatch.version.raw-options]
+root = "../.."
+git_describe_command = 'git describe --tags --match="integrations/vllm-v[0-9]*"'
+
+[tool.hatch.envs.default]
+installer = "uv"
+dependencies = ["haystack-pydoc-tools", "ruff"]
+
+[tool.hatch.envs.default.scripts]
+docs = ["haystack-pydoc pydoc/config_docusaurus.yml"]
+fmt = "ruff check --fix {args}; ruff format {args}"
+fmt-check = "ruff check {args} && ruff format --check {args}"
+
+[tool.hatch.envs.test]
+dependencies = [
+    "pytest",
+    "pytest-asyncio",
+    "pytest-cov",
+    "pytest-rerunfailures",
+    "mypy",
+    "pip",
+    "Pillow",
+]
+
+[tool.hatch.envs.test.scripts]
+unit = 'pytest -m "not integration" {args:tests}'
+integration = 'pytest -m "integration" {args:tests}'
+all = 'pytest {args:tests}'
+unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}'
+integration-cov-append-retry = 'pytest --cov=haystack_integrations --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}'
+types = "mypy -p haystack_integrations.components.generators.vllm {args}"
+
+[tool.mypy]
+install_types = true
+non_interactive = true
+check_untyped_defs = true
+disallow_incomplete_defs = true
+
+[tool.ruff]
+line-length = 120
+
+[tool.ruff.lint]
+select = [
+    "A",
+    "ANN",
+    "ARG",
+    "B",
+    "C",
+    "D102",   # Missing docstring in public method
+    "D103",   # Missing docstring in public function
+    "D205",   # 1 blank line required between summary line and description
+    "D209",   # Closing triple quotes go to new line
+    "D213",   # summary lines must be positioned on the second physical line of the docstring
+    "D417",   # Missing argument descriptions in the docstring
+    "D419",   # Docstring is empty
+    "DTZ",
+    "E",
+    "EM",
+    "F",
+    "I",
+    "ICN",
+    "ISC",
+    "N",
+    "PLC",
+    "PLE",
+    "PLR",
+    "PLW",
+    "Q",
+    "RUF",
+    "S",
+    "T",
+    "TID",
+    "UP",
+    "W",
+    "YTT",
+]
+ignore = [
+    # Allow non-abstract empty methods in abstract base classes
+    "B027",
+    # Allow function calls in argument defaults (common Haystack pattern for Secret.from_env_var)
+    "B008",
+    # Ignore checks for possible passwords
+    "S105",
+    "S106",
+    "S107",
+    # Ignore complexity
+    "C901",
+    "PLR0911",
+    "PLR0912",
+    "PLR0913",
+    "PLR0915",
+    # Allow `Any` type - used legitimately for dynamic types and SDK boundaries
+    "ANN401",
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["haystack_integrations"]
+
+[tool.ruff.lint.flake8-tidy-imports]
+ban-relative-imports = "parents"
+
+[tool.ruff.lint.per-file-ignores]
+# Tests can use magic values, assertions, relative imports, and don't need type annotations
+"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"]
+
+[tool.coverage.run]
+source = ["haystack_integrations"]
+branch = true
+parallel = false
+relative_files = true
+
+[tool.coverage.report]
+omit = ["*/tests/*", "*/__init__.py"]
+show_missing = true
+exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"]
+
+[tool.pytest.ini_options]
+addopts = "--strict-markers"
+markers = [
+  "integration: integration tests",
+]
+log_cli = true
+asyncio_default_fixture_loop_scope = "function"
diff --git a/integrations/vllm/src/haystack_integrations/components/generators/py.typed b/integrations/vllm/src/haystack_integrations/components/generators/py.typed
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/integrations/vllm/src/haystack_integrations/components/generators/vllm/__init__.py b/integrations/vllm/src/haystack_integrations/components/generators/vllm/__init__.py
new file mode 100644
index 0000000000..c929b5cd12
--- /dev/null
+++ b/integrations/vllm/src/haystack_integrations/components/generators/vllm/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from .chat.chat_generator import VLLMChatGenerator
+
+__all__ = ["VLLMChatGenerator"]
diff --git a/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/__init__.py b/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/__init__.py
new file mode 100644
index 0000000000..c1764a6e03
--- /dev/null
+++ b/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/chat_generator.py b/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/chat_generator.py
new file mode 100644
index 0000000000..5d21c3abaa
--- /dev/null
+++ b/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/chat_generator.py
@@ -0,0 +1,549 @@
+import asyncio
+import dataclasses
+import json
+from typing import Any
+
+from haystack import default_from_dict, default_to_dict, logging
+from haystack.components.generators.chat.openai import (
+    _check_finish_reason,
+    _convert_chat_completion_chunk_to_streaming_chunk,
+)
+from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message, _serialize_object
+from haystack.core.component import component
+from haystack.dataclasses import ChatMessage, ToolCall
+from haystack.dataclasses.chat_message import ReasoningContent
+from haystack.dataclasses.streaming_chunk import (
+    AsyncStreamingCallbackT,
+    ComponentInfo,
+    StreamingCallbackT,
+    StreamingChunk,
+    SyncStreamingCallbackT,
+    select_streaming_callback,
+)
+from haystack.tools import (
+    ToolsType,
+    _check_duplicate_tool_names,
+    deserialize_tools_or_toolset_inplace,
+    flatten_tools_or_toolsets,
+    serialize_tools_or_toolset,
+    warm_up_tools,
+)
+from haystack.utils import Secret, deserialize_callable, serialize_callable
+from haystack.utils.http_client import init_http_client
+from openai import AsyncOpenAI, AsyncStream, OpenAI, Stream
+from openai.types.chat import ChatCompletion, ChatCompletionChunk
+from openai.types.chat.chat_completion import Choice
+
+logger = logging.getLogger(__name__)
+
+
+def _convert_chat_completion_to_chat_message(completion: ChatCompletion, choice: Choice) -> ChatMessage:
+    """
+    Convert a vLLM chat completion response to a ChatMessage, including reasoning content if present.
+
+    :param completion: The completion returned by the vLLM API.
+    :param choice: The choice returned by the vLLM API.
+    :return: The ChatMessage.
+    """
+    message = choice.message
+    text = message.content
+
+    tool_calls = []
+    if message.tool_calls:
+        for tc in message.tool_calls:
+            if not hasattr(tc, "function"):
+                continue
+            try:
+                arguments = json.loads(tc.function.arguments)
+                tool_calls.append(ToolCall(id=tc.id, tool_name=tc.function.name, arguments=arguments))
+            except json.JSONDecodeError:
+                logger.warning(
+                    "vLLM returned a malformed JSON string for tool call arguments. This tool call "
+                    "will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
+                    _id=tc.id,
+                    _name=tc.function.name,
+                    _arguments=tc.function.arguments,
+                )
+
+    meta: dict[str, Any] = {
+        "model": completion.model,
+        "index": choice.index,
+        "finish_reason": choice.finish_reason,
+        "usage": _serialize_object(completion.usage),
+    }
+
+    reasoning_text = getattr(message, "reasoning", None)
+    reasoning = ReasoningContent(reasoning_text=reasoning_text) if reasoning_text else None
+
+    return ChatMessage.from_assistant(text=text, tool_calls=tool_calls, meta=meta, reasoning=reasoning)
+
+
+@component
+class VLLMChatGenerator:
+    """
+    A component for generating chat completions using models served with [vLLM](https://docs.vllm.ai/).
+
+    It expects a vLLM server to be running and accessible at the `api_base_url` parameter.
+
+    ### Starting the vLLM server
+
+    Before using this component, start a vLLM server:
+
+    ```bash
+    vllm serve Qwen/Qwen3-4B-Instruct-2507
+    ```
+
+    For reasoning models, start the server with the appropriate reasoning parser:
+
+    ```bash
+    vllm serve Qwen/Qwen3-0.6B --reasoning-parser qwen3
+    ```
+
+    For tool calling, the server must be started with `--enable-auto-tool-choice` and `--tool-call-parser`:
+
+    ```bash
+    vllm serve Qwen/Qwen3-0.6B --enable-auto-tool-choice --tool-call-parser hermes
+    ```
+
+    The available tool call parsers depend on the model. See the
+    [vLLM tool calling docs](https://docs.vllm.ai/en/stable/features/tool_calling/) for the full list.
+
+    For details on server options, see the [vLLM CLI docs](https://docs.vllm.ai/en/stable/cli/serve/).
+
+    ### Usage example
+
+    ```python
+    from haystack.dataclasses import ChatMessage
+    from haystack_integrations.components.generators.vllm import VLLMChatGenerator
+
+    generator = VLLMChatGenerator(
+        model="Qwen/Qwen3-0.6B",
+        generation_kwargs={"max_tokens": 512, "temperature": 0.7},
+    )
+
+    messages = [ChatMessage.from_user("What's Natural Language Processing?")]
+    response = generator.run(messages=messages)
+    print(response["replies"][0].text)
+    ```
+
+    ### Usage example with vLLM-specific parameters
+
+    Pass the vLLM-specific parameters inside the `generation_kwargs`["extra_body"] dictionary.
+
+    ```python
+    from haystack_integrations.components.generators.vllm import VLLMChatGenerator
+
+    generator = VLLMChatGenerator(
+        model="Qwen/Qwen3-0.6B",
+        generation_kwargs={
+            "max_tokens": 512,
+            "extra_body": {
+                "top_k": 50,
+                "min_tokens": 10,
+                "repetition_penalty": 1.1,
+            },
+        },
+    )
+    ```
+
+    ### Usage example with tool calling
+
+    To use tool calling, start the vLLM server with `--enable-auto-tool-choice` and `--tool-call-parser`.
+
+    ```python
+    from haystack.dataclasses import ChatMessage
+    from haystack.tools import tool
+    from haystack_integrations.components.generators.vllm import VLLMChatGenerator
+
+    @tool
+    def weather(city: str) -> str:
+        \"\"\"Get the weather in a given city.\"\"\"
+        return f"The weather in {city} is sunny"
+
+    generator = VLLMChatGenerator(model="Qwen/Qwen3-0.6B", tools=[weather])
+
+    messages = [ChatMessage.from_user("What is the weather in Paris?")]
+    response = generator.run(messages=messages)
+    print(response["replies"][0].tool_calls)
+    ```
+
+    ### Usage example with reasoning models
+
+    To use reasoning models, start the vLLM server with `--reasoning-parser`.
+
+    ```python
+    from haystack.dataclasses import ChatMessage
+    from haystack_integrations.components.generators.vllm import VLLMChatGenerator
+
+    generator = VLLMChatGenerator(model="Qwen/Qwen3-0.6B")
+
+    messages = [ChatMessage.from_user("Solve step by step: what is 15 * 37?")]
+    response = generator.run(messages=messages)
+    reply = response["replies"][0]
+    if reply.reasoning:
+        print("Reasoning:", reply.reasoning.reasoning_text)
+    print("Answer:", reply.text)
+    ```
+    """
+
+    def __init__(
+        self,
+        *,
+        model: str,
+        api_key: Secret | None = Secret.from_env_var("VLLM_API_KEY", strict=False),
+        streaming_callback: StreamingCallbackT | None = None,
+        api_base_url: str = "http://localhost:8000/v1",
+        generation_kwargs: dict[str, Any] | None = None,
+        timeout: float | None = None,
+        max_retries: int | None = None,
+        tools: ToolsType | None = None,
+        http_client_kwargs: dict[str, Any] | None = None,
+    ) -> None:
+        """
+        Creates an instance of VLLMChatGenerator.
+
+        :param model: The name of the model served by vLLM (e.g., "Qwen/Qwen3-0.6B").
+        :param api_key: The vLLM API key. Defaults to the `VLLM_API_KEY` environment variable.
+            Only required if the vLLM server was started with `--api-key`.
+        :param streaming_callback: A callback function that is called when a new token is received from the stream.
+            The callback function accepts
+            [StreamingChunk](https://docs.haystack.deepset.ai/docs/data-classes#streamingchunk)
+            as an argument.
+        :param api_base_url: The base URL of the vLLM server.
+        :param generation_kwargs: Additional parameters for text generation. These parameters are sent directly to
+            the vLLM OpenAI-compatible endpoint. See
+            [vLLM documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server/)
+            for more details.
+            Some of the supported parameters:
+            - `max_tokens`: Maximum number of tokens to generate.
+            - `temperature`: Sampling temperature.
+            - `top_p`: Nucleus sampling parameter.
+            - `n`: Number of completions to generate for each prompt.
+            - `stop`: One or more sequences after which the model should stop generating tokens.
+            - `response_format`: A JSON schema or a Pydantic model that enforces the structure of the response.
+            - `extra_body`: A dictionary of vLLM-specific parameters not part of the standard OpenAI API
+              (e.g., `top_k`, `min_tokens`, `repetition_penalty`).
+        :param timeout:
+            Timeout for vLLM client calls. If not set, it defaults to the default set by the OpenAI client.
+        :param max_retries:
+            Maximum number of retries to attempt for failed requests. If not set, it defaults to the default
+            set by the OpenAI client.
+        :param tools:
+            A list of Tool and/or Toolset objects, or a single Toolset for which the model can prepare calls.
+            Each tool should have a unique name. Not all models support tools.
+        :param http_client_kwargs:
+            A dictionary of keyword arguments to configure a custom `httpx.Client` or `httpx.AsyncClient`.
+            For more information, see the [HTTPX documentation](https://www.python-httpx.org/api/#client).
+        """
+
+        self.model = model
+        self.api_key = api_key
+        self.streaming_callback = streaming_callback
+        self.api_base_url = api_base_url
+        self.generation_kwargs = generation_kwargs or {}
+        self.timeout = timeout
+        self.max_retries = max_retries
+        self.tools = tools
+        self.http_client_kwargs = http_client_kwargs
+
+        _check_duplicate_tool_names(flatten_tools_or_toolsets(self.tools))
+
+        self._client: OpenAI | None = None
+        self._async_client: AsyncOpenAI | None = None
+        self._is_warmed_up = False
+
+    def warm_up(self) -> None:
+        """Create the OpenAI clients and warm up tools."""
+        if self._is_warmed_up:
+            return
+
+        api_key = "placeholder-api-key"
+        if self.api_key and (resolved_value := self.api_key.resolve_value()):
+            api_key = resolved_value
+
+        client_kwargs: dict[str, Any] = {
+            "api_key": api_key,
+            "base_url": self.api_base_url,
+        }
+        if self.timeout is not None:
+            client_kwargs["timeout"] = self.timeout
+        if self.max_retries is not None:
+            client_kwargs["max_retries"] = self.max_retries
+
+        self._client = OpenAI(
+            http_client=init_http_client(self.http_client_kwargs, async_client=False), **client_kwargs
+        )
+        self._async_client = AsyncOpenAI(
+            http_client=init_http_client(self.http_client_kwargs, async_client=True), **client_kwargs
+        )
+        warm_up_tools(self.tools)
+        self._is_warmed_up = True
+
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Serialize this component to a dictionary.
+
+        :returns:
+            The serialized component as a dictionary.
+        """
+        callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
+        return default_to_dict(
+            self,
+            model=self.model,
+            streaming_callback=callback_name,
+            api_base_url=self.api_base_url,
+            generation_kwargs=self.generation_kwargs,
+            api_key=self.api_key,
+            timeout=self.timeout,
+            max_retries=self.max_retries,
+            tools=serialize_tools_or_toolset(self.tools),
+            http_client_kwargs=self.http_client_kwargs,
+        )
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "VLLMChatGenerator":
+        """
+        Deserialize this component from a dictionary.
+
+        :param data: The dictionary representation of this component.
+        :returns:
+            The deserialized component instance.
+        """
+        deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools")
+        init_params = data.get("init_parameters", {})
+        serialized_callback_handler = init_params.get("streaming_callback")
+        if serialized_callback_handler:
+            data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
+        return default_from_dict(cls, data)
+
+    def _prepare_api_call(
+        self,
+        messages: list[ChatMessage],
+        streaming_callback: StreamingCallbackT | None,
+        generation_kwargs: dict[str, Any] | None,
+        tools: ToolsType | None,
+    ) -> dict[str, Any]:
+        """Build the kwargs dict for the OpenAI chat completions API call."""
+        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
+
+        openai_formatted_messages = [message.to_openai_dict_format() for message in messages]
+
+        flattened_tools = flatten_tools_or_toolsets(tools or self.tools)
+        _check_duplicate_tool_names(flattened_tools)
+        tool_definitions = None
+        if flattened_tools:
+            tool_definitions = [{"type": "function", "function": t.tool_spec} for t in flattened_tools]
+
+        api_kwargs: dict[str, Any] = {
+            "model": self.model,
+            "messages": openai_formatted_messages,
+            "stream": streaming_callback is not None,
+            **generation_kwargs,
+        }
+        if tool_definitions:
+            api_kwargs["tools"] = tool_definitions
+
+        return api_kwargs
+
+    def _handle_stream_response(self, chat_completion: Stream, callback: SyncStreamingCallbackT) -> list[ChatMessage]:
+        """Handle a synchronous streaming response, extracting reasoning content from vLLM's reasoning chunks."""
+        component_info = ComponentInfo.from_component(self)
+        chunks: list[StreamingChunk] = []
+
+        # track reasoning and content blocks. We use these flags to detect the transition and mark start=True
+        # on the first chunk of each block
+        reasoning_started = False
+        content_started = False
+
+        for chunk in chat_completion:
+            assert len(chunk.choices) <= 1  # noqa: S101
+
+            reasoning_text = None
+            if chunk.choices:
+                reasoning_text = getattr(chunk.choices[0].delta, "reasoning", None)
+
+            if reasoning_text:
+                streaming_chunk = StreamingChunk(
+                    content="",
+                    reasoning=ReasoningContent(reasoning_text=reasoning_text),
+                    index=0,
+                    start=not reasoning_started,
+                    component_info=component_info,
+                    meta={
+                        "model": chunk.model,
+                        "index": chunk.choices[0].index,
+                        "finish_reason": chunk.choices[0].finish_reason,
+                    },
+                )
+                reasoning_started = True
+            else:
+                streaming_chunk = _convert_chat_completion_chunk_to_streaming_chunk(
+                    chunk=chunk, previous_chunks=chunks, component_info=component_info
+                )
+                # _convert_chat_completion_chunk_to_streaming_chunk doesn't know about reasoning chunks
+                # We set start=True on the first content chunk after reasoning.
+                if reasoning_started and not content_started:
+                    streaming_chunk = dataclasses.replace(streaming_chunk, start=True)
+                    content_started = True
+
+            chunks.append(streaming_chunk)
+            callback(streaming_chunk)
+
+        return [_convert_streaming_chunks_to_chat_message(chunks=chunks)]
+
+    async def _handle_async_stream_response(
+        self, chat_completion: AsyncStream[ChatCompletionChunk], callback: AsyncStreamingCallbackT
+    ) -> list[ChatMessage]:
+        """Handle an asynchronous streaming response, extracting reasoning content from vLLM's reasoning chunks."""
+        component_info = ComponentInfo.from_component(self)
+        chunks: list[StreamingChunk] = []
+        reasoning_started = False
+        content_started = False
+        try:
+            async for chunk in chat_completion:
+                assert len(chunk.choices) <= 1  # noqa: S101
+
+                reasoning_text = None
+                if chunk.choices:
+                    reasoning_text = getattr(chunk.choices[0].delta, "reasoning", None)
+
+                if reasoning_text:
+                    streaming_chunk = StreamingChunk(
+                        content="",
+                        reasoning=ReasoningContent(reasoning_text=reasoning_text),
+                        index=0,
+                        start=not reasoning_started,
+                        component_info=component_info,
+                        meta={
+                            "model": chunk.model,
+                            "index": chunk.choices[0].index,
+                            "finish_reason": chunk.choices[0].finish_reason,
+                        },
+                    )
+                    reasoning_started = True
+                else:
+                    streaming_chunk = _convert_chat_completion_chunk_to_streaming_chunk(
+                        chunk=chunk, previous_chunks=chunks, component_info=component_info
+                    )
+                    if reasoning_started and not content_started:
+                        streaming_chunk = dataclasses.replace(streaming_chunk, start=True)
+                        content_started = True
+
+                chunks.append(streaming_chunk)
+                await callback(streaming_chunk)
+        except asyncio.CancelledError:
+            await asyncio.shield(chat_completion.close())
+            raise
+
+        return [_convert_streaming_chunks_to_chat_message(chunks=chunks)]
+
+    @component.output_types(replies=list[ChatMessage])
+    def run(
+        self,
+        messages: list[ChatMessage],
+        streaming_callback: StreamingCallbackT | None = None,
+        generation_kwargs: dict[str, Any] | None = None,
+        *,
+        tools: ToolsType | None = None,
+    ) -> dict[str, list[ChatMessage]]:
+        """
+        Run the VLLM chat generator on the given input data.
+
+        :param messages:
+            A list of ChatMessage instances representing the input messages.
+        :param streaming_callback:
+            A callback function that is called when a new token is received from the stream.
+        :param generation_kwargs:
+            Additional keyword arguments for text generation. These parameters will
+            override the parameters passed during component initialization.
+            For details on vLLM API parameters, see
+            [vLLM documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server/).
+        :param tools:
+            A list of Tool and/or Toolset objects, or a single Toolset for which the model can prepare calls.
+            If set, it will override the `tools` parameter provided during initialization.
+
+        :returns:
+            A dictionary with the following key:
+            - `replies`: A list containing the generated responses as ChatMessage instances.
+        """
+        if not self._is_warmed_up:
+            self.warm_up()
+
+        if len(messages) == 0:
+            return {"replies": []}
+
+        streaming_callback = select_streaming_callback(
+            init_callback=self.streaming_callback, runtime_callback=streaming_callback, requires_async=False
+        )
+
+        api_kwargs = self._prepare_api_call(messages, streaming_callback, generation_kwargs, tools)
+        assert self._client is not None  # noqa: S101
+        chat_completion = self._client.chat.completions.create(**api_kwargs)
+
+        if streaming_callback is not None:
+            completions = self._handle_stream_response(chat_completion, streaming_callback)
+        else:
+            completions = [
+                _convert_chat_completion_to_chat_message(chat_completion, choice) for choice in chat_completion.choices
+            ]
+
+        for message in completions:
+            _check_finish_reason(message.meta)
+
+        return {"replies": completions}
+
+    @component.output_types(replies=list[ChatMessage])
+    async def run_async(
+        self,
+        messages: list[ChatMessage],
+        streaming_callback: StreamingCallbackT | None = None,
+        generation_kwargs: dict[str, Any] | None = None,
+        *,
+        tools: ToolsType | None = None,
+    ) -> dict[str, list[ChatMessage]]:
+        """
+        Run the VLLM chat generator on the given input data asynchronously.
+
+        :param messages:
+            A list of ChatMessage instances representing the input messages.
+        :param streaming_callback:
+            A callback function that is called when a new token is received from the stream.
+            Must be a coroutine.
+        :param generation_kwargs:
+            Additional keyword arguments for text generation. These parameters will
+            override the parameters passed during component initialization.
+            For details on vLLM API parameters, see
+            [vLLM documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server/).
+        :param tools:
+            A list of Tool and/or Toolset objects, or a single Toolset for which the model can prepare calls.
+            If set, it will override the `tools` parameter provided during initialization.
+
+        :returns:
+            A dictionary with the following key:
+            - `replies`: A list containing the generated responses as ChatMessage instances.
+        """
+        if not self._is_warmed_up:
+            self.warm_up()
+
+        if len(messages) == 0:
+            return {"replies": []}
+
+        streaming_callback = select_streaming_callback(
+            init_callback=self.streaming_callback, runtime_callback=streaming_callback, requires_async=True
+        )
+
+        api_kwargs = self._prepare_api_call(messages, streaming_callback, generation_kwargs, tools)
+        assert self._async_client is not None  # noqa: S101
+        chat_completion = await self._async_client.chat.completions.create(**api_kwargs)
+
+        if streaming_callback is not None:
+            completions = await self._handle_async_stream_response(chat_completion, streaming_callback)
+        else:
+            completions = [
+                _convert_chat_completion_to_chat_message(chat_completion, choice) for choice in chat_completion.choices
+            ]
+
+        for message in completions:
+            _check_finish_reason(message.meta)
+
+        return {"replies": completions}
diff --git a/integrations/vllm/tests/__init__.py b/integrations/vllm/tests/__init__.py
new file mode 100644
index 0000000000..c1764a6e03
--- /dev/null
+++ b/integrations/vllm/tests/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/integrations/vllm/tests/test_chat_generator.py b/integrations/vllm/tests/test_chat_generator.py
new file mode 100644
index 0000000000..dffee6e5ac
--- /dev/null
+++ b/integrations/vllm/tests/test_chat_generator.py
@@ -0,0 +1,634 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+from collections.abc import Iterator
+from datetime import datetime, timezone
+from typing import Annotated
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from haystack.components.generators.utils import print_streaming_chunk
+from haystack.dataclasses import ChatMessage
+from haystack.tools import tool
+from haystack.utils.auth import Secret
+from openai import AsyncStream, Stream
+from openai.types.chat import ChatCompletion, ChatCompletionChunk, chat_completion_chunk
+from openai.types.chat.chat_completion import Choice
+from openai.types.chat.chat_completion_message import ChatCompletionMessage
+
+from haystack_integrations.components.generators.vllm.chat.chat_generator import (
+    VLLMChatGenerator,
+    _convert_chat_completion_to_chat_message,
+)
+
+MODEL = "Qwen/Qwen3-0.6B"
+
+
+class MockStream(Stream[ChatCompletionChunk]):
+    def __init__(self, mock_chunks, client=None, *args, **kwargs):
+        client = client or MagicMock()
+        super().__init__(client=client, *args, **kwargs)  # noqa: B026
+        self.mock_chunks = mock_chunks
+
+    def __stream__(self) -> Iterator[ChatCompletionChunk]:
+        yield from self.mock_chunks
+
+
+class AsyncMockStream(AsyncStream[ChatCompletionChunk]):
+    def __init__(self, mock_chunks):
+        self.mock_chunks = mock_chunks
+        self._index = 0
+
+    def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        if self._index < len(self.mock_chunks):
+            chunk = self.mock_chunks[self._index]
+            self._index += 1
+            return chunk
+        raise StopAsyncIteration
+
+
+def _make_reasoning_chunk(text):
+    chunk = ChatCompletionChunk(
+        id="test-id",
+        model=MODEL,
+        object="chat.completion.chunk",
+        choices=[
+            chat_completion_chunk.Choice(
+                finish_reason=None,
+                logprobs=None,
+                index=0,
+                delta=chat_completion_chunk.ChoiceDelta(content=None, role=None),
+            )
+        ],
+        created=int(datetime.now(tz=timezone.utc).timestamp()),
+        usage=None,
+    )
+    chunk.choices[0].delta.reasoning = text
+    return chunk
+
+
+def _make_content_chunk(text, finish_reason=None):
+    return ChatCompletionChunk(
+        id="test-id",
+        model=MODEL,
+        object="chat.completion.chunk",
+        choices=[
+            chat_completion_chunk.Choice(
+                finish_reason=finish_reason,
+                logprobs=None,
+                index=0,
+                delta=chat_completion_chunk.ChoiceDelta(content=text, role=None),
+            )
+        ],
+        created=int(datetime.now(tz=timezone.utc).timestamp()),
+        usage=None,
+    )
+
+
+@pytest.fixture
+def completion():
+    return ChatCompletion(
+        id="test-id",
+        model=MODEL,
+        object="chat.completion",
+        choices=[
+            {
+                "finish_reason": "stop",
+                "logprobs": None,
+                "index": 0,
+                "message": {"content": "Paris is the capital of France.", "role": "assistant"},
+            }
+        ],
+        created=int(datetime.now(tz=timezone.utc).timestamp()),
+        usage={"prompt_tokens": 57, "completion_tokens": 40, "total_tokens": 97},
+    )
+
+
+@pytest.fixture
+def completion_with_reasoning(completion):
+    completion.choices[0].message.reasoning = "The user asked about the capital of France. I know it's Paris."
+    return completion
+
+
+@pytest.fixture
+def mock_chat_completion(completion):
+    with patch("openai.resources.chat.completions.Completions.create") as mock:
+        mock.return_value = completion
+        yield mock
+
+
+@pytest.fixture
+def mock_chat_completion_with_reasoning(completion_with_reasoning):
+    with patch("openai.resources.chat.completions.Completions.create") as mock:
+        mock.return_value = completion_with_reasoning
+        yield mock
+
+
+@pytest.fixture
+def mock_async_chat_completion(completion):
+    with patch("openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock) as mock:
+        mock.return_value = completion
+        yield mock
+
+
+@pytest.fixture
+def mock_async_chat_completion_with_reasoning(completion_with_reasoning):
+    with patch("openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock) as mock:
+        mock.return_value = completion_with_reasoning
+        yield mock
+
+
+class TestConvertChatCompletionToChatMessage:
+    def test_without_reasoning(self, completion):
+        message = _convert_chat_completion_to_chat_message(completion, completion.choices[0])
+
+        assert message.text == "Paris is the capital of France."
+        assert message.meta["model"] == MODEL
+        assert message.reasoning is None
+
+    def test_with_reasoning(self, completion_with_reasoning):
+        message = _convert_chat_completion_to_chat_message(
+            completion_with_reasoning, completion_with_reasoning.choices[0]
+        )
+
+        assert message.text == "Paris is the capital of France."
+        assert message.reasoning is not None
+        assert "capital of France" in message.reasoning.reasoning_text
+
+    def test_preserves_tool_calls(self):
+        completion = ChatCompletion(
+            id="test-id",
+            model=MODEL,
+            object="chat.completion",
+            choices=[
+                Choice(
+                    finish_reason="tool_calls",
+                    logprobs=None,
+                    index=0,
+                    message=ChatCompletionMessage(
+                        role="assistant",
+                        tool_calls=[
+                            {
+                                "id": "call_123",
+                                "type": "function",
+                                "function": {"name": "weather", "arguments": '{"city": "Paris"}'},
+                            }
+                        ],
+                    ),
+                )
+            ],
+            created=int(datetime.now(tz=timezone.utc).timestamp()),
+            usage={"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
+        )
+
+        message = _convert_chat_completion_to_chat_message(completion, completion.choices[0])
+
+        assert len(message.tool_calls) == 1
+        assert message.tool_calls[0].tool_name == "weather"
+        assert message.tool_calls[0].arguments == {"city": "Paris"}
+
+    def test_skips_malformed_tool_call_arguments(self):
+        completion = ChatCompletion(
+            id="test-id",
+            model=MODEL,
+            object="chat.completion",
+            choices=[
+                Choice(
+                    finish_reason="tool_calls",
+                    logprobs=None,
+                    index=0,
+                    message=ChatCompletionMessage(
+                        role="assistant",
+                        tool_calls=[
+                            {
+                                "id": "call_bad",
+                                "type": "function",
+                                "function": {"name": "weather", "arguments": "not-valid-json"},
+                            }
+                        ],
+                    ),
+                )
+            ],
+            created=int(datetime.now(tz=timezone.utc).timestamp()),
+            usage={"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
+        )
+
+        message = _convert_chat_completion_to_chat_message(completion, completion.choices[0])
+
+        assert len(message.tool_calls) == 0
+
+
+class TestVLLMChatGeneratorInit:
+    def test_init_default(self):
+        component = VLLMChatGenerator(model=MODEL)
+
+        assert component.model == MODEL
+        assert component.streaming_callback is None
+        assert not component.generation_kwargs
+        assert component.api_base_url == "http://localhost:8000/v1"
+        assert component._client is None
+
+    def test_init_with_api_key_from_env(self, monkeypatch):
+        monkeypatch.setenv("VLLM_API_KEY", "test-vllm-key")
+        component = VLLMChatGenerator(model=MODEL)
+
+        assert component.api_key.resolve_value() == "test-vllm-key"
+
+    def test_init_with_parameters(self):
+        component = VLLMChatGenerator(
+            model=MODEL,
+            api_key=Secret.from_token("my-key"),
+            api_base_url="http://my-server:8000/v1",
+            generation_kwargs={"max_tokens": 512, "temperature": 0.7},
+            timeout=60.0,
+            max_retries=3,
+        )
+
+        assert component._client is None
+        assert component._async_client is None
+        assert not component._is_warmed_up
+        assert component.tools is None
+        assert component.http_client_kwargs is None
+        assert component.streaming_callback is None
+        assert component.api_base_url == "http://my-server:8000/v1"
+        assert component.model == MODEL
+        assert component.api_key.resolve_value() == "my-key"
+        assert component.generation_kwargs == {"max_tokens": 512, "temperature": 0.7}
+        assert component.timeout == 60.0
+        assert component.max_retries == 3
+
+
+class TestVLLMChatGeneratorWarmUp:
+    def test_warm_up_creates_clients(self):
+        component = VLLMChatGenerator(model=MODEL)
+        assert component._client is None
+
+        component.warm_up()
+
+        assert component._client is not None
+        assert component._async_client is not None
+        assert component._is_warmed_up is True
+
+    def test_warm_up_is_idempotent(self):
+        component = VLLMChatGenerator(model=MODEL)
+        component.warm_up()
+        first_client = component._client
+
+        component.warm_up()
+
+        assert component._client is first_client
+
+
+class TestVLLMChatGeneratorSerde:
+    def test_to_dict(self, monkeypatch):
+        monkeypatch.setenv("VLLM_API_KEY", "test-key")
+        component = VLLMChatGenerator(
+            model=MODEL,
+            generation_kwargs={"max_tokens": 512},
+        )
+        data = component.to_dict()
+
+        assert data["init_parameters"]["model"] == MODEL
+        assert data["init_parameters"]["api_key"] == {"env_vars": ["VLLM_API_KEY"], "strict": False, "type": "env_var"}
+        assert data["init_parameters"]["api_base_url"] == "http://localhost:8000/v1"
+        assert data["init_parameters"]["generation_kwargs"] == {"max_tokens": 512}
+        assert data["init_parameters"]["http_client_kwargs"] is None
+        assert data["init_parameters"]["tools"] is None
+        assert data["init_parameters"]["timeout"] is None
+        assert data["init_parameters"]["max_retries"] is None
+        assert data["init_parameters"]["streaming_callback"] is None
+
+    def test_from_dict(self):
+        data = {
+            "type": "haystack_integrations.components.generators.vllm.chat.chat_generator.VLLMChatGenerator",
+            "init_parameters": {
+                "model": MODEL,
+                "api_key": {"type": "env_var", "env_vars": ["VLLM_API_KEY"], "strict": False},
+                "api_base_url": "http://my-server:8000/v1",
+                "generation_kwargs": {"max_tokens": 512},
+                "streaming_callback": "haystack.components.generators.utils.print_streaming_chunk",
+                "tools": None,
+                "timeout": 30.0,
+                "max_retries": 5,
+                "http_client_kwargs": None,
+            },
+        }
+        component = VLLMChatGenerator.from_dict(data)
+
+        assert isinstance(component, VLLMChatGenerator)
+        assert component.model == MODEL
+        assert component.generation_kwargs == {"max_tokens": 512}
+        assert component.streaming_callback is print_streaming_chunk
+        assert component.api_base_url == "http://my-server:8000/v1"
+        assert component.api_key == Secret.from_env_var("VLLM_API_KEY", strict=False)
+        assert component.timeout == 30.0
+        assert component.max_retries == 5
+        assert component.http_client_kwargs is None
+        assert component.tools is None
+
+    def test_from_dict_with_streaming_callback(self):
+        data = {
+            "type": "haystack_integrations.components.generators.vllm.chat.chat_generator.VLLMChatGenerator",
+            "init_parameters": {
+                "model": MODEL,
+                "api_key": {"type": "env_var", "env_vars": ["VLLM_API_KEY"], "strict": False},
+                "api_base_url": "http://localhost:8000/v1",
+                "generation_kwargs": {},
+                "streaming_callback": "haystack.components.generators.utils.print_streaming_chunk",
+                "tools": None,
+                "timeout": None,
+                "max_retries": None,
+                "http_client_kwargs": None,
+            },
+        }
+        component = VLLMChatGenerator.from_dict(data)
+
+        assert component.streaming_callback is not None
+
+
+class TestVLLMChatGeneratorRun:
+    def test_run(self, mock_chat_completion):  # noqa: ARG002
+        component = VLLMChatGenerator(model=MODEL)
+        response = component.run([ChatMessage.from_user("What's the capital of France")])
+
+        assert len(response["replies"]) == 1
+        assert response["replies"][0].text == "Paris is the capital of France."
+        assert response["replies"][0].reasoning is None
+
+    def test_run_with_reasoning(self, mock_chat_completion_with_reasoning):  # noqa: ARG002
+        component = VLLMChatGenerator(model=MODEL)
+        response = component.run([ChatMessage.from_user("What's the capital of France")])
+
+        reply = response["replies"][0]
+        assert reply.text == "Paris is the capital of France."
+        assert reply.reasoning is not None
+        assert "capital of France" in reply.reasoning.reasoning_text
+
+    def test_run_passes_generation_kwargs(self, mock_chat_completion):
+        component = VLLMChatGenerator(
+            model=MODEL,
+            generation_kwargs={"max_tokens": 100, "temperature": 0.5},
+        )
+        component.run([ChatMessage.from_user("Hello")])
+
+        _, kwargs = mock_chat_completion.call_args
+        assert kwargs["max_tokens"] == 100
+        assert kwargs["temperature"] == 0.5
+
+    def test_run_empty_messages(self):
+        component = VLLMChatGenerator(model=MODEL)
+        assert component.run([]) == {"replies": []}
+
+    def test_run_streaming(self):
+        openai_chunks = [
+            _make_content_chunk("Hello", finish_reason="stop"),
+        ]
+
+        chunks_received = []
+        component = VLLMChatGenerator(model=MODEL, streaming_callback=chunks_received.append)
+        component.warm_up()
+
+        with patch("openai.resources.chat.completions.Completions.create") as mock:
+            mock.return_value = MockStream(openai_chunks, cast_to=None, response=None, client=None)
+            response = component.run([ChatMessage.from_user("Hello")])
+
+        assert len(chunks_received) > 0
+        assert len(response["replies"]) == 1
+        assert response["replies"][0].text == "Hello"
+
+    def test_run_streaming_with_reasoning(self):
+        """Test that streaming with reasoning correctly sets start=True on the first reasoning and content chunks."""
+        openai_chunks = [
+            _make_reasoning_chunk("Okay"),
+            _make_reasoning_chunk(", the"),
+            _make_reasoning_chunk(" capital"),
+            _make_reasoning_chunk(" is Paris.\n"),
+            _make_content_chunk("\n\n"),
+            _make_content_chunk("Paris"),
+            _make_content_chunk(None, finish_reason="stop"),
+        ]
+
+        streaming_chunks = []
+        component = VLLMChatGenerator(model=MODEL, streaming_callback=streaming_chunks.append)
+        component.warm_up()
+
+        with patch("openai.resources.chat.completions.Completions.create") as mock:
+            mock.return_value = MockStream(openai_chunks, cast_to=None, response=None, client=None)
+            result = component.run([ChatMessage.from_user("Hello")])
+
+        assert result["replies"][0].text == "\n\nParis"
+        assert result["replies"][0].reasoning.reasoning_text == "Okay, the capital is Paris.\n"
+
+        assert len(streaming_chunks) == 7
+        # chunk 0: first reasoning chunk -> start=True
+        assert streaming_chunks[0].reasoning is not None
+        assert streaming_chunks[0].start is True
+        # chunks 1-3: subsequent reasoning chunks -> start=False
+        for i in range(1, 4):
+            assert streaming_chunks[i].reasoning is not None
+            assert streaming_chunks[i].start is False
+        # chunk 4: first content chunk after reasoning -> start=True
+        assert streaming_chunks[4].content == "\n\n"
+        assert streaming_chunks[4].start is True
+        # chunks 5-6: subsequent content chunks -> start=False
+        assert streaming_chunks[5].start is False
+        assert streaming_chunks[6].start is False
+
+
+@pytest.mark.asyncio
+class TestVLLMChatGeneratorRunAsync:
+    async def test_run_async_empty_messages(self):
+        component = VLLMChatGenerator(model=MODEL)
+        assert await component.run_async([]) == {"replies": []}
+
+    async def test_run_async(self, mock_async_chat_completion):  # noqa: ARG002
+        component = VLLMChatGenerator(model=MODEL)
+        response = await component.run_async([ChatMessage.from_user("Hello")])
+
+        assert len(response["replies"]) == 1
+        assert response["replies"][0].text == "Paris is the capital of France."
+        assert response["replies"][0].reasoning is None
+
+    async def test_run_async_with_reasoning(self, mock_async_chat_completion_with_reasoning):  # noqa: ARG002
+        component = VLLMChatGenerator(model=MODEL)
+        response = await component.run_async([ChatMessage.from_user("Hello")])
+
+        reply = response["replies"][0]
+        assert reply.reasoning is not None
+        assert "capital of France" in reply.reasoning.reasoning_text
+
+    async def test_run_async_streaming(self):
+        openai_chunks = [
+            _make_content_chunk("Hello", finish_reason="stop"),
+        ]
+
+        chunks_received = []
+
+        async def callback(chunk):
+            chunks_received.append(chunk)
+
+        component = VLLMChatGenerator(model=MODEL, streaming_callback=callback)
+        component.warm_up()
+
+        with patch("openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock) as mock:
+            mock.return_value = AsyncMockStream(openai_chunks)
+            response = await component.run_async([ChatMessage.from_user("Hello")])
+
+        assert len(chunks_received) > 0
+        assert len(response["replies"]) == 1
+        assert response["replies"][0].text == "Hello"
+
+    async def test_run_async_streaming_with_reasoning(self):
+        """Test that async streaming with reasoning sets start=True on first reasoning and content chunks."""
+        openai_chunks = [
+            _make_reasoning_chunk("Okay"),
+            _make_reasoning_chunk(", the"),
+            _make_reasoning_chunk(" capital"),
+            _make_reasoning_chunk(" is Paris.\n"),
+            _make_content_chunk("\n\n"),
+            _make_content_chunk("Paris"),
+            _make_content_chunk(None, finish_reason="stop"),
+        ]
+
+        streaming_chunks = []
+
+        async def callback(chunk):
+            streaming_chunks.append(chunk)
+
+        component = VLLMChatGenerator(model=MODEL, streaming_callback=callback)
+        component.warm_up()
+
+        with patch("openai.resources.chat.completions.AsyncCompletions.create", new_callable=AsyncMock) as mock:
+            mock.return_value = AsyncMockStream(openai_chunks)
+            result = await component.run_async([ChatMessage.from_user("Hello")])
+
+        assert result["replies"][0].text == "\n\nParis"
+        assert result["replies"][0].reasoning.reasoning_text == "Okay, the capital is Paris.\n"
+
+        assert len(streaming_chunks) == 7
+        assert streaming_chunks[0].reasoning is not None
+        assert streaming_chunks[0].start is True
+        for i in range(1, 4):
+            assert streaming_chunks[i].reasoning is not None
+            assert streaming_chunks[i].start is False
+        assert streaming_chunks[4].content == "\n\n"
+        assert streaming_chunks[4].start is True
+        assert streaming_chunks[5].start is False
+        assert streaming_chunks[6].start is False
+
+
+NO_THINKING_KWARGS = {"extra_body": {"chat_template_kwargs": {"enable_thinking": False}}}
+THINKING_KWARGS = {"extra_body": {"chat_template_kwargs": {"enable_thinking": True}}}
+
+
+@pytest.mark.integration
+class TestVLLMChatGeneratorLiveRun:
+    @pytest.mark.parametrize("generation_kwargs", [NO_THINKING_KWARGS, THINKING_KWARGS])
+    def test_live_run(self, generation_kwargs):
+        component = VLLMChatGenerator(model=MODEL, generation_kwargs=generation_kwargs)
+        response = component.run([ChatMessage.from_user("What is the capital of France?")])
+
+        assert len(response["replies"]) == 1
+        reply = response["replies"][0]
+        assert "paris" in reply.text.lower()
+
+        if generation_kwargs == NO_THINKING_KWARGS:
+            assert reply.reasoning is None
+        else:
+            assert reply.reasoning is not None
+            assert len(reply.reasoning.reasoning_text) > 0
+
+    @pytest.mark.parametrize("generation_kwargs", [NO_THINKING_KWARGS, THINKING_KWARGS])
+    def test_live_run_streaming(self, generation_kwargs):
+        chunks_received = []
+        component = VLLMChatGenerator(
+            model=MODEL,
+            generation_kwargs=generation_kwargs,
+            streaming_callback=chunks_received.append,
+        )
+        response = component.run([ChatMessage.from_user("What is the capital of France?")])
+
+        assert len(chunks_received) > 0
+        assert len(response["replies"]) == 1
+        assert "paris" in response["replies"][0].text.lower()
+        if generation_kwargs == THINKING_KWARGS:
+            assert response["replies"][0].reasoning is not None
+            assert len(response["replies"][0].reasoning.reasoning_text) > 0
+        else:
+            assert response["replies"][0].reasoning is None
+
+    def test_live_run_with_reasoning_and_parallel_tool_calls(self):
+
+        @tool
+        def weather(city: Annotated[str, "The city to get the weather for"]) -> str:
+            """Get the weather in a given city."""
+            return f"The weather in {city} is sunny"
+
+        component = VLLMChatGenerator(
+            model=MODEL,
+            tools=[weather],
+            generation_kwargs=THINKING_KWARGS,
+        )
+        response = component.run([ChatMessage.from_user("What is the weather in Paris? And in Berlin?")])
+
+        assert len(response["replies"]) == 1
+        message = response["replies"][0]
+        assert message.reasoning is not None
+        assert len(message.reasoning.reasoning_text) > 0
+
+        tool_calls = message.tool_calls
+        assert tool_calls[0].tool_name == "weather"
+        assert tool_calls[0].arguments == {"city": "Paris"}
+        assert tool_calls[1].tool_name == "weather"
+        assert tool_calls[1].arguments == {"city": "Berlin"}
+
+    def test_live_run_with_structured_output(self):
+        response_format = {
+            "type": "json_schema",
+            "json_schema": {
+                "name": "capital_info",
+                "strict": True,
+                "schema": {
+                    "type": "object",
+                    "properties": {"capital": {"type": "string"}, "population": {"type": "number"}},
+                    "required": ["capital", "population"],
+                },
+            },
+        }
+        component = VLLMChatGenerator(
+            model=MODEL,
+            # reasoning produces more reliable JSON output
+            generation_kwargs={**THINKING_KWARGS, "response_format": response_format, "temperature": 0.0},
+        )
+        response = component.run(
+            [ChatMessage.from_user("What's the capital of France and its population? Respond in JSON.")]
+        )
+
+        assert len(response["replies"]) == 1
+        response_data = json.loads(response["replies"][0].text)
+        assert isinstance(response_data, dict)
+        assert "capital" in response_data
+        assert "paris" in response_data["capital"].lower()
+        assert "population" in response_data
+
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize("generation_kwargs", [NO_THINKING_KWARGS, THINKING_KWARGS])
+    async def test_live_run_async(self, generation_kwargs):
+        component = VLLMChatGenerator(model=MODEL, generation_kwargs=generation_kwargs)
+        response = await component.run_async(
+            [ChatMessage.from_user("What is the capital of France? Answer in one word.")]
+        )
+
+        assert len(response["replies"]) == 1
+        reply = response["replies"][0]
+        assert "paris" in reply.text.lower()
+        if generation_kwargs == THINKING_KWARGS:
+            assert reply.reasoning is not None
+            assert len(reply.reasoning.reasoning_text) > 0
+        else:
+            assert reply.reasoning is None