haystack-core-integrations/.github/workflows/vllm.yml at main · deepset-ai/haystack-core-integrations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# This workflow comes from https://github.com/ofek/hatch-mypyc
# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
name: Test / vllm

on:
  schedule:
    - cron: "0 0 * * *"
  pull_request:
    paths:
      - "integrations/vllm/**"
      - "!integrations/vllm/*.md"
      - ".github/workflows/vllm.yml"
  push:
    branches:
      - main
    paths:
      - "integrations/vllm/**"
      - "!integrations/vllm/*.md"
      - ".github/workflows/vllm.yml"

defaults:
  run:
    working-directory: integrations/vllm

concurrency:
  group: vllm-${{ github.head_ref || github.sha }}
  cancel-in-progress: true

env:
  PYTHONUNBUFFERED: "1"
  FORCE_COLOR: "1"
  VLLM_MODEL: "Qwen/Qwen3-0.6B"
  VLLM_EMBEDDING_MODEL: "sentence-transformers/all-MiniLM-L6-v2"
  VLLM_RANKER_MODEL: "BAAI/bge-reranker-base"
  VLLM_TARGET_DEVICE: "cpu"
  VLLM_CPU_KVCACHE_SPACE: "4"
  # we only test on Ubuntu to keep vLLM server running simple
  TEST_MATRIX_OS: '["ubuntu-latest"]'
  # vLLM is not compatible with Python 3.14. https://github.com/vllm-project/vllm/issues/34096
  TEST_MATRIX_PYTHON: '["3.10", "3.13"]'

jobs:
  compute-test-matrix:
    runs-on: ubuntu-slim
    defaults:
      run:
        working-directory: .
    outputs:
      os: ${{ steps.set.outputs.os }}
      python-version: ${{ steps.set.outputs.python-version }}
    steps:
      - id: set
        run: |
          echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT"
          echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT"

  run:
    name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
    needs: compute-test-matrix
    permissions:
      contents: write
      pull-requests: write
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }}
        python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }}

    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: ${{ matrix.python-version }}

      - name: Install Hatch
        run: pip install hatch

      - name: Install vLLM (CPU)
        run: |
          # vLLM on PyPI is GPU-only and requires CUDA, so it won't run on CPU-only systems.
          # CPU wheels are not published to PyPI; they are only available as direct downloads from GitHub releases.
          # We fetch the latest release and install the appropriate x86 CPU wheel.
          # The --torch-backend cpu flag ensures uv installs PyTorch from the official CPU-only index,
          # since the required torch+cpu builds are also not available on PyPI.
          VLLM_VERSION="$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')"
          export VLLM_VERSION
          echo "Installing vLLM ${VLLM_VERSION} (CPU)"
          hatch run -- uv pip install \
            "https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl" \
            --torch-backend cpu

      - name: Start vLLM chat server
        run: |
          nohup hatch run -- vllm serve ${{ env.VLLM_MODEL }} \
            --port 8000 \
            --reasoning-parser qwen3 \
            --max-model-len 1024 \
            --enforce-eager \
            --dtype bfloat16 \
            --enable-auto-tool-choice \
            --tool-call-parser hermes \
            --max-num-seqs 1 &

          # Wait for the vLLM chat server to be ready with a timeout of 300 seconds
          timeout=300
          while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8000/health > /dev/null 2>&1; do
            echo "Waiting for vLLM chat server to start..."
            sleep 10
            ((timeout-=10))
          done

          if [ $timeout -eq 0 ]; then
            echo "Timed out waiting for vLLM chat server to start."
            exit 1
          fi

          echo "vLLM chat server started successfully."

      - name: Start vLLM embedding server
        run: |
          nohup hatch run -- vllm serve ${{ env.VLLM_EMBEDDING_MODEL }} \
            --port 8001 \
            --enforce-eager \
            --max-num-seqs 1 &

          # Wait for the vLLM embedding server to be ready with a timeout of 300 seconds
          timeout=300
          while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8001/health > /dev/null 2>&1; do
            echo "Waiting for vLLM embedding server to start..."
            sleep 10
            ((timeout-=10))
          done

          if [ $timeout -eq 0 ]; then
            echo "Timed out waiting for vLLM embedding server to start."
            exit 1
          fi

          echo "vLLM embedding server started successfully."

      - name: Start vLLM ranker server
        run: |
          nohup hatch run -- vllm serve ${{ env.VLLM_RANKER_MODEL }} \
            --port 8002 \
            --enforce-eager \
            --max-num-seqs 1 &

          # Wait for the vLLM ranker server to be ready with a timeout of 300 seconds
          timeout=300
          while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8002/health > /dev/null 2>&1; do
            echo "Waiting for vLLM ranker server to start..."
            sleep 10
            ((timeout-=10))
          done

          if [ $timeout -eq 0 ]; then
            echo "Timed out waiting for vLLM ranker server to start."
            exit 1
          fi

          echo "vLLM ranker server started successfully."
      - name: Lint
        if: matrix.python-version == '3.10' && runner.os == 'Linux'
        run: hatch run fmt-check && hatch run test:types

      - name: Run unit tests
        run: hatch run test:unit-cov-retry

      # On PR: posts coverage comment (directly on same-repo PRs; via artifact for fork PRs). On push to main: stores coverage baseline on data branch.
      - name: Store unit tests coverage
        id: coverage_comment
        if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule'
        uses: py-cov-action/python-coverage-comment-action@63f52f4fbbffada6e8dee8ec432de7e01df9ba79 # v3.41
        with:
          GITHUB_TOKEN: ${{ github.token }}
          COVERAGE_PATH: integrations/vllm
          SUBPROJECT_ID: vllm
          MINIMUM_GREEN: 90
          MINIMUM_ORANGE: 60

      - name: Upload coverage comment to be posted
        if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true'
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
          name: coverage-comment-vllm
          path: python-coverage-comment-action-vllm.txt

      - name: Run integration tests
        run: hatch run test:integration-cov-append-retry

      - name: Store combined coverage
        if: github.event_name == 'push'
        uses: py-cov-action/python-coverage-comment-action@63f52f4fbbffada6e8dee8ec432de7e01df9ba79 # v3.41
        with:
          GITHUB_TOKEN: ${{ github.token }}
          COVERAGE_PATH: integrations/vllm
          SUBPROJECT_ID: vllm-combined
          MINIMUM_GREEN: 90
          MINIMUM_ORANGE: 60

      - name: Run unit tests with lowest direct dependencies
        if: github.event_name != 'push'
        run: |
          hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
          hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
          hatch run test:unit

      - name: Nightly - run unit tests with Haystack main branch
        if: github.event_name == 'schedule'
        run: |
          hatch env prune
          hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
          hatch run test:unit

  notify-slack-on-failure:
    needs: run
    if: failure() && github.event_name == 'schedule'
    runs-on: ubuntu-slim
    steps:
      - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
        with:
          slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}