haystack-core-integrations/.github/workflows/vllm.yml at 4dc67947e855dcbd489bef8eb32c8f4d3a3fcce0 · deepset-ai/haystack-core-integrations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# This workflow comes from https://github.com/ofek/hatch-mypyc
# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
name: Test / vllm

on:
  schedule:
    - cron: "0 0 * * *"
  pull_request:
    paths:
      - "integrations/vllm/**"
      - "!integrations/vllm/*.md"
      - ".github/workflows/vllm.yml"
  push:
    branches:
      - main
    paths:
      - "integrations/vllm/**"
      - "!integrations/vllm/*.md"
      - ".github/workflows/vllm.yml"

defaults:
  run:
    working-directory: integrations/vllm

concurrency:
  group: vllm-${{ github.head_ref || github.sha }}
  cancel-in-progress: true

env:
  PYTHONUNBUFFERED: "1"
  FORCE_COLOR: "1"
  VLLM_MODEL: "Qwen/Qwen3-0.6B"
  # we only test on Ubuntu to keep vLLM server running simple
  TEST_MATRIX_OS: '["ubuntu-latest"]'
  # numba not compatible with Python 3.14
  TEST_MATRIX_PYTHON: '["3.10", "3.13"]'

jobs:
  compute-test-matrix:
    runs-on: ubuntu-slim
    defaults:
      run:
        working-directory: .
    outputs:
      os: ${{ steps.set.outputs.os }}
      python-version: ${{ steps.set.outputs.python-version }}
    steps:
      - id: set
        run: |
          echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT"
          echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT"

  run:
    name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
    needs: compute-test-matrix
    permissions:
      contents: write
      pull-requests: write
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }}
        python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }}

    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: ${{ matrix.python-version }}

      - name: Install Hatch
        run: pip install hatch

      - name: Install vLLM (CPU)
        run: |
          # vLLM on PyPI is GPU-only and requires CUDA, so it won't run on CPU-only systems.
          # CPU wheels are not published to PyPI; they are only available as direct downloads from GitHub releases.
          # We fetch the latest release and install the appropriate x86 CPU wheel.
          # The --torch-backend cpu flag ensures uv installs PyTorch from the official CPU-only index,
          # since the required torch+cpu builds are also not available on PyPI.
          VLLM_VERSION="$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')"
          export VLLM_VERSION
          echo "Installing vLLM ${VLLM_VERSION} (CPU)"
          hatch run -- uv pip install \
            "https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl" \
            --torch-backend cpu

      - name: Start vLLM server
        env:
          VLLM_TARGET_DEVICE: "cpu"
          VLLM_CPU_KVCACHE_SPACE: "4"
        run: |
          nohup hatch run -- vllm serve ${{ env.VLLM_MODEL }} \
            --reasoning-parser qwen3 \
            --max-model-len 1024 \
            --enforce-eager \
            --dtype bfloat16 \
            --enable-auto-tool-choice \
            --tool-call-parser hermes &

          # Wait for the vLLM server to be ready with a timeout of 300 seconds
          timeout=300
          while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8000/health > /dev/null 2>&1; do
            echo "Waiting for vLLM server to start..."
            sleep 10
            ((timeout-=10))
          done

          if [ $timeout -eq 0 ]; then
            echo "Timed out waiting for vLLM server to start."
            exit 1
          fi

          echo "vLLM server started successfully."

      - name: Lint
        if: matrix.python-version == '3.10' && runner.os == 'Linux'
        run: hatch run fmt-check && hatch run test:types

      - name: Run unit tests
        run: hatch run test:unit-cov-retry

      # On PR: generates coverage comment artifact. On push to main: stores coverage baseline on data branch.
      - name: Store unit tests coverage
        if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule'
        uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
        with:
          GITHUB_TOKEN: ${{ github.token }}
          COVERAGE_PATH: integrations/vllm
          SUBPROJECT_ID: vllm
          COMMENT_ARTIFACT_NAME: coverage-comment-vllm
          MINIMUM_GREEN: 90
          MINIMUM_ORANGE: 60

      - name: Run integration tests
        run: hatch run test:integration-cov-append-retry

      - name: Store combined coverage
        if: github.event_name == 'push'
        uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
        with:
          GITHUB_TOKEN: ${{ github.token }}
          COVERAGE_PATH: integrations/vllm
          SUBPROJECT_ID: vllm-combined
          COMMENT_ARTIFACT_NAME: coverage-comment-vllm-combined
          MINIMUM_GREEN: 90
          MINIMUM_ORANGE: 60

      - name: Run unit tests with lowest direct dependencies
        if: github.event_name != 'push'
        run: |
          hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
          hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
          hatch run test:unit

      - name: Nightly - run unit tests with Haystack main branch
        if: github.event_name == 'schedule'
        run: |
          hatch env prune
          hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
          hatch run test:unit

  notify-slack-on-failure:
    needs: run
    if: failure() && github.event_name == 'schedule'
    runs-on: ubuntu-slim
    steps:
      - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
        with:
          slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}