-
Notifications
You must be signed in to change notification settings - Fork 258
225 lines (195 loc) · 8.4 KB
/
vllm.yml
File metadata and controls
225 lines (195 loc) · 8.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# This workflow comes from https://github.com/ofek/hatch-mypyc
# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
name: Test / vllm
on:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- "integrations/vllm/**"
- "!integrations/vllm/*.md"
- ".github/workflows/vllm.yml"
push:
branches:
- main
paths:
- "integrations/vllm/**"
- "!integrations/vllm/*.md"
- ".github/workflows/vllm.yml"
defaults:
run:
working-directory: integrations/vllm
concurrency:
group: vllm-${{ github.head_ref || github.sha }}
cancel-in-progress: true
env:
PYTHONUNBUFFERED: "1"
FORCE_COLOR: "1"
VLLM_MODEL: "Qwen/Qwen3-0.6B"
VLLM_EMBEDDING_MODEL: "sentence-transformers/all-MiniLM-L6-v2"
VLLM_RANKER_MODEL: "BAAI/bge-reranker-base"
VLLM_TARGET_DEVICE: "cpu"
VLLM_CPU_KVCACHE_SPACE: "4"
# we only test on Ubuntu to keep vLLM server running simple
TEST_MATRIX_OS: '["ubuntu-latest"]'
# vLLM is not compatible with Python 3.14. https://github.com/vllm-project/vllm/issues/34096
TEST_MATRIX_PYTHON: '["3.10", "3.13"]'
jobs:
compute-test-matrix:
runs-on: ubuntu-slim
defaults:
run:
working-directory: .
outputs:
os: ${{ steps.set.outputs.os }}
python-version: ${{ steps.set.outputs.python-version }}
steps:
- id: set
run: |
echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT"
echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT"
run:
name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
needs: compute-test-matrix
permissions:
contents: write
pull-requests: write
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }}
python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install Hatch
run: pip install hatch
- name: Install vLLM (CPU)
run: |
# vLLM on PyPI is GPU-only and requires CUDA, so it won't run on CPU-only systems.
# CPU wheels are not published to PyPI; they are only available as direct downloads from GitHub releases.
# We fetch the latest release and install the appropriate x86 CPU wheel.
# The --torch-backend cpu flag ensures uv installs PyTorch from the official CPU-only index,
# since the required torch+cpu builds are also not available on PyPI.
VLLM_VERSION="$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')"
export VLLM_VERSION
echo "Installing vLLM ${VLLM_VERSION} (CPU)"
hatch run -- uv pip install \
"https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl" \
--torch-backend cpu
- name: Start vLLM chat server
run: |
nohup hatch run -- vllm serve ${{ env.VLLM_MODEL }} \
--port 8000 \
--reasoning-parser qwen3 \
--max-model-len 1024 \
--enforce-eager \
--dtype bfloat16 \
--enable-auto-tool-choice \
--tool-call-parser hermes \
--max-num-seqs 1 &
# Wait for the vLLM chat server to be ready with a timeout of 300 seconds
timeout=300
while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8000/health > /dev/null 2>&1; do
echo "Waiting for vLLM chat server to start..."
sleep 10
((timeout-=10))
done
if [ $timeout -eq 0 ]; then
echo "Timed out waiting for vLLM chat server to start."
exit 1
fi
echo "vLLM chat server started successfully."
- name: Start vLLM embedding server
run: |
nohup hatch run -- vllm serve ${{ env.VLLM_EMBEDDING_MODEL }} \
--port 8001 \
--enforce-eager \
--max-num-seqs 1 &
# Wait for the vLLM embedding server to be ready with a timeout of 300 seconds
timeout=300
while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8001/health > /dev/null 2>&1; do
echo "Waiting for vLLM embedding server to start..."
sleep 10
((timeout-=10))
done
if [ $timeout -eq 0 ]; then
echo "Timed out waiting for vLLM embedding server to start."
exit 1
fi
echo "vLLM embedding server started successfully."
- name: Start vLLM ranker server
run: |
nohup hatch run -- vllm serve ${{ env.VLLM_RANKER_MODEL }} \
--port 8002 \
--enforce-eager \
--max-num-seqs 1 &
# Wait for the vLLM ranker server to be ready with a timeout of 300 seconds
timeout=300
while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8002/health > /dev/null 2>&1; do
echo "Waiting for vLLM ranker server to start..."
sleep 10
((timeout-=10))
done
if [ $timeout -eq 0 ]; then
echo "Timed out waiting for vLLM ranker server to start."
exit 1
fi
echo "vLLM ranker server started successfully."
- name: Lint
if: matrix.python-version == '3.10' && runner.os == 'Linux'
run: hatch run fmt-check && hatch run test:types
- name: Run unit tests
run: hatch run test:unit-cov-retry
# On PR: posts coverage comment (directly on same-repo PRs; via artifact for fork PRs). On push to main: stores coverage baseline on data branch.
- name: Store unit tests coverage
id: coverage_comment
if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule'
uses: py-cov-action/python-coverage-comment-action@63f52f4fbbffada6e8dee8ec432de7e01df9ba79 # v3.41
with:
GITHUB_TOKEN: ${{ github.token }}
COVERAGE_PATH: integrations/vllm
SUBPROJECT_ID: vllm
MINIMUM_GREEN: 90
MINIMUM_ORANGE: 60
- name: Upload coverage comment to be posted
if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true'
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: coverage-comment-vllm
path: python-coverage-comment-action-vllm.txt
- name: Run integration tests
run: hatch run test:integration-cov-append-retry
- name: Store combined coverage
if: github.event_name == 'push'
uses: py-cov-action/python-coverage-comment-action@63f52f4fbbffada6e8dee8ec432de7e01df9ba79 # v3.41
with:
GITHUB_TOKEN: ${{ github.token }}
COVERAGE_PATH: integrations/vllm
SUBPROJECT_ID: vllm-combined
MINIMUM_GREEN: 90
MINIMUM_ORANGE: 60
- name: Run unit tests with lowest direct dependencies
if: github.event_name != 'push'
run: |
hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
hatch run test:unit
- name: Nightly - run unit tests with Haystack main branch
if: github.event_name == 'schedule'
run: |
hatch env prune
hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
hatch run test:unit
notify-slack-on-failure:
needs: run
if: failure() && github.event_name == 'schedule'
runs-on: ubuntu-slim
steps:
- uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
with:
slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}