-
Notifications
You must be signed in to change notification settings - Fork 255
173 lines (151 loc) · 6.35 KB
/
vllm.yml
File metadata and controls
173 lines (151 loc) · 6.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# This workflow comes from https://github.com/ofek/hatch-mypyc
# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
name: Test / vllm
on:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- "integrations/vllm/**"
- "!integrations/vllm/*.md"
- ".github/workflows/vllm.yml"
push:
branches:
- main
paths:
- "integrations/vllm/**"
- "!integrations/vllm/*.md"
- ".github/workflows/vllm.yml"
defaults:
run:
working-directory: integrations/vllm
concurrency:
group: vllm-${{ github.head_ref || github.sha }}
cancel-in-progress: true
env:
PYTHONUNBUFFERED: "1"
FORCE_COLOR: "1"
VLLM_MODEL: "Qwen/Qwen3-0.6B"
# we only test on Ubuntu to keep vLLM server running simple
TEST_MATRIX_OS: '["ubuntu-latest"]'
# numba not compatible with Python 3.14
TEST_MATRIX_PYTHON: '["3.10", "3.13"]'
jobs:
compute-test-matrix:
runs-on: ubuntu-slim
defaults:
run:
working-directory: .
outputs:
os: ${{ steps.set.outputs.os }}
python-version: ${{ steps.set.outputs.python-version }}
steps:
- id: set
run: |
echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT"
echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT"
run:
name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
needs: compute-test-matrix
permissions:
contents: write
pull-requests: write
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }}
python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install Hatch
run: pip install hatch
- name: Install vLLM (CPU)
run: |
# vLLM on PyPI is GPU-only and requires CUDA, so it won't run on CPU-only systems.
# CPU wheels are not published to PyPI; they are only available as direct downloads from GitHub releases.
# We fetch the latest release and install the appropriate x86 CPU wheel.
# The --torch-backend cpu flag ensures uv installs PyTorch from the official CPU-only index,
# since the required torch+cpu builds are also not available on PyPI.
VLLM_VERSION="$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')"
export VLLM_VERSION
echo "Installing vLLM ${VLLM_VERSION} (CPU)"
hatch run -- uv pip install \
"https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl" \
--torch-backend cpu
- name: Start vLLM server
env:
VLLM_TARGET_DEVICE: "cpu"
VLLM_CPU_KVCACHE_SPACE: "4"
run: |
nohup hatch run -- vllm serve ${{ env.VLLM_MODEL }} \
--reasoning-parser qwen3 \
--max-model-len 1024 \
--enforce-eager \
--dtype bfloat16 \
--enable-auto-tool-choice \
--tool-call-parser hermes &
# Wait for the vLLM server to be ready with a timeout of 300 seconds
timeout=300
while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8000/health > /dev/null 2>&1; do
echo "Waiting for vLLM server to start..."
sleep 10
((timeout-=10))
done
if [ $timeout -eq 0 ]; then
echo "Timed out waiting for vLLM server to start."
exit 1
fi
echo "vLLM server started successfully."
- name: Lint
if: matrix.python-version == '3.10' && runner.os == 'Linux'
run: hatch run fmt-check && hatch run test:types
- name: Run unit tests
run: hatch run test:unit-cov-retry
# On PR: generates coverage comment artifact. On push to main: stores coverage baseline on data branch.
- name: Store unit tests coverage
if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule'
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
with:
GITHUB_TOKEN: ${{ github.token }}
COVERAGE_PATH: integrations/vllm
SUBPROJECT_ID: vllm
COMMENT_ARTIFACT_NAME: coverage-comment-vllm
MINIMUM_GREEN: 90
MINIMUM_ORANGE: 60
- name: Run integration tests
run: hatch run test:integration-cov-append-retry
- name: Store combined coverage
if: github.event_name == 'push'
uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40
with:
GITHUB_TOKEN: ${{ github.token }}
COVERAGE_PATH: integrations/vllm
SUBPROJECT_ID: vllm-combined
COMMENT_ARTIFACT_NAME: coverage-comment-vllm-combined
MINIMUM_GREEN: 90
MINIMUM_ORANGE: 60
- name: Run unit tests with lowest direct dependencies
if: github.event_name != 'push'
run: |
hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt
hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt
hatch run test:unit
- name: Nightly - run unit tests with Haystack main branch
if: github.event_name == 'schedule'
run: |
hatch env prune
hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
hatch run test:unit
notify-slack-on-failure:
needs: run
if: failure() && github.event_name == 'schedule'
runs-on: ubuntu-slim
steps:
- uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
with:
slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}