Skip to content

Commit 2d1dd67

Browse files
garrett4waderoot
authored andcommitted
feat(infra): split Docker image into sglang and vllm variants (#985)
* feat(infra): split Docker image into sglang and vllm variants Build separate sglang and vllm Docker images from a single parameterized Dockerfile. Each variant ships only one inference backend, eliminating dependency conflicts and reducing image size. Key changes: - Dockerfile with ARG BASE_IMAGE/VARIANT for per-variant builds - CI workflows: build-docker-image, test-areal, tag-release, install-test - pyproject.toml: sglang/vllm conflicting extras, cuda renamed to cuda-train - pytest markers and conditional fixtures for backend-specific tests - GRPO tests parametrized over both sglang and vllm inference backends - Docker validation tools auto-detect installed inference variant Refs: #985 * fix(infra): update GCP runner to v2.332.0 and resolve rebase import conflicts Runner 2.317.0 does not support node24 required by actions/checkout@v6, causing 'Set up job' failures on dynamically provisioned GCP instances. Key changes: - Bump RUNNER_VERSION from 2.317.0 to 2.332.0 in test-areal.yml and build-docker-image.yml - Remove duplicate imports in test_rollout_controller.py from rebase with PR #996
1 parent e974e79 commit 2d1dd67

29 files changed

Lines changed: 2147 additions & 1113 deletions

.github/workflows/build-docker-image.yml

Lines changed: 71 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ permissions:
1414
env:
1515
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
1616
VALIDATOR_LABELS: gcp-docker-validator
17-
RUNNER_VERSION: '2.317.0'
17+
RUNNER_VERSION: '2.332.0'
1818
IMAGE_NAME: ghcr.io/inclusionai/areal-runtime
1919
IMAGE_TAG: test
2020

@@ -111,12 +111,12 @@ jobs:
111111
112112
throw new Error(`Timed out waiting for builder runner ${instanceName} to come online.`);
113113
114-
build-and-push-image:
114+
build-and-push-images:
115115
needs:
116116
- start-builder
117-
name: Build and push Docker image
117+
name: Build and push Docker images
118118
runs-on: [self-hosted, areal-docker-builder]
119-
timeout-minutes: 180
119+
timeout-minutes: 240 # ~90 min/image x 2 + buffer for cache misses
120120
steps:
121121
- uses: actions/checkout@v6
122122

@@ -132,35 +132,65 @@ jobs:
132132
username: inclusionai
133133
password: ${{ secrets.GHCR_TOKEN }}
134134

135-
- name: Build and push Docker image
135+
- name: Build and push sglang image
136136
uses: docker/build-push-action@v7
137137
with:
138138
context: .
139139
file: ./Dockerfile
140140
push: true
141-
tags: ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}
141+
tags: ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}-sglang
142+
build-args: |
143+
VARIANT=sglang
144+
145+
- name: Build and push vllm image
146+
uses: docker/build-push-action@v7
147+
with:
148+
context: .
149+
file: ./Dockerfile
150+
push: true
151+
tags: ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}-vllm
152+
build-args: |
153+
VARIANT=vllm
142154
143155
- name: Image details
144156
run: |
145-
echo "✅ Docker image built and pushed successfully!"
146-
echo "Image: ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}"
157+
echo "✅ Docker images built and pushed successfully!"
158+
echo "Images:"
159+
echo " - ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}-sglang"
160+
echo " - ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}-vllm"
147161
echo "Commit: ${{ github.sha }}"
148162
echo "Branch: ${{ github.head_ref || github.ref_name }}"
149163
150-
run-tests:
151-
name: Run tests with test image
164+
run-tests-sglang:
165+
name: Run tests with sglang image
166+
needs:
167+
- build-and-push-images
168+
uses: ./.github/workflows/test-areal.yml
169+
with:
170+
image_tag: test
171+
variant: sglang
172+
secrets: inherit
173+
174+
run-tests-vllm:
175+
name: Run tests with vllm image
152176
needs:
153-
- build-and-push-image
177+
- build-and-push-images
154178
uses: ./.github/workflows/test-areal.yml
155179
with:
156180
image_tag: test
181+
variant: vllm
157182
secrets: inherit
158183

159-
promote-image:
160-
name: Promote test image to dev
184+
promote-images:
185+
name: Promote ${{ matrix.variant }} test image to dev
161186
needs:
162-
- run-tests
187+
- run-tests-sglang
188+
- run-tests-vllm
163189
runs-on: ubuntu-latest
190+
strategy:
191+
fail-fast: false
192+
matrix:
193+
variant: [sglang, vllm]
164194
steps:
165195
- name: Log in to GitHub Container Registry
166196
uses: docker/login-action@v4
@@ -173,45 +203,53 @@ jobs:
173203
env:
174204
IMAGE_NAME: ghcr.io/inclusionai/areal-runtime
175205
run: |
176-
docker pull $IMAGE_NAME:test
177-
docker tag $IMAGE_NAME:test $IMAGE_NAME:dev
178-
docker push $IMAGE_NAME:dev
179-
echo "✅ Image promoted from :test to :dev"
206+
docker pull $IMAGE_NAME:test-${{ matrix.variant }}
207+
docker tag $IMAGE_NAME:test-${{ matrix.variant }} $IMAGE_NAME:dev-${{ matrix.variant }}
208+
docker push $IMAGE_NAME:dev-${{ matrix.variant }}
209+
echo "✅ ${{ matrix.variant }} image promoted from :test-${{ matrix.variant }} to :dev-${{ matrix.variant }}"
180210
181-
cleanup-test-image:
182-
name: Delete test image from registry
211+
cleanup-test-images:
212+
name: Delete ${{ matrix.variant }} test image from registry
183213
needs:
184-
- build-and-push-image
185-
- run-tests
186-
- promote-image
187-
if: always() && needs.build-and-push-image.result == 'success'
214+
- build-and-push-images
215+
- run-tests-sglang
216+
- run-tests-vllm
217+
- promote-images
218+
if: always() && needs.build-and-push-images.result == 'success'
188219
runs-on: ubuntu-latest
220+
strategy:
221+
fail-fast: false
222+
matrix:
223+
variant: [sglang, vllm]
189224
steps:
190225
- name: Delete test image from GHCR
191226
env:
192227
GH_TOKEN: ${{ secrets.GHCR_TOKEN }}
193228
run: |
229+
PACKAGE_NAME="areal-runtime"
230+
TAG="test-${{ matrix.variant }}"
194231
# Get the package version ID for the test tag
195232
PACKAGE_VERSION_ID=$(curl -s -H "Authorization: Bearer $GH_TOKEN" \
196-
"https://api.github.com/orgs/inclusionai/packages/container/areal-runtime/versions" \
197-
| jq -r '.[] | select(.metadata.container.tags[] == "test") | .id')
233+
"https://api.github.com/orgs/inclusionai/packages/container/$PACKAGE_NAME/versions?per_page=100" \
234+
| jq -r ".[] | select(.metadata.container.tags[] == \"$TAG\") | .id")
198235
199236
if [ -n "$PACKAGE_VERSION_ID" ] && [ "$PACKAGE_VERSION_ID" != "null" ]; then
200237
curl -X DELETE -H "Authorization: Bearer $GH_TOKEN" \
201-
"https://api.github.com/orgs/inclusionai/packages/container/areal-runtime/versions/$PACKAGE_VERSION_ID"
202-
echo "✅ Deleted test image from registry"
238+
"https://api.github.com/orgs/inclusionai/packages/container/$PACKAGE_NAME/versions/$PACKAGE_VERSION_ID"
239+
echo "✅ Deleted ${{ matrix.variant }} test image from registry"
203240
else
204-
echo "⚠️ Test image not found or already deleted"
241+
echo "⚠️ ${{ matrix.variant }} test image not found or already deleted"
205242
fi
206243
207244
stop-builder:
208245
name: Stop areal-docker-builder instance
209246
needs:
210247
- start-builder
211-
- build-and-push-image
212-
- run-tests
213-
- promote-image
214-
- cleanup-test-image
248+
- build-and-push-images
249+
- run-tests-sglang
250+
- run-tests-vllm
251+
- promote-images
252+
- cleanup-test-images
215253
if: always() && needs.start-builder.outputs.was_running != 'true'
216254
runs-on: ubuntu-latest
217255
env:

.github/workflows/install-test.yml

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,13 @@ jobs:
7777
# Test CUDA extras installation on Linux only (CUDA packages have Linux-only wheels)
7878
# Note: flash-attn is excluded because it requires CUDA compilation infrastructure
7979
# which is not available on standard GitHub runners
80+
# sglang and vllm are conflicting extras, so we test each separately
8081
runs-on: ubuntu-latest
81-
name: Install test (Linux, CUDA extras)
82+
strategy:
83+
fail-fast: false
84+
matrix:
85+
variant: [sglang, vllm]
86+
name: Install test (Linux, CUDA extras, ${{ matrix.variant }})
8287

8388
steps:
8489
- name: Checkout repository
@@ -96,17 +101,20 @@ jobs:
96101
- name: Install package with CUDA extras (excluding flash-attn)
97102
# flash-attn requires CUDA toolkit for compilation, skip it in CI
98103
# Test individual extras that have pre-built wheels
99-
run: uv sync --extra vllm --extra sglang --extra megatron --extra tms
104+
run: uv sync --extra ${{ matrix.variant }} --extra megatron --extra tms
100105

101106
- name: Verify package import with CUDA extras
102107
run: |
103108
uv run python -c "import areal; print(f'areal version: {areal.__version__}')"
104109
105110
- name: Verify CUDA-dependent packages are installed
111+
env:
112+
VARIANT: ${{ matrix.variant }}
106113
run: |
107114
uv run python -c "
108-
import importlib.util
109-
packages = ['vllm', 'sglang', 'megatron.core']
115+
import os, importlib.util
116+
variant = os.environ['VARIANT']
117+
packages = [variant, 'megatron.core']
110118
for pkg in packages:
111119
spec = importlib.util.find_spec(pkg)
112120
status = 'installed' if spec else 'not found'
@@ -120,9 +128,13 @@ jobs:
120128
# and `no space on device left` errors on busy runners
121129
if: github.event_name == 'workflow_dispatch'
122130
runs-on: ubuntu-latest
123-
name: Install test (Docker runtime image)
131+
strategy:
132+
fail-fast: false
133+
matrix:
134+
variant: [sglang, vllm]
135+
name: Install test (Docker ${{ matrix.variant }} image)
124136
container:
125-
image: ghcr.io/inclusionai/areal-runtime:v1.0.1
137+
image: ghcr.io/inclusionai/areal-runtime:dev-${{ matrix.variant }}
126138

127139
steps:
128140
- name: Checkout repository

.github/workflows/tag-release-image.yml

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ jobs:
2626
start-builder:
2727
name: Start areal-docker-builder instance
2828
runs-on: ubuntu-latest
29+
outputs:
30+
was_running: ${{ steps.start-instance.outputs.was_running }}
2931
env:
3032
INSTANCE_NAME: areal-docker-builder
3133
INSTANCE_ZONE: us-central1-f
@@ -39,6 +41,7 @@ jobs:
3941
uses: google-github-actions/setup-gcloud@v3
4042

4143
- name: Start builder instance if stopped
44+
id: start-instance
4245
run: |
4346
set -euo pipefail
4447
@@ -55,12 +58,14 @@ jobs:
5558
5659
if [ "$status" = "RUNNING" ]; then
5760
echo "Instance $INSTANCE_NAME is already running."
61+
echo "was_running=true" >> $GITHUB_OUTPUT
5862
elif [ "$status" = "TERMINATED" ] || [ "$status" = "SUSPENDED" ]; then
5963
echo "Instance $INSTANCE_NAME is $status. Starting it..."
6064
gcloud compute instances start "$INSTANCE_NAME" \
6165
--project "$GCP_PROJECT_ID" \
6266
--zone "$INSTANCE_ZONE"
6367
echo "Instance started successfully."
68+
echo "was_running=false" >> $GITHUB_OUTPUT
6469
else
6570
echo "Instance $INSTANCE_NAME has unexpected status: $status" >&2
6671
exit 1
@@ -113,9 +118,9 @@ jobs:
113118
build-and-push-release:
114119
needs:
115120
- start-builder
116-
name: Build and push release Docker image
121+
name: Build and push release Docker images
117122
runs-on: [self-hosted, areal-docker-builder]
118-
timeout-minutes: 180
123+
timeout-minutes: 300
119124
outputs:
120125
package_version: ${{ steps.get-version.outputs.version }}
121126
release_tag: ${{ steps.get-version.outputs.release_tag }}
@@ -150,25 +155,41 @@ jobs:
150155
username: inclusionai
151156
password: ${{ secrets.GHCR_TOKEN }}
152157

153-
- name: Build and push Docker image
158+
- name: Build and push sglang image
154159
uses: docker/build-push-action@v7
155160
with:
156161
context: .
157162
file: ./Dockerfile
158163
push: true
159164
tags: |
160-
${{ env.IMAGE_NAME }}:${{ steps.get-version.outputs.release_tag }}
165+
${{ env.IMAGE_NAME }}:${{ steps.get-version.outputs.release_tag }}-sglang
161166
${{ env.IMAGE_NAME }}:latest
162-
${{ env.IMAGE_NAME }}:dev
167+
${{ env.IMAGE_NAME }}:dev-sglang
168+
build-args: |
169+
VARIANT=sglang
170+
171+
- name: Build and push vllm image
172+
uses: docker/build-push-action@v7
173+
with:
174+
context: .
175+
file: ./Dockerfile
176+
push: true
177+
tags: |
178+
${{ env.IMAGE_NAME }}:${{ steps.get-version.outputs.release_tag }}-vllm
179+
${{ env.IMAGE_NAME }}:dev-vllm
180+
build-args: |
181+
VARIANT=vllm
163182
164183
- name: Image details
165184
run: |
166-
echo "Docker image built and pushed successfully!"
185+
echo "Docker images built and pushed successfully!"
167186
echo "Image: ${{ env.IMAGE_NAME }}"
168187
echo "Tags:"
169-
echo " - ${{ steps.get-version.outputs.release_tag }}"
170-
echo " - latest"
171-
echo " - dev"
188+
echo " - ${{ steps.get-version.outputs.release_tag }}-sglang"
189+
echo " - ${{ steps.get-version.outputs.release_tag }}-vllm"
190+
echo " - latest (sglang)"
191+
echo " - dev-sglang"
192+
echo " - dev-vllm"
172193
echo "Release: ${{ github.event.release.name || github.event.inputs.tag }}"
173194
echo "Commit: ${{ github.sha }}"
174195
@@ -177,7 +198,7 @@ jobs:
177198
needs:
178199
- build-and-push-release
179200
- start-builder
180-
if: always()
201+
if: always() && needs.start-builder.outputs.was_running != 'true'
181202
runs-on: ubuntu-latest
182203
env:
183204
INSTANCE_NAME: areal-docker-builder

0 commit comments

Comments
 (0)