Skip to content

Commit 2f74a42

Browse files
authored
Merge pull request #879 from docker/use-llamacpp-images
Use official llama.cpp images for Linux
2 parents fd76359 + 1ab618e commit 2f74a42

13 files changed

Lines changed: 469 additions & 127 deletions

File tree

.github/workflows/release.yml

Lines changed: 22 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,6 @@ on:
3131
description: "SGLang version"
3232
required: false
3333
type: string
34-
# This can be removed once we have llama.cpp built for MUSA and CANN.
35-
buildMusaCann:
36-
description: "Build MUSA and CANN images"
37-
required: false
38-
type: boolean
39-
default: false
4034
imagesOnly:
4135
description: "Only build and push Docker images (skip CLI releases, pinata bump, docs update, and CE packaging)"
4236
required: false
@@ -243,7 +237,6 @@ jobs:
243237
LLAMA_SERVER_VERSION: ${{ inputs.llamaServerVersion }}
244238
VLLM_VERSION: ${{ inputs.vllmVersion }}
245239
SGLANG_VERSION: ${{ inputs.sglangVersion }}
246-
BUILD_MUSA_CANN: ${{ inputs.buildMusaCann || 'false' }}
247240
steps:
248241
- name: Checkout repo
249242
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
@@ -272,15 +265,7 @@ jobs:
272265
echo "docker/model-runner:$RELEASE_TAG-rocm" >> "$GITHUB_OUTPUT"
273266
echo "docker/model-runner:latest-rocm" >> "$GITHUB_OUTPUT"
274267
echo 'EOF' >> "$GITHUB_OUTPUT"
275-
echo "musa<<EOF" >> "$GITHUB_OUTPUT"
276-
echo "docker/model-runner:$RELEASE_TAG-musa" >> "$GITHUB_OUTPUT"
277-
echo "docker/model-runner:latest-musa" >> "$GITHUB_OUTPUT"
278-
echo 'EOF' >> "$GITHUB_OUTPUT"
279-
echo "cann<<EOF" >> "$GITHUB_OUTPUT"
280-
echo "docker/model-runner:$RELEASE_TAG-cann" >> "$GITHUB_OUTPUT"
281-
echo "docker/model-runner:latest-cann" >> "$GITHUB_OUTPUT"
282-
echo 'EOF' >> "$GITHUB_OUTPUT"
283-
268+
284269
- name: Load versions
285270
shell: bash
286271
run: |
@@ -293,6 +278,21 @@ jobs:
293278
echo "VLLM_VERSION=${VLLM_VERSION:-$VERSIONS_VLLM}" >> "$GITHUB_ENV"
294279
echo "SGLANG_VERSION=${SGLANG_VERSION:-$VERSIONS_SGLANG}" >> "$GITHUB_ENV"
295280
281+
- name: Resolve llama.cpp upstream images
282+
id: llama-images
283+
shell: bash
284+
run: |
285+
CPU_IMAGE=$(bash scripts/resolve-llama-upstream-image.sh \
286+
"$LLAMA_SERVER_VERSION" cpu)
287+
CUDA_IMAGE=$(bash scripts/resolve-llama-upstream-image.sh \
288+
"$LLAMA_SERVER_VERSION" cuda)
289+
ROCM_IMAGE=$(bash scripts/resolve-llama-upstream-image.sh \
290+
"$LLAMA_SERVER_VERSION" rocm)
291+
292+
echo "cpu=$CPU_IMAGE" >> "$GITHUB_OUTPUT"
293+
echo "cuda=$CUDA_IMAGE" >> "$GITHUB_OUTPUT"
294+
echo "rocm=$ROCM_IMAGE" >> "$GITHUB_OUTPUT"
295+
296296
- name: Log in to DockerHub
297297
uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121
298298
with:
@@ -315,6 +315,8 @@ jobs:
315315
platforms: linux/amd64, linux/arm64
316316
build-args: |
317317
"LLAMA_SERVER_VERSION=${{ env.LLAMA_SERVER_VERSION }}"
318+
"LLAMA_SERVER_VARIANT=cpu"
319+
"LLAMA_UPSTREAM_IMAGE=${{ steps.llama-images.outputs.cpu }}"
318320
"VERSION=${{ env.RELEASE_TAG }}"
319321
push: true
320322
sbom: true
@@ -330,7 +332,7 @@ jobs:
330332
build-args: |
331333
"LLAMA_SERVER_VERSION=${{ env.LLAMA_SERVER_VERSION }}"
332334
"LLAMA_SERVER_VARIANT=cuda"
333-
"BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04"
335+
"LLAMA_UPSTREAM_IMAGE=${{ steps.llama-images.outputs.cuda }}"
334336
"VERSION=${{ env.RELEASE_TAG }}"
335337
push: true
336338
sbom: true
@@ -346,7 +348,7 @@ jobs:
346348
build-args: |
347349
"LLAMA_SERVER_VERSION=${{ env.LLAMA_SERVER_VERSION }}"
348350
"LLAMA_SERVER_VARIANT=cuda"
349-
"BASE_IMAGE=nvidia/cuda:13.0.2-runtime-ubuntu24.04"
351+
"LLAMA_UPSTREAM_IMAGE=${{ steps.llama-images.outputs.cuda }}"
350352
"VLLM_VERSION=${{ env.VLLM_VERSION }}"
351353
"VERSION=${{ env.RELEASE_TAG }}"
352354
push: true
@@ -363,7 +365,7 @@ jobs:
363365
build-args: |
364366
"LLAMA_SERVER_VERSION=${{ env.LLAMA_SERVER_VERSION }}"
365367
"LLAMA_SERVER_VARIANT=cuda"
366-
"BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04"
368+
"LLAMA_UPSTREAM_IMAGE=${{ steps.llama-images.outputs.cuda }}"
367369
"SGLANG_VERSION=${{ env.SGLANG_VERSION }}"
368370
"VERSION=${{ env.RELEASE_TAG }}"
369371
push: true
@@ -380,47 +382,13 @@ jobs:
380382
build-args: |
381383
"LLAMA_SERVER_VERSION=${{ env.LLAMA_SERVER_VERSION }}"
382384
"LLAMA_SERVER_VARIANT=rocm"
383-
"BASE_IMAGE=rocm/dev-ubuntu-22.04"
385+
"LLAMA_UPSTREAM_IMAGE=${{ steps.llama-images.outputs.rocm }}"
384386
"VERSION=${{ env.RELEASE_TAG }}"
385387
push: true
386388
sbom: true
387389
provenance: mode=max
388390
tags: ${{ steps.tags.outputs.rocm }}
389391

390-
- name: Build MUSA image
391-
if: ${{ env.BUILD_MUSA_CANN == 'true' }}
392-
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f
393-
with:
394-
file: Dockerfile
395-
target: final-llamacpp
396-
platforms: linux/amd64
397-
build-args: |
398-
"LLAMA_SERVER_VERSION=${{ env.LLAMA_SERVER_VERSION }}"
399-
"LLAMA_SERVER_VARIANT=musa"
400-
"BASE_IMAGE=mthreads/musa:rc4.3.0-runtime-ubuntu22.04-amd64"
401-
"VERSION=${{ env.RELEASE_TAG }}"
402-
push: true
403-
sbom: true
404-
provenance: mode=max
405-
tags: ${{ steps.tags.outputs.musa }}
406-
407-
- name: Build CANN image
408-
if: ${{ env.BUILD_MUSA_CANN == 'true' }}
409-
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f
410-
with:
411-
file: Dockerfile
412-
target: final-llamacpp
413-
platforms: linux/arm64, linux/amd64
414-
build-args: |
415-
"LLAMA_SERVER_VERSION=${{ env.LLAMA_SERVER_VERSION }}"
416-
"LLAMA_SERVER_VARIANT=cann"
417-
"BASE_IMAGE=ascendai/cann:8.2.rc2-910b-ubuntu22.04-py3.11"
418-
"VERSION=${{ env.RELEASE_TAG }}"
419-
push: true
420-
sbom: true
421-
provenance: mode=max
422-
tags: ${{ steps.tags.outputs.cann }}
423-
424392
# ---------------------------------------------------------------------------
425393
# Release CLI for Docker Desktop — build, sign & push CLI + Desktop module image
426394
# ---------------------------------------------------------------------------

.versions

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,4 @@ VLLM_UPSTREAM_VERSION=0.19.0
44
VLLM_METAL_RELEASE=v0.2.0-20260420-142150
55
DIFFUSERS_RELEASE=v0.1.0-20260216-000000
66
SGLANG_VERSION=0.5.6
7-
LLAMA_SERVER_VERSION=latest
8-
BASE_IMAGE=ubuntu:26.04
7+
LLAMA_SERVER_VERSION=b8967

Dockerfile

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,9 @@
11
# syntax=docker/dockerfile:1
22

33
ARG GO_VERSION=1.25
4-
ARG LLAMA_SERVER_VERSION=latest
4+
ARG LLAMA_SERVER_VERSION=b8967
55
ARG LLAMA_SERVER_VARIANT=cpu
6-
ARG LLAMA_BINARY_PATH=/com.docker.llama-server.native.linux.${LLAMA_SERVER_VARIANT}.${TARGETARCH}
7-
8-
# only 26.04 for cpu variant for max hardware support with vulkan
9-
# use 22.04 for gpu variants to match ROCm/CUDA base images
10-
ARG BASE_IMAGE=ubuntu:26.04
6+
ARG LLAMA_UPSTREAM_IMAGE=ghcr.io/ggml-org/llama.cpp:server-vulkan-b8967
117

128
ARG VERSION=dev
139

@@ -43,11 +39,8 @@ RUN --mount=type=cache,target=/go/pkg/mod \
4339
--mount=type=cache,target=/root/.cache/go-build \
4440
CGO_ENABLED=1 GOOS=linux go build -tags=novllm -ldflags="-s -w -X main.Version=${VERSION}" -o model-runner .
4541

46-
# --- Get llama.cpp binary ---
47-
FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server
48-
49-
# --- Final image ---
50-
FROM docker.io/${BASE_IMAGE} AS llamacpp
42+
# --- Final image: directly FROM the upstream llama.cpp image ---
43+
FROM ${LLAMA_UPSTREAM_IMAGE} AS llamacpp
5144

5245
ARG LLAMA_SERVER_VARIANT
5346

@@ -57,30 +50,32 @@ RUN groupadd --system modelrunner && useradd --system --gid modelrunner -G video
5750

5851
COPY scripts/ /scripts/
5952

60-
# Install ca-certificates for HTTPS and vulkan
53+
# Install additional packages not shipped by the upstream image
54+
# (e.g. ca-certificates for HTTPS, mesa patches for aarch64 virtio-vulkan).
6155
RUN /scripts/apt-install.sh && rm -rf /scripts
6256

6357
WORKDIR /app
6458

65-
# Create directories for the socket file and llama.cpp binary, and set proper permissions
66-
RUN mkdir -p /var/run/model-runner /app/bin /models && \
59+
# Create directories for the socket file and set proper permissions
60+
RUN mkdir -p /var/run/model-runner /models && \
6761
chown -R modelrunner:modelrunner /var/run/model-runner /app /models && \
6862
chmod -R 755 /models
6963

70-
# Copy the llama.cpp binary from the llama-server stage
71-
ARG LLAMA_BINARY_PATH
72-
COPY --from=llama-server ${LLAMA_BINARY_PATH}/ /app/.
73-
RUN chmod +x /app/bin/com.docker.llama-server
74-
7564
USER modelrunner
7665

77-
# Set the environment variable for the socket path and LLaMA server binary path
66+
# Set the environment variable for the socket path and LLamA server binary path.
67+
# LLAMA_SERVER_PATH points at the directory containing the llama-server binary
68+
# and its ggml backend plugins — keeping them together lets llama.cpp discover
69+
# backends via its default search path (relative to the binary).
7870
ENV MODEL_RUNNER_SOCK=/var/run/model-runner/model-runner.sock
7971
ENV MODEL_RUNNER_PORT=12434
80-
ENV LLAMA_SERVER_PATH=/app/bin
72+
ENV LLAMA_SERVER_PATH=/app
73+
# LD_LIBRARY_PATH is required so that backend plugins loaded via dlopen()
74+
# (e.g. libggml-cpu-*.so, libggml-vulkan.so) can resolve their transitive
75+
# dependencies on libggml-base.so and other shared libraries in /app.
76+
ENV LD_LIBRARY_PATH=/app
8177
ENV HOME=/home/modelrunner
8278
ENV MODELS_PATH=/models
83-
ENV LD_LIBRARY_PATH=/app/lib
8479

8580
# Label the image so that it's hidden on cloud engines.
8681
LABEL com.docker.desktop.service="model-runner"

Makefile

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,36 @@ include .versions
33

44
APP_NAME := model-runner
55
LLAMA_SERVER_VARIANT := cpu
6-
VLLM_BASE_IMAGE := nvidia/cuda:13.0.2-runtime-ubuntu24.04
6+
# Resolved lazily — only evaluated when a Docker target references it.
7+
LLAMA_UPSTREAM_IMAGE ?= $(shell \
8+
bash scripts/resolve-llama-upstream-image.sh \
9+
"$(LLAMA_SERVER_VERSION)" "$(LLAMA_SERVER_VARIANT)")
710
DOCKER_IMAGE := docker/model-runner:latest
811
DOCKER_IMAGE_VLLM := docker/model-runner:latest-vllm-cuda
912
DOCKER_IMAGE_SGLANG := docker/model-runner:latest-sglang
1013
DOCKER_TARGET ?= final-llamacpp
1114
PORT := 8080
1215
LLAMA_ARGS ?=
1316
E2E_TIMEOUT ?= 30m
14-
DOCKER_BUILD_ARGS := \
15-
--load \
16-
--platform linux/$(shell docker version --format '{{.Server.Arch}}') \
17+
18+
define check-llama-image
19+
$(if $(LLAMA_UPSTREAM_IMAGE),,$(error Failed to resolve llama.cpp upstream image. Check LLAMA_SERVER_VERSION and LLAMA_SERVER_VARIANT or set LLAMA_UPSTREAM_IMAGE directly.))
20+
endef
21+
22+
ifeq ($(LLAMA_SERVER_VARIANT),rocm)
23+
DOCKER_BUILD_PLATFORMS := linux/amd64
24+
else
25+
DOCKER_BUILD_PLATFORMS := linux/amd64,linux/arm64
26+
endif
27+
28+
LOCAL_DOCKER_PLATFORM ?= linux/$(shell docker version --format '{{.Server.Arch}}')
29+
30+
DOCKER_BUILD_COMMON_ARGS = \
1731
--build-arg GO_VERSION=$(GO_VERSION) \
1832
--build-arg LLAMA_SERVER_VERSION=$(LLAMA_SERVER_VERSION) \
1933
--build-arg LLAMA_SERVER_VARIANT=$(LLAMA_SERVER_VARIANT) \
34+
--build-arg LLAMA_UPSTREAM_IMAGE=$(LLAMA_UPSTREAM_IMAGE) \
2035
--build-arg SGLANG_VERSION=$(SGLANG_VERSION) \
21-
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
2236
--build-arg VLLM_VERSION='$(VLLM_VERSION)' \
2337
--target $(DOCKER_TARGET) \
2438
-t $(DOCKER_IMAGE)
@@ -102,7 +116,7 @@ e2e:
102116
test-docker-ce-installation:
103117
@echo "Testing Docker CE installation..."
104118
@echo "Note: This requires Docker to be running"
105-
BASE_IMAGE=$(BASE_IMAGE) scripts/test-docker-ce-installation.sh
119+
scripts/test-docker-ce-installation.sh
106120

107121
validate:
108122
find . -type f -name "*.sh" | grep -v "pkg/go-containerregistry\|llamacpp/native/vendor" | xargs shellcheck
@@ -154,11 +168,13 @@ validate-all:
154168

155169
# Build Docker image
156170
docker-build:
157-
docker buildx build $(DOCKER_BUILD_ARGS) .
171+
$(call check-llama-image)
172+
docker buildx build --load --platform $(LOCAL_DOCKER_PLATFORM) $(DOCKER_BUILD_COMMON_ARGS) .
158173

159174
# Build multi-platform Docker image
160175
docker-build-multiplatform:
161-
docker buildx build --platform linux/amd64,linux/arm64 $(DOCKER_BUILD_ARGS) .
176+
$(call check-llama-image)
177+
docker buildx build --platform $(DOCKER_BUILD_PLATFORMS) $(DOCKER_BUILD_COMMON_ARGS) .
162178

163179
# Run in Docker container with TCP port access and mounted model storage
164180
docker-run: docker-build
@@ -169,8 +185,7 @@ docker-build-vllm:
169185
@$(MAKE) docker-build \
170186
DOCKER_TARGET=final-vllm \
171187
DOCKER_IMAGE=$(DOCKER_IMAGE_VLLM) \
172-
LLAMA_SERVER_VARIANT=cuda \
173-
BASE_IMAGE=$(VLLM_BASE_IMAGE)
188+
LLAMA_SERVER_VARIANT=cuda
174189

175190
# Run vLLM Docker container with TCP port access and mounted model storage
176191
docker-run-vllm: docker-build-vllm
@@ -181,8 +196,7 @@ docker-build-sglang:
181196
@$(MAKE) docker-build \
182197
DOCKER_TARGET=final-sglang \
183198
DOCKER_IMAGE=$(DOCKER_IMAGE_SGLANG) \
184-
LLAMA_SERVER_VARIANT=cuda \
185-
BASE_IMAGE=$(VLLM_BASE_IMAGE)
199+
LLAMA_SERVER_VARIANT=cuda
186200

187201
# Run SGLang Docker container with TCP port access and mounted model storage
188202
docker-run-sglang: docker-build-sglang
@@ -379,6 +393,9 @@ help:
379393
@echo ""
380394
@echo "Backend configuration options:"
381395
@echo " LLAMA_ARGS - Arguments for llama.cpp (e.g., \"--verbose --jinja -ngl 999 --ctx-size 2048\")"
396+
@echo " LLAMA_SERVER_VERSION - Upstream llama.cpp version (latest or bNNNN)"
397+
@echo " LLAMA_SERVER_VARIANT - Linux backend flavor (cpu, cuda, or rocm)"
398+
@echo " LLAMA_UPSTREAM_IMAGE - Override the resolved upstream image directly"
382399
@echo " LOCAL_LLAMA - Use local llama.cpp build from llamacpp/install/bin (set to 1 to enable)"
383400
@echo ""
384401
@echo "Example usage:"

0 commit comments

Comments
 (0)