Skip to content

Commit 9a183eb

Browse files
Add HF llama.cpp DLC (#6000)
* [WIP] Add HuggingFace LlamaCpp support * [WIP] Add HuggingFace LlamaCpp support with Dockerfiles, buildspec, and serving scripts * Update Docker base name and coverage report framework for HuggingFace LlamaCpp support * Update local & sagemaker tests for llama.cpp DLC * Update dlc_developer_config.toml * Disable training container build in dlc_developer_config.toml * Refactor SageMaker integration for llama.cpp: replace Python proxy with custom llama-server build * Remove unnecesary resources file * Minimal style changes * Update Dockerfiles to address multiple CVEs * Fix path for Huggingface Llamacpp buildspec in dlc_developer_config.toml * Update py_version extraction in generate_sagemaker_pytest_cmd to handle None case * Remove transformers version from buildspec tag generation in Llamacpp configuration * Enhance image_builder to require transformers_version for HuggingFace builds and update tests to include Llamacpp in upstream types * Update Llamacpp version to b9522 & skip llamacpp scanning and oss compliance * Revert partner_developer and build_frameworks in dlc_developer_config.toml to default values * Apply black formatting * Fix image_builder.py formatting --------- Co-authored-by: Sirut Buasai <73297481+sirutBuasai@users.noreply.github.com>
1 parent 6af0724 commit 9a183eb

30 files changed

Lines changed: 1572 additions & 16 deletions

dlc_developer_config.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ deep_canary_mode = false
3636

3737
[build]
3838
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
39-
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_vllm_omni", "huggingface_sglang", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
39+
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_vllm_omni", "huggingface_sglang", "huggingface_llamacpp", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
4040
build_frameworks = []
4141

4242

@@ -195,5 +195,8 @@ dlc-pr-huggingface-vllm-omni = ""
195195
# HuggingFace SGLang
196196
dlc-pr-huggingface-sglang = ""
197197

198+
# Huggingface Llamacpp
199+
dlc-pr-huggingface-llamacpp = ""
200+
198201
# sglang
199202
dlc-pr-sglang = ""
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
--- a/tools/server/server.cpp
2+
+++ b/tools/server/server.cpp
3+
@@ -11,7 +11,9 @@
4+
#include "llama.h"
5+
#include "log.h"
6+
7+
+#include <algorithm>
8+
#include <atomic>
9+
+#include <cctype>
10+
#include <clocale>
11+
#include <exception>
12+
#include <signal.h>
13+
@@ -69,6 +71,81 @@
14+
}
15+
return res;
16+
};
17+
+}
18+
+
19+
+static std::string sagemaker_header(const server_http_req & req, const std::string & name) {
20+
+ for (const auto & h : req.headers) {
21+
+ std::string key = h.first;
22+
+ std::transform(key.begin(), key.end(), key.begin(), [](unsigned char c) { return std::tolower(c); });
23+
+ if (key == name) {
24+
+ return h.second;
25+
+ }
26+
+ }
27+
+ return "";
28+
+}
29+
+
30+
+static std::string sagemaker_route_from_attrs(const server_http_req & req) {
31+
+ const std::string attrs = sagemaker_header(req, "x-amzn-sagemaker-custom-attributes");
32+
+ const std::string key = "route=";
33+
+ const size_t pos = attrs.find(key);
34+
+ if (pos == std::string::npos) {
35+
+ return "";
36+
+ }
37+
+ const size_t start = pos + key.size();
38+
+ const size_t end = attrs.find_first_of(",; \t\r\n", start);
39+
+ return attrs.substr(start, end == std::string::npos ? std::string::npos : end - start);
40+
+}
41+
+
42+
+static bool sagemaker_route_syntax_ok(const std::string & route) {
43+
+ return !route.empty() && route[0] == '/' && route.find("..") == std::string::npos &&
44+
+ route.find("://") == std::string::npos && route.find('?') == std::string::npos &&
45+
+ route.find('#') == std::string::npos;
46+
+}
47+
+
48+
+static std::string sagemaker_default_route(const server_http_req & req) {
49+
+ const json body = json::parse(req.body, nullptr, false);
50+
+ if (body.is_object()) {
51+
+ if (body.contains("messages")) {
52+
+ return "/v1/chat/completions";
53+
+ }
54+
+ if (body.contains("prompt")) {
55+
+ return "/v1/completions";
56+
+ }
57+
+ if (body.contains("input")) {
58+
+ return "/v1/embeddings";
59+
+ }
60+
+ }
61+
+ return "/v1/chat/completions";
62+
+}
63+
+
64+
+static server_http_res_ptr sagemaker_error(int status, const std::string & message) {
65+
+ auto res = std::make_unique<server_http_res>();
66+
+ res->status = status;
67+
+ res->data = safe_json_to_str({
68+
+ { "error", {
69+
+ { "code", status },
70+
+ { "message", message },
71+
+ { "type", "invalid_request_error" },
72+
+ } },
73+
+ });
74+
+ return res;
75+
+}
76+
+
77+
+static server_http_res_ptr sagemaker_invocations(
78+
+ const server_http_req & req,
79+
+ const std::map<std::string, server_http_context::handler_t> & routes) {
80+
+ const std::string requested = sagemaker_route_from_attrs(req);
81+
+ const std::string route = requested.empty() ? sagemaker_default_route(req) : requested;
82+
+ if (!sagemaker_route_syntax_ok(route)) {
83+
+ return sagemaker_error(400, "invalid SageMaker route: " + route);
84+
+ }
85+
+ const auto it = routes.find(route);
86+
+ if (it == routes.end()) {
87+
+ return sagemaker_error(400, "unsupported SageMaker route: " + route);
88+
+ }
89+
+ server_http_req routed_req = req;
90+
+ routed_req.path = route;
91+
+ return it->second(routed_req);
92+
}
93+
94+
int main(int argc, char ** argv) {
95+
@@ -169,6 +246,38 @@
96+
ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
97+
}
98+
99+
+
100+
+ const std::map<std::string, server_http_context::handler_t> sagemaker_routes = {
101+
+ {"/props", routes.post_props},
102+
+ {"/completion", routes.post_completions},
103+
+ {"/completions", routes.post_completions},
104+
+ {"/v1/completions", routes.post_completions_oai},
105+
+ {"/chat/completions", routes.post_chat_completions},
106+
+ {"/v1/chat/completions", routes.post_chat_completions},
107+
+ {"/v1/responses", routes.post_responses_oai},
108+
+ {"/responses", routes.post_responses_oai},
109+
+ {"/v1/audio/transcriptions", routes.post_transcriptions_oai},
110+
+ {"/audio/transcriptions", routes.post_transcriptions_oai},
111+
+ {"/v1/messages", routes.post_anthropic_messages},
112+
+ {"/v1/messages/count_tokens", routes.post_anthropic_count_tokens},
113+
+ {"/infill", routes.post_infill},
114+
+ {"/embedding", routes.post_embeddings},
115+
+ {"/embeddings", routes.post_embeddings},
116+
+ {"/v1/embeddings", routes.post_embeddings_oai},
117+
+ {"/rerank", routes.post_rerank},
118+
+ {"/reranking", routes.post_rerank},
119+
+ {"/v1/rerank", routes.post_rerank},
120+
+ {"/v1/reranking", routes.post_rerank},
121+
+ {"/tokenize", routes.post_tokenize},
122+
+ {"/detokenize", routes.post_detokenize},
123+
+ {"/apply-template", routes.post_apply_template},
124+
+ {"/lora-adapters", routes.post_lora_adapters},
125+
+ };
126+
+
127+
+ ctx_http.get ("/ping", ex_wrapper(routes.get_health)); // SageMaker health endpoint
128+
+ ctx_http.post("/invocations", ex_wrapper([&sagemaker_routes](const server_http_req & req) {
129+
+ return sagemaker_invocations(req, sagemaker_routes);
130+
+ }));
131+
ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
132+
ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
133+
ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics));
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
# Source CUDA compat for older drivers (e.g., g5 instances)
5+
if [ -f /usr/local/bin/start_cuda_compat.sh ] \
6+
&& command -v nvidia-smi >/dev/null 2>&1 \
7+
&& command -v nvcc >/dev/null 2>&1; then
8+
source /usr/local/bin/start_cuda_compat.sh
9+
fi
10+
11+
# SageMaker sends traffic to port 8080 on /ping and /invocations. The custom
12+
# llama-server build handles those routes directly.
13+
HOST="${LLAMACPP_SAGEMAKER_HOST:-0.0.0.0}"
14+
PORT="${SAGEMAKER_BIND_TO_PORT:-${LLAMACPP_SAGEMAKER_PORT:-8080}}"
15+
16+
PREFIX="SM_LLAMACPP_"
17+
ARG_PREFIX="--"
18+
19+
ARGS=()
20+
21+
while IFS='=' read -r key value; do
22+
arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
23+
24+
ARGS+=("${ARG_PREFIX}${arg_name}")
25+
if [ -n "$value" ]; then
26+
ARGS+=("$value")
27+
fi
28+
done < <(env | grep "^${PREFIX}" || true)
29+
30+
# Drop any user-supplied --host / --port so SageMaker can always reach the server.
31+
normalized=()
32+
skip_next=0
33+
for a in "${ARGS[@]}"; do
34+
if [ "$skip_next" -eq 1 ]; then
35+
skip_next=0
36+
continue
37+
fi
38+
if [ "$a" = "--host" ] || [ "$a" = "--port" ]; then
39+
skip_next=1
40+
continue
41+
fi
42+
normalized+=("$a")
43+
done
44+
ARGS=("${normalized[@]}")
45+
ARGS+=(--host "$HOST" --port "$PORT")
46+
47+
echo "[sagemaker] llama-server args: ${ARGS[*]}" >&2
48+
49+
exec /app/llama-server "${ARGS[@]}"
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
3+
verlte() {
4+
[ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
5+
}
6+
7+
COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
8+
if [ -f $COMPAT_FILE ]; then
9+
CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
10+
echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
11+
NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
12+
if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
13+
NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
14+
fi
15+
echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
16+
if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
17+
echo "Adding CUDA compat to LD_LIBRARY_PATH"
18+
export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
19+
echo $LD_LIBRARY_PATH
20+
else
21+
echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
22+
fi
23+
else
24+
echo "Skipping CUDA compat setup as package not found"
25+
fi

huggingface/llamacpp/buildspec.yml

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
2+
prod_account_id: &PROD_ACCOUNT_ID 763104351884
3+
region: &REGION <set-$REGION-in-environment>
4+
base_framework: &BASE_FRAMEWORK llamacpp
5+
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
6+
version: &VERSION "b9522"
7+
short_version: &SHORT_VERSION "b9522"
8+
arch_type: &ARCH_TYPE x86_64
9+
autopatch_build: "False"
10+
11+
repository_info:
12+
build_repository: &BUILD_REPOSITORY
13+
image_type: &IMAGE_TYPE inference
14+
root: huggingface/llamacpp
15+
repository_name: &REPOSITORY_NAME !join [ "pr", "-", "huggingface", "-", *BASE_FRAMEWORK ]
16+
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
17+
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ "huggingface", "-", *BASE_FRAMEWORK ]
18+
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
19+
20+
context:
21+
build_context: &BUILD_CONTEXT
22+
start_cuda_compat:
23+
source: build_artifacts/start_cuda_compat.sh
24+
target: start_cuda_compat.sh
25+
sagemaker_entrypoint:
26+
source: build_artifacts/sagemaker_entrypoint.sh
27+
target: sagemaker_entrypoint.sh
28+
llamacpp_sagemaker_server_patch:
29+
source: build_artifacts/llamacpp_sagemaker_server.patch
30+
target: llamacpp_sagemaker_server.patch
31+
32+
33+
images:
34+
BuildHuggingFaceLlamacppGpuCu130DockerImage:
35+
<<: *BUILD_REPOSITORY
36+
context:
37+
<<: *BUILD_CONTEXT
38+
image_size_baseline: 40000
39+
device_type: &DEVICE_TYPE gpu
40+
cuda_version: &CUDA_VERSION cu130
41+
os_version: &OS_VERSION ubuntu24.04
42+
python_version: &DOCKER_PYTHON_VERSION py3
43+
tag_python_version: &TAG_PYTHON_VERSION py312
44+
tag: !join [ *VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ]
45+
docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
46+
target: sagemaker
47+
build: true
48+
enable_common_stage_build: false
49+
test_configs:
50+
test_platforms:
51+
- sanity
52+
- security
53+
- sagemaker
54+
55+
BuildHuggingFaceLlamacppCpuDockerImage:
56+
<<: *BUILD_REPOSITORY
57+
context:
58+
<<: *BUILD_CONTEXT
59+
image_size_baseline: 40000
60+
device_type: &DEVICE_TYPE cpu
61+
os_version: &OS_VERSION ubuntu24.04
62+
python_version: &DOCKER_PYTHON_VERSION py3
63+
tag_python_version: &TAG_PYTHON_VERSION py312
64+
tag: !join [ *VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ]
65+
docker_file: !join [ docker/, *SHORT_VERSION, /Dockerfile., *DEVICE_TYPE ]
66+
target: sagemaker
67+
build: true
68+
enable_common_stage_build: false
69+
test_configs:
70+
test_platforms:
71+
- sanity
72+
- security
73+
- sagemaker
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
ARG UBUNTU_VERSION=24.04
2+
ARG LLAMACPP_VERSION=b9522
3+
4+
FROM ubuntu:${UBUNTU_VERSION} AS build
5+
6+
ARG LLAMACPP_VERSION
7+
8+
RUN apt-get update \
9+
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
10+
build-essential \
11+
ca-certificates \
12+
cmake \
13+
gcc-14 \
14+
g++-14 \
15+
git \
16+
libgomp1 \
17+
libssl-dev \
18+
patch \
19+
python3 \
20+
&& rm -rf /var/lib/apt/lists/*
21+
22+
ENV CC=gcc-14 \
23+
CXX=g++-14
24+
25+
WORKDIR /src/llama.cpp
26+
27+
RUN git clone --branch "${LLAMACPP_VERSION}" --depth 1 https://github.com/ggml-org/llama.cpp.git .
28+
29+
COPY llamacpp_sagemaker_server.patch /tmp/llamacpp_sagemaker_server.patch
30+
31+
RUN patch -p1 < /tmp/llamacpp_sagemaker_server.patch
32+
RUN cmake -B build \
33+
-DGGML_NATIVE=OFF \
34+
-DGGML_BACKEND_DL=ON \
35+
-DGGML_CPU_ALL_VARIANTS=ON \
36+
-DLLAMA_BUILD_TESTS=OFF \
37+
. \
38+
&& cmake --build build --config Release -j"$(nproc)" --target llama-server
39+
40+
RUN mkdir -p /app/lib \
41+
&& find build -name "*.so*" -exec cp -P {} /app/lib \; \
42+
&& cp build/bin/llama-server /app/llama-server
43+
44+
FROM ubuntu:${UBUNTU_VERSION} AS base
45+
46+
LABEL maintainer="Amazon AI"
47+
LABEL dlc_major_version="1"
48+
49+
WORKDIR /app
50+
51+
ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH}
52+
53+
RUN apt-get update \
54+
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
55+
ca-certificates \
56+
curl \
57+
libgomp1 \
58+
&& apt-get autoremove -y \
59+
&& apt-get clean -y \
60+
&& rm -rf /tmp/* /var/tmp/* \
61+
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
62+
&& find /var/cache -type f -delete \
63+
&& rm -rf /var/lib/apt/lists/*
64+
65+
COPY --from=build /app/lib/ /app/
66+
COPY --from=build /app/llama-server /app/llama-server
67+
68+
FROM base AS sagemaker
69+
70+
COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
71+
72+
# Fix several CVEs:
73+
# CVE-2026-31789, CVE-2025-15467, CVE-2025-13151, CVE-2025-15281,
74+
# CVE-2025-69419, CVE-2025-68973, CVE-2025-69421, CVE-2026-28390,
75+
# CVE-2025-69420, CVE-2026-0915, CVE-2026-0861, CVE-2026-28388,
76+
# CVE-2026-31790, CVE-2026-28387, CVE-2026-28389
77+
RUN apt-get update \
78+
&& apt-get install -y --only-upgrade \
79+
libssl3t64 \
80+
openssl \
81+
libtasn1-6 \
82+
libc6 \
83+
libc-bin \
84+
gnupg \
85+
gpg \
86+
gpgv \
87+
&& rm -rf /var/lib/apt/lists/*
88+
89+
ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]

0 commit comments

Comments
 (0)