Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
c695fc8
[WIP] Add HuggingFace LlamaCpp support
ehcalabres Apr 6, 2026
5d5118f
[WIP] Add HuggingFace LlamaCpp support with Dockerfiles, buildspec, a…
ehcalabres Apr 24, 2026
d2cc69f
Update Docker base name and coverage report framework for HuggingFace…
ehcalabres Apr 27, 2026
1afc833
Update local & sagemaker tests for llama.cpp DLC
ehcalabres Apr 27, 2026
ecdd5a2
Update dlc_developer_config.toml
ehcalabres Apr 27, 2026
3f69b07
Disable training container build in dlc_developer_config.toml
ehcalabres Apr 27, 2026
cdcae10
Merge branch 'master' into add-hf-llamacpp-dlc
ehcalabres Apr 27, 2026
aba0ce6
Merge branch 'master' into add-hf-llamacpp-dlc
ehcalabres Apr 27, 2026
f5bcbc3
Refactor SageMaker integration for llama.cpp: replace Python proxy wi…
ehcalabres Apr 29, 2026
529537d
Remove unnecesary resources file
ehcalabres Apr 30, 2026
eeb0049
Minimal style changes
ehcalabres Apr 30, 2026
0773825
Update Dockerfiles to address multiple CVEs
ehcalabres Apr 30, 2026
a8af8de
Merge branch 'master' into add-hf-llamacpp-dlc
sirutBuasai May 7, 2026
f54cdec
Merge branch 'master' into add-hf-llamacpp-dlc
sirutBuasai May 12, 2026
2074c6d
Merge branch 'master' into add-hf-llamacpp-dlc
sirutBuasai May 12, 2026
988efb4
Fix path for Huggingface Llamacpp buildspec in dlc_developer_config.toml
ehcalabres May 12, 2026
af6ba95
Update py_version extraction in generate_sagemaker_pytest_cmd to hand…
ehcalabres May 13, 2026
473a6d6
Remove transformers version from buildspec tag generation in Llamacpp…
ehcalabres May 13, 2026
3cc4e73
Enhance image_builder to require transformers_version for HuggingFace…
ehcalabres May 19, 2026
48cc589
Update Llamacpp version to b9522 & skip llamacpp scanning and oss com…
ehcalabres Jun 5, 2026
fc3c495
Merge branch 'master' into add-hf-llamacpp-dlc
ehcalabres Jun 5, 2026
d6bcfb0
Revert partner_developer and build_frameworks in dlc_developer_config…
ehcalabres Jun 8, 2026
1661dc4
Merge branch 'master' into add-hf-llamacpp-dlc
ehcalabres Jun 8, 2026
2909d89
Apply black formatting
ehcalabres Jun 8, 2026
88ee612
Fix image_builder.py formatting
ehcalabres Jun 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion dlc_developer_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ deep_canary_mode = false

[build]
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_vllm_omni", "huggingface_sglang", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_vllm_omni", "huggingface_sglang", "huggingface_llamacpp", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = []


Expand Down Expand Up @@ -195,5 +195,8 @@ dlc-pr-huggingface-vllm-omni = ""
# HuggingFace SGLang
dlc-pr-huggingface-sglang = ""

# Huggingface Llamacpp
dlc-pr-huggingface-llamacpp = ""

# sglang
dlc-pr-sglang = ""
133 changes: 133 additions & 0 deletions huggingface/llamacpp/build_artifacts/llamacpp_sagemaker_server.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -11,7 +11,9 @@
#include "llama.h"
#include "log.h"

+#include <algorithm>
#include <atomic>
+#include <cctype>
#include <clocale>
#include <exception>
#include <signal.h>
@@ -69,6 +71,81 @@
}
return res;
};
+}
+
+static std::string sagemaker_header(const server_http_req & req, const std::string & name) {
+ for (const auto & h : req.headers) {
+ std::string key = h.first;
+ std::transform(key.begin(), key.end(), key.begin(), [](unsigned char c) { return std::tolower(c); });
+ if (key == name) {
+ return h.second;
+ }
+ }
+ return "";
+}
+
+static std::string sagemaker_route_from_attrs(const server_http_req & req) {
+ const std::string attrs = sagemaker_header(req, "x-amzn-sagemaker-custom-attributes");
+ const std::string key = "route=";
+ const size_t pos = attrs.find(key);
+ if (pos == std::string::npos) {
+ return "";
+ }
+ const size_t start = pos + key.size();
+ const size_t end = attrs.find_first_of(",; \t\r\n", start);
+ return attrs.substr(start, end == std::string::npos ? std::string::npos : end - start);
+}
+
+static bool sagemaker_route_syntax_ok(const std::string & route) {
+ return !route.empty() && route[0] == '/' && route.find("..") == std::string::npos &&
+ route.find("://") == std::string::npos && route.find('?') == std::string::npos &&
+ route.find('#') == std::string::npos;
+}
+
+static std::string sagemaker_default_route(const server_http_req & req) {
+ const json body = json::parse(req.body, nullptr, false);
+ if (body.is_object()) {
+ if (body.contains("messages")) {
+ return "/v1/chat/completions";
+ }
+ if (body.contains("prompt")) {
+ return "/v1/completions";
+ }
+ if (body.contains("input")) {
+ return "/v1/embeddings";
+ }
+ }
+ return "/v1/chat/completions";
+}
+
+static server_http_res_ptr sagemaker_error(int status, const std::string & message) {
+ auto res = std::make_unique<server_http_res>();
+ res->status = status;
+ res->data = safe_json_to_str({
+ { "error", {
+ { "code", status },
+ { "message", message },
+ { "type", "invalid_request_error" },
+ } },
+ });
+ return res;
+}
+
+static server_http_res_ptr sagemaker_invocations(
+ const server_http_req & req,
+ const std::map<std::string, server_http_context::handler_t> & routes) {
+ const std::string requested = sagemaker_route_from_attrs(req);
+ const std::string route = requested.empty() ? sagemaker_default_route(req) : requested;
+ if (!sagemaker_route_syntax_ok(route)) {
+ return sagemaker_error(400, "invalid SageMaker route: " + route);
+ }
+ const auto it = routes.find(route);
+ if (it == routes.end()) {
+ return sagemaker_error(400, "unsupported SageMaker route: " + route);
+ }
+ server_http_req routed_req = req;
+ routed_req.path = route;
+ return it->second(routed_req);
}

int main(int argc, char ** argv) {
@@ -169,6 +246,38 @@
ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
}

+
+ const std::map<std::string, server_http_context::handler_t> sagemaker_routes = {
+ {"/props", routes.post_props},
+ {"/completion", routes.post_completions},
+ {"/completions", routes.post_completions},
+ {"/v1/completions", routes.post_completions_oai},
+ {"/chat/completions", routes.post_chat_completions},
+ {"/v1/chat/completions", routes.post_chat_completions},
+ {"/v1/responses", routes.post_responses_oai},
+ {"/responses", routes.post_responses_oai},
+ {"/v1/audio/transcriptions", routes.post_transcriptions_oai},
+ {"/audio/transcriptions", routes.post_transcriptions_oai},
+ {"/v1/messages", routes.post_anthropic_messages},
+ {"/v1/messages/count_tokens", routes.post_anthropic_count_tokens},
+ {"/infill", routes.post_infill},
+ {"/embedding", routes.post_embeddings},
+ {"/embeddings", routes.post_embeddings},
+ {"/v1/embeddings", routes.post_embeddings_oai},
+ {"/rerank", routes.post_rerank},
+ {"/reranking", routes.post_rerank},
+ {"/v1/rerank", routes.post_rerank},
+ {"/v1/reranking", routes.post_rerank},
+ {"/tokenize", routes.post_tokenize},
+ {"/detokenize", routes.post_detokenize},
+ {"/apply-template", routes.post_apply_template},
+ {"/lora-adapters", routes.post_lora_adapters},
+ };
+
+ ctx_http.get ("/ping", ex_wrapper(routes.get_health)); // SageMaker health endpoint
+ ctx_http.post("/invocations", ex_wrapper([&sagemaker_routes](const server_http_req & req) {
+ return sagemaker_invocations(req, sagemaker_routes);
+ }));
ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
ctx_http.get ("/v1/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check)
ctx_http.get ("/metrics", ex_wrapper(routes.get_metrics));
49 changes: 49 additions & 0 deletions huggingface/llamacpp/build_artifacts/sagemaker_entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/bin/bash
set -euo pipefail

# Source CUDA compat for older drivers (e.g., g5 instances)
if [ -f /usr/local/bin/start_cuda_compat.sh ] \
&& command -v nvidia-smi >/dev/null 2>&1 \
&& command -v nvcc >/dev/null 2>&1; then
source /usr/local/bin/start_cuda_compat.sh
fi

# SageMaker sends traffic to port 8080 on /ping and /invocations. The custom
# llama-server build handles those routes directly.
HOST="${LLAMACPP_SAGEMAKER_HOST:-0.0.0.0}"
PORT="${SAGEMAKER_BIND_TO_PORT:-${LLAMACPP_SAGEMAKER_PORT:-8080}}"

PREFIX="SM_LLAMACPP_"
ARG_PREFIX="--"

ARGS=()

while IFS='=' read -r key value; do
arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')

ARGS+=("${ARG_PREFIX}${arg_name}")
if [ -n "$value" ]; then
ARGS+=("$value")
fi
done < <(env | grep "^${PREFIX}" || true)

# Drop any user-supplied --host / --port so SageMaker can always reach the server.
normalized=()
skip_next=0
for a in "${ARGS[@]}"; do
if [ "$skip_next" -eq 1 ]; then
skip_next=0
continue
fi
if [ "$a" = "--host" ] || [ "$a" = "--port" ]; then
skip_next=1
continue
fi
normalized+=("$a")
done
ARGS=("${normalized[@]}")
ARGS+=(--host "$HOST" --port "$PORT")

echo "[sagemaker] llama-server args: ${ARGS[*]}" >&2

exec /app/llama-server "${ARGS[@]}"
25 changes: 25 additions & 0 deletions huggingface/llamacpp/build_artifacts/start_cuda_compat.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

verlte() {
[ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
}

COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
if [ -f $COMPAT_FILE ]; then
CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
fi
echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
echo "Adding CUDA compat to LD_LIBRARY_PATH"
export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
echo $LD_LIBRARY_PATH
else
echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
fi
else
echo "Skipping CUDA compat setup as package not found"
fi
73 changes: 73 additions & 0 deletions huggingface/llamacpp/buildspec.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
base_framework: &BASE_FRAMEWORK llamacpp
framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
version: &VERSION "b9522"
short_version: &SHORT_VERSION "b9522"
arch_type: &ARCH_TYPE x86_64
autopatch_build: "False"

repository_info:
build_repository: &BUILD_REPOSITORY
image_type: &IMAGE_TYPE inference
root: huggingface/llamacpp
repository_name: &REPOSITORY_NAME !join [ "pr", "-", "huggingface", "-", *BASE_FRAMEWORK ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ "huggingface", "-", *BASE_FRAMEWORK ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
build_context: &BUILD_CONTEXT
start_cuda_compat:
source: build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
sagemaker_entrypoint:
source: build_artifacts/sagemaker_entrypoint.sh
target: sagemaker_entrypoint.sh
llamacpp_sagemaker_server_patch:
source: build_artifacts/llamacpp_sagemaker_server.patch
target: llamacpp_sagemaker_server.patch


images:
BuildHuggingFaceLlamacppGpuCu130DockerImage:
<<: *BUILD_REPOSITORY
context:
<<: *BUILD_CONTEXT
image_size_baseline: 40000
device_type: &DEVICE_TYPE gpu
cuda_version: &CUDA_VERSION cu130
os_version: &OS_VERSION ubuntu24.04
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
tag: !join [ *VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
build: true
enable_common_stage_build: false
test_configs:
test_platforms:
- sanity
- security
- sagemaker

BuildHuggingFaceLlamacppCpuDockerImage:
<<: *BUILD_REPOSITORY
context:
<<: *BUILD_CONTEXT
image_size_baseline: 40000
device_type: &DEVICE_TYPE cpu
os_version: &OS_VERSION ubuntu24.04
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
tag: !join [ *VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ]
docker_file: !join [ docker/, *SHORT_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
build: true
enable_common_stage_build: false
test_configs:
test_platforms:
- sanity
- security
- sagemaker
89 changes: 89 additions & 0 deletions huggingface/llamacpp/docker/b9522/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
ARG UBUNTU_VERSION=24.04
ARG LLAMACPP_VERSION=b9522

FROM ubuntu:${UBUNTU_VERSION} AS build

ARG LLAMACPP_VERSION

RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
cmake \
gcc-14 \
g++-14 \
git \
libgomp1 \
libssl-dev \
patch \
python3 \
&& rm -rf /var/lib/apt/lists/*

ENV CC=gcc-14 \
CXX=g++-14

WORKDIR /src/llama.cpp

RUN git clone --branch "${LLAMACPP_VERSION}" --depth 1 https://github.com/ggml-org/llama.cpp.git .

COPY llamacpp_sagemaker_server.patch /tmp/llamacpp_sagemaker_server.patch

RUN patch -p1 < /tmp/llamacpp_sagemaker_server.patch
RUN cmake -B build \
-DGGML_NATIVE=OFF \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
-DLLAMA_BUILD_TESTS=OFF \
. \
&& cmake --build build --config Release -j"$(nproc)" --target llama-server

RUN mkdir -p /app/lib \
&& find build -name "*.so*" -exec cp -P {} /app/lib \; \
&& cp build/bin/llama-server /app/llama-server

FROM ubuntu:${UBUNTU_VERSION} AS base

LABEL maintainer="Amazon AI"
LABEL dlc_major_version="1"

WORKDIR /app

ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH}

RUN apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ca-certificates \
curl \
libgomp1 \
&& apt-get autoremove -y \
&& apt-get clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete \
&& rm -rf /var/lib/apt/lists/*

COPY --from=build /app/lib/ /app/
COPY --from=build /app/llama-server /app/llama-server

FROM base AS sagemaker

COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh

# Fix several CVEs:
# CVE-2026-31789, CVE-2025-15467, CVE-2025-13151, CVE-2025-15281,
# CVE-2025-69419, CVE-2025-68973, CVE-2025-69421, CVE-2026-28390,
# CVE-2025-69420, CVE-2026-0915, CVE-2026-0861, CVE-2026-28388,
# CVE-2026-31790, CVE-2026-28387, CVE-2026-28389
RUN apt-get update \
&& apt-get install -y --only-upgrade \
libssl3t64 \
openssl \
libtasn1-6 \
libc6 \
libc-bin \
gnupg \
gpg \
gpgv \
&& rm -rf /var/lib/apt/lists/*

ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]
Loading
Loading