aws · sirutBuasai · Jun 8, 2026 · Apr 6, 2026 · Apr 24, 2026 · Apr 27, 2026
@@ -36,7 +36,7 @@ deep_canary_mode = false
 
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
-# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_vllm_omni", "huggingface_sglang", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
+# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_vllm", "huggingface_vllm_omni", "huggingface_sglang", "huggingface_llamacpp", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
 build_frameworks = []
 
 
@@ -195,5 +195,8 @@ dlc-pr-huggingface-vllm-omni = ""
 # HuggingFace SGLang
 dlc-pr-huggingface-sglang = ""
 
+# Huggingface Llamacpp
+dlc-pr-huggingface-llamacpp = ""
+
 # sglang
 dlc-pr-sglang = ""
@@ -0,0 +1,133 @@
+--- a/tools/server/server.cpp
++++ b/tools/server/server.cpp
+@@ -11,7 +11,9 @@
+ #include "llama.h"
+ #include "log.h"
+
++#include <algorithm>
+ #include <atomic>
++#include <cctype>
+ #include <clocale>
+ #include <exception>
+ #include <signal.h>
+@@ -69,6 +71,81 @@
+         }
+         return res;
+     };
++}
++
++static std::string sagemaker_header(const server_http_req & req, const std::string & name) {
++    for (const auto & h : req.headers) {
++        std::string key = h.first;
++        std::transform(key.begin(), key.end(), key.begin(), [](unsigned char c) { return std::tolower(c); });
++        if (key == name) {
++            return h.second;
++        }
++    }
++    return "";
++}
++
++static std::string sagemaker_route_from_attrs(const server_http_req & req) {
++    const std::string attrs = sagemaker_header(req, "x-amzn-sagemaker-custom-attributes");
++    const std::string key = "route=";
++    const size_t pos = attrs.find(key);
++    if (pos == std::string::npos) {
++        return "";
++    }
++    const size_t start = pos + key.size();
++    const size_t end = attrs.find_first_of(",; \t\r\n", start);
++    return attrs.substr(start, end == std::string::npos ? std::string::npos : end - start);
++}
++
++static bool sagemaker_route_syntax_ok(const std::string & route) {
++    return !route.empty() && route[0] == '/' && route.find("..") == std::string::npos &&
++           route.find("://") == std::string::npos && route.find('?') == std::string::npos &&
++           route.find('#') == std::string::npos;
++}
++
++static std::string sagemaker_default_route(const server_http_req & req) {
++    const json body = json::parse(req.body, nullptr, false);
++    if (body.is_object()) {
++        if (body.contains("messages")) {
++            return "/v1/chat/completions";
++        }
++        if (body.contains("prompt")) {
++            return "/v1/completions";
++        }
++        if (body.contains("input")) {
++            return "/v1/embeddings";
++        }
++    }
++    return "/v1/chat/completions";
++}
++
++static server_http_res_ptr sagemaker_error(int status, const std::string & message) {
++    auto res = std::make_unique<server_http_res>();
++    res->status = status;
++    res->data = safe_json_to_str({
++        { "error", {
++            { "code", status },
++            { "message", message },
++            { "type", "invalid_request_error" },
++        } },
++    });
++    return res;
++}
++
++static server_http_res_ptr sagemaker_invocations(
++        const server_http_req & req,
++        const std::map<std::string, server_http_context::handler_t> & routes) {
++    const std::string requested = sagemaker_route_from_attrs(req);
++    const std::string route = requested.empty() ? sagemaker_default_route(req) : requested;
++    if (!sagemaker_route_syntax_ok(route)) {
++        return sagemaker_error(400, "invalid SageMaker route: " + route);
++    }
++    const auto it = routes.find(route);
++    if (it == routes.end()) {
++        return sagemaker_error(400, "unsupported SageMaker route: " + route);
++    }
++    server_http_req routed_req = req;
++    routed_req.path = route;
++    return it->second(routed_req);
+ }
+
+ int main(int argc, char ** argv) {
+@@ -169,6 +246,38 @@
+         ctx_http.post("/models/unload",        ex_wrapper(models_routes->post_router_models_unload));
+     }
+
++
++    const std::map<std::string, server_http_context::handler_t> sagemaker_routes = {
++        {"/props", routes.post_props},
++        {"/completion", routes.post_completions},
++        {"/completions", routes.post_completions},
++        {"/v1/completions", routes.post_completions_oai},
++        {"/chat/completions", routes.post_chat_completions},
++        {"/v1/chat/completions", routes.post_chat_completions},
++        {"/v1/responses", routes.post_responses_oai},
++        {"/responses", routes.post_responses_oai},
++        {"/v1/audio/transcriptions", routes.post_transcriptions_oai},
++        {"/audio/transcriptions", routes.post_transcriptions_oai},
++        {"/v1/messages", routes.post_anthropic_messages},
++        {"/v1/messages/count_tokens", routes.post_anthropic_count_tokens},
++        {"/infill", routes.post_infill},
++        {"/embedding", routes.post_embeddings},
++        {"/embeddings", routes.post_embeddings},
++        {"/v1/embeddings", routes.post_embeddings_oai},
++        {"/rerank", routes.post_rerank},
++        {"/reranking", routes.post_rerank},
++        {"/v1/rerank", routes.post_rerank},
++        {"/v1/reranking", routes.post_rerank},
++        {"/tokenize", routes.post_tokenize},
++        {"/detokenize", routes.post_detokenize},
++        {"/apply-template", routes.post_apply_template},
++        {"/lora-adapters", routes.post_lora_adapters},
++    };
++
++    ctx_http.get ("/ping", ex_wrapper(routes.get_health)); // SageMaker health endpoint
++    ctx_http.post("/invocations", ex_wrapper([&sagemaker_routes](const server_http_req & req) {
++        return sagemaker_invocations(req, sagemaker_routes);
++    }));
+     ctx_http.get ("/health",                   ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+     ctx_http.get ("/v1/health",                ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+     ctx_http.get ("/metrics",                  ex_wrapper(routes.get_metrics));
@@ -0,0 +1,49 @@
+#!/bin/bash
+set -euo pipefail
+
+# Source CUDA compat for older drivers (e.g., g5 instances)
+if [ -f /usr/local/bin/start_cuda_compat.sh ] \
+    && command -v nvidia-smi >/dev/null 2>&1 \
+    && command -v nvcc >/dev/null 2>&1; then
+    source /usr/local/bin/start_cuda_compat.sh
+fi
+
+# SageMaker sends traffic to port 8080 on /ping and /invocations. The custom
+# llama-server build handles those routes directly.
+HOST="${LLAMACPP_SAGEMAKER_HOST:-0.0.0.0}"
+PORT="${SAGEMAKER_BIND_TO_PORT:-${LLAMACPP_SAGEMAKER_PORT:-8080}}"
+
+PREFIX="SM_LLAMACPP_"
+ARG_PREFIX="--"
+
+ARGS=()
+
+while IFS='=' read -r key value; do
+    arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
+
+    ARGS+=("${ARG_PREFIX}${arg_name}")
+    if [ -n "$value" ]; then
+        ARGS+=("$value")
+    fi
+done < <(env | grep "^${PREFIX}" || true)
+
+# Drop any user-supplied --host / --port so SageMaker can always reach the server.
+normalized=()
+skip_next=0
+for a in "${ARGS[@]}"; do
+    if [ "$skip_next" -eq 1 ]; then
+        skip_next=0
+        continue
+    fi
+    if [ "$a" = "--host" ] || [ "$a" = "--port" ]; then
+        skip_next=1
+        continue
+    fi
+    normalized+=("$a")
+done
+ARGS=("${normalized[@]}")
+ARGS+=(--host "$HOST" --port "$PORT")
+
+echo "[sagemaker] llama-server args: ${ARGS[*]}" >&2
+
+exec /app/llama-server "${ARGS[@]}"
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+verlte() {
+  [ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
+}
+
+COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
+if [ -f $COMPAT_FILE ]; then
+  CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
+  echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
+  NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
+  if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
+    NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
+  fi
+  echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
+  if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
+    echo "Adding CUDA compat to LD_LIBRARY_PATH"
+    export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
+    echo $LD_LIBRARY_PATH
+  else
+    echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
+  fi
+else
+  echo "Skipping CUDA compat setup as package not found"
+fi
@@ -0,0 +1,73 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+base_framework: &BASE_FRAMEWORK llamacpp
+framework: &FRAMEWORK !join [ "huggingface_", *BASE_FRAMEWORK]
+version: &VERSION "b9522"
+short_version: &SHORT_VERSION "b9522"
+arch_type: &ARCH_TYPE x86_64
+autopatch_build: "False"
+
+repository_info:
+  build_repository: &BUILD_REPOSITORY
+    image_type: &IMAGE_TYPE inference
+    root: huggingface/llamacpp
+    repository_name: &REPOSITORY_NAME !join [ "pr", "-", "huggingface", "-", *BASE_FRAMEWORK ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ "huggingface", "-", *BASE_FRAMEWORK ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  build_context: &BUILD_CONTEXT
+    start_cuda_compat:
+      source: build_artifacts/start_cuda_compat.sh
+      target: start_cuda_compat.sh
+    sagemaker_entrypoint:
+      source: build_artifacts/sagemaker_entrypoint.sh
+      target: sagemaker_entrypoint.sh
+    llamacpp_sagemaker_server_patch:
+      source: build_artifacts/llamacpp_sagemaker_server.patch
+      target: llamacpp_sagemaker_server.patch
+
+
+images:
+  BuildHuggingFaceLlamacppGpuCu130DockerImage:
+    <<: *BUILD_REPOSITORY
+    context:
+      <<: *BUILD_CONTEXT
+    image_size_baseline: 40000
+    device_type: &DEVICE_TYPE gpu
+    cuda_version: &CUDA_VERSION cu130
+    os_version: &OS_VERSION ubuntu24.04
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    tag: !join [ *VERSION, '-', *DEVICE_TYPE, '-', *CUDA_VERSION, '-', *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: sagemaker
+    build: true
+    enable_common_stage_build: false
+    test_configs:
+      test_platforms:
+        - sanity
+        - security
+        - sagemaker
+
+  BuildHuggingFaceLlamacppCpuDockerImage:
+    <<: *BUILD_REPOSITORY
+    context:
+      <<: *BUILD_CONTEXT
+    image_size_baseline: 40000
+    device_type: &DEVICE_TYPE cpu
+    os_version: &OS_VERSION ubuntu24.04
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    tag: !join [ *VERSION, '-', *DEVICE_TYPE, '-', *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    target: sagemaker
+    build: true
+    enable_common_stage_build: false
+    test_configs:
+      test_platforms:
+        - sanity
+        - security
+        - sagemaker
@@ -0,0 +1,89 @@
+ARG UBUNTU_VERSION=24.04
+ARG LLAMACPP_VERSION=b9522
+
+FROM ubuntu:${UBUNTU_VERSION} AS build
+
+ARG LLAMACPP_VERSION
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        gcc-14 \
+        g++-14 \
+        git \
+        libgomp1 \
+        libssl-dev \
+        patch \
+        python3 \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV CC=gcc-14 \
+    CXX=g++-14
+
+WORKDIR /src/llama.cpp
+
+RUN git clone --branch "${LLAMACPP_VERSION}" --depth 1 https://github.com/ggml-org/llama.cpp.git .
+
+COPY llamacpp_sagemaker_server.patch /tmp/llamacpp_sagemaker_server.patch
+
+RUN patch -p1 < /tmp/llamacpp_sagemaker_server.patch
+RUN cmake -B build \
+    -DGGML_NATIVE=OFF \
+    -DGGML_BACKEND_DL=ON \
+    -DGGML_CPU_ALL_VARIANTS=ON \
+    -DLLAMA_BUILD_TESTS=OFF \
+    . \
+    && cmake --build build --config Release -j"$(nproc)" --target llama-server
+
+RUN mkdir -p /app/lib \
+    && find build -name "*.so*" -exec cp -P {} /app/lib \; \
+    && cp build/bin/llama-server /app/llama-server
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+WORKDIR /app
+
+ENV LD_LIBRARY_PATH=/app:${LD_LIBRARY_PATH}
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        ca-certificates \
+        curl \
+        libgomp1 \
+    && apt-get autoremove -y \
+    && apt-get clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=build /app/lib/ /app/
+COPY --from=build /app/llama-server /app/llama-server
+
+FROM base AS sagemaker
+
+COPY --chmod=0755 sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
+
+# Fix several CVEs:
+# CVE-2026-31789, CVE-2025-15467, CVE-2025-13151, CVE-2025-15281,
+# CVE-2025-69419, CVE-2025-68973, CVE-2025-69421, CVE-2026-28390,
+# CVE-2025-69420, CVE-2026-0915, CVE-2026-0861, CVE-2026-28388,
+# CVE-2026-31790, CVE-2026-28387, CVE-2026-28389
+RUN apt-get update \
+    && apt-get install -y --only-upgrade \
+        libssl3t64 \
+        openssl \
+        libtasn1-6 \
+        libc6 \
+        libc-bin \
+        gnupg \
+        gpg \
+        gpgv \
+    && rm -rf /var/lib/apt/lists/*
+
+ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]