Skip to content

Commit 2e9d007

Browse files
authored
feat: harden model-engine runtime on chainguard (#809)
* feat: harden model-engine runtime on chainguard * fix: restore runtime kubectl assets * fix: tighten runtime binary handling * fix: eliminate remaining runtime binary highs * fix: honor target architecture for runtime binaries * fix: restore CI test compatibility * fix: remove coreutils dependency from migration script * fix: use shell-based readiness probes * fix: remove endpoint builder shell dependencies * style: format remote build helper * fix: normalize endpoint build context paths * fix: use writable build context temp dirs * fix: unblock simple bundle endpoint builds * test: cover remote build diff paths * test: fix remote build credential assertion * fix: address review feedback on build context handling * fix: keep temp build contexts out of archives * fix: avoid archiving temp build contexts * fix: address runtime library and ignore matching reviews * fix: restore root-only ignore glob behavior * test: align archive ignore coverage with matcher semantics * fix: skip rewriting build context root args
1 parent 2cccc15 commit 2e9d007

13 files changed

Lines changed: 634 additions & 149 deletions

File tree

charts/model-engine/templates/cacher_deployment.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,9 @@ spec:
4848
readinessProbe:
4949
exec:
5050
command:
51-
- cat
52-
- /tmp/readyz
51+
- bash
52+
- -c
53+
- test -f /tmp/readyz
5354
command:
5455
- dumb-init
5556
- --

charts/model-engine/templates/endpoint_builder_deployment.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,9 @@ spec:
4949
readinessProbe:
5050
exec:
5151
command:
52-
- cat
53-
- /tmp/readyz
52+
- bash
53+
- -c
54+
- test -f /tmp/readyz
5455
command:
5556
- dumb-init
5657
- --

model-engine/Dockerfile

Lines changed: 49 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,70 @@
1-
# syntax = docker/dockerfile:experimental
1+
# syntax = docker/dockerfile:1
22

3-
# --- Builder: compile C extensions (pycurl, etc.) and install Python packages ---
4-
FROM python:3.13-slim AS builder
3+
FROM cgr.dev/chainguard/python:latest-dev AS builder
54

5+
USER root
66
WORKDIR /workspace
7+
ARG TARGETARCH
78

8-
RUN apt-get update && apt-get install -y --no-install-recommends \
9+
RUN apk add --no-cache \
10+
bash \
11+
build-base \
12+
curl \
13+
curl-dev \
14+
dumb-init \
915
git \
10-
gcc \
11-
build-essential \
12-
libssl-dev \
13-
libcurl4-openssl-dev \
14-
&& rm -rf /var/lib/apt/lists/*
16+
go \
17+
openssl-dev \
18+
rsync
1519

16-
RUN pip install pip==24.2 setuptools
17-
RUN pip install awscli==1.34.28 --no-cache-dir
20+
RUN python -m venv /workspace/venv
21+
ENV PATH="/workspace/venv/bin:/usr/sbin:/usr/bin:/sbin:/bin"
1822

19-
WORKDIR /workspace/model-engine/
20-
COPY model-engine/requirements-test.txt requirements-test.txt
23+
WORKDIR /workspace/model-engine
2124
COPY model-engine/requirements.txt requirements.txt
2225
COPY model-engine/requirements_override.txt requirements_override.txt
23-
RUN pip install -r requirements-test.txt --no-cache-dir
24-
RUN pip install -r requirements.txt --no-cache-dir
25-
# NOTE: aioboto3==10.4.0 -> aiobotocore==2.4.2 -> urllib3<1.27, which downgrades urllib3
26-
# from 2.x back to 1.26.x. CVE-2023-43804, CVE-2023-45803, CVE-2024-37891 remain.
27-
# Fix: upgrade aioboto3 to >=15.x (separate PR — breaking API changes).
26+
RUN pip install --upgrade pip==24.2 setuptools cmake setuptools-rust
27+
RUN pip install -r requirements.txt --no-cache-dir --no-build-isolation
2828
RUN pip install -r requirements_override.txt --no-cache-dir
2929
COPY model-engine/setup.py setup.py
3030
COPY model-engine/model_engine_server model_engine_server
31+
COPY model-engine/service_configs service_configs
3132
RUN pip install -e .
3233

33-
# --- Runtime: no build tools (eliminates linux-libc-dev and python3.13 CVEs) ---
34-
FROM python:3.13-slim AS model-engine
35-
36-
WORKDIR /workspace
37-
38-
# Runtime-only system deps (vim omitted: multiple unpatched HIGH CVEs in Debian 13.4)
39-
RUN apt-get update && apt-get install -y --no-install-recommends \
40-
dumb-init \
41-
git \
42-
openssh-client \
43-
curl \
44-
procps \
45-
htop \
46-
libcurl4 \
47-
&& rm -rf /var/lib/apt/lists/*
48-
49-
# Install aws-iam-authenticator (architecture-aware)
50-
RUN ARCH=$(uname -m) && \
51-
if [ "$ARCH" = "aarch64" ]; then \
52-
curl -fLo /bin/aws-iam-authenticator https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.7.11/aws-iam-authenticator_0.7.11_linux_arm64; \
53-
else \
54-
curl -fLo /bin/aws-iam-authenticator https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.7.11/aws-iam-authenticator_0.7.11_linux_amd64; \
55-
fi && \
56-
chmod +x /bin/aws-iam-authenticator
34+
RUN mkdir -p /tmp/runtime-bin /tmp/runtime-libs && \
35+
cp /bin/bash /tmp/runtime-bin/bash && \
36+
cp /usr/bin/dumb-init /tmp/runtime-bin/dumb-init && \
37+
cp /usr/bin/git /tmp/runtime-bin/git && \
38+
cp -R /usr/libexec/git-core /tmp/runtime-bin/git-core && \
39+
cp /usr/lib/libpcre2-8.so.0* /tmp/runtime-libs/ && \
40+
cp /usr/lib/libcurl.so.4* /tmp/runtime-libs/ && \
41+
cp /usr/lib/libreadline.so.8* /tmp/runtime-libs/ && \
42+
cp /usr/lib/libtinfo.so.6* /tmp/runtime-libs/ && \
43+
cp /usr/lib/libz.so.1* /tmp/runtime-libs/ && \
44+
git clone --depth 1 --branch v1.35.3 https://github.com/kubernetes/kubernetes.git /tmp/k8s && \
45+
cd /tmp/k8s && \
46+
GOTOOLCHAIN=local KUBE_BUILD_PLATFORMS=linux/${TARGETARCH} make WHAT=cmd/kubectl && \
47+
cp _output/local/bin/linux/${TARGETARCH}/kubectl /tmp/runtime-bin/kubectl && \
48+
GOBIN=/tmp/runtime-bin GOOS=linux GOARCH=${TARGETARCH} go install sigs.k8s.io/aws-iam-authenticator/cmd/aws-iam-authenticator@v0.7.11
5749

58-
# Install kubectl (architecture-aware)
59-
RUN ARCH=$(uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/') && \
60-
curl -fLO "https://dl.k8s.io/release/v1.35.3/bin/linux/${ARCH}/kubectl" && \
61-
chmod +x kubectl && \
62-
mv kubectl /usr/local/bin/kubectl
50+
FROM cgr.dev/chainguard/python:latest AS model-engine
6351

64-
# Copy Python packages, entry-point scripts, and source tree from builder
65-
COPY --from=builder /usr/local/lib/python3.13/site-packages /usr/local/lib/python3.13/site-packages
66-
COPY --from=builder /usr/local/bin /usr/local/bin
67-
COPY --from=builder /workspace/model-engine /workspace/model-engine
68-
69-
RUN useradd --create-home --shell /bin/bash nonroot && \
70-
chown -R nonroot:nonroot /workspace
52+
USER root
53+
WORKDIR /workspace
7154

72-
COPY integration_tests /workspace/integration_tests
55+
COPY --from=builder --chown=nonroot:nonroot /workspace/venv /workspace/venv
56+
COPY --from=builder --chown=nonroot:nonroot /workspace/model-engine /workspace/model-engine
57+
COPY --from=builder /tmp/runtime-bin/bash /bin/bash
58+
COPY --from=builder /tmp/runtime-bin/dumb-init /usr/bin/dumb-init
59+
COPY --from=builder /tmp/runtime-bin/git /usr/bin/git
60+
COPY --from=builder /tmp/runtime-bin/git-core /usr/libexec/git-core
61+
COPY --from=builder /tmp/runtime-bin/kubectl /usr/local/bin/kubectl
62+
COPY --from=builder /tmp/runtime-bin/aws-iam-authenticator /usr/local/bin/aws-iam-authenticator
63+
COPY --from=builder /tmp/runtime-libs/ /usr/lib/
7364

74-
WORKDIR /workspace
75-
ENV PYTHONPATH /workspace
76-
ENV WORKSPACE /workspace
65+
ENV PATH="/workspace/venv/bin:/usr/local/bin:/usr/libexec/git-core:/usr/bin:/bin"
66+
ENV PYTHONPATH=/workspace
67+
ENV WORKSPACE=/workspace
7768

7869
USER nonroot
7970
EXPOSE 5000

model-engine/model_engine_server/common/dtos/llms/vllm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, Dict, List, Optional, Union
1+
from typing import Any, Dict, List, Optional, Union, cast
22

33
from model_engine_server.common.pydantic_types import BaseModel, Field
44
from model_engine_server.common.types.gen.openai import (
@@ -275,7 +275,7 @@ class VLLMSamplingParams(BaseModel):
275275
(canonical beam search algorithm).""",
276276
)
277277
stop_token_ids: Optional[List[int]] = Field(
278-
default_factory=list,
278+
default_factory=lambda: cast(List[int], []),
279279
description="""List of tokens that stop the generation when they are
280280
generated. The returned output will contain the stop tokens unless
281281
the stop tokens are special tokens.""",

model-engine/model_engine_server/core/docker/remote_build.py

Lines changed: 115 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,18 @@
22
import os
33
import shutil
44
import subprocess
5+
import tarfile
56
import tempfile
67
import uuid
78
from base64 import b64encode
89
from contextlib import ExitStack
910
from dataclasses import dataclass
11+
from fnmatch import fnmatchcase
1012
from pathlib import Path
1113
from string import Template
12-
from subprocess import PIPE
1314
from typing import Dict, Iterable, List, Optional, Union
1415

16+
import boto3
1517
import click
1618
import tenacity
1719
import yaml
@@ -74,49 +76,102 @@ def zip_context(
7476
s3_uri = f"s3://{S3_BUCKET}/{s3_file_name}"
7577
print(f"Uploading to s3 at: {s3_uri}")
7678
try:
77-
# Need to gimme_okta_aws_creds (you can export AWS_PROFILE='ml-admin' right after)
78-
tar_command = _build_tar_cmd(context, ignore_file, folders_to_include)
79-
print(f"Creating archive: {' '.join(tar_command)}")
80-
81-
with subprocess.Popen(
82-
tar_command,
83-
stdout=subprocess.PIPE,
84-
stderr=subprocess.DEVNULL,
85-
) as proc:
86-
assert proc.stdout is not None
87-
with storage_client.open(
88-
s3_uri,
89-
"wb",
90-
) as out_file:
91-
shutil.copyfileobj(proc.stdout, out_file)
79+
context_path = Path(context).resolve()
80+
ignore_patterns = _read_ignore_patterns(context_path, ignore_file)
81+
archive_roots = [
82+
_normalize_path_for_archive(context_path, folder)[1] for folder in folders_to_include
83+
]
84+
with tempfile.NamedTemporaryFile(suffix=".tar.gz") as archive:
85+
print(f"Creating archive: {archive.name}")
86+
with tarfile.open(archive.name, mode="w:gz") as tar:
87+
for folder, archive_root in zip(folders_to_include, archive_roots):
88+
resolved_path, _ = _normalize_path_for_archive(context_path, folder)
89+
nested_archive_roots = [
90+
root
91+
for root in archive_roots
92+
if root != archive_root and root.startswith(f"{archive_root}/")
93+
]
94+
tar.add(
95+
resolved_path,
96+
arcname=archive_root,
97+
filter=lambda tar_info, nested_archive_roots=nested_archive_roots: _filter_archive_member(
98+
tar_info, ignore_patterns, nested_archive_roots
99+
),
100+
)
101+
102+
with (
103+
open(archive.name, "rb") as archive_in,
104+
storage_client.open(
105+
s3_uri,
106+
"wb",
107+
) as out_file,
108+
):
109+
shutil.copyfileobj(archive_in, out_file)
92110
print("Done uploading!")
93111
except (ClientError, ProfileNotFound):
94112
print("Did you gimme_okta_aws_creds and then export AWS_PROFILE='ml-admin'? Try doing both")
95113
raise
96114

97115

98-
def _build_tar_cmd(
99-
context: str, ignore_file: Optional[str], folders_to_include: List[str]
100-
) -> List[str]:
101-
assert len(folders_to_include) > 0, "Need at least one folder to create a tar archive from!"
116+
def _read_ignore_patterns(context_path: Path, ignore_file: Optional[str]) -> List[str]:
117+
if ignore_file is None:
118+
return []
102119

103-
tar_command = ["tar", "-C", context]
104-
105-
if ignore_file is not None:
106-
ignore_file = os.path.join(context, ignore_file)
107-
if not os.path.isfile(ignore_file):
108-
print(
109-
f"WARNING: File {ignore_file} does not exist in calling context, not using any file as a .dockerignore"
110-
)
120+
ignore_path = context_path / ignore_file
121+
if not ignore_path.is_file():
122+
print(
123+
f"WARNING: File {ignore_path} does not exist in calling context, not using any file as a .dockerignore"
124+
)
125+
return []
126+
127+
patterns: List[str] = []
128+
for raw_line in ignore_path.read_text().splitlines():
129+
line = raw_line.strip()
130+
if not line or line.startswith("#"):
131+
continue
132+
patterns.append(line.removeprefix("./"))
133+
return patterns
134+
135+
136+
def _normalize_path_for_archive(context_path: Path, folder_to_include: str) -> tuple[Path, str]:
137+
include_path = Path(folder_to_include)
138+
resolved_path = (
139+
include_path.resolve()
140+
if include_path.is_absolute()
141+
else (context_path / include_path).resolve()
142+
)
143+
try:
144+
archive_root = str(resolved_path.relative_to(context_path))
145+
except ValueError as exc:
146+
raise ValueError(
147+
f"{folder_to_include=} is not contained within context {context_path}"
148+
) from exc
149+
return resolved_path, archive_root
150+
151+
152+
def _filter_archive_member(
153+
tar_info: tarfile.TarInfo,
154+
ignore_patterns: List[str],
155+
nested_archive_roots: Optional[List[str]] = None,
156+
) -> Optional[tarfile.TarInfo]:
157+
normalized_name = tar_info.name.removeprefix("./")
158+
nested_archive_roots = nested_archive_roots or []
159+
160+
for nested_root in nested_archive_roots:
161+
if normalized_name == nested_root or normalized_name.startswith(f"{nested_root}/"):
162+
return None
163+
164+
for pattern in ignore_patterns:
165+
normalized_pattern = pattern.rstrip("/")
166+
if "/" in normalized_pattern:
167+
pattern_matches = fnmatchcase(normalized_name, normalized_pattern)
111168
else:
112-
tar_command.append("--exclude-from")
113-
tar_command.append(ignore_file)
114-
115-
tar_command.append("-cf")
116-
tar_command.append("-")
117-
tar_command.extend(folders_to_include)
118-
119-
return tar_command
169+
pattern_matches = "/" not in normalized_name and fnmatchcase(
170+
normalized_name, normalized_pattern
171+
)
172+
if pattern_matches or normalized_name.startswith(f"{normalized_pattern}/"):
173+
return None
174+
return tar_info
120175

121176

122177
def start_build_job(
@@ -154,18 +209,18 @@ def start_build_job(
154209
f = stack.enter_context(tempfile.NamedTemporaryFile("wt", suffix=".yaml"))
155210
template_f = stack.enter_context(open(TEMPLATE_FILE, "rt"))
156211

157-
# In Circle CI we need to retrieve the AWS access key to attach to kaniko
212+
# Keep these values available for any template using explicit env creds, but do not
213+
# shell out to the AWS CLI from the endpoint-builder image.
158214
aws_access_key_id = ""
159215
aws_secret_access_key = ""
216+
aws_session_token = ""
160217
if os.getenv("CIRCLECI"):
161-
aws_access_key_id_result = subprocess.run(
162-
["aws", "configure", "get", "aws_access_key_id"], check=False, stdout=PIPE
163-
)
164-
aws_access_key_id = aws_access_key_id_result.stdout.decode().strip()
165-
aws_secret_access_key_result = subprocess.run(
166-
["aws", "configure", "get", "aws_secret_access_key"], check=False, stdout=PIPE
167-
)
168-
aws_secret_access_key = aws_secret_access_key_result.stdout.decode().strip()
218+
credentials = boto3.Session().get_credentials()
219+
if credentials is not None:
220+
frozen_credentials = credentials.get_frozen_credentials()
221+
aws_access_key_id = frozen_credentials.access_key or ""
222+
aws_secret_access_key = frozen_credentials.secret_key or ""
223+
aws_session_token = frozen_credentials.token or ""
169224
job = Template(template_f.read()).substitute(
170225
NAME=job_name,
171226
CUSTOM_TAGS=json.dumps(custom_tags_serialized),
@@ -176,6 +231,7 @@ def start_build_job(
176231
CACHE_REPO=f"{infra_config().docker_repo_prefix}/{cache_name}",
177232
AWS_ACCESS_KEY_ID=aws_access_key_id,
178233
AWS_SECRET_ACCESS_KEY=aws_secret_access_key,
234+
AWS_SESSION_TOKEN=aws_session_token,
179235
NAMESPACE=NAMESPACE,
180236
)
181237
yml = yaml.safe_load(job)
@@ -214,7 +270,13 @@ def start_build_job(
214270
pip_conf_base64 = b64encode(pip_conf_data.encode("utf-8")).decode("utf-8")
215271
data = {"data": {"codeartifact_pip_conf": pip_conf_base64}}
216272
subprocess.check_output(
217-
["kubectl", "patch", "secret", "codeartifact-pip-conf", f"-p={json.dumps(data)}"]
273+
[
274+
"kubectl",
275+
"patch",
276+
"secret",
277+
"codeartifact-pip-conf",
278+
f"-p={json.dumps(data)}",
279+
]
218280
).decode("utf-8")
219281

220282
print(f"Executing Kaniko build command:\n{container_spec}")
@@ -293,7 +355,13 @@ def build_remote(
293355
ignore_file=ignore_file,
294356
)
295357
return start_build_job(
296-
s3_file_name, dockerfile, repotags, use_cache, cache_name, build_args, custom_tags
358+
s3_file_name,
359+
dockerfile,
360+
repotags,
361+
use_cache,
362+
cache_name,
363+
build_args,
364+
custom_tags,
297365
)
298366

299367

0 commit comments

Comments
 (0)