Skip to content

Commit 56d2db4

Browse files
authored
week 22: Fix acft image vulnerabilities (#5084)
* Fix ACFT image vulnerabilities
1 parent cef5458 commit 56d2db4

11 files changed

Lines changed: 206 additions & 452 deletions

File tree

assets/training/finetune_acft_hf_nlp/environments/acpt-draft/context/Dockerfile

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,9 @@
22
FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2204-cu126-py310-torch280:{{latest-image-tag:biweekly\.\d{6}\.\d{1}.*}}
33
USER root
44

5-
# OS package security fixes not yet present in the base image
6-
# (USN-8251-1 libpng, USN-8229-1 sed, USN-8227-1 curl, USN-8233-1 nghttp2, USN-8284-1 gnutls, USN-8283-1 rsync)
5+
# USN-8284-1/USN-8283-1: upgrade Ubuntu packages to patched Jammy versions.
76
RUN apt-get update && apt-get install -y --no-install-recommends \
8-
libpng16-16 sed curl libcurl4 libcurl3-gnutls libnghttp2-14 libgnutls30 rsync \
7+
libgnutls30 rsync \
98
&& rm -rf /var/lib/apt/lists/*
109

1110
COPY requirements.txt .
@@ -27,21 +26,21 @@ RUN pip install xgrammar==0.1.32
2726
# GHSA-69w3-r845-3855 (CVE-2026-1839): arbitrary code execution in Trainer class;
2827
# patched only in transformers>=5.0.0rc3. Upgrading to latest stable 5.x.
2928
RUN pip install transformers==5.5.4
30-
# Base conda env (python3.13) overrides for vulnerable transitive deps:
31-
# - python-dotenv>=1.2.2 (GHSA-mf9w-mj56-hr94): pydantic-settings 2.14.1 still requires
32-
# only python-dotenv>=0.21.0; uvicorn[standard] requires >=0.13; fastmcp requires
33-
# >=1.1.0. No released parent forces >=1.2.2, so direct override is the only fix.
29+
# Python 3.13 conda env overrides for vulnerable preinstalled transitive deps
30+
# after checking latest parent metadata on 2026-05-25:
31+
# - python-dotenv>=1.2.2 (GHSA-mf9w-mj56-hr94): metadata probes found
32+
# pydantic-settings 2.14.1 requires only >=0.21.0, uvicorn 0.48.0 requires
33+
# >=0.13 for [standard], and anaconda-auth 0.15.0 has no version floor.
3434
# - urllib3>=2.7.0 (GHSA-qccp-gfcp-xxvc / CVE-2026-44431, GHSA-mf9v-mfxr-j63j /
35-
# CVE-2026-44432): transitive dep at 2.6.3 in base env.
36-
# - idna>=3.15 (GHSA-65pc-fj4g-8rjx / CVE-2026-45409): transitive dep at 3.11 in base
37-
# env; pulled by requests/urllib3/cryptography which all use loose floors (idna<4)
38-
# so no parent upgrade forces >=3.15.
39-
# - click>=8.3.3 (GHSA-47fr-3ffg-hgmw / CVE-2026-7246): transitive dep at 8.2.1 in
40-
# base env; pulled by mlflow/uvicorn/typer with loose floors (click>=8.x), so no
41-
# parent upgrade forces >=8.3.3.
42-
# - pip>=26.1 (GHSA-jp4c-xjxw-mgf9 / CVE-2026-6357): shipped at 26.0.1 in both base
43-
# (python3.13) and PTCA (python3.10) envs. Pip self-upgrade leaves a stale
44-
# conda-meta/pip-26.0.1-*.json record that the SCA scanner still flags — remove
35+
# CVE-2026-44432): requests 2.34.2 still permits urllib3<3,>=1.26.
36+
# - idna>=3.15 (GHSA-65pc-fj4g-8rjx / CVE-2026-45409): metadata probes found
37+
# requests 2.34.2, anyio 4.13.0, httpx 0.28.1, and yarl 1.24.2 all still
38+
# permit idna below 3.15, so no parent upgrade forces the patched version.
39+
# - click>=8.3.3 (GHSA-47fr-3ffg-hgmw / CVE-2026-7246): metadata probes found
40+
# typer 0.25.1 requires only >=8.2.1 and anaconda-cli-base 0.8.2 has no floor.
41+
# - pip>=26.1 (GHSA-jp4c-xjxw-mgf9 / CVE-2026-6357): shipped at 26.0.1 in both
42+
# Python envs. Pip self-upgrade leaves a stale
43+
# conda-meta/pip-26.0.1-*.json record that the SCA scanner still flags; remove
4544
# it explicitly after the upgrade.
4645
RUN conda run -n base python -m pip install --no-cache-dir --upgrade \
4746
'python-dotenv>=1.2.2' 'urllib3>=2.7.0' 'idna>=3.15' 'click>=8.3.3' 'pip>=26.1' \

assets/training/finetune_acft_hf_nlp/environments/acpt-rft/context/Dockerfile

Lines changed: 21 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@ COPY requirements.txt .
1111
RUN pip install -r requirements.txt --no-cache-dir
1212
RUN pip install azureml-acft-common-components=={{latest-pypi-version}}
1313
RUN pip install azureml-evaluate-mlflow=={{latest-pypi-version}}
14-
RUN pip install verl==0.7.0
14+
RUN pip install verl==0.7.1
1515
RUN pip install sacrebleu==2.5.1
1616
COPY tracking /opt/conda/envs/ptca/lib/python3.10/site-packages/verl/utils/tracking.py
1717

1818
RUN pip install --no-cache-dir accelerate==1.10.0
19-
RUN pip install --no-cache-dir sglang==0.5.10
20-
RUN pip install --no-cache-dir sgl-kernel==0.3.16.post3
19+
RUN pip install --no-cache-dir sglang==0.5.11
20+
RUN pip install --no-cache-dir sglang-kernel==0.4.2
2121

2222
RUN pip uninstall -y mlflow
2323
RUN pip install --no-cache-dir --force-reinstall "mlflow>=3.2.0,<4.0.0"
@@ -27,119 +27,43 @@ RUN pip install --no-cache-dir starlette==0.49.1
2727
# Removing the binary forces wandb to use its Python backend (safe fallback).
2828
RUN pip install --no-cache-dir --upgrade "wandb>=0.26.0" && \
2929
find /opt/conda/envs/ptca -name 'wandb-core' -path '*/wandb/bin/*' -delete 2>/dev/null || true
30-
RUN pip install --no-cache-dir triton==3.4.0
31-
RUN pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 --index-url https://download.pytorch.org/whl/cu126
30+
RUN pip install --no-cache-dir --upgrade torch==2.11.0 torchvision==0.26.0 torchaudio==2.11.0
3231
COPY vllm_async_server /opt/conda/envs/ptca/lib/python3.10/site-packages/verl/workers/rollout/vllm_rollout/vllm_async_server.py
3332
COPY __init__ /opt/conda/envs/ptca/lib/python3.10/site-packages/verl/utils/reward_score/__init__.py
3433
COPY azure_grader /opt/conda/envs/ptca/lib/python3.10/site-packages/verl/utils/reward_score/azure_grader.py
3534
COPY azure_python_grader /opt/conda/envs/ptca/lib/python3.10/site-packages/verl/utils/reward_score/azure_python_grader.py
3635
COPY utils /opt/conda/envs/ptca/lib/python3.10/site-packages/verl/utils/vllm/utils.py
37-
# vllm pinned to 0.19.1 to fix:
38-
# - GHSA-6c4r-fmh3-7rh8 (librosa transitive dep removed in vllm 0.18.1+ via PR #37058;
39-
# PyPI metadata confirms 0.18.0 still lists `librosa; extra == "audio"` while 0.18.1+ do not)
40-
# - CVE-2026-7141 (fixed in 0.19.1)
41-
# - GHSA-x368-4g9h-fvv4 / VCM 5012008 (fix landed in 0.19.1)
42-
# Parent package (verl 0.7.0) constrains `vllm<=0.12.0,>=0.8.5` only via the optional [vllm]
43-
# extra, which is NOT used in this image (verl is installed without the extra); thus there is
44-
# no parent that pulls vllm — it is a direct top-level install here, and the only available
45-
# remediation path is a direct version override.
46-
#
47-
# SURGICAL FIX (CVE-2026-44223 / GHSA-83vm-p52w-f9pw / VCM 5012192):
48-
# The upstream fix (vllm-project/vllm#38610) is a single-line change in
49-
# vllm/v1/spec_decode/extract_hidden_states.py: `return sampled_token_ids` →
50-
# `return sampled_token_ids[:, :1]`. We backport this as a file overlay on the
51-
# installed vllm 0.19.1 (see `COPY extract_hidden_states ...` near the end of
52-
# this Dockerfile + the build-time verification that asserts the overlay
53-
# landed and vllm.__version__ remains 0.19.1). This pattern matches the
54-
# existing overlays for vllm_async_server / vllm_rollout / utils.
55-
#
56-
# RESIDUAL FINDINGS (accepted risk, not remediated in this build):
57-
# - GHSA-hpv8-x276-m59f / VCM 5012004 / CVE-2026-44222 (vLLM multimodal
58-
# special-token placeholder DoS via OpenAI-compatible API server) — fixed
59-
# in vllm>=0.20.0. The upstream fix (umbrella issue vllm-project/vllm#32656)
60-
# is a broad multimodal refactor across per-model files (qwen2_vl,
61-
# qwen2_5_vl, qwen2_5_omni_thinker, qwen3_vl, ernie45_vl, glm4v, keye…) that
62-
# cannot be backported as a small overlay without significant code-shift
63-
# risk. We accept this finding because the RFT image consumes vLLM only as
64-
# an in-process rollout backend for trusted training workloads — it does
65-
# not expose vLLM's OpenAI-compatible multimodal endpoint to unauthenticated
66-
# callers, so the remote DoS vector is unreachable in this deployment.
67-
# - GHSA-83vm-p52w-f9pw / VCM 5012192 — see SURGICAL FIX above (REMEDIATED).
68-
# We are NOT upgrading to vllm 0.20.x in this build. Cascade blockers (re-verified
69-
# against PyPI requires_dist and Dao-AILab/yeshsurya GitHub release assets on 2026-05-23):
70-
# 1. torch ABI pin: vllm 0.20.0/0.20.1/0.20.2/0.21.0 all require torch==2.11.0 exact
71-
# (PyPI requires_dist). Current vllm 0.19.1 resolves torch==2.10.0; sglang 0.5.10
72-
# pins torch==2.9.1. The minimum sglang line accepting torch 2.11.0 is sglang 0.5.11
73-
# (also bumps transformers==5.6.0 + new sgl-kernel/torch-memory-saver/flashinfer matrix).
74-
# 2. flash-attn wheel — THIS IS THE HARD BLOCKER. The image installs a prebuilt cp310
75-
# flash-attn wheel from yeshsurya/flash-attention; only v2.8.3-linux-1 is published
76-
# and it is built against the torch 2.10 ABI. The upstream Dao-AILab flash-attention
77-
# release feed (latest fa4-v4.0.0.beta14, 2026-05-23) publishes no cp310 wheels at
78-
# all (assets list empty), so there is no torch-2.11 / cp310 wheel available to
79-
# consume. Building flash-attn from source inside ACR exceeds the build timeout.
80-
# 3. vLLM v1-engine internal patches: the COPY'd files (vllm_async_server, vllm_rollout,
81-
# utils) import `vllm.v1.engine.async_llm.AsyncLLM`, `vllm.v1.engine.core.EngineCoreProc`,
82-
# `vllm.v1.engine.utils.CoreEngineProcManager`, `vllm.v1.executor.abstract.Executor`,
83-
# `vllm.utils.argparse_utils`, `vllm.utils.network_utils`, `vllm.config.LoRAConfig`. These
84-
# v1-engine internals frequently shift across vllm minor lines (0.19→0.20) and would
85-
# require a full re-validation of the patches.
86-
# (Previously-listed blocker #4, verl 0.7.0 + transformers>=5.6.0 ImportError on
87-
# AutoModelForVision2Seq, is RESOLVED upstream in verl 0.7.1 — the import is now
88-
# wrapped in try/except per github.com/volcengine/verl@v0.7.1/verl/utils/model.py
89-
# line 1. We can adopt verl 0.7.1 if/when blockers #1/#2 are unblocked; not bumped
90-
# here to keep this security change minimal.)
91-
# Risk acceptance: this image consumes vLLM internally for RFT training rollouts; it is
92-
# deployed in internal/trusted training workloads and does not expose a public OpenAI
93-
# endpoint for unauthenticated multimodal traffic, so the practical exposure of the DoS path
94-
# is limited. The override avoids a high-risk torch / sglang / flash-attn / DeepGEMM /
95-
# custom-vLLM-patch requalification in a single security bump. Sister env acpt-grpo runs
96-
# vllm==0.20.1 successfully BUT does NOT pin sglang/flash-attn/torch/verl and so does not
97-
# hit this cascade.
98-
# NOTE on SBOM visibility: the CVE-2026-44223 overlay (extract_hidden_states.py, see
99-
# end of file) closes the runtime vulnerability but does NOT update vllm's dist-info
100-
# METADATA Version field, so SBOM-based scanners (Qualys/VCM) will continue to report
101-
# CVE-2026-44223 and CVE-2026-44222 against vllm@0.19.1 until the cascade can be lifted.
102-
RUN pip install vllm==0.19.1
36+
# vllm is a direct top-level runtime dependency in this image. PyPI metadata checks show
37+
# verl only declares vllm under the unused [vllm] extra, so there is no parent package to
38+
# upgrade for GHSA-hpv8-x276-m59f / GHSA-83vm-p52w-f9pw.
39+
RUN pip install --no-cache-dir vllm==0.20.1
10340
# Keep xgrammar at the patched floor even when pulled transitively by vllm.
10441
RUN pip install --no-cache-dir 'xgrammar>=0.1.32'
10542
RUN pip install openai==2.14.0
10643
RUN pip install --force-reinstall --no-cache-dir --no-build-isolation git+https://github.com/deepseek-ai/DeepGEMM.git@c9f8b34dcdacc20aa746b786f983492c51072870
107-
RUN pip install https://github.com/yeshsurya/flash-attention/releases/download/v2.8.3-linux-1/flash_attn-2.8.3-cp310-cp310-linux_x86_64.whl
44+
RUN pip install --no-cache-dir 'flash-attn-4>=4.0.0b9'
10845
# Security overrides for pip-installed packages whose parent packages do not pin them safely.
10946
# cryptography==46.0.7: CVE-2026-41727; not pre-installed in ptca env, pulled by azureml-mlflow
11047
# fastmcp>=3.2.0: GHSA-rww4-4w9c-7733, GHSA-m8x7-r2rg-vh5g, GHSA-vv7q-7jx5-f767
11148
# Mako>=1.3.11: CVE-2025-46803; transitive dep of alembic, parent uses loose floor
11249
# lxml>=6.1.0: GHSA-vfmq-68hx-4jfw; transitive dep of multiple packages, parent uses loose floor
113-
# transformers>=5.0.0rc3: GHSA-69w3-r845-3855 (CVE-2026-1839); direct dep. No upper cap because
114-
# the actual installed transformers ends up at 5.8.x (pulled forward by vllm 0.19.1
115-
# requires_dist `transformers>=4.56.0` followed by later installs) — verl 0.7.0 imports
116-
# `AutoModelForVision2Seq` at module top level, but `verl.utils.model` is not imported at
117-
# build/verification time in this image, so the symbol absence in transformers 5.6+ does
118-
# not affect the build. (verl 0.7.1 wraps that import in try/except; see vllm cascade
119-
# comment above for full context.)
50+
# transformers==5.6.0: GHSA-69w3-r845-3855 (CVE-2026-1839); direct dep and exact sglang
51+
# 0.5.11 requirement. verl 0.7.1 wraps the older AutoModelForVision2Seq import.
12052
# GitPython>=3.1.47: GHSA-x2qx-6953-8485, GHSA-rpm5-65cw-6hj4; transitive dep of wandb (requires
12153
# gitpython!=3.1.29,>=1.0.0 as of 0.26.1), parent uses loose floor — no wandb release forces >=3.1.47
122-
# pyOpenSSL>=26.0.0: CVE-2026-27459 (HIGH, DTLS cookie callback buffer overflow) and
123-
# CVE-2026-27448 (LOW, TLS connection bypass via unhandled callback exception). Base image
124-
# ships pyOpenSSL 25.3.0; azureml-core 1.61.0.post3 pins pyopenssl<26.0.0 and no newer
125-
# azureml-core release exists (verified 2026-05-23), so explicit override is required.
126-
# Pattern matches sister env acpt-grpo.
127-
RUN pip install --upgrade cryptography==46.0.7 'fastmcp>=3.2.0' 'Mako>=1.3.11' 'lxml>=6.1.0' 'transformers>=5.0.0rc3' 'GitPython>=3.1.47' 'pyOpenSSL>=26.0.0'
54+
# pyOpenSSL>=26.0.0: CVE-2026-27459 and CVE-2026-27448. azureml-core 1.61.0.post3 still
55+
# pins pyopenssl<26.0.0, so explicit override is required.
56+
RUN pip install --upgrade cryptography==46.0.7 'fastmcp>=3.2.0' 'Mako>=1.3.11' 'lxml>=6.1.0' 'transformers==5.6.0' 'GitPython>=3.1.47' 'pyOpenSSL>=26.0.0'
12857
# Base env (py3.13) and ptca env (py3.10) overrides for packages where every parent pins a loose floor.
129-
# python-dotenv>=1.2.2: GHSA-mf9w-mj56-hr94; transitive dep of pydantic-settings (requires >=0.21.0),
130-
# uvicorn (optional, requires >=0.13), and fastmcp (requires >=1.1.0). All parents use loose floors,
131-
# so no parent upgrade can force >=1.2.2. Base image ships 1.2.1 in base conda env; we patch
132-
# both base (python 3.13) and ptca (python 3.10) envs to cover all install paths.
133-
# pip>=26.1.1: GHSA-jp4c-xjxw-mgf9 / VCM 5011855 (CVE-2026-6357). Base image biweekly.202605.x
134-
# ships pip 26.0.1 in BOTH the ptca (py3.10) and base (py3.13) conda envs (per scan paths).
135-
# pip is the Python package installer itself — it is bootstrapped by the conda/python
136-
# distribution and has no parent package that pulls it in, so the only available remediation
137-
# is a direct upgrade in each conda environment. Pattern matches sister env acpt-grpo.
58+
# python-dotenv>=1.2.2: GHSA-mf9w-mj56-hr94; transitive dep of pydantic-settings, uvicorn,
59+
# and fastmcp. All parents use loose floors, so no parent upgrade can force >=1.2.2.
60+
# pip>=26.1.1: GHSA-jp4c-xjxw-mgf9 / VCM 5011855 (CVE-2026-6357). pip is bootstrapped by
61+
# conda/python and has no parent package that can be upgraded, so patch both conda envs.
13862
# NOTE: `pip install --upgrade pip` replaces the on-disk pip files but does NOT update
13963
# conda's metadata DB at conda-meta/pip-*.json. SBOM scanners that read conda-meta still
140-
# report the old pip version, so we explicitly delete the stale pip-26.0*.json files.
64+
# report the old pip version, so we explicitly delete stale pip-26.0*.json files.
14165
# urllib3>=2.7.0 (base env, py3.13): GHSA-mf9v-mfxr-j63j / VCM 5012484 and
142-
# GHSA-qccp-gfcp-xxvc / VCM 5012480. urllib3 is brought in transitively in the base env by
66+
# GHSA-qccp-gfcp-xxvc / VCM 5012480. urllib3 is brought in transitively by
14367
# requests/botocore/azureml-core/kubernetes/etc.; all of these only constrain urllib3<3
14468
# (loose), so no parent upgrade forces >=2.7.0. Direct override is the only remediation
14569
# (verified via PyPI requires_dist on 2026-05-23; matches sister env acpt-grpo).
@@ -149,7 +73,7 @@ RUN pip install --upgrade cryptography==46.0.7 'fastmcp>=3.2.0' 'Mako>=1.3.11' '
14973
# currently published release (verified via PyPI requires_dist on 2026-05-23 — all parents
15074
# use loose floors like `idna>=2.5` or `idna<4`). Direct override is the only remediation.
15175
# click>=8.3.3 (base env, py3.13): GHSA-47fr-3ffg-hgmw / VCM 5012984 (CVE-2026-7246, HIGH,
152-
# click.edit() command injection). click is bootstrapped into the base conda env and pulled
76+
# click.edit() command injection). click is bootstrapped into the conda env and pulled
15377
# by typer/uvicorn/black/flask/etc.; none of these pin click>=8.3.3 in published releases
15478
# (verified PyPI requires_dist 2026-05-23). Direct override is the only remediation.
15579
# (Note: requirements.txt also pins click==8.3.3 to cover the ptca env install path.)
@@ -165,17 +89,6 @@ RUN find /opt/conda/envs/ptca/lib/python3.10/site-packages/ray -type d -name 'th
16589
pip install --no-cache-dir --target "$dir" 'aiohttp>=3.13.4' 'idna>=3.15'; \
16690
done
16791
COPY vllm_rollout /opt/conda/envs/ptca/lib/python3.10/site-packages/verl/workers/rollout/vllm_rollout/vllm_rollout.py
168-
# CVE-2026-44223 surgical backport: overlay the patched extract_hidden_states.py
169-
# (verbatim copy of vllm v0.19.1 with the one-line PR #38610 fix applied).
170-
# Must come AFTER every step that could pip-touch vllm; the verification RUN
171-
# below fails the build fast if the overlay did not land or vllm version drifted.
172-
COPY extract_hidden_states /opt/conda/envs/ptca/lib/python3.10/site-packages/vllm/v1/spec_decode/extract_hidden_states.py
173-
RUN find /opt/conda/envs/ptca/lib/python3.10/site-packages/vllm -name '*.pyc' -delete && \
174-
/opt/conda/envs/ptca/bin/python -c "import vllm; assert vllm.__version__ == '0.19.1', vllm.__version__; \
175-
import vllm.v1.spec_decode.extract_hidden_states as m, inspect; src = inspect.getsource(m); \
176-
assert 'sampled_token_ids[:, :1]' in src, 'CVE-2026-44223 overlay missing'; \
177-
assert 'CVE-2026-44223' in src, 'CVE-2026-44223 marker missing'; \
178-
print('vllm', vllm.__version__, 'CVE-2026-44223 overlay verified')"
17992
RUN rm -rf ~/.cache/pip /tmp/* /var/tmp/*
18093
ENV PYTHONHASHSEED=random \
18194
PYTHONDONTWRITEBYTECODE=1

assets/training/finetune_acft_hf_nlp/environments/acpt-rft/context/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ ray[default] @ https://github.com/yeshsurya/ray/releases/download/2.53.dev/ray-3
2525
tensorboard==2.20.0
2626
tensordict==0.9.1
2727
torchdata==0.11.0
28-
torchvision==0.23.0
29-
transformers>=5.0.0rc3,<5.6.0
28+
torchvision==0.26.0
29+
transformers==5.6.0
3030
uvicorn==0.35.0
3131
zmq==0.0.0
3232
filelock>=3.20.1

0 commit comments

Comments
 (0)