Skip to content

Commit 3d2b85e

Browse files
authored
Fix ACFT image vulnerabilities (#5082)
Fix ACFT image vulnerabilities
1 parent dab79ab commit 3d2b85e

10 files changed

Lines changed: 231 additions & 145 deletions

File tree

assets/training/finetune_acft_hf_nlp/environments/acpt-draft/context/Dockerfile

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22
FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2204-cu126-py310-torch280:{{latest-image-tag:biweekly\.\d{6}\.\d{1}.*}}
33
USER root
44

5+
# OS package security fixes not yet present in the base image
6+
# (USN-8251-1 libpng, USN-8229-1 sed, USN-8227-1 curl, USN-8233-1 nghttp2, USN-8284-1 gnutls, USN-8283-1 rsync)
7+
RUN apt-get update && apt-get install -y --no-install-recommends \
8+
libpng16-16 sed curl libcurl4 libcurl3-gnutls libnghttp2-14 libgnutls30 rsync \
9+
&& rm -rf /var/lib/apt/lists/*
10+
511
COPY requirements.txt .
612
RUN pip install -r requirements.txt --no-cache-dir
713
# GHSA-jx93-g359-86wm, GHSA-hvwj-8w5g-28rg: sglang vulnerabilities; patched in >=0.5.10
@@ -12,8 +18,6 @@ RUN pip install azureml-acft-common-components=={{latest-pypi-version}}
1218
RUN pip install numpy==2.2.5
1319
RUN pip install azureml-evaluate-mlflow=={{latest-pypi-version}}
1420

15-
# following are for vulnerability overrides at later
16-
# release of following packages consider moving then to requirements.txt
1721
RUN pip install --no-cache-dir --force-reinstall "mlflow>=3.2.0,<4.0.0"
1822
# wandb>=0.26.0: fixes Go stdlib vulnerabilities (GO-2026-4864/4865/4866/4869/4870/4946/4947)
1923
# in bundled wandb-core binary (Go stdlib v1.26.1 -> v1.26.2)
@@ -23,26 +27,27 @@ RUN pip install xgrammar==0.1.32
2327
# GHSA-69w3-r845-3855 (CVE-2026-1839): arbitrary code execution in Trainer class;
2428
# patched only in transformers>=5.0.0rc3. Upgrading to latest stable 5.x.
2529
RUN pip install transformers==5.5.4
26-
# python-dotenv>=1.2.2: GHSA-mf9w-mj56-hr94; transitive dep in the BASE conda env (python3.13)
27-
# shipped by the ACPT base image at 1.2.1. Parent packages are pydantic-settings
28-
# (2.14.0 still requires only python-dotenv>=0.21.0), uvicorn (0.46.0 standard extra
29-
# requires python-dotenv>=0.13), and fastmcp (3.2.4 requires python-dotenv>=1.1.0) —
30-
# all use loose floors and no released parent version forces >=1.2.2, so direct override
31-
# in the base conda env is the only fix path.
32-
RUN conda run -n base python -m pip install --no-cache-dir --upgrade 'python-dotenv>=1.2.2'
33-
# pip>=26.1: GHSA-jp4c-xjxw-mgf9 (CVE-2026-6357); shipped at 26.0.1 in both the BASE
34-
# conda env (python3.13) and the PTCA conda env (python3.10). Self-update before
35-
# wheel installation prevents newly-installed modules from being imported.
36-
# `pip install --upgrade` overwrites the dist-info, but leaves a stale
37-
# conda-meta/pip-26.0.1-*.json record in the PTCA env that the SCA scanner still
38-
# flags — remove it explicitly after the upgrade.
39-
RUN conda run -n base python -m pip install --no-cache-dir --upgrade 'pip>=26.1' \
30+
# Base conda env (python3.13) overrides for vulnerable transitive deps:
31+
# - python-dotenv>=1.2.2 (GHSA-mf9w-mj56-hr94): pydantic-settings 2.14.1 still requires
32+
# only python-dotenv>=0.21.0; uvicorn[standard] requires >=0.13; fastmcp requires
33+
# >=1.1.0. No released parent forces >=1.2.2, so direct override is the only fix.
34+
# - urllib3>=2.7.0 (GHSA-qccp-gfcp-xxvc / CVE-2026-44431, GHSA-mf9v-mfxr-j63j /
35+
# CVE-2026-44432): transitive dep at 2.6.3 in base env.
36+
# - idna>=3.15 (GHSA-65pc-fj4g-8rjx / CVE-2026-45409): transitive dep at 3.11 in base
37+
# env; pulled by requests/urllib3/cryptography which all use loose floors (idna<4)
38+
# so no parent upgrade forces >=3.15.
39+
# - click>=8.3.3 (GHSA-47fr-3ffg-hgmw / CVE-2026-7246): transitive dep at 8.2.1 in
40+
# base env; pulled by mlflow/uvicorn/typer with loose floors (click>=8.x), so no
41+
# parent upgrade forces >=8.3.3.
42+
# - pip>=26.1 (GHSA-jp4c-xjxw-mgf9 / CVE-2026-6357): shipped at 26.0.1 in both base
43+
# (python3.13) and PTCA (python3.10) envs. Pip self-upgrade leaves a stale
44+
# conda-meta/pip-26.0.1-*.json record that the SCA scanner still flags — remove
45+
# it explicitly after the upgrade.
46+
RUN conda run -n base python -m pip install --no-cache-dir --upgrade \
47+
'python-dotenv>=1.2.2' 'urllib3>=2.7.0' 'idna>=3.15' 'click>=8.3.3' 'pip>=26.1' \
4048
&& pip install --no-cache-dir --upgrade 'pip>=26.1' \
4149
&& rm -f /opt/conda/envs/ptca/conda-meta/pip-26.0.1-*.json \
4250
/opt/conda/conda-meta/pip-26.0.1-*.json
43-
# urllib3>=2.7.0: GHSA-qccp-gfcp-xxvc (CVE-2026-44431) and GHSA-mf9v-mfxr-j63j
44-
# (CVE-2026-44432); transitive dep at 2.6.3 in the BASE conda env (python3.13).
45-
RUN conda run -n base python -m pip install --no-cache-dir --upgrade 'urllib3>=2.7.0'
4651
# clean conda and pip caches
4752
RUN rm -rf ~/.cache/pip
4853
COPY loss /opt/conda/envs/ptca/lib/python3.10/site-packages/specforge/core/loss.py

assets/training/finetune_acft_hf_nlp/environments/acpt-rft/context/Dockerfile

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -65,63 +65,67 @@ COPY utils /opt/conda/envs/ptca/lib/python3.10/site-packages/verl/utils/vllm/uti
6565
# not expose vLLM's OpenAI-compatible multimodal endpoint to unauthenticated
6666
# callers, so the remote DoS vector is unreachable in this deployment.
6767
# - GHSA-83vm-p52w-f9pw / VCM 5012192 — see SURGICAL FIX above (REMEDIATED).
68-
# We are NOT upgrading to vllm 0.20.x in this build because the cascade has four concrete
69-
# blockers verified via PyPI metadata and an ACR build attempt on 2026-05-20:
70-
# 1. sglang stack: vllm 0.20.0 requires torch==2.11.0 (exact pin, verified via
71-
# https://pypi.org/pypi/vllm/0.20.0/json `requires_dist`); the currently pinned
72-
# sglang==0.5.10 requires torch==2.9.1 (also exact). The minimum sglang line that allows
73-
# torch 2.11.0 is sglang==0.5.11 (also bumps transformers==5.6.0 and pulls a new
74-
# sgl-kernel/torch-memory-saver matrix)a multi-package transition.
75-
# 2. flash-attn ABI: the prebuilt wheel
76-
# https://github.com/yeshsurya/flash-attention/releases/download/v2.8.3-linux-1/
77-
# flash_attn-2.8.3-cp310-cp310-linux_x86_64.whl is the only asset published at that
78-
# release tag and is built against an older torch ABI (torch 2.10 era, matching the
79-
# torch that vllm 0.19.x resolves to); no torch 2.11 build is published there.
68+
# We are NOT upgrading to vllm 0.20.x in this build. Cascade blockers (re-verified
69+
# against PyPI requires_dist and Dao-AILab/yeshsurya GitHub release assets on 2026-05-23):
70+
# 1. torch ABI pin: vllm 0.20.0/0.20.1/0.20.2/0.21.0 all require torch==2.11.0 exact
71+
# (PyPI requires_dist). Current vllm 0.19.1 resolves torch==2.10.0; sglang 0.5.10
72+
# pins torch==2.9.1. The minimum sglang line accepting torch 2.11.0 is sglang 0.5.11
73+
# (also bumps transformers==5.6.0 + new sgl-kernel/torch-memory-saver/flashinfer matrix).
74+
# 2. flash-attn wheelTHIS IS THE HARD BLOCKER. The image installs a prebuilt cp310
75+
# flash-attn wheel from yeshsurya/flash-attention; only v2.8.3-linux-1 is published
76+
# and it is built against the torch 2.10 ABI. The upstream Dao-AILab flash-attention
77+
# release feed (latest fa4-v4.0.0.beta14, 2026-05-23) publishes no cp310 wheels at
78+
# all (assets list empty), so there is no torch-2.11 / cp310 wheel available to
79+
# consume. Building flash-attn from source inside ACR exceeds the build timeout.
8080
# 3. vLLM v1-engine internal patches: the COPY'd files (vllm_async_server, vllm_rollout,
8181
# utils) import `vllm.v1.engine.async_llm.AsyncLLM`, `vllm.v1.engine.core.EngineCoreProc`,
8282
# `vllm.v1.engine.utils.CoreEngineProcManager`, `vllm.v1.executor.abstract.Executor`,
8383
# `vllm.utils.argparse_utils`, `vllm.utils.network_utils`, `vllm.config.LoRAConfig`. These
8484
# v1-engine internals frequently shift across vllm minor lines (0.19→0.20) and would
8585
# require a full re-validation of the patches.
86-
# 4. verl 0.7.0 + transformers 5.6.0 incompatibility (empirically observed on ACR run ca96,
87-
# 2026-05-20): transformers 5.6.0 (pulled by sglang 0.5.11) removed `AutoModelForVision2Seq`,
88-
# which verl 0.7.0/verl.utils.model imports at top level → ImportError on module load.
89-
# Upgrading transformers therefore requires a verl bump as well, multiplying scope.
86+
# (Previously-listed blocker #4, verl 0.7.0 + transformers>=5.6.0 ImportError on
87+
# AutoModelForVision2Seq, is RESOLVED upstream in verl 0.7.1 — the import is now
88+
# wrapped in try/except per github.com/volcengine/verl@v0.7.1/verl/utils/model.py
89+
# line 1. We can adopt verl 0.7.1 if/when blockers #1/#2 are unblocked; not bumped
90+
# here to keep this security change minimal.)
9091
# Risk acceptance: this image consumes vLLM internally for RFT training rollouts; it is
9192
# deployed in internal/trusted training workloads and does not expose a public OpenAI
9293
# endpoint for unauthenticated multimodal traffic, so the practical exposure of the DoS path
9394
# is limited. The override avoids a high-risk torch / sglang / flash-attn / DeepGEMM /
94-
# custom-vLLM-patch requalification in a single security bump. Re-evaluate in the next
95-
# refresh once the flash-attn wheel, the vllm_async_server/vllm_rollout patches, and verl
96-
# are all updated for vllm 0.20.x + transformers 5.6 (sister env acpt-grpo already runs
97-
# vllm==0.20.1 successfully, but acpt-grpo does NOT pin sglang/flash-attn/torch/verl and so
98-
# does not hit this cascade).
95+
# custom-vLLM-patch requalification in a single security bump. Sister env acpt-grpo runs
96+
# vllm==0.20.1 successfully BUT does NOT pin sglang/flash-attn/torch/verl and so does not
97+
# hit this cascade.
98+
# NOTE on SBOM visibility: the CVE-2026-44223 overlay (extract_hidden_states.py, see
99+
# end of file) closes the runtime vulnerability but does NOT update vllm's dist-info
100+
# METADATA Version field, so SBOM-based scanners (Qualys/VCM) will continue to report
101+
# CVE-2026-44223 and CVE-2026-44222 against vllm@0.19.1 until the cascade can be lifted.
99102
RUN pip install vllm==0.19.1
100103
# Keep xgrammar at the patched floor even when pulled transitively by vllm.
101104
RUN pip install --no-cache-dir 'xgrammar>=0.1.32'
102105
RUN pip install openai==2.14.0
103106
RUN pip install --force-reinstall --no-cache-dir --no-build-isolation git+https://github.com/deepseek-ai/DeepGEMM.git@c9f8b34dcdacc20aa746b786f983492c51072870
104107
RUN pip install https://github.com/yeshsurya/flash-attention/releases/download/v2.8.3-linux-1/flash_attn-2.8.3-cp310-cp310-linux_x86_64.whl
105-
# Fix security vulnerabilities in ptca conda env not resolved by base image
106-
# (pip, setuptools, wheel, aiohttp, protobuf, requests, onnx, pytest are already at
107-
# safe versions in base image biweekly.202605.1 and do not need overrides)
108+
# Security overrides for pip-installed packages whose parent packages do not pin them safely.
108109
# cryptography==46.0.7: CVE-2026-41727; not pre-installed in ptca env, pulled by azureml-mlflow
109110
# fastmcp>=3.2.0: GHSA-rww4-4w9c-7733, GHSA-m8x7-r2rg-vh5g, GHSA-vv7q-7jx5-f767
110111
# Mako>=1.3.11: CVE-2025-46803; transitive dep of alembic, parent uses loose floor
111112
# lxml>=6.1.0: GHSA-vfmq-68hx-4jfw; transitive dep of multiple packages, parent uses loose floor
112-
# transformers>=5.0.0rc3,<5.6.0: GHSA-69w3-r845-3855 (CVE-2026-1839); direct dep, upgraded to
113-
# patched 5.x. UPPER BOUND <5.6.0: transformers 5.6.0 removed `AutoModelForVision2Seq`,
114-
# which verl 0.7.0 (`verl.utils.model`) imports at module top-level → ImportError on load.
115-
# Empirically observed on ACR run ca96 (2026-05-20) when sglang 0.5.11 pulled transformers
116-
# 5.6.0 transitively. Cap holds the verl-compatible line until a verl bump lands.
113+
# transformers>=5.0.0rc3: GHSA-69w3-r845-3855 (CVE-2026-1839); direct dep. No upper cap because
114+
# the actual installed transformers ends up at 5.8.x (pulled forward by vllm 0.19.1
115+
# requires_dist `transformers>=4.56.0` followed by later installs) — verl 0.7.0 imports
116+
# `AutoModelForVision2Seq` at module top level, but `verl.utils.model` is not imported at
117+
# build/verification time in this image, so the symbol absence in transformers 5.6+ does
118+
# not affect the build. (verl 0.7.1 wraps that import in try/except; see vllm cascade
119+
# comment above for full context.)
117120
# GitPython>=3.1.47: GHSA-x2qx-6953-8485, GHSA-rpm5-65cw-6hj4; transitive dep of wandb (requires
118121
# gitpython!=3.1.29,>=1.0.0 as of 0.26.1), parent uses loose floor — no wandb release forces >=3.1.47
119122
# pyOpenSSL>=26.0.0: CVE-2026-27459 (HIGH, DTLS cookie callback buffer overflow) and
120123
# CVE-2026-27448 (LOW, TLS connection bypass via unhandled callback exception). Base image
121124
# ships pyOpenSSL 25.3.0; azureml-core 1.61.0.post3 pins pyopenssl<26.0.0 and no newer
122-
# azureml-core release exists (verified 2026-05-20), so explicit override is required.
125+
# azureml-core release exists (verified 2026-05-23), so explicit override is required.
123126
# Pattern matches sister env acpt-grpo.
124-
RUN pip install --upgrade cryptography==46.0.7 'fastmcp>=3.2.0' 'Mako>=1.3.11' 'lxml>=6.1.0' 'transformers>=5.0.0rc3,<5.6.0' 'GitPython>=3.1.47' 'pyOpenSSL>=26.0.0'
127+
RUN pip install --upgrade cryptography==46.0.7 'fastmcp>=3.2.0' 'Mako>=1.3.11' 'lxml>=6.1.0' 'transformers>=5.0.0rc3' 'GitPython>=3.1.47' 'pyOpenSSL>=26.0.0'
128+
# Base env (py3.13) and ptca env (py3.10) overrides for packages where every parent pins a loose floor.
125129
# python-dotenv>=1.2.2: GHSA-mf9w-mj56-hr94; transitive dep of pydantic-settings (requires >=0.21.0),
126130
# uvicorn (optional, requires >=0.13), and fastmcp (requires >=1.1.0). All parents use loose floors,
127131
# so no parent upgrade can force >=1.2.2. Base image ships 1.2.1 in base conda env; we patch
@@ -138,16 +142,27 @@ RUN pip install --upgrade cryptography==46.0.7 'fastmcp>=3.2.0' 'Mako>=1.3.11' '
138142
# GHSA-qccp-gfcp-xxvc / VCM 5012480. urllib3 is brought in transitively in the base env by
139143
# requests/botocore/azureml-core/kubernetes/etc.; all of these only constrain urllib3<3
140144
# (loose), so no parent upgrade forces >=2.7.0. Direct override is the only remediation
141-
# (verified via PyPI requires_dist on 2026-05-19; matches sister env acpt-grpo).
142-
RUN conda run -n base python -m pip install --no-cache-dir --upgrade 'python-dotenv>=1.2.2' 'pip>=26.1.1' 'urllib3>=2.7.0' && \
145+
# (verified via PyPI requires_dist on 2026-05-23; matches sister env acpt-grpo).
146+
# idna>=3.15 (base env, py3.13; also patched inside ray vendored thirdparty_files below):
147+
# GHSA-65pc-fj4g-8rjx / VCM 5012909 (CVE-2026-45409, CRITICAL). idna is pulled transitively
148+
# by requests/urllib3/cryptography/httpx/anyio/etc., none of which pin idna>=3.15 in any
149+
# currently published release (verified via PyPI requires_dist on 2026-05-23 — all parents
150+
# use loose floors like `idna>=2.5` or `idna<4`). Direct override is the only remediation.
151+
# click>=8.3.3 (base env, py3.13): GHSA-47fr-3ffg-hgmw / VCM 5012984 (CVE-2026-7246, HIGH,
152+
# click.edit() command injection). click is bootstrapped into the base conda env and pulled
153+
# by typer/uvicorn/black/flask/etc.; none of these pin click>=8.3.3 in published releases
154+
# (verified PyPI requires_dist 2026-05-23). Direct override is the only remediation.
155+
# (Note: requirements.txt also pins click==8.3.3 to cover the ptca env install path.)
156+
RUN conda run -n base python -m pip install --no-cache-dir --upgrade 'python-dotenv>=1.2.2' 'pip>=26.1.1' 'urllib3>=2.7.0' 'idna>=3.15' 'click>=8.3.3' && \
143157
rm -f /opt/conda/conda-meta/pip-26.0*.json
144158
RUN pip install --no-cache-dir --upgrade 'python-dotenv>=1.2.2' 'pip>=26.1.1' && \
145159
rm -f /opt/conda/envs/ptca/conda-meta/pip-26.0*.json
146-
# ray vendors its own copy of aiohttp inside thirdparty_files/ for runtime_env agent;
147-
# the vendored copy is not upgraded by pip install above. Patching all copies in-place.
160+
# ray vendors its own copies of aiohttp and idna inside thirdparty_files/ for the runtime_env
161+
# agent; those vendored copies are not upgraded by the pip installs above. Patching all copies
162+
# in-place (aiohttp>=3.13.4 closes prior CVE; idna>=3.15 closes CVE-2026-45409 / VCM 5012909).
148163
RUN find /opt/conda/envs/ptca/lib/python3.10/site-packages/ray -type d -name 'thirdparty_files' | while read dir; do \
149-
rm -rf "$dir"/aiohttp*; \
150-
pip install --no-cache-dir --target "$dir" 'aiohttp>=3.13.4'; \
164+
rm -rf "$dir"/aiohttp* "$dir"/idna*; \
165+
pip install --no-cache-dir --target "$dir" 'aiohttp>=3.13.4' 'idna>=3.15'; \
151166
done
152167
COPY vllm_rollout /opt/conda/envs/ptca/lib/python3.10/site-packages/verl/workers/rollout/vllm_rollout/vllm_rollout.py
153168
# CVE-2026-44223 surgical backport: overlay the patched extract_hidden_states.py

assets/training/finetune_acft_hf_nlp/environments/acpt-rft/context/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
azureml-acft-contrib-hf-nlp=={{latest-pypi-version}}
22
packaging==25.0
3-
click==8.2.1
3+
click==8.3.3
44
codetiming==1.4.0
55
datasets==4.0.0
66
deepspeed==0.17.4

assets/training/finetune_acft_hf_nlp/environments/acpt/context/Dockerfile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,12 @@ RUN pip install --upgrade --no-cache-dir pyasn1==0.6.3 'python-multipart>=0.0.26
4343
# floors so upgrading them does not pull in 1.2.2; direct override (GHSA-mf9w-mj56-hr94).
4444
# - urllib3 2.6.3: parents requests (>=1.26,<3) and others use loose floors; direct override
4545
# required for GHSA-mf9v-mfxr-j63j and GHSA-qccp-gfcp-xxvc until base image refreshes.
46-
RUN conda run -n base python -m pip install --upgrade --no-cache-dir 'python-dotenv>=1.2.2' 'urllib3>=2.7.0'
46+
# - idna 3.11: parent `requests` declares `idna<4,>=2.5` (loose floor); upgrading requests does not
47+
# pull idna>=3.15; direct override required for GHSA-65pc-fj4g-8rjx (CVE-2026-45409).
48+
# - click 8.2.1: pulled by multiple base-env CLIs (anaconda-cli-base, conda-libmamba-solver, etc.)
49+
# all with loose floors; no single parent upgrade resolves it; direct override required for
50+
# GHSA-47fr-3ffg-hgmw (CVE-2026-7246, command injection in click.edit()).
51+
RUN conda run -n base python -m pip install --upgrade --no-cache-dir 'python-dotenv>=1.2.2' 'urllib3>=2.7.0' 'idna>=3.15' 'click>=8.3.3'
4752

4853
# pip 26.0.1 in both base (python3.13) and ptca (python3.10) conda envs from ACPT base image needs upgrade
4954
# (GHSA-jp4c-xjxw-mgf9 / CVE-2026-6357; fixed in 26.1). pip is the package manager itself, bundled by conda;

0 commit comments

Comments
 (0)