From 7e23afcc65a256204d54cd844765007114658098 Mon Sep 17 00:00:00 2001 From: Lawrence Elitzer Date: Wed, 22 Apr 2026 09:54:43 -0500 Subject: [PATCH] fix(docker): replace PyPI opencv wheel with ffmpeg-free build [security] Mirrors Unstructured-IO/unstructured#4336. After uv sync, the Dockerfile now downloads a source-built opencv-contrib-python-headless wheel (WITH_FFMPEG=OFF) from the upstream release, hash-verifies it, and substitutes it for the PyPI opencv variant installed from uv.lock. This eliminates the 14 bundled ffmpeg 5.1.x CVEs shipped in PyPI opencv wheels. Bumps service version 0.1.3 -> 0.1.4. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 6 +++++ Dockerfile | 40 +++++++++++++++++++++++++++++ prepline_general/api/__version__.py | 2 +- 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 868719ba..7629fcb7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.1.4 + +### Security + +- **Replace PyPI opencv wheels with ffmpeg-free builds in Docker image**: After `uv sync`, the Dockerfile now substitutes the installed PyPI opencv-python variant with a source-built `opencv-contrib-python-headless` wheel compiled with `WITH_FFMPEG=OFF`, eliminating 14 bundled ffmpeg CVEs. The contrib-headless variant is a strict superset of the cv2 API (core + contrib modules, no GUI) and can transparently replace `opencv-python`, `opencv-python-headless`, or `opencv-contrib-python`. Wheel is downloaded from the upstream `Unstructured-IO/unstructured` release and hash-verified. Mirrors [unstructured#4336](https://github.com/Unstructured-IO/unstructured/pull/4336). + ## 0.1.3 ### Security diff --git a/Dockerfile b/Dockerfile index 5bd528e6..2fecd9a7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -75,6 +75,46 @@ RUN ${PYTHON} -c "from unstructured.nlp.tokenize import _load_spacy_model; _load ${PYTHON} -c "from unstructured.partition.model_init import initialize; initialize()" && \ ${PYTHON} -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')" +# Replace PyPI opencv wheels (which bundle vulnerable ffmpeg 5.1.x with 14 CVEs) +# with a source-built opencv-contrib-python-headless wheel compiled with +# WITH_FFMPEG=OFF + ENABLE_CONTRIB=1 + ENABLE_HEADLESS=1. +# +# The contrib-headless variant is a strict superset of the cv2 API exposed by +# opencv-python, opencv-python-headless, and opencv-contrib-python, so a +# single wheel can replace any of them. Because the wheel's metadata name +# only matches opencv-contrib-python-headless, any other variant has to be +# uninstalled first - `uv pip install --reinstall-package` would silently +# no-op for the non-matching names. We uninstall each variant individually +# with `|| true` to tolerate variants that aren't present (our lockfile +# currently only resolves opencv-python, but this stays robust if transitive +# deps change). +# +# See: https://github.com/opencv/opencv-python/issues/1212 +# +# Note: uv.lock resolves opencv packages to 4.13.0.92, but our wheel is pinned +# to 4.12.0.88 because 4.13.0.92 has no sdist on PyPI — the upstream +# Unstructured-IO/unstructured GHA workflow (build-opencv-wheels.yml) +# compiles from source and requires an sdist. Bump this when a newer version +# publishes an sdist. +ARG OPENCV_WHEEL_TAG=opencv-4.12.0.88 +ARG OPENCV_WHEEL_VERSION=4.12.0.88 +# SHA-256 hashes of the wheels published in the upstream +# Unstructured-IO/unstructured release. Update these when bumping +# OPENCV_WHEEL_VERSION. +ARG OPENCV_SHA256_aarch64=498fbb787dbfe7d6bc853ddad4ea1154e8fbefbfafd05aafb417f576e27850d5 +ARG OPENCV_SHA256_x86_64=50545ffc1efabf06cd70894b65a7fbca56786f560f452bf67a42c1bbd7a85961 +RUN ARCH=$(uname -m) && \ + WHEEL="opencv_contrib_python_headless-${OPENCV_WHEEL_VERSION}-cp312-cp312-linux_${ARCH}.whl" && \ + wget -q -O /tmp/"${WHEEL}" \ + "https://github.com/Unstructured-IO/unstructured/releases/download/${OPENCV_WHEEL_TAG}/${WHEEL}" && \ + EXPECTED=$(eval echo "\$OPENCV_SHA256_${ARCH}") && \ + echo "${EXPECTED} /tmp/${WHEEL}" | sha256sum -c - && \ + for pkg in opencv-python opencv-python-headless opencv-contrib-python opencv-contrib-python-headless; do \ + uv pip uninstall "$pkg" 2>/dev/null || true; \ + done && \ + uv pip install --no-deps /tmp/"${WHEEL}" && \ + rm /tmp/"${WHEEL}" + COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md COPY --chown=${NB_USER}:${NB_USER} logger_config.yaml logger_config.yaml COPY --chown=${NB_USER}:${NB_USER} prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/ diff --git a/prepline_general/api/__version__.py b/prepline_general/api/__version__.py index 506147f2..ecfc1a30 100644 --- a/prepline_general/api/__version__.py +++ b/prepline_general/api/__version__.py @@ -1 +1 @@ -__version__ = "0.1.3" # pragma: no cover +__version__ = "0.1.4" # pragma: no cover