Skip to content

Commit b9c9323

Browse files
luke-kucingclaude
andauthored
build(docker): switch to chainguard wolfi-base, upgrade deps to 0.0.93 (#539)
- Replace quay.io/unstructured-io/base-images:wolfi-base-latest with cgr.dev/chainguard/wolfi-base:latest - Inline all base image setup: system packages, user creation, libreoffice initialization, tesseract data - Add pandoc-3.1.8-r0.apk package for document conversion - Add initialize-libreoffice.sh script for soffice config - Simplify Dockerfile by removing unnecessary multi-stage build - Remove unused Python versions (3.10, 3.11, 3.13) - Upgrade unstructured to 0.18.31 and other dependencies 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 8d6e863 commit b9c9323

9 files changed

Lines changed: 143 additions & 85 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.0.93
2+
* Refactored the Dockerfile to use the chainguard/wolfi-base image instead of the unstructured/base-image. This is to align with the recent change in the unstructured repo where the same change was made.
3+
* upgraded dependancies to address CVEs
4+
15
## 0.0.92
26
* Upgrade pdfminer-six to 20260107 to fix ~15-18% performance regression from eager f-string evaluation
37

Dockerfile

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# syntax=docker/dockerfile:experimental
2-
FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base
2+
FROM cgr.dev/chainguard/wolfi-base:latest
33

44
# NOTE(crag): NB_USER ARG for mybinder.org compat:
55
# https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html
@@ -10,25 +10,65 @@ ARG PIPELINE_PACKAGE
1010
ARG PYTHON_VERSION="3.12"
1111

1212
# Set up environment
13-
ENV PYTHON python${PYTHON_VERSION}
14-
ENV PIP ${PYTHON} -m pip
13+
ENV PYTHON=python${PYTHON_VERSION}
14+
ENV PIP="${PYTHON} -m pip"
15+
16+
USER root
17+
18+
COPY ./docker/packages/*.apk /tmp/packages/
19+
20+
RUN apk update && \
21+
apk add libxml2 python-3.12 python-3.12-base py3.12-pip glib \
22+
mesa-gl mesa-libgallium cmake bash libmagic wget git openjpeg \
23+
poppler poppler-utils poppler-glib libreoffice tesseract && \
24+
apk add --allow-untrusted /tmp/packages/pandoc-3.1.8-r0.apk && \
25+
rm -rf /tmp/packages && \
26+
git clone --depth 1 https://github.com/tesseract-ocr/tessdata.git /tmp/tessdata && \
27+
mkdir -p /usr/local/share/tessdata && \
28+
cp /tmp/tessdata/*.traineddata /usr/local/share/tessdata && \
29+
rm -rf /tmp/tessdata && \
30+
git clone --depth 1 https://github.com/tesseract-ocr/tessconfigs /tmp/tessconfigs && \
31+
cp -r /tmp/tessconfigs/configs /usr/local/share/tessdata && \
32+
cp -r /tmp/tessconfigs/tessconfigs /usr/local/share/tessdata && \
33+
rm -rf /tmp/tessconfigs && \
34+
apk cache clean && \
35+
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/libreoffice && \
36+
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/soffice && \
37+
chmod +x /usr/lib/libreoffice/program/soffice.bin && \
38+
apk add --no-cache font-ubuntu fontconfig && \
39+
apk upgrade --no-cache py3.12-pip && \
40+
fc-cache -fv && \
41+
ln -sf /usr/bin/$PYTHON /usr/bin/python3 && \
42+
addgroup --gid ${NB_UID} ${NB_USER} && \
43+
adduser --disabled-password --gecos "" --uid ${NB_UID} -G ${NB_USER} ${NB_USER} && \
44+
rm -rf /usr/lib/python3.10 && \
45+
rm -rf /usr/lib/python3.11 && \
46+
rm -rf /usr/lib/python3.13 && \
47+
rm -f /usr/bin/python3.13
48+
49+
ENV USER=${NB_USER}
50+
ENV HOME=/home/${NB_USER}
51+
COPY --chown=${NB_USER} scripts/initialize-libreoffice.sh ${HOME}/initialize-libreoffice.sh
1552

16-
WORKDIR ${HOME}
1753
USER ${NB_USER}
54+
WORKDIR ${HOME}
55+
56+
# Initialize libreoffice config as non-root user (required for soffice to work properly)
57+
# See: https://github.com/Unstructured-IO/unstructured/issues/3105
58+
RUN ./initialize-libreoffice.sh && rm initialize-libreoffice.sh
1859

1960
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
2061
ENV PATH="/home/${NB_USER}/.local/bin:${PATH}"
62+
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
2163

22-
FROM base as python-deps
2364
COPY --chown=${NB_USER}:${NB_USER} requirements/base.txt requirements-base.txt
24-
RUN ${PIP} install pip==${PIP_VERSION}
25-
RUN ${PIP} install --no-cache -r requirements-base.txt
65+
RUN ${PIP} install pip==${PIP_VERSION} && \
66+
${PIP} install --no-cache -r requirements-base.txt
2667

27-
FROM python-deps as model-deps
2868
RUN ${PYTHON} -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
29-
${PYTHON} -c "from unstructured.partition.model_init import initialize; initialize()"
69+
${PYTHON} -c "from unstructured.partition.model_init import initialize; initialize()" && \
70+
${PYTHON} -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
3071

31-
FROM model-deps as code
3272
COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md
3373
COPY --chown=${NB_USER}:${NB_USER} logger_config.yaml logger_config.yaml
3474
COPY --chown=${NB_USER}:${NB_USER} prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/
30 MB
Binary file not shown.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.92" # pragma: no cover
1+
__version__ = "0.0.93" # pragma: no cover

preprocessing-pipeline-family.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.92
2+
version: 0.0.93

0 commit comments

Comments
 (0)