From d900bf84195b12f2cdae5889206bbc363fd8a771 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 18:15:55 -0700 Subject: [PATCH 01/15] Bump python to 3.12, use latest wolfi image --- .github/workflows/ci.yml | 2 +- .github/workflows/docker-publish.yml | 2 +- CHANGELOG.md | 5 ++++- README.md | 4 ++-- docker/rockylinux-9.4/Dockerfile | 12 ++++++------ prepline_general/api/app.py | 2 +- prepline_general/api/general.py | 4 ++-- preprocessing-pipeline-family.yaml | 2 +- 8 files changed, 18 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1aed35359..35979f282 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,7 +7,7 @@ on: branches: [ main ] env: - PYTHON_VERSION: "3.10" + PYTHON_VERSION: "3.12" PIPELINE_FAMILY: "general" jobs: diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 5a39d6bad..6d54bbecc 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -11,7 +11,7 @@ env: PACKAGE: "unstructured-api" PIPELINE_FAMILY: "general" PIP_VERSION: "25.1.1" - PYTHON_VERSION: "3.10" + PYTHON_VERSION: "3.12" jobs: setup: diff --git a/CHANGELOG.md b/CHANGELOG.md index cd37b437c..ec9dff316 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,10 @@ +## 0.0.87 +* Bump Python to 3.12 + ## 0.0.86 * Patch various CVEs -## 0.0.86 +## 0.0.85 * Patch various CVEs * Bump Python version to 3.12, some packages no longer support 3.9 diff --git a/README.md b/README.md index 3bee3ae48..285b5a3ff 100644 --- a/README.md +++ b/README.md @@ -289,12 +289,12 @@ curl -X 'POST' * Using `pyenv` to manage virtualenv's is recommended * Mac install instructions. See [here](https://github.com/Unstructured-IO/community#mac--homebrew) for more detailed instructions. * `brew install pyenv-virtualenv` - * `pyenv install 3.10.12` + * `pyenv install 3.12` * Linux instructions are available [here](https://github.com/Unstructured-IO/community#linux). * Create a virtualenv to work in and activate it, e.g. for one named `document-processing`: - `pyenv virtualenv 3.10.12 unstructured-api`
+ `pyenv virtualenv 3.12 unstructured-api`
`pyenv activate unstructured-api` See the [Unstructured Quick Start](https://github.com/Unstructured-IO/unstructured#eight_pointed_black_star-quick-start) for the many OS dependencies that are required, if the ability to process all file types is desired. diff --git a/docker/rockylinux-9.4/Dockerfile b/docker/rockylinux-9.4/Dockerfile index 064a1008c..c4922f23d 100644 --- a/docker/rockylinux-9.4/Dockerfile +++ b/docker/rockylinux-9.4/Dockerfile @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:experimental -FROM quay.io/unstructured-io/base-images:rocky9.2-9@sha256:73d8492452f086144d4b92b7931aa04719f085c74d16cae81e8826ef873729c9 as base +FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base # NOTE(crag): NB_USER ARG for mybinder.org compat: # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html @@ -22,18 +22,18 @@ ENV PATH="/home/${NB_USER}/.local/bin:${PATH}" FROM base as python-deps # COPY requirements/dev.txt requirements-dev.txt COPY requirements/base.txt requirements-base.txt -RUN python3.10 -m pip install pip==${PIP_VERSION} \ +RUN python3.12 -m pip install pip==${PIP_VERSION} \ && dnf -y groupinstall "Development Tools" \ - && su -l ${NB_USER} -c 'pip3.10 install --no-cache -r requirements-base.txt' \ + && su -l ${NB_USER} -c 'pip3.12 install --no-cache -r requirements-base.txt' \ && dnf -y groupremove "Development Tools" \ && dnf clean all \ - && ln -s /home/notebook-user/.local/bin/pip3.10 /usr/local/bin/pip3.10 || true + && ln -s /home/notebook-user/.local/bin/pip3.12 /usr/local/bin/pip3.12 || true USER ${NB_USER} FROM python-deps as model-deps -RUN python3.10 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \ - python3.10 -c "from unstructured.partition.model_init import initialize; initialize()" +RUN python3.12 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \ + python3.12 -c "from unstructured.partition.model_init import initialize; initialize()" FROM model-deps as code COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md diff --git a/prepline_general/api/app.py b/prepline_general/api/app.py index 26380bfba..5868797dd 100644 --- a/prepline_general/api/app.py +++ b/prepline_general/api/app.py @@ -13,7 +13,7 @@ app = FastAPI( title="Unstructured Pipeline API", summary="Partition documents with the Unstructured library", - version="0.0.86", + version="0.0.87", docs_url="/general/docs", openapi_url="/general/openapi.json", servers=[ diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index 176197026..736c7c917 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -600,7 +600,7 @@ def return_content_type(filename: str): @router.get("/general/v0/general", include_in_schema=False) -@router.get("/general/v0.0.86/general", include_in_schema=False) +@router.get("/general/v0.0.87/general", include_in_schema=False) async def handle_invalid_get_request(): raise HTTPException( status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported." @@ -615,7 +615,7 @@ async def handle_invalid_get_request(): description="Description", operation_id="partition_parameters", ) -@router.post("/general/v0.0.86/general", include_in_schema=False) +@router.post("/general/v0.0.87/general", include_in_schema=False) def general_partition( request: Request, # cannot use annotated type here because of a bug described here: diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml index 039aef554..a7c2dac93 100644 --- a/preprocessing-pipeline-family.yaml +++ b/preprocessing-pipeline-family.yaml @@ -1,2 +1,2 @@ name: general -version: 0.0.86 +version: 0.0.87 From 324f6c8e4147b054923b7109e8b3b647f0356700 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 18:25:05 -0700 Subject: [PATCH 02/15] Missed a few python version setters --- .github/workflows/bump_libraries.yaml | 2 +- Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bump_libraries.yaml b/.github/workflows/bump_libraries.yaml index 79c256159..76dcab07c 100644 --- a/.github/workflows/bump_libraries.yaml +++ b/.github/workflows/bump_libraries.yaml @@ -9,7 +9,7 @@ on: - 'requirements/**' env: - PYTHON_VERSION: "3.8" + PYTHON_VERSION: "3.12" jobs: bump-changelog: diff --git a/Dockerfile b/Dockerfile index 31cd453dc..48b468468 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ ARG NB_USER=notebook-user ARG NB_UID=1000 ARG PIP_VERSION ARG PIPELINE_PACKAGE -ARG PYTHON_VERSION="3.11" +ARG PYTHON_VERSION="3.12" # Set up environment ENV PYTHON python${PYTHON_VERSION} From 7ec00e0f78e3c86360b67fe875ac847dd1222e51 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 18:36:38 -0700 Subject: [PATCH 03/15] Fixes --- Dockerfile | 2 -- docker/rockylinux-9.4/Dockerfile | 13 ++++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 48b468468..3871f491d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,6 @@ FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html ARG NB_USER=notebook-user ARG NB_UID=1000 -ARG PIP_VERSION ARG PIPELINE_PACKAGE ARG PYTHON_VERSION="3.12" @@ -21,7 +20,6 @@ ENV PATH="/home/${NB_USER}/.local/bin:${PATH}" FROM base as python-deps COPY --chown=${NB_USER}:${NB_USER} requirements/base.txt requirements-base.txt -RUN ${PIP} install pip==${PIP_VERSION} RUN ${PIP} install --no-cache -r requirements-base.txt FROM python-deps as model-deps diff --git a/docker/rockylinux-9.4/Dockerfile b/docker/rockylinux-9.4/Dockerfile index c4922f23d..a64e9e32c 100644 --- a/docker/rockylinux-9.4/Dockerfile +++ b/docker/rockylinux-9.4/Dockerfile @@ -1,11 +1,10 @@ # syntax=docker/dockerfile:experimental -FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base +FROM quay.io/unstructured-io/base-images:rocky9.2-9@sha256:73d8492452f086144d4b92b7931aa04719f085c74d16cae81e8826ef873729c9 as base # NOTE(crag): NB_USER ARG for mybinder.org compat: # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html ARG NB_USER=notebook-user ARG NB_UID=1000 -ARG PIP_VERSION ARG PIPELINE_PACKAGE # Set up environment @@ -22,18 +21,18 @@ ENV PATH="/home/${NB_USER}/.local/bin:${PATH}" FROM base as python-deps # COPY requirements/dev.txt requirements-dev.txt COPY requirements/base.txt requirements-base.txt -RUN python3.12 -m pip install pip==${PIP_VERSION} \ +RUN python3.10 -m pip install pip==${PIP_VERSION} \ && dnf -y groupinstall "Development Tools" \ - && su -l ${NB_USER} -c 'pip3.12 install --no-cache -r requirements-base.txt' \ + && su -l ${NB_USER} -c 'pip3.10 install --no-cache -r requirements-base.txt' \ && dnf -y groupremove "Development Tools" \ && dnf clean all \ - && ln -s /home/notebook-user/.local/bin/pip3.12 /usr/local/bin/pip3.12 || true + && ln -s /home/notebook-user/.local/bin/pip3.10 /usr/local/bin/pip3.10 || true USER ${NB_USER} FROM python-deps as model-deps -RUN python3.12 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \ - python3.12 -c "from unstructured.partition.model_init import initialize; initialize()" +RUN python3.10 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \ + python3.10 -c "from unstructured.partition.model_init import initialize; initialize()" FROM model-deps as code COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md From 4e9d6d04bbd6e5a617c5798d677e5ffeec8077d2 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 18:37:19 -0700 Subject: [PATCH 04/15] Fix --- docker/rockylinux-9.4/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/rockylinux-9.4/Dockerfile b/docker/rockylinux-9.4/Dockerfile index a64e9e32c..064a1008c 100644 --- a/docker/rockylinux-9.4/Dockerfile +++ b/docker/rockylinux-9.4/Dockerfile @@ -5,6 +5,7 @@ FROM quay.io/unstructured-io/base-images:rocky9.2-9@sha256:73d8492452f086144d4b9 # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html ARG NB_USER=notebook-user ARG NB_UID=1000 +ARG PIP_VERSION ARG PIPELINE_PACKAGE # Set up environment From b1be8a223f1880647ac983545c7225f9e19bf4d3 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 18:46:58 -0700 Subject: [PATCH 05/15] Install pip 3.12 --- Dockerfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Dockerfile b/Dockerfile index 3871f491d..c6e56ae11 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,6 +13,10 @@ ENV PYTHON python${PYTHON_VERSION} ENV PIP ${PYTHON} -m pip WORKDIR ${HOME} + +USER root +RUN apk add --no-cache {PYTHON} py${PYTHON_VERSION/./}-pip + USER ${NB_USER} ENV PYTHONPATH="${PYTHONPATH}:${HOME}" From 116c66608275ab8fa43fb55be47deed8953bd849 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 18:59:09 -0700 Subject: [PATCH 06/15] Fixes --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index c6e56ae11..933c933ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ ENV PIP ${PYTHON} -m pip WORKDIR ${HOME} USER root -RUN apk add --no-cache {PYTHON} py${PYTHON_VERSION/./}-pip +RUN apk add --no-cache python3 python3-pip USER ${NB_USER} From 237eb86bbc93ce06b9e60a81bf19d847423e9bb5 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 19:12:39 -0700 Subject: [PATCH 07/15] Use wolfi-style python package names --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 933c933ea..391f75866 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ ENV PIP ${PYTHON} -m pip WORKDIR ${HOME} USER root -RUN apk add --no-cache python3 python3-pip +RUN apk add --no-cache python3.12 py3.12-pip USER ${NB_USER} From 71f7c379056a0ab349065e4ffacace00400acfe9 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 19:27:30 -0700 Subject: [PATCH 08/15] Try chainguard image --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 391f75866..94420460d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,6 @@ # syntax=docker/dockerfile:experimental -FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base +# FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base +FROM cgr.dev/chainguard/wolfi-base as base # NOTE(crag): NB_USER ARG for mybinder.org compat: # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html From 436be34558c579b32bd719bbaf9ad7fd554cde29 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 19:36:00 -0700 Subject: [PATCH 09/15] Correct package naming --- Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 94420460d..933c933ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,5 @@ # syntax=docker/dockerfile:experimental -# FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base -FROM cgr.dev/chainguard/wolfi-base as base +FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base # NOTE(crag): NB_USER ARG for mybinder.org compat: # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html @@ -16,7 +15,7 @@ ENV PIP ${PYTHON} -m pip WORKDIR ${HOME} USER root -RUN apk add --no-cache python3.12 py3.12-pip +RUN apk add --no-cache python3 python3-pip USER ${NB_USER} From ae1ae0f68bcbedcb1dbf13302f8220daf3972d9d Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 19:46:48 -0700 Subject: [PATCH 10/15] Fix --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 933c933ea..98b841fca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ ENV PIP ${PYTHON} -m pip WORKDIR ${HOME} USER root -RUN apk add --no-cache python3 python3-pip +RUN apk add --no-cache python3 USER ${NB_USER} From 9d637f5226de4ac82069b331435728d5d81c302d Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 20:04:16 -0700 Subject: [PATCH 11/15] Extracted names directly from the APKINDEX --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 98b841fca..9f1b3f2a8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ ENV PIP ${PYTHON} -m pip WORKDIR ${HOME} USER root -RUN apk add --no-cache python3 +RUN apk add --no-cache python-3.12 py3.12-pip USER ${NB_USER} From 9d38ed2ec298563b9c499d8d408469ca97822141 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 20:13:13 -0700 Subject: [PATCH 12/15] Use a hardened python 3.12 image --- Dockerfile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9f1b3f2a8..15f3533b4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,6 @@ # syntax=docker/dockerfile:experimental -FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base +# FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base +FROM cgr.dev/chainguard/python:3.12 as base # NOTE(crag): NB_USER ARG for mybinder.org compat: # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html @@ -14,9 +15,6 @@ ENV PIP ${PYTHON} -m pip WORKDIR ${HOME} -USER root -RUN apk add --no-cache python-3.12 py3.12-pip - USER ${NB_USER} ENV PYTHONPATH="${PYTHONPATH}:${HOME}" From 52ba8edba5ecc42d2de2b523e0ddc9096adeedee Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 20:17:34 -0700 Subject: [PATCH 13/15] Tags are only available for paid customers --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 15f3533b4..6ea14cdd5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # syntax=docker/dockerfile:experimental # FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base -FROM cgr.dev/chainguard/python:3.12 as base +FROM cgr.dev/chainguard/python as base # NOTE(crag): NB_USER ARG for mybinder.org compat: # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html From 1dc62a1099d5061efa3696240520927ea6cce587 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 20:26:04 -0700 Subject: [PATCH 14/15] Make notebook user first --- Dockerfile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 6ea14cdd5..962c69ab6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,16 @@ # syntax=docker/dockerfile:experimental -# FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base FROM cgr.dev/chainguard/python as base # NOTE(crag): NB_USER ARG for mybinder.org compat: # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html ARG NB_USER=notebook-user ARG NB_UID=1000 + +# Create the user +USER root +RUN adduser -D -u ${NB_UID} ${NB_USER} +ENV HOME=/home/${NB_USER} + ARG PIPELINE_PACKAGE ARG PYTHON_VERSION="3.12" From b8eb564380c68f100e259d6c5ce66b1b5c220177 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 20:33:08 -0700 Subject: [PATCH 15/15] Run everything as root --- Dockerfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 962c69ab6..7f5df05e4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,8 +20,6 @@ ENV PIP ${PYTHON} -m pip WORKDIR ${HOME} -USER ${NB_USER} - ENV PYTHONPATH="${PYTHONPATH}:${HOME}" ENV PATH="/home/${NB_USER}/.local/bin:${PATH}"