Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/requirements"
directory: "/"
Comment thread
cursor[bot] marked this conversation as resolved.
schedule:
interval: "daily"
# Only use this to bump our libraries
Expand Down
15 changes: 7 additions & 8 deletions .github/workflows/bump_libraries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ on:
- opened
- reopened
paths:
- 'requirements/**'
- 'uv.lock'
- 'pyproject.toml'

env:
PYTHON_VERSION: "3.8"
PYTHON_VERSION: "3.12"

jobs:
bump-changelog:
Expand All @@ -19,19 +20,18 @@ jobs:
contents: write
steps:
- uses: actions/checkout@v5
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v6
with:
python-version: ${{ env.PYTHON_VERSION }}
run: uv python install ${{ env.PYTHON_VERSION }}
- name: Dependabot metadata
id: metadata
uses: dependabot/fetch-metadata@v2
with:
github-token: "${{ secrets.GITHUB_TOKEN }}"
- name: Create release version
run: |
pip install pip-tools
make pip-compile
uv lock --upgrade
package=${{ steps.metadata.outputs.dependency-names }}
# Strip any [extras] from name
package=${package%\[*}
Expand All @@ -41,4 +41,3 @@ jobs:
- uses: stefanzweifel/git-auto-commit-action@v6
with:
commit_message: "Bump libraries and release"

88 changes: 24 additions & 64 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,42 +11,21 @@ env:
PIPELINE_FAMILY: "general"

jobs:
setup:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/cache@v5
id: virtualenv-cache
with:
path: |
.venv
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/*.txt') }}
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v6
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-ci

lint:
runs-on: ubuntu-latest
needs: setup
steps:
- uses: actions/checkout@v5
- uses: actions/cache@v5
id: virtualenv-cache
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
path: |
.venv
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/*.txt') }}
enable-cache: true
cache-dependency-glob: "uv.lock"
- name: Set up Python ${{ env.PYTHON_VERSION }}
run: uv python install ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: uv sync --extra test --frozen
- name: Lint
run: |
source .venv/bin/activate
make check
run: make check

shellcheck:
runs-on: ubuntu-latest
Expand All @@ -57,26 +36,20 @@ jobs:

test:
runs-on: ubuntu-latest
needs: [setup, lint]
needs: lint
steps:
- uses: actions/checkout@v5
- uses: actions/cache@v5
id: virtualenv-cache
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
path: |
.venv
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}

enable-cache: true
cache-dependency-glob: "uv.lock"
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v6
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Run core tests
run: uv python install ${{ env.PYTHON_VERSION }}
- name: Install dependencies and run core tests
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
sudo apt-get update && sudo apt-get install --yes poppler-utils libreoffice
make install-test
uv sync --extra test --frozen
make install-pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
Expand All @@ -101,23 +74,18 @@ jobs:
uses: dangoslen/changelog-enforcer@v3

# TODO - figure out best practice for caching docker images
# (Using the virtualenv to get pytest)
test_dockerfile:
runs-on: ubuntu-latest
needs: [setup, lint]
needs: lint
steps:
- uses: actions/checkout@v5
- uses: actions/cache@v5
id: virtualenv-cache
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Smoke test fails: bare pytest call after venv activation removed

High Severity

The old CI workflows ran source .venv/bin/activate before make docker-test, putting pytest on PATH. This PR removes that activation and uses uv sync instead, but docker-smoke-test.sh (line 83) still calls bare pytest — not uv run pytest. Since uv sync creates a .venv without activating it, and setup-uv only adds uv to PATH, pytest won't be found. Every other pytest invocation in the Makefile was correctly updated to use uv run, but this script was missed.

Additional Locations (1)

Fix in Cursor Fix in Web

path: |
.venv
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
enable-cache: true
cache-dependency-glob: "uv.lock"
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v6
with:
python-version: ${{ env.PYTHON_VERSION }}
run: uv python install ${{ env.PYTHON_VERSION }}
- name: Free up disk space
run: |
# Clear some space (https://github.com/actions/runner-images/issues/2840)
Expand All @@ -140,14 +108,6 @@ jobs:
df -h
- name: Test Dockerfile
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-test
uv sync --extra test --frozen
make docker-build
make docker-test
# - name: Scan image
# uses: anchore/scan-action@v3
# with:
# image: "pipeline-family-${{ env.PIPELINE_FAMILY }}-dev"
# # NOTE(robinson) - revert this to medium when we bump libreoffice
# severity-cutoff: critical
43 changes: 11 additions & 32 deletions .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,9 @@ env:
DOCKER_BUILD_REPOSITORY: quay.io/unstructured-io/build-unstructured-api
PACKAGE: "unstructured-api"
PIPELINE_FAMILY: "general"
PIP_VERSION: "25.1.1"
PYTHON_VERSION: "3.12"

jobs:
setup:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/cache@v5
id: virtualenv-cache
with:
path: |
.venv
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v6
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Setup virtual environment (no cache hit)
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
run: |
python${{ env.PYTHON_VERSION }} -m venv .venv
source .venv/bin/activate
make install-ci
set-short-sha:
runs-on: ubuntu-latest
outputs:
Expand All @@ -49,7 +28,7 @@ jobs:
# NOTE(luke): temporary disable arm64 since its failing the smoke test
arch: ["amd64"]
runs-on: ubuntu-latest
needs: [setup, set-short-sha]
needs: set-short-sha
env:
SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
DOCKER_PLATFORM: linux/${{ matrix.arch }}
Expand Down Expand Up @@ -90,25 +69,25 @@ jobs:
run: |
DOCKER_BUILDKIT=1 docker buildx build --load -f Dockerfile \
--platform=$DOCKER_PLATFORM \
--build-arg PIP_VERSION=$PIP_VERSION \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--build-arg PIPELINE_PACKAGE=${{ env.PIPELINE_FAMILY }} \
--provenance=false \
--progress plain \
--cache-from $DOCKER_BUILD_REPOSITORY:${{ matrix.arch }} \
-t $DOCKER_BUILD_REPOSITORY:${{ matrix.arch }}-$SHORT_SHA .
- name: Set virtualenv cache
uses: actions/cache@v5
id: virtualenv-cache
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
path: |
.venv
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
enable-cache: true
cache-dependency-glob: "uv.lock"
- name: Set up Python ${{ env.PYTHON_VERSION }}
run: uv python install ${{ env.PYTHON_VERSION }}
- name: Install test dependencies
run: uv sync --extra test --frozen
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Test image
run: |
source .venv/bin/activate
export DOCKER_IMAGE="$DOCKER_BUILD_REPOSITORY:${{ matrix.arch }}-$SHORT_SHA"
if [ "$DOCKER_PLATFORM" == "linux/arm64" ]; then
SKIP_INFERENCE_TESTS=true make docker-test
Expand All @@ -121,7 +100,7 @@ jobs:
docker push $DOCKER_BUILD_REPOSITORY:${{ matrix.arch }}-$SHORT_SHA
publish-images:
runs-on: ubuntu-latest
needs: [setup, set-short-sha, build-images]
needs: [set-short-sha, build-images]
env:
SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
steps:
Expand Down Expand Up @@ -156,7 +135,7 @@ jobs:
#docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64
docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64
docker manifest push $DOCKER_REPOSITORY:$SHORT_SHA
VERSION=$(grep -m1 version preprocessing-pipeline-family.yaml | cut -d ' ' -f2)
VERSION=$(grep -oP '(?<=__version__ = ")[^"]+' prepline_general/api/__version__.py)
#docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64
docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64
docker manifest push ${DOCKER_REPOSITORY}:$VERSION
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ venv.bak/
# mkdocs documentation
/site

# ruff
.ruff_cache/

# mypy
.mypy_cache/
.dmypy.json
Expand Down
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## 0.1.0
* Migrate to native uv for package management, replacing pip and pip-compile
* Replace black and flake8 with ruff for linting and formatting
* Remove all version pins from dependencies, use uv.lock for reproducibility
* Update Dockerfile, CI workflows, and Makefile to use uv throughout
* Add type stubs (types-requests) as explicit test dependencies
* Fix flaky Korean OCR test assertions for tesseract compatibility

## 0.0.93
* Refactored the Dockerfile to use the chainguard/wolfi-base image instead of the unstructured/base-image. This is to align with the recent change in the unstructured repo where the same change was made.
* upgraded dependancies to address CVEs
Expand Down
25 changes: 16 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,20 @@ FROM cgr.dev/chainguard/wolfi-base:latest
# https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html
ARG NB_USER=notebook-user
ARG NB_UID=1000
ARG PIP_VERSION
ARG PIPELINE_PACKAGE
ARG PYTHON_VERSION="3.12"

# Set up environment
ENV PYTHON=python${PYTHON_VERSION}
ENV PIP="${PYTHON} -m pip"

USER root
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
Comment thread
cursor[bot] marked this conversation as resolved.
Outdated

COPY ./docker/packages/*.apk /tmp/packages/
USER root

RUN apk update && \
apk add libxml2 python-3.12 python-3.12-base py3.12-pip glib \
mesa-gl mesa-libgallium cmake bash libmagic wget git openjpeg \
poppler poppler-utils poppler-glib libreoffice tesseract && \
apk add --allow-untrusted /tmp/packages/pandoc-3.1.8-r0.apk && \
rm -rf /tmp/packages && \
git clone --depth 1 https://github.com/tesseract-ocr/tessdata.git /tmp/tessdata && \
mkdir -p /usr/local/share/tessdata && \
cp /tmp/tessdata/*.traineddata /usr/local/share/tessdata && \
Expand Down Expand Up @@ -60,10 +56,21 @@ RUN ./initialize-libreoffice.sh && rm initialize-libreoffice.sh
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
ENV PATH="/home/${NB_USER}/.local/bin:${PATH}"
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
ENV UV_COMPILE_BYTECODE=1
ENV UV_LINK_MODE=copy
ENV UV_PROJECT_ENVIRONMENT="${HOME}/.local"

COPY --chown=${NB_USER}:${NB_USER} pyproject.toml pyproject.toml
COPY --chown=${NB_USER}:${NB_USER} uv.lock uv.lock
RUN uv sync --no-dev --no-install-project --frozen

COPY --chown=${NB_USER}:${NB_USER} requirements/base.txt requirements-base.txt
RUN ${PIP} install pip==${PIP_VERSION} && \
${PIP} install --no-cache -r requirements-base.txt
ARG PANDOC_VERSION="3.9"
RUN ARCH=$(uname -m) && \
if [ "$ARCH" = "x86_64" ]; then PANDOC_ARCH="amd64"; else PANDOC_ARCH="arm64"; fi && \
wget -q "https://github.com/jgm/pandoc/releases/download/${PANDOC_VERSION}/pandoc-${PANDOC_VERSION}-linux-${PANDOC_ARCH}.tar.gz" -O /tmp/pandoc.tar.gz && \
tar -xzf /tmp/pandoc.tar.gz -C /tmp && \
cp /tmp/pandoc-${PANDOC_VERSION}/bin/pandoc /home/${USER}/.local/bin/ && \
rm -rf /tmp/pandoc*

RUN ${PYTHON} -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
${PYTHON} -c "from unstructured.partition.model_init import initialize; initialize()" && \
Expand Down
Loading