Skip to content

Commit 0b974b5

Browse files
committed
Migrate to uv
1 parent b9c9323 commit 0b974b5

27 files changed

Lines changed: 3464 additions & 1165 deletions

.github/dependabot.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
version: 2
22
updates:
33
- package-ecosystem: "pip"
4-
directory: "/requirements"
4+
directory: "/"
55
schedule:
66
interval: "daily"
77
# Only use this to bump our libraries

.github/workflows/bump_libraries.yaml

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@ on:
66
- opened
77
- reopened
88
paths:
9-
- 'requirements/**'
9+
- 'uv.lock'
10+
- 'pyproject.toml'
1011

1112
env:
12-
PYTHON_VERSION: "3.8"
13+
PYTHON_VERSION: "3.12"
1314

1415
jobs:
1516
bump-changelog:
@@ -19,19 +20,18 @@ jobs:
1920
contents: write
2021
steps:
2122
- uses: actions/checkout@v5
23+
- name: Install uv
24+
uses: astral-sh/setup-uv@v5
2225
- name: Set up Python ${{ env.PYTHON_VERSION }}
23-
uses: actions/setup-python@v6
24-
with:
25-
python-version: ${{ env.PYTHON_VERSION }}
26+
run: uv python install ${{ env.PYTHON_VERSION }}
2627
- name: Dependabot metadata
2728
id: metadata
2829
uses: dependabot/fetch-metadata@v2
2930
with:
3031
github-token: "${{ secrets.GITHUB_TOKEN }}"
3132
- name: Create release version
3233
run: |
33-
pip install pip-tools
34-
make pip-compile
34+
uv lock --upgrade
3535
package=${{ steps.metadata.outputs.dependency-names }}
3636
# Strip any [extras] from name
3737
package=${package%\[*}
@@ -41,4 +41,3 @@ jobs:
4141
- uses: stefanzweifel/git-auto-commit-action@v6
4242
with:
4343
commit_message: "Bump libraries and release"
44-

.github/workflows/ci.yml

Lines changed: 24 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -11,42 +11,21 @@ env:
1111
PIPELINE_FAMILY: "general"
1212

1313
jobs:
14-
setup:
15-
runs-on: ubuntu-latest
16-
steps:
17-
- uses: actions/checkout@v5
18-
- uses: actions/cache@v5
19-
id: virtualenv-cache
20-
with:
21-
path: |
22-
.venv
23-
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/*.txt') }}
24-
- name: Set up Python ${{ env.PYTHON_VERSION }}
25-
uses: actions/setup-python@v6
26-
with:
27-
python-version: ${{ env.PYTHON_VERSION }}
28-
- name: Setup virtual environment (no cache hit)
29-
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
30-
run: |
31-
python${{ env.PYTHON_VERSION }} -m venv .venv
32-
source .venv/bin/activate
33-
make install-ci
34-
3514
lint:
3615
runs-on: ubuntu-latest
37-
needs: setup
3816
steps:
3917
- uses: actions/checkout@v5
40-
- uses: actions/cache@v5
41-
id: virtualenv-cache
18+
- name: Install uv
19+
uses: astral-sh/setup-uv@v5
4220
with:
43-
path: |
44-
.venv
45-
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/*.txt') }}
21+
enable-cache: true
22+
cache-dependency-glob: "uv.lock"
23+
- name: Set up Python ${{ env.PYTHON_VERSION }}
24+
run: uv python install ${{ env.PYTHON_VERSION }}
25+
- name: Install dependencies
26+
run: uv sync --extra test --frozen
4627
- name: Lint
47-
run: |
48-
source .venv/bin/activate
49-
make check
28+
run: make check
5029

5130
shellcheck:
5231
runs-on: ubuntu-latest
@@ -57,26 +36,20 @@ jobs:
5736

5837
test:
5938
runs-on: ubuntu-latest
60-
needs: [setup, lint]
39+
needs: lint
6140
steps:
6241
- uses: actions/checkout@v5
63-
- uses: actions/cache@v5
64-
id: virtualenv-cache
42+
- name: Install uv
43+
uses: astral-sh/setup-uv@v5
6544
with:
66-
path: |
67-
.venv
68-
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
69-
45+
enable-cache: true
46+
cache-dependency-glob: "uv.lock"
7047
- name: Set up Python ${{ env.PYTHON_VERSION }}
71-
uses: actions/setup-python@v6
72-
with:
73-
python-version: ${{ env.PYTHON_VERSION }}
74-
- name: Run core tests
48+
run: uv python install ${{ env.PYTHON_VERSION }}
49+
- name: Install dependencies and run core tests
7550
run: |
76-
python${{ env.PYTHON_VERSION }} -m venv .venv
77-
source .venv/bin/activate
7851
sudo apt-get update && sudo apt-get install --yes poppler-utils libreoffice
79-
make install-test
52+
uv sync --extra test --frozen
8053
make install-pandoc
8154
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
8255
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
@@ -101,23 +74,18 @@ jobs:
10174
uses: dangoslen/changelog-enforcer@v3
10275

10376
# TODO - figure out best practice for caching docker images
104-
# (Using the virtualenv to get pytest)
10577
test_dockerfile:
10678
runs-on: ubuntu-latest
107-
needs: [setup, lint]
79+
needs: lint
10880
steps:
10981
- uses: actions/checkout@v5
110-
- uses: actions/cache@v5
111-
id: virtualenv-cache
82+
- name: Install uv
83+
uses: astral-sh/setup-uv@v5
11284
with:
113-
python-version: ${{ env.PYTHON_VERSION }}
114-
path: |
115-
.venv
116-
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
85+
enable-cache: true
86+
cache-dependency-glob: "uv.lock"
11787
- name: Set up Python ${{ env.PYTHON_VERSION }}
118-
uses: actions/setup-python@v6
119-
with:
120-
python-version: ${{ env.PYTHON_VERSION }}
88+
run: uv python install ${{ env.PYTHON_VERSION }}
12189
- name: Free up disk space
12290
run: |
12391
# Clear some space (https://github.com/actions/runner-images/issues/2840)
@@ -140,14 +108,6 @@ jobs:
140108
df -h
141109
- name: Test Dockerfile
142110
run: |
143-
python${{ env.PYTHON_VERSION }} -m venv .venv
144-
source .venv/bin/activate
145-
make install-test
111+
uv sync --extra test --frozen
146112
make docker-build
147113
make docker-test
148-
# - name: Scan image
149-
# uses: anchore/scan-action@v3
150-
# with:
151-
# image: "pipeline-family-${{ env.PIPELINE_FAMILY }}-dev"
152-
# # NOTE(robinson) - revert this to medium when we bump libreoffice
153-
# severity-cutoff: critical

.github/workflows/docker-publish.yml

Lines changed: 11 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -10,30 +10,9 @@ env:
1010
DOCKER_BUILD_REPOSITORY: quay.io/unstructured-io/build-unstructured-api
1111
PACKAGE: "unstructured-api"
1212
PIPELINE_FAMILY: "general"
13-
PIP_VERSION: "25.1.1"
1413
PYTHON_VERSION: "3.12"
1514

1615
jobs:
17-
setup:
18-
runs-on: ubuntu-latest
19-
steps:
20-
- uses: actions/checkout@v5
21-
- uses: actions/cache@v5
22-
id: virtualenv-cache
23-
with:
24-
path: |
25-
.venv
26-
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
27-
- name: Set up Python ${{ env.PYTHON_VERSION }}
28-
uses: actions/setup-python@v6
29-
with:
30-
python-version: ${{ env.PYTHON_VERSION }}
31-
- name: Setup virtual environment (no cache hit)
32-
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
33-
run: |
34-
python${{ env.PYTHON_VERSION }} -m venv .venv
35-
source .venv/bin/activate
36-
make install-ci
3716
set-short-sha:
3817
runs-on: ubuntu-latest
3918
outputs:
@@ -49,7 +28,7 @@ jobs:
4928
# NOTE(luke): temporary disable arm64 since its failing the smoke test
5029
arch: ["amd64"]
5130
runs-on: ubuntu-latest
52-
needs: [setup, set-short-sha]
31+
needs: set-short-sha
5332
env:
5433
SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
5534
DOCKER_PLATFORM: linux/${{ matrix.arch }}
@@ -90,25 +69,25 @@ jobs:
9069
run: |
9170
DOCKER_BUILDKIT=1 docker buildx build --load -f Dockerfile \
9271
--platform=$DOCKER_PLATFORM \
93-
--build-arg PIP_VERSION=$PIP_VERSION \
9472
--build-arg BUILDKIT_INLINE_CACHE=1 \
9573
--build-arg PIPELINE_PACKAGE=${{ env.PIPELINE_FAMILY }} \
9674
--provenance=false \
9775
--progress plain \
9876
--cache-from $DOCKER_BUILD_REPOSITORY:${{ matrix.arch }} \
9977
-t $DOCKER_BUILD_REPOSITORY:${{ matrix.arch }}-$SHORT_SHA .
100-
- name: Set virtualenv cache
101-
uses: actions/cache@v5
102-
id: virtualenv-cache
78+
- name: Install uv
79+
uses: astral-sh/setup-uv@v5
10380
with:
104-
path: |
105-
.venv
106-
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
81+
enable-cache: true
82+
cache-dependency-glob: "uv.lock"
83+
- name: Set up Python ${{ env.PYTHON_VERSION }}
84+
run: uv python install ${{ env.PYTHON_VERSION }}
85+
- name: Install test dependencies
86+
run: uv sync --extra test --frozen
10787
- name: Set up QEMU
10888
uses: docker/setup-qemu-action@v3
10989
- name: Test image
11090
run: |
111-
source .venv/bin/activate
11291
export DOCKER_IMAGE="$DOCKER_BUILD_REPOSITORY:${{ matrix.arch }}-$SHORT_SHA"
11392
if [ "$DOCKER_PLATFORM" == "linux/arm64" ]; then
11493
SKIP_INFERENCE_TESTS=true make docker-test
@@ -121,7 +100,7 @@ jobs:
121100
docker push $DOCKER_BUILD_REPOSITORY:${{ matrix.arch }}-$SHORT_SHA
122101
publish-images:
123102
runs-on: ubuntu-latest
124-
needs: [setup, set-short-sha, build-images]
103+
needs: [set-short-sha, build-images]
125104
env:
126105
SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
127106
steps:
@@ -156,7 +135,7 @@ jobs:
156135
#docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64
157136
docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64
158137
docker manifest push $DOCKER_REPOSITORY:$SHORT_SHA
159-
VERSION=$(grep -m1 version preprocessing-pipeline-family.yaml | cut -d ' ' -f2)
138+
VERSION=$(grep -oP '(?<=__version__ = ")[^"]+' prepline_general/api/__version__.py)
160139
#docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64
161140
docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64
162141
docker manifest push ${DOCKER_REPOSITORY}:$VERSION

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,9 @@ venv.bak/
120120
# mkdocs documentation
121121
/site
122122

123+
# ruff
124+
.ruff_cache/
125+
123126
# mypy
124127
.mypy_cache/
125128
.dmypy.json

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
## 0.1.0
2+
* Migrate to native uv for package management, replacing pip and pip-compile
3+
* Replace black and flake8 with ruff for linting and formatting
4+
* Remove all version pins from dependencies, use uv.lock for reproducibility
5+
* Update Dockerfile, CI workflows, and Makefile to use uv throughout
6+
* Add type stubs (types-requests) as explicit test dependencies
7+
* Fix flaky Korean OCR test assertions for tesseract compatibility
8+
19
## 0.0.93
210
* Refactored the Dockerfile to use the chainguard/wolfi-base image instead of the unstructured/base-image. This is to align with the recent change in the unstructured repo where the same change was made.
311
* upgraded dependancies to address CVEs

Dockerfile

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,20 @@ FROM cgr.dev/chainguard/wolfi-base:latest
55
# https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html
66
ARG NB_USER=notebook-user
77
ARG NB_UID=1000
8-
ARG PIP_VERSION
98
ARG PIPELINE_PACKAGE
109
ARG PYTHON_VERSION="3.12"
1110

1211
# Set up environment
1312
ENV PYTHON=python${PYTHON_VERSION}
14-
ENV PIP="${PYTHON} -m pip"
1513

16-
USER root
14+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
1715

18-
COPY ./docker/packages/*.apk /tmp/packages/
16+
USER root
1917

2018
RUN apk update && \
2119
apk add libxml2 python-3.12 python-3.12-base py3.12-pip glib \
2220
mesa-gl mesa-libgallium cmake bash libmagic wget git openjpeg \
2321
poppler poppler-utils poppler-glib libreoffice tesseract && \
24-
apk add --allow-untrusted /tmp/packages/pandoc-3.1.8-r0.apk && \
25-
rm -rf /tmp/packages && \
2622
git clone --depth 1 https://github.com/tesseract-ocr/tessdata.git /tmp/tessdata && \
2723
mkdir -p /usr/local/share/tessdata && \
2824
cp /tmp/tessdata/*.traineddata /usr/local/share/tessdata && \
@@ -60,10 +56,21 @@ RUN ./initialize-libreoffice.sh && rm initialize-libreoffice.sh
6056
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
6157
ENV PATH="/home/${NB_USER}/.local/bin:${PATH}"
6258
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
59+
ENV UV_COMPILE_BYTECODE=1
60+
ENV UV_LINK_MODE=copy
61+
ENV UV_PROJECT_ENVIRONMENT="${HOME}/.local"
62+
63+
COPY --chown=${NB_USER}:${NB_USER} pyproject.toml pyproject.toml
64+
COPY --chown=${NB_USER}:${NB_USER} uv.lock uv.lock
65+
RUN uv sync --no-dev --no-install-project --frozen
6366

64-
COPY --chown=${NB_USER}:${NB_USER} requirements/base.txt requirements-base.txt
65-
RUN ${PIP} install pip==${PIP_VERSION} && \
66-
${PIP} install --no-cache -r requirements-base.txt
67+
ARG PANDOC_VERSION="3.9"
68+
RUN ARCH=$(uname -m) && \
69+
if [ "$ARCH" = "x86_64" ]; then PANDOC_ARCH="amd64"; else PANDOC_ARCH="arm64"; fi && \
70+
wget -q "https://github.com/jgm/pandoc/releases/download/${PANDOC_VERSION}/pandoc-${PANDOC_VERSION}-linux-${PANDOC_ARCH}.tar.gz" -O /tmp/pandoc.tar.gz && \
71+
tar -xzf /tmp/pandoc.tar.gz -C /tmp && \
72+
cp /tmp/pandoc-${PANDOC_VERSION}/bin/pandoc /home/${USER}/.local/bin/ && \
73+
rm -rf /tmp/pandoc*
6774

6875
RUN ${PYTHON} -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
6976
${PYTHON} -c "from unstructured.partition.model_init import initialize; initialize()" && \

0 commit comments

Comments
 (0)