Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/actions/base-cache/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ runs:
- name: Install dependencies
shell: bash
run: |
uv sync --frozen --all-extras --all-groups
uv sync --locked --all-extras --all-groups
make install-nltk-models
10 changes: 5 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ jobs:
env:
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
run: |
uv sync --frozen --group test
uv sync --locked --group test
make install-nltk-models
make test-no-extras CI=true

Expand Down Expand Up @@ -162,7 +162,7 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install extra dependencies
run: |
uv sync --frozen ${{ matrix.uv-extras }} --group test
uv sync --locked ${{ matrix.uv-extras }} --group test
make install-nltk-models
- name: Install system dependencies
run: |
Expand Down Expand Up @@ -250,7 +250,7 @@ jobs:
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install diffstat
tesseract --version
uv run ./test_unstructured_ingest/test-ingest-src.sh
uv run --no-sync ./test_unstructured_ingest/test-ingest-src.sh

test_json_to_html:
strategy:
Expand All @@ -268,7 +268,7 @@ jobs:
OVERWRITE_FIXTURES: "false"
run: |
sudo apt-get install diffstat
uv run ./test_unstructured_ingest/check-diff-expected-output-html.sh
uv run --no-sync ./test_unstructured_ingest/check-diff-expected-output-html.sh

test_json_to_markdown:
strategy:
Expand All @@ -286,7 +286,7 @@ jobs:
OVERWRITE_FIXTURES: "false"
run: |
sudo apt-get install diffstat
uv run ./test_unstructured_ingest/check-diff-expected-output-markdown.sh
uv run --no-sync ./test_unstructured_ingest/check-diff-expected-output-markdown.sh

changelog:
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ingest-test-fixtures-update-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ jobs:
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install diffstat
tesseract --version
uv run ./test_unstructured_ingest/test-ingest-src.sh
uv run --no-sync ./test_unstructured_ingest/test-ingest-src.sh
- name: Update HTML fixtures
run: make html-fixtures-update
- name: Update markdown fixtures
Expand Down
81 changes: 81 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
name: Pypi Release

on:
release:
types:
- published

permissions:
contents: read
id-token: write # Required for PyPI trusted publishing / attestations

concurrency:
group: release
cancel-in-progress: false

env:
PYTHON_VERSION: "3.12"

jobs:
release:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4

- name: Install uv
uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
with:
python-version: ${{ env.PYTHON_VERSION }}

- name: Set up Python
run: uv python install

- name: Install dependencies
run: uv sync --locked --only-group release --no-install-project

- name: Validate version matches release tag
env:
TAG: ${{ github.event.release.tag_name }}
run: |
PKG_VERSION=$(uv run --no-sync python -c "from unstructured.__version__ import __version__; print(__version__)")
if [[ "$TAG" != "$PKG_VERSION" && "$TAG" != "v$PKG_VERSION" ]]; then
echo "Tag '$TAG' does not match package version '$PKG_VERSION'"
exit 1
fi

- name: Build artifact
id: build
run: uv build

- name: Publish package
uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1

# Best-effort: attempt Azure upload even if PyPI fails, but only if build succeeded.
- name: Create .pypirc for Azure Artifacts
if: always() && steps.build.outcome == 'success'
run: |
cat <<EOF > ~/.pypirc
[distutils]
index-servers =
azure

[azure]
repository: https://pkgs.dev.azure.com/${{ secrets.AZURE_ARTIFACTS_FEED }}/_packaging/${{ secrets.AZURE_ARTIFACTS_FEED }}/pypi/upload/
username: ${{ secrets.AZURE_ARTIFACTS_USERNAME }}
password: ${{ secrets.AZURE_ARTIFACTS_PAT }}
EOF

- name: Publish package to Azure Artifacts
if: always() && steps.build.outcome == 'success'
run: |
EXIT_CODE=0
uv run --no-sync twine upload -r azure dist/* || EXIT_CODE=$?
if [[ $EXIT_CODE -eq 0 ]]; then
echo "Successfully published to Azure Artifacts (or already existed)"
else
echo "Azure Artifacts upload failed (exit code: $EXIT_CODE)"
if [[ $EXIT_CODE -eq 1 ]]; then
echo "This may be due to version conflicts or connectivity issues"
fi
echo "Azure Artifacts upload is non-critical - skipping failure"
fi
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## 0.20.2

### Enhancements
- Add automated PyPI publishing: new `release.yml` GitHub Actions workflow triggers on GitHub release, builds the package with `uv build`, publishes to PyPI via `pypa/gh-action-pypi-publish`, and uploads to Azure Artifacts via `twine`
- Replace `uv sync --frozen` with `uv sync --locked` across all CI workflows, Dockerfile, and Makefile to fail fast on stale lockfiles
- Add `--no-sync` to all `uv run` and `uv build` commands that follow a prior `uv sync` step to prevent implicit re-syncing

## 0.20.1

### Fixes
Expand Down
8 changes: 4 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,11 @@ ENV UV_COMPILE_BYTECODE=1
ENV UV_PYTHON_DOWNLOADS=never

# Install Python dependencies via uv and download required NLTK packages
RUN uv sync --frozen --all-extras --no-group dev --no-group lint --no-group test && \
RUN uv sync --locked --all-extras --no-group dev --no-group lint --no-group test --no-group release && \
mkdir -p ${NLTK_DATA} && \
uv run $PYTHON -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng && \
uv run $PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
uv run $PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
uv run --no-sync $PYTHON -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng && \
uv run --no-sync $PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
uv run --no-sync $PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"

ENV PATH="/app/.venv/bin:${PATH}"
ENV HF_HUB_OFFLINE=1
Expand Down
46 changes: 23 additions & 23 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ help: Makefile
## install: install all dependencies via uv
.PHONY: install
install:
@uv sync --frozen --all-extras --all-groups
@uv sync --locked --all-extras --all-groups
@$(MAKE) install-nltk-models

## lock: update and lock all dependencies
Expand All @@ -23,7 +23,7 @@ lock:

.PHONY: install-nltk-models
install-nltk-models:
uv run --frozen python -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"
uv run --no-sync python -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"


#################
Expand All @@ -38,62 +38,62 @@ export UNSTRUCTURED_INCLUDE_DEBUG_METADATA ?= false
test:
CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
uv run --frozen --no-sync pytest -n auto test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40
uv run --no-sync pytest -n auto test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing --durations=40

.PHONY: test-no-extras
test-no-extras:
CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
uv run --frozen --no-sync pytest -n auto \
uv run --no-sync pytest -n auto \
test_${PACKAGE_NAME}/partition/test_text.py \
test_${PACKAGE_NAME}/partition/test_email.py \
test_${PACKAGE_NAME}/partition/html/test_partition.py \
test_${PACKAGE_NAME}/partition/test_xml.py

.PHONY: test-extra-csv
test-extra-csv:
CI=$(CI) uv run --frozen --no-sync pytest -n auto \
CI=$(CI) uv run --no-sync pytest -n auto \
test_unstructured/partition/test_csv.py \
test_unstructured/partition/test_tsv.py

.PHONY: test-extra-docx
test-extra-docx:
CI=$(CI) uv run --frozen --no-sync pytest -n auto \
CI=$(CI) uv run --no-sync pytest -n auto \
test_unstructured/partition/test_doc.py \
test_unstructured/partition/test_docx.py

.PHONY: test-extra-epub
test-extra-epub:
CI=$(CI) uv run --frozen --no-sync pytest -n auto test_unstructured/partition/test_epub.py
CI=$(CI) uv run --no-sync pytest -n auto test_unstructured/partition/test_epub.py

.PHONY: test-extra-markdown
test-extra-markdown:
CI=$(CI) uv run --frozen --no-sync pytest -n auto test_unstructured/partition/test_md.py
CI=$(CI) uv run --no-sync pytest -n auto test_unstructured/partition/test_md.py

.PHONY: test-extra-odt
test-extra-odt:
CI=$(CI) uv run --frozen --no-sync pytest -n auto test_unstructured/partition/test_odt.py
CI=$(CI) uv run --no-sync pytest -n auto test_unstructured/partition/test_odt.py

.PHONY: test-extra-pdf-image
test-extra-pdf-image:
CI=$(CI) uv run --frozen --no-sync pytest -n auto test_unstructured/partition/pdf_image
CI=$(CI) uv run --no-sync pytest -n auto test_unstructured/partition/pdf_image

.PHONY: test-extra-pptx
test-extra-pptx:
CI=$(CI) uv run --frozen --no-sync pytest -n auto \
CI=$(CI) uv run --no-sync pytest -n auto \
test_unstructured/partition/test_ppt.py \
test_unstructured/partition/test_pptx.py

.PHONY: test-extra-pypandoc
test-extra-pypandoc:
CI=$(CI) uv run --frozen --no-sync pytest -n auto \
CI=$(CI) uv run --no-sync pytest -n auto \
test_unstructured/partition/test_org.py \
test_unstructured/partition/test_rst.py \
test_unstructured/partition/test_rtf.py

.PHONY: test-extra-xlsx
test-extra-xlsx:
CI=$(CI) uv run --frozen --no-sync pytest -n auto test_unstructured/partition/test_xlsx.py
CI=$(CI) uv run --no-sync pytest -n auto test_unstructured/partition/test_xlsx.py

## check: runs all linters and checks
.PHONY: check
Expand All @@ -102,8 +102,8 @@ check: check-ruff check-version
## check-ruff: runs ruff linter and formatter check
.PHONY: check-ruff
check-ruff:
uv run --frozen --no-sync ruff check .
uv run --frozen --no-sync ruff format --check .
uv run --no-sync ruff check .
uv run --no-sync ruff format --check .

.PHONY: check-licenses
check-licenses:
Expand All @@ -119,8 +119,8 @@ check-version:
## tidy: auto-format and fix lint issues
.PHONY: tidy
tidy:
uv run --frozen --no-sync ruff format .
uv run --frozen --no-sync ruff check --fix-only --show-fixes .
uv run --no-sync ruff format .
uv run --no-sync ruff check --fix-only --show-fixes .

.PHONY: tidy-shell
tidy-shell:
Expand All @@ -135,7 +135,7 @@ version-sync:
## check-coverage: check test coverage meets threshold
.PHONY: check-coverage
check-coverage:
uv run --frozen --no-sync coverage report --fail-under=90
uv run --no-sync coverage report --fail-under=90

##########
# Docker #
Expand Down Expand Up @@ -166,10 +166,10 @@ docker-test:
-v ${CURRENT_DIR}/test_unstructured_ingest:/home/notebook-user/test_unstructured_ingest \
$(if $(wildcard uns_test_env_file),--env-file uns_test_env_file,) \
$(DOCKER_IMAGE) \
bash -c "uv sync --frozen --all-extras --group test --no-install-project && \
bash -c "uv sync --locked --all-extras --group test --no-install-project && \
CI=$(CI) \
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
uv run pytest -n auto $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
uv run --no-sync pytest -n auto $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"

.PHONY: docker-smoke-test
docker-smoke-test:
Expand All @@ -187,7 +187,7 @@ docker-jupyter-notebook:

.PHONY: run-jupyter
run-jupyter:
uv run jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''
uv run --no-sync jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''


###########
Expand All @@ -197,9 +197,9 @@ run-jupyter:
.PHONY: html-fixtures-update
html-fixtures-update:
rm -r test_unstructured_ingest/expected-structured-output-html && \
uv run test_unstructured_ingest/structured-json-to-html.sh test_unstructured_ingest/expected-structured-output-html
uv run --no-sync test_unstructured_ingest/structured-json-to-html.sh test_unstructured_ingest/expected-structured-output-html

.PHONY: markdown-fixtures-update
markdown-fixtures-update:
rm -r test_unstructured_ingest/expected-structured-output-markdown && \
uv run test_unstructured_ingest/structured-json-to-markdown.sh test_unstructured_ingest/expected-structured-output-markdown
uv run --no-sync test_unstructured_ingest/structured-json-to-markdown.sh test_unstructured_ingest/expected-structured-output-markdown
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ Then install all dependencies (base, extras, dev, test, and lint groups):
make install
```

This runs `uv sync --frozen --all-extras --all-groups`, which creates a virtual environment
This runs `uv sync --locked --all-extras --all-groups`, which creates a virtual environment
and installs everything in one step. No need to manually create or activate a virtualenv.

To install only specific document-type extras:
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ dev = [
lint = [
"ruff>=0.15.0, <1.0.0",
]
release = [
"twine>=6.0.0, <7.0.0",
]

[tool.uv]
required-environments = [
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.20.1" # pragma: no cover
__version__ = "0.20.2" # pragma: no cover
Loading
Loading