Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 17 additions & 14 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
name: CI

env:
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

on:
push:
branches: [ main ]
Expand All @@ -19,7 +22,7 @@ jobs:
python-version: ["3.11", "3.12", "3.13"]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
- uses: ./.github/actions/base-cache
with:
python-version: ${{ matrix.python-version }}
Expand All @@ -31,7 +34,7 @@ jobs:
runs-on: ubuntu-latest
needs: [setup]
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
- uses: ./.github/actions/base-cache
with:
python-version: ${{ matrix.python-version }}
Expand All @@ -45,7 +48,7 @@ jobs:
runs-on: ubuntu-latest
needs: [setup, changelog]
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
- uses: ./.github/actions/base-cache
with:
python-version: ${{ matrix.python-version }}
Expand All @@ -55,14 +58,14 @@ jobs:
shellcheck:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
- name: ShellCheck
uses: ludeeus/action-shellcheck@master

shfmt:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
- name: setup shfmt
uses: mfinelli/setup-shfmt@v3
- name: Run shfmt
Expand All @@ -76,7 +79,7 @@ jobs:
runs-on: opensource-linux-8core
needs: [setup, lint]
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
- uses: ./.github/actions/base-cache
with:
python-version: ${{ matrix.python-version }}
Expand Down Expand Up @@ -106,7 +109,7 @@ jobs:
runs-on: ubuntu-latest
needs: [setup, lint]
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
Expand Down Expand Up @@ -147,7 +150,7 @@ jobs:
runs-on: opensource-linux-8core
needs: [setup, lint, test_unit_no_extras]
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
Expand Down Expand Up @@ -182,7 +185,7 @@ jobs:
needs: [setup, lint]
steps:
# actions/checkout MUST come before auth
- uses: 'actions/checkout@v4'
- uses: 'actions/checkout@v5'
- uses: ./.github/actions/base-cache
with:
python-version: ${{ matrix.python-version }}
Expand Down Expand Up @@ -254,7 +257,7 @@ jobs:
runs-on: ubuntu-latest
needs: [setup, lint]
steps:
- uses: 'actions/checkout@v4'
- uses: 'actions/checkout@v5'
- uses: ./.github/actions/base-cache
with:
python-version: ${{ matrix.python-version }}
Expand All @@ -272,7 +275,7 @@ jobs:
runs-on: ubuntu-latest
needs: [setup, lint]
steps:
- uses: 'actions/checkout@v4'
- uses: 'actions/checkout@v5'
- uses: ./.github/actions/base-cache
with:
python-version: ${{ matrix.python-version }}
Expand All @@ -287,9 +290,9 @@ jobs:
runs-on: ubuntu-latest
steps:
# need to checkout otherwise paths-filter will fail on merge-queue trigger
- uses: actions/checkout@v4
- uses: actions/checkout@v5
- if: github.ref != 'refs/heads/main'
uses: dorny/paths-filter@v3
uses: dorny/paths-filter@v4
id: changes
with:
filters: |
Expand All @@ -305,7 +308,7 @@ jobs:
runs-on: opensource-linux-8core
needs: [ setup, lint ]
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
- name: Test Dockerfile
run: |
echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/claude.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
id-token: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
uses: actions/checkout@v5
with:
fetch-depth: 1

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeflash.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
if: ${{ github.actor != 'codeflash-ai[bot]' }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
with:
fetch-depth: 0
- name: 🐍 Set up Python 3.12
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
with:
driver: docker
- name: Checkout code
uses: actions/checkout@v4
uses: actions/checkout@v5
- name: Login to Quay.io
uses: docker/login-action@v3
with:
Expand Down Expand Up @@ -76,7 +76,7 @@ jobs:
steps:
- uses: docker/setup-buildx-action@v3
- name: Checkout code
uses: actions/checkout@v4
uses: actions/checkout@v5
- name: Login to Quay.io
uses: docker/login-action@v3
with:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ingest-test-fixtures-update-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
setup:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
- uses: ./.github/actions/base-cache
with:
python-version: ${{ env.PYTHON_VERSION }}
Expand All @@ -24,7 +24,7 @@ jobs:
needs: [setup]
steps:
# actions/checkout MUST come before auth
- uses: "actions/checkout@v4"
- uses: "actions/checkout@v5"
- uses: ./.github/actions/base-cache
with:
python-version: ${{ env.PYTHON_VERSION }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/partition-benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
setup:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v5
- uses: ./.github/actions/base-cache
with:
python-version: ${{ env.PYTHON_VERSION }}
Expand All @@ -39,7 +39,7 @@ jobs:

steps:

- uses: actions/checkout@v4
- uses: actions/checkout@v5

- uses: ./.github/actions/base-cache
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release-version-alert.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout codes
uses: actions/checkout@v4
uses: actions/checkout@v5
- name: Get PR information
id: pr-info
run: |
Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## 0.22.17

### Enhancements
- **Prepare PDF rendering for process-isolated PDFium execution**: Resolve the PDF renderer at call time instead of binding it once at import time, so downstream integrations can safely monkey-patch `convert_pdf_to_image()` to a process-isolated implementation without stale aliases bypassing the patch.
- **Chunk PDF rendering during OCR and image extraction**: `process_file_with_ocr()` now renders multi-page PDFs in configurable page ranges (`PDFIUM_CHUNK_SIZE`, default `8`) instead of one full-document render, and `save_elements()` renders only the page ranges actually needed for extracted images/tables instead of rasterizing the entire document.
- **Harden `PDFIUM_CHUNK_SIZE` configuration**: Invalid `PDFIUM_CHUNK_SIZE` values now fall back safely to the default with a warning instead of raising a request-path `ValueError`.

## 0.22.16

### Enhancements
Expand Down
15 changes: 8 additions & 7 deletions scripts/check-licenses.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,14 @@ Python-2.0"
# upstream source repository.
IGNORED_PACKAGES=(
# Metadata missing -- verified permissive on GitHub
arro3-core # MIT / Apache-2.0 (geoarrow/geoarrow-rs)
chroma-hnswlib # Apache-2.0 (chroma-core/hnswlib)
google-crc32c # Apache-2.0 (googleapis/python-crc32c)
iopath # MIT (facebookresearch/iopath)
pypdfium2 # BSD-3-Clause (PDFium/PDFium)
sentencepiece # Apache-2.0 (google/sentencepiece)
voyageai # MIT (voyage-ai/voyageai-python)
arro3-core # MIT / Apache-2.0 (geoarrow/geoarrow-rs)
chroma-hnswlib # Apache-2.0 (chroma-core/hnswlib)
google-crc32c # Apache-2.0 (googleapis/python-crc32c)
iopath # MIT (facebookresearch/iopath)
matplotlib-inline # BSD-3-Clause (ipython/matplotlib-inline)
pypdfium2 # BSD-3-Clause (PDFium/PDFium)
sentencepiece # Apache-2.0 (google/sentencepiece)
voyageai # MIT (voyage-ai/voyageai-python)

# Permissive but non-standard classifier
lmdb # OpenLDAP Public License (BSD-style, jnwatson/py-lmdb)
Expand Down
118 changes: 118 additions & 0 deletions test_unstructured/partition/pdf_image/test_ocr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import io
from collections import namedtuple
from typing import Optional
from unittest.mock import MagicMock, patch
Expand Down Expand Up @@ -68,6 +69,27 @@ def test_process_file_with_ocr_invalid_filename(is_image):
)


def test_process_data_with_ocr_restores_file_position(mocker):
source_file = io.BytesIO(b"pdf-bytes")
source_file.seek(4)
result_layout = MagicMock(DocumentLayout)

mocker.patch(
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
return_value=result_layout,
)

result = ocr.process_data_with_ocr(
data=source_file,
is_image=False,
out_layout=DocumentLayout(),
extracted_layout=[],
)

assert result is result_layout
assert source_file.tell() == 4


def test_supplement_page_layout_with_ocr_invalid_ocr():
with pytest.raises(ValueError):
_ = ocr.supplement_page_layout_with_ocr(
Expand Down Expand Up @@ -673,3 +695,99 @@ def test_pass_down_agents(mock_ocr_get_instance, mocker, mock_page):
"language": "eng",
"ocr_agent_module": OCR_AGENT_TESSERACT,
}


def test_process_file_with_ocr_chunks_pdf_pages(monkeypatch, mocker):
image_paths = {
(1, 4): [f"/tmp/page_{i}.png" for i in range(1, 5)],
(5, 8): [f"/tmp/page_{i}.png" for i in range(5, 9)],
(9, 10): [f"/tmp/page_{i}.png" for i in range(9, 11)],
}
render_calls = []

doc = MagicMock(DocumentLayout)
doc.pages = [MagicMock(PageLayout) for _ in range(10)]

def _fake_render(*args, **kwargs):
render_calls.append((kwargs["first_page"], kwargs["last_page"]))
return image_paths[(kwargs["first_page"], kwargs["last_page"])]

mocker.patch(
"unstructured.partition.pdf_image.ocr.convert_pdf_to_image",
side_effect=_fake_render,
)
mocker.patch(
"unstructured.partition.pdf_image.ocr.PILImage.open",
return_value=Image.new("RGB", (16, 16)),
)
supplement = mocker.patch(
"unstructured.partition.pdf_image.ocr.supplement_page_layout_with_ocr",
side_effect=lambda page_layout, image, **kwargs: page_layout,
)
monkeypatch.setenv("PDFIUM_CHUNK_SIZE", "4")

result = ocr.process_file_with_ocr(
filename="dummy.pdf",
out_layout=doc,
extracted_layout=[],
is_image=False,
)

assert result.pages == doc.pages
assert render_calls == [(1, 4), (5, 8), (9, 10)]
assert supplement.call_count == 10


def test_process_file_with_ocr_invalid_chunk_size_falls_back(monkeypatch, mocker):
doc = MagicMock(DocumentLayout)
doc.pages = [MagicMock(PageLayout) for _ in range(10)]

render_calls = []

def _fake_render(*args, **kwargs):
render_calls.append((kwargs["first_page"], kwargs["last_page"]))
return [f"/tmp/page_{i}.png" for i in range(kwargs["first_page"], kwargs["last_page"] + 1)]

mocker.patch(
"unstructured.partition.pdf_image.ocr.convert_pdf_to_image",
side_effect=_fake_render,
)
mocker.patch(
"unstructured.partition.pdf_image.ocr.PILImage.open",
return_value=Image.new("RGB", (16, 16)),
)
mocker.patch(
"unstructured.partition.pdf_image.ocr.supplement_page_layout_with_ocr",
side_effect=lambda page_layout, image, **kwargs: page_layout,
)
warn = mocker.patch("unstructured.partition.pdf_image.pdf_image_utils.logger.warning")
monkeypatch.setenv("PDFIUM_CHUNK_SIZE", "auto")

ocr.process_file_with_ocr(
filename="dummy.pdf",
out_layout=doc,
extracted_layout=[],
is_image=False,
)

assert render_calls == [(1, 8), (9, 10)]
warn.assert_called_once()


def test_process_file_with_ocr_raises_when_layout_is_empty_but_pdf_renders(mocker):
doc = MagicMock(DocumentLayout)
doc.pages = []

mocker.patch(
"unstructured.partition.pdf_image.ocr.convert_pdf_to_image",
return_value=["/tmp/page_1.png"],
)
mocker.patch("unstructured.partition.pdf_image.ocr.os.path.isfile", return_value=True)

with pytest.raises(ValueError, match="empty layout"):
ocr.process_file_with_ocr(
filename="dummy.pdf",
out_layout=doc,
extracted_layout=[],
is_image=False,
)
Loading
Loading