Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## 0.22.17

### Enhancements
- **Prepare PDF rendering for process-isolated PDFium execution**: Resolve the PDF renderer at call time instead of binding it once at import time, so downstream integrations can safely monkey-patch `convert_pdf_to_image()` to a process-isolated implementation without stale aliases bypassing the patch.
- **Chunk PDF rendering during OCR and image extraction**: `process_file_with_ocr()` now renders multi-page PDFs in configurable page ranges (`PDFIUM_CHUNK_SIZE`, default `8`) instead of one full-document render, and `save_elements()` renders only the page ranges actually needed for extracted images/tables instead of rasterizing the entire document.
- **Harden `PDFIUM_CHUNK_SIZE` configuration**: Invalid `PDFIUM_CHUNK_SIZE` values now fall back safely to the default with a warning instead of raising a request-path `ValueError`.

## 0.22.16

### Enhancements
Expand Down
185 changes: 185 additions & 0 deletions test_unstructured/partition/pdf_image/test_ocr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import io
from collections import namedtuple
from typing import Optional
from unittest.mock import MagicMock, patch
Expand Down Expand Up @@ -68,6 +69,27 @@ def test_process_file_with_ocr_invalid_filename(is_image):
)


def test_process_data_with_ocr_restores_file_position(mocker):
source_file = io.BytesIO(b"pdf-bytes")
source_file.seek(4)
result_layout = MagicMock(DocumentLayout)

mocker.patch(
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
return_value=result_layout,
)

result = ocr.process_data_with_ocr(
data=source_file,
is_image=False,
out_layout=DocumentLayout(),
extracted_layout=[],
)

assert result is result_layout
assert source_file.tell() == 4


def test_supplement_page_layout_with_ocr_invalid_ocr():
with pytest.raises(ValueError):
_ = ocr.supplement_page_layout_with_ocr(
Expand Down Expand Up @@ -673,3 +695,166 @@ def test_pass_down_agents(mock_ocr_get_instance, mocker, mock_page):
"language": "eng",
"ocr_agent_module": OCR_AGENT_TESSERACT,
}


def test_process_file_with_ocr_chunks_pdf_pages(monkeypatch, mocker):
image_paths = {
(1, 4): [f"/tmp/page_{i}.png" for i in range(1, 5)],
(5, 8): [f"/tmp/page_{i}.png" for i in range(5, 9)],
(9, 10): [f"/tmp/page_{i}.png" for i in range(9, 11)],
}
render_calls = []

doc = MagicMock(DocumentLayout)
doc.pages = [MagicMock(PageLayout) for _ in range(10)]

def _fake_render(*args, **kwargs):
render_calls.append((kwargs["first_page"], kwargs["last_page"]))
return image_paths.get((kwargs["first_page"], kwargs["last_page"]), [])

mocker.patch(
"unstructured.partition.pdf_image.ocr.convert_pdf_to_image",
side_effect=_fake_render,
)
mocker.patch(
"unstructured.partition.pdf_image.ocr.PILImage.open",
return_value=Image.new("RGB", (16, 16)),
)
supplement = mocker.patch(
"unstructured.partition.pdf_image.ocr.supplement_page_layout_with_ocr",
side_effect=lambda page_layout, image, **kwargs: page_layout,
)
monkeypatch.setenv("PDFIUM_CHUNK_SIZE", "4")
mocker.patch("unstructured.partition.pdf_image.ocr.os.path.isfile", return_value=True)

result = ocr.process_file_with_ocr(
filename="dummy.pdf",
out_layout=doc,
extracted_layout=[],
is_image=False,
)

assert result.pages == doc.pages
assert render_calls == [(1, 4), (5, 8), (9, 10), (11, 11)]
assert supplement.call_count == 10


def test_process_file_with_ocr_invalid_chunk_size_falls_back(monkeypatch, mocker):
doc = MagicMock(DocumentLayout)
doc.pages = [MagicMock(PageLayout) for _ in range(10)]

render_calls = []

def _fake_render(*args, **kwargs):
render_calls.append((kwargs["first_page"], kwargs["last_page"]))
if kwargs["first_page"] > 10:
return []
return [f"/tmp/page_{i}.png" for i in range(kwargs["first_page"], kwargs["last_page"] + 1)]

mocker.patch(
"unstructured.partition.pdf_image.ocr.convert_pdf_to_image",
side_effect=_fake_render,
)
mocker.patch(
"unstructured.partition.pdf_image.ocr.PILImage.open",
return_value=Image.new("RGB", (16, 16)),
)
mocker.patch(
"unstructured.partition.pdf_image.ocr.supplement_page_layout_with_ocr",
side_effect=lambda page_layout, image, **kwargs: page_layout,
)
warn = mocker.patch("unstructured.partition.pdf_image.pdf_image_utils.logger.warning")
monkeypatch.setenv("PDFIUM_CHUNK_SIZE", "auto")
mocker.patch("unstructured.partition.pdf_image.ocr.os.path.isfile", return_value=True)

ocr.process_file_with_ocr(
filename="dummy.pdf",
out_layout=doc,
extracted_layout=[],
is_image=False,
)

assert render_calls == [(1, 8), (9, 10), (11, 11)]
warn.assert_called_once()


def test_process_file_with_ocr_raises_when_layout_is_empty_but_pdf_renders(mocker):
doc = MagicMock(DocumentLayout)
doc.pages = []

render = mocker.patch(
"unstructured.partition.pdf_image.ocr.convert_pdf_to_image",
return_value=["/tmp/page_1.png"],
)
mocker.patch("unstructured.partition.pdf_image.ocr.os.path.isfile", return_value=True)

with pytest.raises(ValueError, match="empty layout"):
ocr.process_file_with_ocr(
filename="dummy.pdf",
out_layout=doc,
extracted_layout=[],
is_image=False,
)

render.assert_called_once()
assert render.call_args.kwargs["first_page"] == 1
assert render.call_args.kwargs["last_page"] == 1


def test_process_file_with_ocr_raises_when_chunk_render_count_mismatch(mocker):
doc = MagicMock(DocumentLayout)
doc.pages = [MagicMock(PageLayout) for _ in range(2)]

render = mocker.patch(
"unstructured.partition.pdf_image.ocr.convert_pdf_to_image",
return_value=["/tmp/page_1.png"],
)
mocker.patch("unstructured.partition.pdf_image.ocr.os.path.isfile", return_value=True)

with pytest.raises(ValueError, match="Expected 2 rendered page\\(s\\) for range 1-2, got 1\\."):
ocr.process_file_with_ocr(
filename="dummy.pdf",
out_layout=doc,
extracted_layout=[],
is_image=False,
)

render.assert_called_once()


def test_process_file_with_ocr_raises_on_pdf_layout_page_count_mismatch(mocker):
doc = MagicMock(DocumentLayout)
doc.pages = [MagicMock(PageLayout) for _ in range(2)]
render_calls = []

def _fake_render(*args, **kwargs):
render_calls.append((kwargs["first_page"], kwargs["last_page"]))
if (kwargs["first_page"], kwargs["last_page"]) == (1, 2):
return ["/tmp/page_1.png", "/tmp/page_2.png"]
if (kwargs["first_page"], kwargs["last_page"]) == (3, 3):
return ["/tmp/page_3.png"]
return []

mocker.patch(
"unstructured.partition.pdf_image.ocr.convert_pdf_to_image",
side_effect=_fake_render,
)
mocker.patch(
"unstructured.partition.pdf_image.ocr.PILImage.open",
return_value=Image.new("RGB", (16, 16)),
)
mocker.patch(
"unstructured.partition.pdf_image.ocr.supplement_page_layout_with_ocr",
side_effect=lambda page_layout, image, **kwargs: page_layout,
)
mocker.patch("unstructured.partition.pdf_image.ocr.os.path.isfile", return_value=True)

with pytest.raises(ValueError, match="page-count mismatch"):
ocr.process_file_with_ocr(
filename="dummy.pdf",
out_layout=doc,
extracted_layout=[],
is_image=False,
)

assert render_calls == [(1, 2), (3, 3)]
64 changes: 64 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
Text,
Title,
)
from unstructured.documents.elements import (
Image as ImageElement,
)
from unstructured.errors import PageCountExceededError
from unstructured.partition import pdf, strategies
from unstructured.partition.pdf_image import ocr, pdfminer_processing
Expand Down Expand Up @@ -211,6 +214,67 @@ def test_partition_pdf_local_raises_with_no_filename():
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)


def test_partition_pdf_local_rewinds_file_for_follow_on_hi_res_stages(mocker: MockFixture):
source_file = io.BytesIO(b"pdf-bytes")
mock_document_layout = mocker.Mock(spec=DocumentLayout)
mock_document_layout.pages = [mocker.Mock(spec=PageLayout)]
extracted_image = ImageElement(
text="Image Text 1",
coordinates=((10, 10), (10, 20), (20, 20), (20, 10)),
coordinate_system=PixelSpace(width=100, height=100),
metadata=ElementMetadata(page_number=1),
)

mocker.patch(
"unstructured_inference.inference.layout.process_data_with_model",
return_value=mock_document_layout,
)
mocker.patch(
"unstructured.partition.pdf_image.pdfminer_processing.merge_inferred_with_extracted_layout",
return_value=mock_document_layout,
)
mocker.patch(
"unstructured.partition.pdf_image.pdfminer_processing.clean_pdfminer_inner_elements",
side_effect=lambda document_layout: document_layout,
)
mocker.patch(
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
return_value=mock_document_layout,
)
mocker.patch(
"unstructured.partition.pdf.document_to_element_list",
return_value=[extracted_image],
)

downstream_positions = []

def _save_elements(*args, **kwargs):
downstream_positions.append(("save_elements", kwargs["file"].tell()))

def _run_form_extraction(*args, **kwargs):
downstream_positions.append(("run_form_extraction", kwargs["file"].tell()))
return []

mocker.patch(
"unstructured.partition.pdf_image.pdf_image_utils.save_elements",
side_effect=_save_elements,
)
mocker.patch(
"unstructured.partition.pdf_image.form_extraction.run_form_extraction",
side_effect=_run_form_extraction,
)

pdf._partition_pdf_or_image_local(
filename="",
file=source_file,
is_image=False,
extract_image_block_types=[ElementType.IMAGE],
extract_forms=True,
)

assert downstream_positions == [("save_elements", 0), ("run_form_extraction", 0)]


@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
("strategy", "starting_page_number", "expected_page_numbers", "origin"),
Expand Down
Loading
Loading