Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 0.22.24

### Fixes

- **Reject oversized hi-res PDF renders before bitmap allocation**: Hi-res PDF partitioning now passes the configured per-page pixel limit to `unstructured-inference` so oversized pages are rejected immediately before rendering, then returned as unprocessable document errors.

## 0.22.23

### Fixes
Expand Down
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,8 @@ image = [
"pi-heif>=1.2.0, <2.0.0",
"pikepdf>=10.3.0, <11.0.0",
"pypdf>=6.6.2, <7.0.0",
"unstructured-inference>=1.6.6, <2.0.0; platform_system != 'Windows' and python_version >= '3.12'",
"unstructured-inference>=1.2.0, <2.0.0; platform_system != 'Windows' and python_version < '3.12'",
"unstructured-inference>=1.6.6, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
"unstructured-inference>=1.6.10, <2.0.0; platform_system != 'Windows'",
"unstructured-inference>=1.6.10, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
"unstructured-pytesseract>=0.3.15, <1.0.0",
]
md = [
Expand Down
57 changes: 55 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from pdf2image.exceptions import PDFPageCountError
from PIL import Image
from pytest_mock import MockFixture
from unstructured_inference.inference import layout
from unstructured_inference.inference import layout, pdf_image
from unstructured_inference.inference.elements import Rectangle
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElement
Expand All @@ -36,7 +36,7 @@
Text,
Title,
)
from unstructured.errors import PageCountExceededError
from unstructured.errors import PageCountExceededError, UnprocessableEntityError
from unstructured.partition import pdf, strategies
from unstructured.partition.pdf_image import ocr, pdfminer_processing
from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots
Expand Down Expand Up @@ -300,6 +300,59 @@ def test_partition_pdf_passes_configured_dpi_to_inference(
assert mock_process.call_args[1]["pdf_image_dpi"] == 350


def test_partition_pdf_passes_render_max_pixels_to_inference(monkeypatch):
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])

with (
mock.patch.object(
layout,
"process_file_with_model",
return_value=MockDocumentLayout(),
) as mock_process,
mock.patch.object(
ocr,
"process_file_with_ocr",
return_value=MockDocumentLayout(),
),
):
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)

assert mock_process.call_args[1]["pdf_render_max_pixels_per_page"] == 1_000_000_000

with (
open(filename, "rb") as file,
mock.patch.object(
layout,
"process_data_with_model",
return_value=MockDocumentLayout(),
) as mock_process,
mock.patch.object(
ocr,
"process_data_with_ocr",
return_value=MockDocumentLayout(),
),
):
pdf.partition_pdf(file=file, strategy=PartitionStrategy.HI_RES)

assert mock_process.call_args[1]["pdf_render_max_pixels_per_page"] == 1_000_000_000


def test_partition_pdf_render_too_large_error_is_unprocessable(monkeypatch):
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
with mock.patch.object(
layout,
"process_file_with_model",
side_effect=pdf_image.PdfRenderTooLargeError(
"PDF page would render to too many pixels for safe processing: "
"page=1, pixels=1000000001, maximum=1000000000.",
),
):
with pytest.raises(UnprocessableEntityError, match="too many pixels"):
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)


@pytest.mark.parametrize("model_name", ["checkbox", "yolox"])
def test_partition_pdf_with_model_name(
monkeypatch,
Expand Down
25 changes: 25 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@
import numpy as np
import pytest
from PIL import Image as PILImg
from unstructured_inference.inference import pdf_image

from test_unstructured.unit_utils import example_doc_path
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
from unstructured.errors import UnprocessableEntityError
from unstructured.partition.pdf_image import pdf_image_utils


Expand Down Expand Up @@ -62,6 +64,29 @@ def test_convert_pdf_to_image(file_mode, path_only):
assert isinstance(images[0], PILImg.Image)


def test_convert_pdf_to_image_raises_unprocessable_when_render_too_large():
with patch.object(
pdf_image_utils,
"render_pdf_to_image",
side_effect=pdf_image.PdfRenderTooLargeError("too many pixels"),
):
with pytest.raises(UnprocessableEntityError, match="too many pixels"):
pdf_image_utils.convert_pdf_to_image(filename="example.pdf")


def test_convert_pdf_to_images_raises_unprocessable_when_render_too_large():
with (
patch.object(pdf_image_utils.pdf2image, "pdfinfo_from_path", return_value={"Pages": 1}),
patch.object(
pdf_image_utils,
"render_pdf_to_image",
side_effect=pdf_image.PdfRenderTooLargeError("too many pixels"),
),
):
with pytest.raises(UnprocessableEntityError, match="too many pixels"):
list(pdf_image_utils.convert_pdf_to_images(filename="example.pdf"))


@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("path_only", [True, False])
def test_convert_pdf_to_image_twice(file_mode, path_only):
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.23" # pragma: no cover
__version__ = "0.22.24" # pragma: no cover
45 changes: 26 additions & 19 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
Text,
Title,
)
from unstructured.errors import PageCountExceededError
from unstructured.errors import PageCountExceededError, UnprocessableEntityError
from unstructured.file_utils.model import FileType
from unstructured.logger import logger, trace_logger
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
Expand Down Expand Up @@ -781,6 +781,7 @@ def _partition_pdf_or_image_local(
process_data_with_model,
process_file_with_model,
)
from unstructured_inference.inference.pdf_image import PdfRenderTooLargeError

from unstructured.partition.pdf_image.analysis.layout_dump import (
ExtractedLayoutDumper,
Expand All @@ -802,30 +803,42 @@ def _partition_pdf_or_image_local(
process_file_with_pdfminer,
)

hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
if pdf_image_dpi is None:
pdf_image_dpi = env_config.PDF_RENDER_DPI
model_render_kwargs = (
{"pdf_render_max_pixels_per_page": env_config.PDF_RENDER_MAX_PIXELS_PER_PAGE}
if not is_image
else {}
)

if not is_image:
check_pdf_hi_res_max_pages_exceeded(
filename=filename, file=file, pdf_hi_res_max_pages=pdf_hi_res_max_pages
)

hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
if pdf_image_dpi is None:
pdf_image_dpi = env_config.PDF_RENDER_DPI

od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
ocr_layout_dumper: Optional[OCRLayoutDumper] = None
final_layout_dumper: Optional[FinalLayoutDumper] = None

skip_analysis_dump = env_config.ANALYSIS_DUMP_OD_SKIP

def _run_layout_inference(processor, source):
try:
return processor(
source,
is_image=is_image,
model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi,
password=password,
**model_render_kwargs,
)
except PdfRenderTooLargeError as exc:
raise UnprocessableEntityError(str(exc)) from exc

if file is None:
inferred_document_layout = process_file_with_model(
filename,
is_image=is_image,
model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi,
password=password,
)
inferred_document_layout = _run_layout_inference(process_file_with_model, filename)

pdfminer_config = _enable_detect_vertical_if_rotated(
inferred_document_layout,
Expand Down Expand Up @@ -883,13 +896,7 @@ def _partition_pdf_or_image_local(
table_ocr_agent=table_ocr_agent,
)
else:
inferred_document_layout = process_data_with_model(
file,
is_image=is_image,
model_name=hi_res_model_name,
pdf_image_dpi=pdf_image_dpi,
password=password,
)
inferred_document_layout = _run_layout_inference(process_data_with_model, file)

if hasattr(file, "seek"):
file.seek(0)
Expand Down
42 changes: 26 additions & 16 deletions unstructured/partition/pdf_image/pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
import pdf2image
from PIL import Image
from unstructured_inference.inference.layout import convert_pdf_to_image as render_pdf_to_image
from unstructured_inference.inference.pdf_image import PdfRenderTooLargeError

from unstructured.documents.elements import ElementType
from unstructured.errors import UnprocessableEntityError
from unstructured.logger import logger
from unstructured.partition.common.common import convert_to_bytes, exactly_one
from unstructured.partition.utils.config import env_config
Expand Down Expand Up @@ -66,14 +68,18 @@ def convert_pdf_to_image(
if dpi is None:
dpi = env_config.PDF_RENDER_DPI

return render_pdf_to_image(
filename=filename,
file=file,
dpi=dpi,
output_folder=output_folder,
path_only=path_only,
password=password,
)
try:
return render_pdf_to_image(
filename=filename,
file=file,
dpi=dpi,
output_folder=output_folder,
path_only=path_only,
password=password,
pdf_render_max_pixels_per_page=env_config.PDF_RENDER_MAX_PIXELS_PER_PAGE,
)
except PdfRenderTooLargeError as exc:
raise UnprocessableEntityError(str(exc)) from exc


def pad_element_bboxes(
Expand Down Expand Up @@ -405,14 +411,18 @@ def convert_pdf_to_images(
total_pages = info["Pages"]
for start_page in range(1, total_pages + 1, chunk_size):
end_page = min(start_page + chunk_size - 1, total_pages)
chunk_images = render_pdf_to_image(
filename=filename if f_bytes is None else None,
file=f_bytes,
dpi=env_config.PDF_RENDER_DPI,
first_page=start_page,
last_page=end_page,
password=password,
)
try:
chunk_images = render_pdf_to_image(
filename=filename if f_bytes is None else None,
file=f_bytes,
dpi=env_config.PDF_RENDER_DPI,
first_page=start_page,
last_page=end_page,
password=password,
pdf_render_max_pixels_per_page=env_config.PDF_RENDER_MAX_PIXELS_PER_PAGE,
)
except PdfRenderTooLargeError as exc:
raise UnprocessableEntityError(str(exc)) from exc
chunk_images = cast(List[Image.Image], chunk_images)

for image in chunk_images:
Expand Down
5 changes: 5 additions & 0 deletions unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,5 +315,10 @@ def PDF_RENDER_DPI(self) -> int:
"""The DPI to use for rendering PDF pages"""
return self._get_int("PDF_RENDER_DPI", 350)

@property
def PDF_RENDER_MAX_PIXELS_PER_PAGE(self) -> int:
"""Maximum rendered pixels allowed for a single PDF page"""
return self._get_int("PDF_RENDER_MAX_PIXELS_PER_PAGE", 1_000_000_000)


env_config = ENVConfig()
Loading
Loading