Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 0.22.24

### Fixes

- **Reject oversized hi-res PDF renders before rasterization**: Hi-res PDF partitioning now checks estimated per-page rendered pixels before rendering and returns an unprocessable document error when a page exceeds the configured safe limit.

## 0.22.23

### Fixes
Expand Down
60 changes: 59 additions & 1 deletion test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
Text,
Title,
)
from unstructured.errors import PageCountExceededError
from unstructured.errors import PageCountExceededError, UnprocessableEntityError
from unstructured.partition import pdf, strategies
from unstructured.partition.pdf_image import ocr, pdfminer_processing
from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots
Expand Down Expand Up @@ -1581,6 +1581,64 @@ def test_pdf_hi_res_max_pages_argument(filename, pdf_hi_res_max_pages, expected_
)


def test_check_pdf_render_max_pixels_exceeded_raises_for_oversized_page():
page = mock.Mock()
page.cropbox.width = 720
page.cropbox.height = 720
reader = mock.Mock(pages=[page])

with mock.patch.object(pdf, "PdfReader", return_value=reader):
with pytest.raises(UnprocessableEntityError, match="too many pixels"):
pdf.check_pdf_render_max_pixels_exceeded(
filename="oversized.pdf",
pdf_image_dpi=100,
pdf_render_max_pixels_per_page=999_999,
)


def test_check_pdf_render_max_pixels_exceeded_allows_page_under_limit():
page = mock.Mock()
page.cropbox.width = 72
page.cropbox.height = 72
reader = mock.Mock(pages=[page])

with mock.patch.object(pdf, "PdfReader", return_value=reader):
pdf.check_pdf_render_max_pixels_exceeded(
filename="normal.pdf",
pdf_image_dpi=100,
pdf_render_max_pixels_per_page=20_000,
)


def test_check_pdf_render_max_pixels_exceeded_can_be_disabled():
with mock.patch.object(pdf, "PdfReader") as pdf_reader:
pdf.check_pdf_render_max_pixels_exceeded(
filename="oversized.pdf",
pdf_image_dpi=100,
pdf_render_max_pixels_per_page=0,
)

pdf_reader.assert_not_called()


def test_check_pdf_render_max_pixels_exceeded_restores_file_cursor_position():
file = io.BytesIO(b"%PDF-1.7 fake")
file.seek(4)
page = mock.Mock()
page.cropbox.width = 72
page.cropbox.height = 72
reader = mock.Mock(pages=[page])

with mock.patch.object(pdf, "PdfReader", return_value=reader):
pdf.check_pdf_render_max_pixels_exceeded(
file=file,
pdf_image_dpi=100,
pdf_render_max_pixels_per_page=20_000,
)

assert file.tell() == 4


def test_is_pdf_too_complex_skips_small_file_size():
assert not pdf.is_pdf_too_complex(file=b"tiny", min_file_size_bytes=10)

Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.22.23" # pragma: no cover
__version__ = "0.22.24" # pragma: no cover
73 changes: 68 additions & 5 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import contextlib
import copy
import io
import math
import os
import re
import warnings
Expand Down Expand Up @@ -38,7 +39,7 @@
Text,
Title,
)
from unstructured.errors import PageCountExceededError
from unstructured.errors import PageCountExceededError, UnprocessableEntityError
from unstructured.file_utils.model import FileType
from unstructured.logger import logger, trace_logger
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
Expand Down Expand Up @@ -105,6 +106,7 @@
)
DEFAULT_MIN_FILE_SIZE_BYTES = 1 * 1024 * 1024 # 1 MB
DEFAULT_MIN_RAW_STREAM_BYTES = 100_000 # 100 KB
PDF_POINTS_PER_INCH = 72

# increase the max pixels so high dpi values like 300 can still be under the PIL limit
PILImage.MAX_IMAGE_PIXELS = 5e8
Expand Down Expand Up @@ -593,6 +595,61 @@ def check_pdf_hi_res_max_pages_exceeded(
)


def check_pdf_render_max_pixels_exceeded(
filename: str = "",
file: Optional[bytes | IO[bytes]] = None,
pdf_image_dpi: Optional[int] = None,
pdf_render_max_pixels_per_page: Optional[int] = None,
password: Optional[str] = None,
) -> None:
"""Checks whether any PDF page would render beyond the configured pixel limit."""
pdf_image_dpi = pdf_image_dpi or env_config.PDF_RENDER_DPI
if pdf_render_max_pixels_per_page is None:
pdf_render_max_pixels_per_page = env_config.PDF_RENDER_MAX_PIXELS_PER_PAGE

if not pdf_render_max_pixels_per_page:
return

original_pos: Optional[int] = None

try:
if file is not None:
if isinstance(file, bytes):
reader = PdfReader(io.BytesIO(file))
else:
original_pos = file.tell()
file.seek(0)
reader = PdfReader(file)
elif filename:
reader = PdfReader(filename)
else:
raise ValueError("Either 'file' or 'filename' must be provided.")

if password:
reader.decrypt(password)

scale = pdf_image_dpi / PDF_POINTS_PER_INCH

for page_number, page in enumerate(reader.pages, start=1):
page_box = page.cropbox or page.mediabox
rendered_width = math.ceil(abs(float(page_box.width)) * scale)
rendered_height = math.ceil(abs(float(page_box.height)) * scale)
rendered_pixels = rendered_width * rendered_height

if rendered_pixels > pdf_render_max_pixels_per_page:
raise UnprocessableEntityError(
"PDF page would render to too many pixels for safe processing: "
f"page={page_number}, pixels={rendered_pixels}, "
f"maximum={pdf_render_max_pixels_per_page}, dpi={pdf_image_dpi}. "
"Try splitting the PDF, reducing the page dimensions, or using a lower "
"render DPI."
)

finally:
if file is not None and not isinstance(file, bytes) and original_pos is not None:
file.seek(original_pos)


def is_pdf_too_complex(
filename: str = "",
file: Optional[Union[bytes, IO[bytes]]] = None,
Expand Down Expand Up @@ -802,14 +859,20 @@ def _partition_pdf_or_image_local(
process_file_with_pdfminer,
)

hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
if pdf_image_dpi is None:
pdf_image_dpi = env_config.PDF_RENDER_DPI

if not is_image:
check_pdf_hi_res_max_pages_exceeded(
filename=filename, file=file, pdf_hi_res_max_pages=pdf_hi_res_max_pages
)

hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
if pdf_image_dpi is None:
pdf_image_dpi = env_config.PDF_RENDER_DPI
check_pdf_render_max_pixels_exceeded(
filename=filename,
file=file,
pdf_image_dpi=pdf_image_dpi,
password=password,
)

od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
Expand Down
5 changes: 5 additions & 0 deletions unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,5 +315,10 @@ def PDF_RENDER_DPI(self) -> int:
"""The DPI to use for rendering PDF pages"""
return self._get_int("PDF_RENDER_DPI", 350)

@property
def PDF_RENDER_MAX_PIXELS_PER_PAGE(self) -> int:
"""Maximum rendered pixels allowed for a single PDF page."""
return self._get_int("PDF_RENDER_MAX_PIXELS_PER_PAGE", 1_000_000_000)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Default pixel limit exceeds PIL safety limit

Medium Severity

The default PDF_RENDER_MAX_PIXELS_PER_PAGE of 1,000,000,000 (1 billion) is twice as high as PILImage.MAX_IMAGE_PIXELS which is set to 5e8 (500 million) at module level. This means with default settings, any page rendering between 500M and 1B pixels passes the preflight check but still triggers a PIL DecompressionBombError during rasterization — after the resource-intensive poppler rendering step has already allocated memory. The preflight check's purpose is to reject oversized pages before rasterization, but the default value makes it ineffective for the exact range PIL would catch.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 37f65d2. Configure here.



env_config = ENVConfig()
Loading