diff --git a/CHANGELOG.md b/CHANGELOG.md index 519fa53dd4..90b1ce8a9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.22.24 + +### Fixes + +- **Reject oversized hi-res PDF renders before rasterization**: Hi-res PDF partitioning now checks estimated per-page rendered pixels before rendering and returns an unprocessable document error when a page exceeds the configured safe limit. + ## 0.22.23 ### Fixes diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 7fdee0be21..4098fb032a 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -36,7 +36,7 @@ Text, Title, ) -from unstructured.errors import PageCountExceededError +from unstructured.errors import PageCountExceededError, UnprocessableEntityError from unstructured.partition import pdf, strategies from unstructured.partition.pdf_image import ocr, pdfminer_processing from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots @@ -1581,6 +1581,64 @@ def test_pdf_hi_res_max_pages_argument(filename, pdf_hi_res_max_pages, expected_ ) +def test_check_pdf_render_max_pixels_exceeded_raises_for_oversized_page(): + page = mock.Mock() + page.cropbox.width = 720 + page.cropbox.height = 720 + reader = mock.Mock(pages=[page]) + + with mock.patch.object(pdf, "PdfReader", return_value=reader): + with pytest.raises(UnprocessableEntityError, match="too many pixels"): + pdf.check_pdf_render_max_pixels_exceeded( + filename="oversized.pdf", + pdf_image_dpi=100, + pdf_render_max_pixels_per_page=999_999, + ) + + +def test_check_pdf_render_max_pixels_exceeded_allows_page_under_limit(): + page = mock.Mock() + page.cropbox.width = 72 + page.cropbox.height = 72 + reader = mock.Mock(pages=[page]) + + with mock.patch.object(pdf, "PdfReader", return_value=reader): + pdf.check_pdf_render_max_pixels_exceeded( + filename="normal.pdf", + pdf_image_dpi=100, + pdf_render_max_pixels_per_page=20_000, + ) + + +def test_check_pdf_render_max_pixels_exceeded_can_be_disabled(): + with mock.patch.object(pdf, "PdfReader") as pdf_reader: + pdf.check_pdf_render_max_pixels_exceeded( + filename="oversized.pdf", + pdf_image_dpi=100, + pdf_render_max_pixels_per_page=0, + ) + + pdf_reader.assert_not_called() + + +def test_check_pdf_render_max_pixels_exceeded_restores_file_cursor_position(): + file = io.BytesIO(b"%PDF-1.7 fake") + file.seek(4) + page = mock.Mock() + page.cropbox.width = 72 + page.cropbox.height = 72 + reader = mock.Mock(pages=[page]) + + with mock.patch.object(pdf, "PdfReader", return_value=reader): + pdf.check_pdf_render_max_pixels_exceeded( + file=file, + pdf_image_dpi=100, + pdf_render_max_pixels_per_page=20_000, + ) + + assert file.tell() == 4 + + def test_is_pdf_too_complex_skips_small_file_size(): assert not pdf.is_pdf_too_complex(file=b"tiny", min_file_size_bytes=10) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index e7aca043ec..84368a9bc3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.22.23" # pragma: no cover +__version__ = "0.22.24" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 03599ccdfc..060b08e0b6 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -3,6 +3,7 @@ import contextlib import copy import io +import math import os import re import warnings @@ -38,7 +39,7 @@ Text, Title, ) -from unstructured.errors import PageCountExceededError +from unstructured.errors import PageCountExceededError, UnprocessableEntityError from unstructured.file_utils.model import FileType from unstructured.logger import logger, trace_logger from unstructured.nlp.patterns import PARAGRAPH_PATTERN @@ -105,6 +106,7 @@ ) DEFAULT_MIN_FILE_SIZE_BYTES = 1 * 1024 * 1024 # 1 MB DEFAULT_MIN_RAW_STREAM_BYTES = 100_000 # 100 KB +PDF_POINTS_PER_INCH = 72 # increase the max pixels so high dpi values like 300 can still be under the PIL limit PILImage.MAX_IMAGE_PIXELS = 5e8 @@ -593,6 +595,61 @@ def check_pdf_hi_res_max_pages_exceeded( ) +def check_pdf_render_max_pixels_exceeded( + filename: str = "", + file: Optional[bytes | IO[bytes]] = None, + pdf_image_dpi: Optional[int] = None, + pdf_render_max_pixels_per_page: Optional[int] = None, + password: Optional[str] = None, +) -> None: + """Checks whether any PDF page would render beyond the configured pixel limit.""" + pdf_image_dpi = pdf_image_dpi or env_config.PDF_RENDER_DPI + if pdf_render_max_pixels_per_page is None: + pdf_render_max_pixels_per_page = env_config.PDF_RENDER_MAX_PIXELS_PER_PAGE + + if not pdf_render_max_pixels_per_page: + return + + original_pos: Optional[int] = None + + try: + if file is not None: + if isinstance(file, bytes): + reader = PdfReader(io.BytesIO(file)) + else: + original_pos = file.tell() + file.seek(0) + reader = PdfReader(file) + elif filename: + reader = PdfReader(filename) + else: + raise ValueError("Either 'file' or 'filename' must be provided.") + + if password: + reader.decrypt(password) + + scale = pdf_image_dpi / PDF_POINTS_PER_INCH + + for page_number, page in enumerate(reader.pages, start=1): + page_box = page.cropbox or page.mediabox + rendered_width = math.ceil(abs(float(page_box.width)) * scale) + rendered_height = math.ceil(abs(float(page_box.height)) * scale) + rendered_pixels = rendered_width * rendered_height + + if rendered_pixels > pdf_render_max_pixels_per_page: + raise UnprocessableEntityError( + "PDF page would render to too many pixels for safe processing: " + f"page={page_number}, pixels={rendered_pixels}, " + f"maximum={pdf_render_max_pixels_per_page}, dpi={pdf_image_dpi}. " + "Try splitting the PDF, reducing the page dimensions, or using a lower " + "render DPI." + ) + + finally: + if file is not None and not isinstance(file, bytes) and original_pos is not None: + file.seek(original_pos) + + def is_pdf_too_complex( filename: str = "", file: Optional[Union[bytes, IO[bytes]]] = None, @@ -802,14 +859,20 @@ def _partition_pdf_or_image_local( process_file_with_pdfminer, ) + hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model() + if pdf_image_dpi is None: + pdf_image_dpi = env_config.PDF_RENDER_DPI + if not is_image: check_pdf_hi_res_max_pages_exceeded( filename=filename, file=file, pdf_hi_res_max_pages=pdf_hi_res_max_pages ) - - hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model() - if pdf_image_dpi is None: - pdf_image_dpi = env_config.PDF_RENDER_DPI + check_pdf_render_max_pixels_exceeded( + filename=filename, + file=file, + pdf_image_dpi=pdf_image_dpi, + password=password, + ) od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 062884ff88..44bec7cbc6 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -315,5 +315,10 @@ def PDF_RENDER_DPI(self) -> int: """The DPI to use for rendering PDF pages""" return self._get_int("PDF_RENDER_DPI", 350) + @property + def PDF_RENDER_MAX_PIXELS_PER_PAGE(self) -> int: + """Maximum rendered pixels allowed for a single PDF page.""" + return self._get_int("PDF_RENDER_MAX_PIXELS_PER_PAGE", 1_000_000_000) + env_config = ENVConfig()