Reject oversized PDF renders before bitmap allocation (#4345)

CyMule · web-flow · commit b909cf4f2953 · 2026-04-26T21:55:43.000Z
## Summary - pass the configured PDF render pixel limit into `unstructured-inference` - map oversized PDF render failures to `UnprocessableEntityError` - add coverage for filename/file-object paths and render-limit error handling  --- > [!NOTE] > **Medium Risk** > Changes hi-res PDF inference/rendering behavior by enforcing a per-page pixel cap and remapping upstream render failures to `UnprocessableEntityError`, which could alter error surfaces for some PDFs. Dependency bump to `unstructured-inference` may also change model/render behavior across platforms. > > **Overview** > **Prevents OOM/unsafe rendering for hi-res PDFs** by passing a new `PDF_RENDER_MAX_PIXELS_PER_PAGE` limit through to `unstructured-inference` during hi-res partitioning and PDF-to-image conversion. > > When `unstructured-inference` raises `PdfRenderTooLargeError`, the code now converts it into an `UnprocessableEntityError` so oversized pages fail fast with a consistent “unprocessable document” error instead of attempting bitmap allocation. > > Bumps the `unstructured-inference` dependency (and lockfile resolutions) and adds tests covering both filename/file-object paths and the new oversized-render error handling. > > <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit 8bae7e0. Bugbot is set up for automated code reviews on this repo. Configure [here](https://www.cursor.com/dashboard/bugbot).</sup>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## 0.22.24
+
+### Fixes
+
+- **Reject oversized hi-res PDF renders before bitmap allocation**: Hi-res PDF partitioning now passes the configured per-page pixel limit to `unstructured-inference` so oversized pages are rejected immediately before rendering, then returned as unprocessable document errors.
+
 ## 0.22.23
 
 ### Fixes
diff --git a/pyproject.toml b/pyproject.toml
@@ -68,9 +68,8 @@ image = [
     "pi-heif>=1.2.0, <2.0.0",
     "pikepdf>=10.3.0, <11.0.0",
     "pypdf>=6.6.2, <7.0.0",
-    "unstructured-inference>=1.6.6, <2.0.0; platform_system != 'Windows' and python_version >= '3.12'",
-    "unstructured-inference>=1.2.0, <2.0.0; platform_system != 'Windows' and python_version < '3.12'",
-    "unstructured-inference>=1.6.6, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
+    "unstructured-inference>=1.6.10, <2.0.0; platform_system != 'Windows'",
+    "unstructured-inference>=1.6.10, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
     "unstructured-pytesseract>=0.3.15, <1.0.0",
 ]
 md = [
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -16,7 +16,7 @@
 from pdf2image.exceptions import PDFPageCountError
 from PIL import Image
 from pytest_mock import MockFixture
-from unstructured_inference.inference import layout
+from unstructured_inference.inference import layout, pdf_image
 from unstructured_inference.inference.elements import Rectangle
 from unstructured_inference.inference.layout import DocumentLayout, PageLayout
 from unstructured_inference.inference.layoutelement import LayoutElement
@@ -36,7 +36,7 @@
     Text,
     Title,
 )
-from unstructured.errors import PageCountExceededError
+from unstructured.errors import PageCountExceededError, UnprocessableEntityError
 from unstructured.partition import pdf, strategies
 from unstructured.partition.pdf_image import ocr, pdfminer_processing
 from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots
@@ -300,6 +300,59 @@ def test_partition_pdf_passes_configured_dpi_to_inference(
         assert mock_process.call_args[1]["pdf_image_dpi"] == 350
 
 
+def test_partition_pdf_passes_render_max_pixels_to_inference(monkeypatch):
+    filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
+    monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
+
+    with (
+        mock.patch.object(
+            layout,
+            "process_file_with_model",
+            return_value=MockDocumentLayout(),
+        ) as mock_process,
+        mock.patch.object(
+            ocr,
+            "process_file_with_ocr",
+            return_value=MockDocumentLayout(),
+        ),
+    ):
+        pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
+
+    assert mock_process.call_args[1]["pdf_render_max_pixels_per_page"] == 1_000_000_000
+
+    with (
+        open(filename, "rb") as file,
+        mock.patch.object(
+            layout,
+            "process_data_with_model",
+            return_value=MockDocumentLayout(),
+        ) as mock_process,
+        mock.patch.object(
+            ocr,
+            "process_data_with_ocr",
+            return_value=MockDocumentLayout(),
+        ),
+    ):
+        pdf.partition_pdf(file=file, strategy=PartitionStrategy.HI_RES)
+
+    assert mock_process.call_args[1]["pdf_render_max_pixels_per_page"] == 1_000_000_000
+
+
+def test_partition_pdf_render_too_large_error_is_unprocessable(monkeypatch):
+    filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
+    monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
+    with mock.patch.object(
+        layout,
+        "process_file_with_model",
+        side_effect=pdf_image.PdfRenderTooLargeError(
+            "PDF page would render to too many pixels for safe processing: "
+            "page=1, pixels=1000000001, maximum=1000000000.",
+        ),
+    ):
+        with pytest.raises(UnprocessableEntityError, match="too many pixels"):
+            pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
+
+
 @pytest.mark.parametrize("model_name", ["checkbox", "yolox"])
 def test_partition_pdf_with_model_name(
     monkeypatch,
diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@@ -7,10 +7,12 @@
 import numpy as np
 import pytest
 from PIL import Image as PILImg
+from unstructured_inference.inference import pdf_image
 
 from test_unstructured.unit_utils import example_doc_path
 from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
+from unstructured.errors import UnprocessableEntityError
 from unstructured.partition.pdf_image import pdf_image_utils
 
 
@@ -62,6 +64,29 @@ def test_convert_pdf_to_image(file_mode, path_only):
             assert isinstance(images[0], PILImg.Image)
 
 
+def test_convert_pdf_to_image_raises_unprocessable_when_render_too_large():
+    with patch.object(
+        pdf_image_utils,
+        "render_pdf_to_image",
+        side_effect=pdf_image.PdfRenderTooLargeError("too many pixels"),
+    ):
+        with pytest.raises(UnprocessableEntityError, match="too many pixels"):
+            pdf_image_utils.convert_pdf_to_image(filename="example.pdf")
+
+
+def test_convert_pdf_to_images_raises_unprocessable_when_render_too_large():
+    with (
+        patch.object(pdf_image_utils.pdf2image, "pdfinfo_from_path", return_value={"Pages": 1}),
+        patch.object(
+            pdf_image_utils,
+            "render_pdf_to_image",
+            side_effect=pdf_image.PdfRenderTooLargeError("too many pixels"),
+        ),
+    ):
+        with pytest.raises(UnprocessableEntityError, match="too many pixels"):
+            list(pdf_image_utils.convert_pdf_to_images(filename="example.pdf"))
+
+
 @pytest.mark.parametrize("file_mode", ["filename", "rb"])
 @pytest.mark.parametrize("path_only", [True, False])
 def test_convert_pdf_to_image_twice(file_mode, path_only):
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.22.23"  # pragma: no cover
+__version__ = "0.22.24"  # pragma: no cover
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -38,7 +38,7 @@
     Text,
     Title,
 )
-from unstructured.errors import PageCountExceededError
+from unstructured.errors import PageCountExceededError, UnprocessableEntityError
 from unstructured.file_utils.model import FileType
 from unstructured.logger import logger, trace_logger
 from unstructured.nlp.patterns import PARAGRAPH_PATTERN
@@ -781,6 +781,7 @@ def _partition_pdf_or_image_local(
         process_data_with_model,
         process_file_with_model,
     )
+    from unstructured_inference.inference.pdf_image import PdfRenderTooLargeError
 
     from unstructured.partition.pdf_image.analysis.layout_dump import (
         ExtractedLayoutDumper,
@@ -802,30 +803,42 @@ def _partition_pdf_or_image_local(
         process_file_with_pdfminer,
     )
 
+    hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
+    if pdf_image_dpi is None:
+        pdf_image_dpi = env_config.PDF_RENDER_DPI
+    model_render_kwargs = (
+        {"pdf_render_max_pixels_per_page": env_config.PDF_RENDER_MAX_PIXELS_PER_PAGE}
+        if not is_image
+        else {}
+    )
+
     if not is_image:
         check_pdf_hi_res_max_pages_exceeded(
             filename=filename, file=file, pdf_hi_res_max_pages=pdf_hi_res_max_pages
         )
 
-    hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
-    if pdf_image_dpi is None:
-        pdf_image_dpi = env_config.PDF_RENDER_DPI
-
     od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
     extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
     ocr_layout_dumper: Optional[OCRLayoutDumper] = None
     final_layout_dumper: Optional[FinalLayoutDumper] = None
 
     skip_analysis_dump = env_config.ANALYSIS_DUMP_OD_SKIP
 
+    def _run_layout_inference(processor, source):
+        try:
+            return processor(
+                source,
+                is_image=is_image,
+                model_name=hi_res_model_name,
+                pdf_image_dpi=pdf_image_dpi,
+                password=password,
+                **model_render_kwargs,
+            )
+        except PdfRenderTooLargeError as exc:
+            raise UnprocessableEntityError(str(exc)) from exc
+
     if file is None:
-        inferred_document_layout = process_file_with_model(
-            filename,
-            is_image=is_image,
-            model_name=hi_res_model_name,
-            pdf_image_dpi=pdf_image_dpi,
-            password=password,
-        )
+        inferred_document_layout = _run_layout_inference(process_file_with_model, filename)
 
         pdfminer_config = _enable_detect_vertical_if_rotated(
             inferred_document_layout,
@@ -883,13 +896,7 @@ def _partition_pdf_or_image_local(
             table_ocr_agent=table_ocr_agent,
         )
     else:
-        inferred_document_layout = process_data_with_model(
-            file,
-            is_image=is_image,
-            model_name=hi_res_model_name,
-            pdf_image_dpi=pdf_image_dpi,
-            password=password,
-        )
+        inferred_document_layout = _run_layout_inference(process_data_with_model, file)
 
         if hasattr(file, "seek"):
             file.seek(0)
diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -15,8 +15,10 @@
 import pdf2image
 from PIL import Image
 from unstructured_inference.inference.layout import convert_pdf_to_image as render_pdf_to_image
+from unstructured_inference.inference.pdf_image import PdfRenderTooLargeError
 
 from unstructured.documents.elements import ElementType
+from unstructured.errors import UnprocessableEntityError
 from unstructured.logger import logger
 from unstructured.partition.common.common import convert_to_bytes, exactly_one
 from unstructured.partition.utils.config import env_config
@@ -66,14 +68,18 @@ def convert_pdf_to_image(
     if dpi is None:
         dpi = env_config.PDF_RENDER_DPI
 
-    return render_pdf_to_image(
-        filename=filename,
-        file=file,
-        dpi=dpi,
-        output_folder=output_folder,
-        path_only=path_only,
-        password=password,
-    )
+    try:
+        return render_pdf_to_image(
+            filename=filename,
+            file=file,
+            dpi=dpi,
+            output_folder=output_folder,
+            path_only=path_only,
+            password=password,
+            pdf_render_max_pixels_per_page=env_config.PDF_RENDER_MAX_PIXELS_PER_PAGE,
+        )
+    except PdfRenderTooLargeError as exc:
+        raise UnprocessableEntityError(str(exc)) from exc
 
 
 def pad_element_bboxes(
@@ -405,14 +411,18 @@ def convert_pdf_to_images(
     total_pages = info["Pages"]
     for start_page in range(1, total_pages + 1, chunk_size):
         end_page = min(start_page + chunk_size - 1, total_pages)
-        chunk_images = render_pdf_to_image(
-            filename=filename if f_bytes is None else None,
-            file=f_bytes,
-            dpi=env_config.PDF_RENDER_DPI,
-            first_page=start_page,
-            last_page=end_page,
-            password=password,
-        )
+        try:
+            chunk_images = render_pdf_to_image(
+                filename=filename if f_bytes is None else None,
+                file=f_bytes,
+                dpi=env_config.PDF_RENDER_DPI,
+                first_page=start_page,
+                last_page=end_page,
+                password=password,
+                pdf_render_max_pixels_per_page=env_config.PDF_RENDER_MAX_PIXELS_PER_PAGE,
+            )
+        except PdfRenderTooLargeError as exc:
+            raise UnprocessableEntityError(str(exc)) from exc
         chunk_images = cast(List[Image.Image], chunk_images)
 
         for image in chunk_images:
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
@@ -315,5 +315,10 @@ def PDF_RENDER_DPI(self) -> int:
         """The DPI to use for rendering PDF pages"""
         return self._get_int("PDF_RENDER_DPI", 350)
 
+    @property
+    def PDF_RENDER_MAX_PIXELS_PER_PAGE(self) -> int:
+        """Maximum rendered pixels allowed for a single PDF page"""
+        return self._get_int("PDF_RENDER_MAX_PIXELS_PER_PAGE", 1_000_000_000)
+
 
 env_config = ENVConfig()
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.22.23" # pragma: no cover`
	`1`	`+__version__ = "0.22.24" # pragma: no cover`