Skip to content

Commit b909cf4

Browse files
authored
Reject oversized PDF renders before bitmap allocation (#4345)
## Summary - pass the configured PDF render pixel limit into `unstructured-inference` - map oversized PDF render failures to `UnprocessableEntityError` - add coverage for filename/file-object paths and render-limit error handling <!-- CURSOR_SUMMARY --> --- > [!NOTE] > **Medium Risk** > Changes hi-res PDF inference/rendering behavior by enforcing a per-page pixel cap and remapping upstream render failures to `UnprocessableEntityError`, which could alter error surfaces for some PDFs. Dependency bump to `unstructured-inference` may also change model/render behavior across platforms. > > **Overview** > **Prevents OOM/unsafe rendering for hi-res PDFs** by passing a new `PDF_RENDER_MAX_PIXELS_PER_PAGE` limit through to `unstructured-inference` during hi-res partitioning and PDF-to-image conversion. > > When `unstructured-inference` raises `PdfRenderTooLargeError`, the code now converts it into an `UnprocessableEntityError` so oversized pages fail fast with a consistent “unprocessable document” error instead of attempting bitmap allocation. > > Bumps the `unstructured-inference` dependency (and lockfile resolutions) and adds tests covering both filename/file-object paths and the new oversized-render error handling. > > <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit 8bae7e0. Bugbot is set up for automated code reviews on this repo. Configure [here](https://www.cursor.com/dashboard/bugbot).</sup> <!-- /CURSOR_SUMMARY -->
1 parent 879e126 commit b909cf4

9 files changed

Lines changed: 236 additions & 144 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.22.24
2+
3+
### Fixes
4+
5+
- **Reject oversized hi-res PDF renders before bitmap allocation**: Hi-res PDF partitioning now passes the configured per-page pixel limit to `unstructured-inference` so oversized pages are rejected immediately before rendering, then returned as unprocessable document errors.
6+
17
## 0.22.23
28

39
### Fixes

pyproject.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,8 @@ image = [
6868
"pi-heif>=1.2.0, <2.0.0",
6969
"pikepdf>=10.3.0, <11.0.0",
7070
"pypdf>=6.6.2, <7.0.0",
71-
"unstructured-inference>=1.6.6, <2.0.0; platform_system != 'Windows' and python_version >= '3.12'",
72-
"unstructured-inference>=1.2.0, <2.0.0; platform_system != 'Windows' and python_version < '3.12'",
73-
"unstructured-inference>=1.6.6, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
71+
"unstructured-inference>=1.6.10, <2.0.0; platform_system != 'Windows'",
72+
"unstructured-inference>=1.6.10, <2.0.0; platform_system == 'Windows' and python_version >= '3.12' and python_version < '3.13'",
7473
"unstructured-pytesseract>=0.3.15, <1.0.0",
7574
]
7675
md = [

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from pdf2image.exceptions import PDFPageCountError
1717
from PIL import Image
1818
from pytest_mock import MockFixture
19-
from unstructured_inference.inference import layout
19+
from unstructured_inference.inference import layout, pdf_image
2020
from unstructured_inference.inference.elements import Rectangle
2121
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
2222
from unstructured_inference.inference.layoutelement import LayoutElement
@@ -36,7 +36,7 @@
3636
Text,
3737
Title,
3838
)
39-
from unstructured.errors import PageCountExceededError
39+
from unstructured.errors import PageCountExceededError, UnprocessableEntityError
4040
from unstructured.partition import pdf, strategies
4141
from unstructured.partition.pdf_image import ocr, pdfminer_processing
4242
from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots
@@ -300,6 +300,59 @@ def test_partition_pdf_passes_configured_dpi_to_inference(
300300
assert mock_process.call_args[1]["pdf_image_dpi"] == 350
301301

302302

303+
def test_partition_pdf_passes_render_max_pixels_to_inference(monkeypatch):
304+
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
305+
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
306+
307+
with (
308+
mock.patch.object(
309+
layout,
310+
"process_file_with_model",
311+
return_value=MockDocumentLayout(),
312+
) as mock_process,
313+
mock.patch.object(
314+
ocr,
315+
"process_file_with_ocr",
316+
return_value=MockDocumentLayout(),
317+
),
318+
):
319+
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
320+
321+
assert mock_process.call_args[1]["pdf_render_max_pixels_per_page"] == 1_000_000_000
322+
323+
with (
324+
open(filename, "rb") as file,
325+
mock.patch.object(
326+
layout,
327+
"process_data_with_model",
328+
return_value=MockDocumentLayout(),
329+
) as mock_process,
330+
mock.patch.object(
331+
ocr,
332+
"process_data_with_ocr",
333+
return_value=MockDocumentLayout(),
334+
),
335+
):
336+
pdf.partition_pdf(file=file, strategy=PartitionStrategy.HI_RES)
337+
338+
assert mock_process.call_args[1]["pdf_render_max_pixels_per_page"] == 1_000_000_000
339+
340+
341+
def test_partition_pdf_render_too_large_error_is_unprocessable(monkeypatch):
342+
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
343+
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
344+
with mock.patch.object(
345+
layout,
346+
"process_file_with_model",
347+
side_effect=pdf_image.PdfRenderTooLargeError(
348+
"PDF page would render to too many pixels for safe processing: "
349+
"page=1, pixels=1000000001, maximum=1000000000.",
350+
),
351+
):
352+
with pytest.raises(UnprocessableEntityError, match="too many pixels"):
353+
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
354+
355+
303356
@pytest.mark.parametrize("model_name", ["checkbox", "yolox"])
304357
def test_partition_pdf_with_model_name(
305358
monkeypatch,

test_unstructured/partition/pdf_image/test_pdf_image_utils.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@
77
import numpy as np
88
import pytest
99
from PIL import Image as PILImg
10+
from unstructured_inference.inference import pdf_image
1011

1112
from test_unstructured.unit_utils import example_doc_path
1213
from unstructured.documents.coordinates import PixelSpace
1314
from unstructured.documents.elements import ElementMetadata, ElementType, Image, Table
15+
from unstructured.errors import UnprocessableEntityError
1416
from unstructured.partition.pdf_image import pdf_image_utils
1517

1618

@@ -62,6 +64,29 @@ def test_convert_pdf_to_image(file_mode, path_only):
6264
assert isinstance(images[0], PILImg.Image)
6365

6466

67+
def test_convert_pdf_to_image_raises_unprocessable_when_render_too_large():
68+
with patch.object(
69+
pdf_image_utils,
70+
"render_pdf_to_image",
71+
side_effect=pdf_image.PdfRenderTooLargeError("too many pixels"),
72+
):
73+
with pytest.raises(UnprocessableEntityError, match="too many pixels"):
74+
pdf_image_utils.convert_pdf_to_image(filename="example.pdf")
75+
76+
77+
def test_convert_pdf_to_images_raises_unprocessable_when_render_too_large():
78+
with (
79+
patch.object(pdf_image_utils.pdf2image, "pdfinfo_from_path", return_value={"Pages": 1}),
80+
patch.object(
81+
pdf_image_utils,
82+
"render_pdf_to_image",
83+
side_effect=pdf_image.PdfRenderTooLargeError("too many pixels"),
84+
),
85+
):
86+
with pytest.raises(UnprocessableEntityError, match="too many pixels"):
87+
list(pdf_image_utils.convert_pdf_to_images(filename="example.pdf"))
88+
89+
6590
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
6691
@pytest.mark.parametrize("path_only", [True, False])
6792
def test_convert_pdf_to_image_twice(file_mode, path_only):

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.22.23" # pragma: no cover
1+
__version__ = "0.22.24" # pragma: no cover

unstructured/partition/pdf.py

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
Text,
3939
Title,
4040
)
41-
from unstructured.errors import PageCountExceededError
41+
from unstructured.errors import PageCountExceededError, UnprocessableEntityError
4242
from unstructured.file_utils.model import FileType
4343
from unstructured.logger import logger, trace_logger
4444
from unstructured.nlp.patterns import PARAGRAPH_PATTERN
@@ -781,6 +781,7 @@ def _partition_pdf_or_image_local(
781781
process_data_with_model,
782782
process_file_with_model,
783783
)
784+
from unstructured_inference.inference.pdf_image import PdfRenderTooLargeError
784785

785786
from unstructured.partition.pdf_image.analysis.layout_dump import (
786787
ExtractedLayoutDumper,
@@ -802,30 +803,42 @@ def _partition_pdf_or_image_local(
802803
process_file_with_pdfminer,
803804
)
804805

806+
hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
807+
if pdf_image_dpi is None:
808+
pdf_image_dpi = env_config.PDF_RENDER_DPI
809+
model_render_kwargs = (
810+
{"pdf_render_max_pixels_per_page": env_config.PDF_RENDER_MAX_PIXELS_PER_PAGE}
811+
if not is_image
812+
else {}
813+
)
814+
805815
if not is_image:
806816
check_pdf_hi_res_max_pages_exceeded(
807817
filename=filename, file=file, pdf_hi_res_max_pages=pdf_hi_res_max_pages
808818
)
809819

810-
hi_res_model_name = hi_res_model_name or model_name or default_hi_res_model()
811-
if pdf_image_dpi is None:
812-
pdf_image_dpi = env_config.PDF_RENDER_DPI
813-
814820
od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
815821
extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
816822
ocr_layout_dumper: Optional[OCRLayoutDumper] = None
817823
final_layout_dumper: Optional[FinalLayoutDumper] = None
818824

819825
skip_analysis_dump = env_config.ANALYSIS_DUMP_OD_SKIP
820826

827+
def _run_layout_inference(processor, source):
828+
try:
829+
return processor(
830+
source,
831+
is_image=is_image,
832+
model_name=hi_res_model_name,
833+
pdf_image_dpi=pdf_image_dpi,
834+
password=password,
835+
**model_render_kwargs,
836+
)
837+
except PdfRenderTooLargeError as exc:
838+
raise UnprocessableEntityError(str(exc)) from exc
839+
821840
if file is None:
822-
inferred_document_layout = process_file_with_model(
823-
filename,
824-
is_image=is_image,
825-
model_name=hi_res_model_name,
826-
pdf_image_dpi=pdf_image_dpi,
827-
password=password,
828-
)
841+
inferred_document_layout = _run_layout_inference(process_file_with_model, filename)
829842

830843
pdfminer_config = _enable_detect_vertical_if_rotated(
831844
inferred_document_layout,
@@ -883,13 +896,7 @@ def _partition_pdf_or_image_local(
883896
table_ocr_agent=table_ocr_agent,
884897
)
885898
else:
886-
inferred_document_layout = process_data_with_model(
887-
file,
888-
is_image=is_image,
889-
model_name=hi_res_model_name,
890-
pdf_image_dpi=pdf_image_dpi,
891-
password=password,
892-
)
899+
inferred_document_layout = _run_layout_inference(process_data_with_model, file)
893900

894901
if hasattr(file, "seek"):
895902
file.seek(0)

unstructured/partition/pdf_image/pdf_image_utils.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515
import pdf2image
1616
from PIL import Image
1717
from unstructured_inference.inference.layout import convert_pdf_to_image as render_pdf_to_image
18+
from unstructured_inference.inference.pdf_image import PdfRenderTooLargeError
1819

1920
from unstructured.documents.elements import ElementType
21+
from unstructured.errors import UnprocessableEntityError
2022
from unstructured.logger import logger
2123
from unstructured.partition.common.common import convert_to_bytes, exactly_one
2224
from unstructured.partition.utils.config import env_config
@@ -66,14 +68,18 @@ def convert_pdf_to_image(
6668
if dpi is None:
6769
dpi = env_config.PDF_RENDER_DPI
6870

69-
return render_pdf_to_image(
70-
filename=filename,
71-
file=file,
72-
dpi=dpi,
73-
output_folder=output_folder,
74-
path_only=path_only,
75-
password=password,
76-
)
71+
try:
72+
return render_pdf_to_image(
73+
filename=filename,
74+
file=file,
75+
dpi=dpi,
76+
output_folder=output_folder,
77+
path_only=path_only,
78+
password=password,
79+
pdf_render_max_pixels_per_page=env_config.PDF_RENDER_MAX_PIXELS_PER_PAGE,
80+
)
81+
except PdfRenderTooLargeError as exc:
82+
raise UnprocessableEntityError(str(exc)) from exc
7783

7884

7985
def pad_element_bboxes(
@@ -405,14 +411,18 @@ def convert_pdf_to_images(
405411
total_pages = info["Pages"]
406412
for start_page in range(1, total_pages + 1, chunk_size):
407413
end_page = min(start_page + chunk_size - 1, total_pages)
408-
chunk_images = render_pdf_to_image(
409-
filename=filename if f_bytes is None else None,
410-
file=f_bytes,
411-
dpi=env_config.PDF_RENDER_DPI,
412-
first_page=start_page,
413-
last_page=end_page,
414-
password=password,
415-
)
414+
try:
415+
chunk_images = render_pdf_to_image(
416+
filename=filename if f_bytes is None else None,
417+
file=f_bytes,
418+
dpi=env_config.PDF_RENDER_DPI,
419+
first_page=start_page,
420+
last_page=end_page,
421+
password=password,
422+
pdf_render_max_pixels_per_page=env_config.PDF_RENDER_MAX_PIXELS_PER_PAGE,
423+
)
424+
except PdfRenderTooLargeError as exc:
425+
raise UnprocessableEntityError(str(exc)) from exc
416426
chunk_images = cast(List[Image.Image], chunk_images)
417427

418428
for image in chunk_images:

unstructured/partition/utils/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,5 +315,10 @@ def PDF_RENDER_DPI(self) -> int:
315315
"""The DPI to use for rendering PDF pages"""
316316
return self._get_int("PDF_RENDER_DPI", 350)
317317

318+
@property
319+
def PDF_RENDER_MAX_PIXELS_PER_PAGE(self) -> int:
320+
"""Maximum rendered pixels allowed for a single PDF page"""
321+
return self._get_int("PDF_RENDER_MAX_PIXELS_PER_PAGE", 1_000_000_000)
322+
318323

319324
env_config = ENVConfig()

0 commit comments

Comments
 (0)