|
16 | 16 | from pdf2image.exceptions import PDFPageCountError |
17 | 17 | from PIL import Image |
18 | 18 | from pytest_mock import MockFixture |
19 | | -from unstructured_inference.inference import layout |
| 19 | +from unstructured_inference.inference import layout, pdf_image |
20 | 20 | from unstructured_inference.inference.elements import Rectangle |
21 | 21 | from unstructured_inference.inference.layout import DocumentLayout, PageLayout |
22 | 22 | from unstructured_inference.inference.layoutelement import LayoutElement |
|
36 | 36 | Text, |
37 | 37 | Title, |
38 | 38 | ) |
39 | | -from unstructured.errors import PageCountExceededError |
| 39 | +from unstructured.errors import PageCountExceededError, UnprocessableEntityError |
40 | 40 | from unstructured.partition import pdf, strategies |
41 | 41 | from unstructured.partition.pdf_image import ocr, pdfminer_processing |
42 | 42 | from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots |
@@ -300,6 +300,59 @@ def test_partition_pdf_passes_configured_dpi_to_inference( |
300 | 300 | assert mock_process.call_args[1]["pdf_image_dpi"] == 350 |
301 | 301 |
|
302 | 302 |
|
| 303 | +def test_partition_pdf_passes_render_max_pixels_to_inference(monkeypatch): |
| 304 | + filename = example_doc_path("pdf/layout-parser-paper-fast.pdf") |
| 305 | + monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) |
| 306 | + |
| 307 | + with ( |
| 308 | + mock.patch.object( |
| 309 | + layout, |
| 310 | + "process_file_with_model", |
| 311 | + return_value=MockDocumentLayout(), |
| 312 | + ) as mock_process, |
| 313 | + mock.patch.object( |
| 314 | + ocr, |
| 315 | + "process_file_with_ocr", |
| 316 | + return_value=MockDocumentLayout(), |
| 317 | + ), |
| 318 | + ): |
| 319 | + pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES) |
| 320 | + |
| 321 | + assert mock_process.call_args[1]["pdf_render_max_pixels_per_page"] == 1_000_000_000 |
| 322 | + |
| 323 | + with ( |
| 324 | + open(filename, "rb") as file, |
| 325 | + mock.patch.object( |
| 326 | + layout, |
| 327 | + "process_data_with_model", |
| 328 | + return_value=MockDocumentLayout(), |
| 329 | + ) as mock_process, |
| 330 | + mock.patch.object( |
| 331 | + ocr, |
| 332 | + "process_data_with_ocr", |
| 333 | + return_value=MockDocumentLayout(), |
| 334 | + ), |
| 335 | + ): |
| 336 | + pdf.partition_pdf(file=file, strategy=PartitionStrategy.HI_RES) |
| 337 | + |
| 338 | + assert mock_process.call_args[1]["pdf_render_max_pixels_per_page"] == 1_000_000_000 |
| 339 | + |
| 340 | + |
| 341 | +def test_partition_pdf_render_too_large_error_is_unprocessable(monkeypatch): |
| 342 | + filename = example_doc_path("pdf/layout-parser-paper-fast.pdf") |
| 343 | + monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) |
| 344 | + with mock.patch.object( |
| 345 | + layout, |
| 346 | + "process_file_with_model", |
| 347 | + side_effect=pdf_image.PdfRenderTooLargeError( |
| 348 | + "PDF page would render to too many pixels for safe processing: " |
| 349 | + "page=1, pixels=1000000001, maximum=1000000000.", |
| 350 | + ), |
| 351 | + ): |
| 352 | + with pytest.raises(UnprocessableEntityError, match="too many pixels"): |
| 353 | + pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES) |
| 354 | + |
| 355 | + |
303 | 356 | @pytest.mark.parametrize("model_name", ["checkbox", "yolox"]) |
304 | 357 | def test_partition_pdf_with_model_name( |
305 | 358 | monkeypatch, |
|
0 commit comments