Skip to content

Commit befefda

Browse files
committed
pr comments
1 parent e50241b commit befefda

2 files changed

Lines changed: 77 additions & 5 deletions

File tree

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import base64
4+
import io
45
import logging
56
import math
67
import os
@@ -1542,6 +1543,75 @@ def test_pdf_hi_res_max_pages_argument(filename, pdf_hi_res_max_pages, expected_
15421543
)
15431544

15441545

1546+
def test_is_pdf_too_complex_skips_small_file_size():
1547+
assert not pdf.is_pdf_too_complex(file=b"tiny", min_file_size_bytes=10)
1548+
1549+
1550+
def test_is_pdf_too_complex_detects_vector_heavy_page():
1551+
class MockStream:
1552+
def get_data(self):
1553+
return b" ".join([b"m"] * 120 + [b"Tj"] * 2)
1554+
1555+
reader = mock.Mock()
1556+
reader.pages = [{"/Contents": MockStream()}]
1557+
1558+
with mock.patch.object(pdf, "PdfReader", return_value=reader):
1559+
assert pdf.is_pdf_too_complex(
1560+
file=b"x" * 20,
1561+
max_graphics_ops=100,
1562+
min_graphics_to_text_ratio=20.0,
1563+
min_file_size_bytes=1,
1564+
min_raw_stream_bytes=1,
1565+
)
1566+
1567+
1568+
def test_is_pdf_too_complex_skips_pages_without_contents():
1569+
reader = mock.Mock()
1570+
reader.pages = [{"/Contents": None}]
1571+
1572+
with mock.patch.object(pdf, "PdfReader", return_value=reader):
1573+
assert not pdf.is_pdf_too_complex(
1574+
file=b"x" * 20,
1575+
min_file_size_bytes=1,
1576+
min_raw_stream_bytes=1,
1577+
)
1578+
1579+
1580+
def test_is_pdf_too_complex_skips_small_content_streams():
1581+
class MockStream:
1582+
def get_data(self):
1583+
return b"m Tj"
1584+
1585+
reader = mock.Mock()
1586+
reader.pages = [{"/Contents": MockStream()}]
1587+
1588+
with mock.patch.object(pdf, "PdfReader", return_value=reader):
1589+
assert not pdf.is_pdf_too_complex(
1590+
file=b"x" * 20,
1591+
max_graphics_ops=1,
1592+
min_graphics_to_text_ratio=1.0,
1593+
min_file_size_bytes=1,
1594+
min_raw_stream_bytes=20,
1595+
)
1596+
1597+
1598+
def test_is_pdf_too_complex_restores_file_cursor_position():
1599+
file = io.BytesIO(b"x" * 20)
1600+
file.seek(7)
1601+
1602+
reader = mock.Mock()
1603+
reader.pages = []
1604+
1605+
with mock.patch.object(pdf, "PdfReader", return_value=reader):
1606+
assert not pdf.is_pdf_too_complex(
1607+
file=file,
1608+
min_file_size_bytes=1,
1609+
min_raw_stream_bytes=1,
1610+
)
1611+
1612+
assert file.tell() == 7
1613+
1614+
15451615
def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
15461616
# TODO (yao): investigate why we need this test. The LayoutElement definition suggests bbox
15471617
# can't be None and it has to be a Rectangle object that has x1, y1, x2, y2 attributes.

unstructured/partition/pdf.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import re
88
import warnings
99
from pathlib import Path
10-
from typing import IO, TYPE_CHECKING, Any, Optional, cast
10+
from typing import IO, TYPE_CHECKING, Any, Optional, Union, cast
1111

1212
import numpy as np
1313
import wrapt
@@ -103,6 +103,8 @@
103103
rb"(?:^|(?<=\s))" rb"(?:Tj|TJ|'|\"|Tf|Td|TD|Tm|T\*|BT|ET)" rb"(?=\s|$)",
104104
re.MULTILINE,
105105
)
106+
DEFAULT_MIN_FILE_SIZE_BYTES = 1 * 1024 * 1024 # 1 MB
107+
DEFAULT_MIN_RAW_STREAM_BYTES = 100_000 # 100 KB
106108

107109
# increase the max pixels so high dpi values like 300 can still be under the PIL limit
108110
PILImage.MAX_IMAGE_PIXELS = 5e8
@@ -593,11 +595,11 @@ def check_pdf_hi_res_max_pages_exceeded(
593595

594596
def is_pdf_too_complex(
595597
filename: str = "",
596-
file: Optional[bytes | IO[bytes]] = None,
598+
file: Optional[Union[bytes, IO[bytes]]] = None,
597599
max_graphics_ops: int = 10_000,
598600
min_graphics_to_text_ratio: float = 20.0,
599-
min_file_size_bytes: int = int(1 * 1024 * 1024), # 1 MB
600-
min_raw_stream_bytes: int = 100_000,
601+
min_file_size_bytes: int = DEFAULT_MIN_FILE_SIZE_BYTES,
602+
min_raw_stream_bytes: int = DEFAULT_MIN_RAW_STREAM_BYTES,
601603
) -> bool:
602604
"""Check if a PDF is likely a complex vector drawing (e.g., CAD/engineering docs)
603605
that would be extremely slow or produce garbage results with PDFMiner text extraction.
@@ -624,7 +626,7 @@ def is_pdf_too_complex(
624626
Minimum ratio of graphics ops to text ops required (in conjunction with
625627
`max_graphics_ops`) to flag a page as too complex.
626628
min_file_size_bytes
627-
Skip the complexity check entirely for files smaller than this (default 2 MB).
629+
Skip the complexity check entirely for files smaller than this (default 1 MB).
628630
min_raw_stream_bytes
629631
Skip operator counting for pages whose decoded content stream is smaller than
630632
this (default 100 KB). Small streams can't have enough operators to trigger

0 commit comments

Comments
 (0)