Skip to content

Commit 7a6ccc6

Browse files
committed
add a check for too complex
1 parent 031b0cf commit 7a6ccc6

1 file changed

Lines changed: 208 additions & 31 deletions

File tree

unstructured/partition/pdf.py

Lines changed: 208 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pi_heif import register_heif_opener
1717
from PIL import Image as PILImage
1818
from pypdf import PdfReader
19+
from pypdf.generic import ArrayObject, IndirectObject
1920

2021
from unstructured.chunking import add_chunking_strategy
2122
from unstructured.cleaners.core import (
@@ -53,7 +54,10 @@
5354
check_language_args,
5455
prepare_languages_for_tesseract,
5556
)
56-
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
57+
from unstructured.partition.common.metadata import (
58+
apply_metadata,
59+
get_last_modified_date,
60+
)
5761
from unstructured.partition.pdf_image.pdfminer_processing import (
5862
check_annotations_within_element,
5963
get_uris,
@@ -66,7 +70,10 @@
6670
open_pdfminer_pages_generator,
6771
rect_to_bbox,
6872
)
69-
from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy
73+
from unstructured.partition.strategies import (
74+
determine_pdf_or_image_strategy,
75+
validate_strategy,
76+
)
7077
from unstructured.partition.text import element_from_text
7178
from unstructured.partition.utils.config import env_config
7279
from unstructured.partition.utils.constants import (
@@ -77,7 +84,10 @@
7784
OCRMode,
7885
PartitionStrategy,
7986
)
80-
from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
87+
from unstructured.partition.utils.sorting import (
88+
coord_has_valid_points,
89+
sort_page_elements,
90+
)
8191
from unstructured.patches.pdfminer import patch_psparser
8292
from unstructured.utils import first, requires_dependencies
8393

@@ -282,25 +292,34 @@ def partition_pdf_or_image(
282292
line_overlap=pdfminer_line_overlap,
283293
word_margin=pdfminer_word_margin,
284294
)
285-
extracted_elements = []
295+
296+
extracted_elements: list[list[Element]] = []
286297
pdf_text_extractable = False
298+
287299
if not is_image:
288300
try:
289-
extracted_elements = extractable_elements(
290-
filename=filename,
291-
file=spooled_to_bytes_io_if_needed(file),
292-
languages=languages,
293-
metadata_last_modified=metadata_last_modified or last_modified,
294-
starting_page_number=starting_page_number,
295-
password=password,
296-
pdfminer_config=pdfminer_config,
297-
**kwargs,
298-
)
299-
pdf_text_extractable = any(
300-
isinstance(el, Text) and el.text.strip()
301-
for page_elements in extracted_elements
302-
for el in page_elements
303-
)
301+
if is_pdf_too_complex(filename=filename, file=file):
302+
logger.info(
303+
"PDF is too complex for text extraction based on heuristic checks. "
304+
"Falling back to hi_res strategy without text extraction."
305+
)
306+
307+
else:
308+
extracted_elements = extractable_elements(
309+
filename=filename,
310+
file=spooled_to_bytes_io_if_needed(file),
311+
languages=languages,
312+
metadata_last_modified=metadata_last_modified or last_modified,
313+
starting_page_number=starting_page_number,
314+
password=password,
315+
pdfminer_config=pdfminer_config,
316+
**kwargs,
317+
)
318+
pdf_text_extractable = any(
319+
isinstance(el, Text) and el.text.strip()
320+
for page_elements in extracted_elements
321+
for el in page_elements
322+
)
304323
except Exception as e:
305324
logger.debug(e)
306325
logger.info("PDF text extraction failed, skip text extraction...")
@@ -318,15 +337,15 @@ def partition_pdf_or_image(
318337
file.seek(0)
319338

320339
if languages is None:
321-
print("Warning: No languages specified, defaulting to English.")
340+
logger.warning("No languages specified, defaulting to English.")
322341
languages = ["eng"]
323342
ocr_languages = prepare_languages_for_tesseract(languages)
324343

325344
if strategy == PartitionStrategy.HI_RES:
326345
# NOTE(robinson): Catches a UserWarning that occurs when detection is called
327346
with warnings.catch_warnings():
328347
warnings.simplefilter("ignore")
329-
elements = _partition_pdf_or_image_local(
348+
return _partition_pdf_or_image_local(
330349
filename=filename,
331350
file=spooled_to_bytes_io_if_needed(file),
332351
is_image=is_image,
@@ -353,17 +372,14 @@ def partition_pdf_or_image(
353372
# NOTE(crag): do not call _process_uncategorized_text_elements here, because
354373
# extracted elements (which are text blocks outside of OD-determined blocks)
355374
# are likely not Titles and should not be identified as such.
356-
return elements
357375

358376
elif strategy == PartitionStrategy.FAST:
359-
out_elements = _partition_pdf_with_pdfparser(
377+
return _partition_pdf_with_pdfparser(
360378
extracted_elements=extracted_elements,
361379
include_page_breaks=include_page_breaks,
362380
**kwargs,
363381
)
364382

365-
return out_elements
366-
367383
elif strategy == PartitionStrategy.OCR_ONLY:
368384
# NOTE(robinson): Catches file conversion warnings when running with PDFs
369385
with warnings.catch_warnings():
@@ -379,9 +395,9 @@ def partition_pdf_or_image(
379395
password=password,
380396
**kwargs,
381397
)
382-
out_elements = _process_uncategorized_text_elements(elements)
398+
return _process_uncategorized_text_elements(elements)
383399

384-
return out_elements
400+
raise ValueError(f"Unsupported partitioning strategy: {strategy}")
385401

386402

387403
def extractable_elements(
@@ -575,6 +591,158 @@ def check_pdf_hi_res_max_pages_exceeded(
575591
)
576592

577593

594+
def is_pdf_too_complex(
595+
filename: str = "",
596+
file: Optional[bytes | IO[bytes]] = None,
597+
max_graphics_ops: int = 10_000,
598+
min_graphics_to_text_ratio: float = 20.0,
599+
min_file_size_bytes: int = 1 * 1024 * 1024, # 1 MB
600+
min_raw_stream_bytes: int = 100_000,
601+
) -> bool:
602+
"""Check if a PDF is likely a complex vector drawing (e.g., CAD/engineering docs)
603+
that would be extremely slow or produce garbage results with PDFMiner text extraction.
604+
605+
Try to minimize overhead with early exits:
606+
1. Avoid overhead by skipping files smaller than min_file_size_bytes.
607+
2. For each page, decode the raw content stream bytes. Skip pages where the
608+
decoded stream is smaller than min_raw_stream_bytes.
609+
3. For large streams, regex to count graphics without parsing the stream.
610+
611+
A page is flagged as too complex when it has a high number of graphics operators
612+
AND a high ratio of graphics-to-text operators.
613+
614+
Parameters
615+
----------
616+
filename
617+
Path to a PDF file.
618+
file
619+
A file-like object or bytes.
620+
max_graphics_ops
621+
If any page exceeds this many graphics operators AND the graphics-to-text ratio
622+
exceeds `min_graphics_to_text_ratio`, the PDF is considered too complex.
623+
min_graphics_to_text_ratio
624+
Minimum ratio of graphics ops to text ops required (in conjunction with
625+
`max_graphics_ops`) to flag a page as too complex.
626+
min_file_size_bytes
627+
Skip the complexity check entirely for files smaller than this (default 2 MB).
628+
min_raw_stream_bytes
629+
Skip operator counting for pages whose decoded content stream is smaller than
630+
this (default 100 KB). Small streams can't have enough operators to trigger
631+
the threshold.
632+
"""
633+
634+
# Regex patterns for counting graphics and text operators in PDF content streams.
635+
GRAPHICS_OPS_PATTERN = re.compile(
636+
rb"(?:^|(?<=\s))"
637+
rb"(?:m|l|c|v|y|h|re|S|s|f|F|f\*|B|B\*|b|b\*|n|W|W\*|cm|q|Q|Do|"
638+
rb"g|G|rg|RG|k|K|cs|CS|w|J|j|M|d|i|gs)"
639+
rb"(?=\s|$)",
640+
re.MULTILINE,
641+
)
642+
TEXT_OPS_PATTERN = re.compile(
643+
rb"(?:^|(?<=\s))" rb"(?:Tj|TJ|'|\"|Tf|Td|TD|Tm|T\*|BT|ET)" rb"(?=\s|$)",
644+
re.MULTILINE,
645+
)
646+
647+
original_pos: Optional[int] = None
648+
649+
try:
650+
# Preserve file cursor position for file-like inputs
651+
if file is not None and not isinstance(file, bytes) and hasattr(file, "tell"):
652+
original_pos = file.tell()
653+
654+
# Skip for small files
655+
if file is not None:
656+
if isinstance(file, bytes):
657+
file_size = len(file)
658+
else:
659+
file.seek(0, 2)
660+
file_size = file.tell()
661+
file.seek(original_pos or 0)
662+
elif filename:
663+
file_size = os.path.getsize(filename)
664+
else:
665+
return False
666+
667+
if file_size < min_file_size_bytes:
668+
return False
669+
670+
# Build reader
671+
if file is not None:
672+
if isinstance(file, bytes):
673+
reader = PdfReader(io.BytesIO(file))
674+
else:
675+
file.seek(0)
676+
reader = PdfReader(file)
677+
else:
678+
reader = PdfReader(filename)
679+
680+
if not reader.pages:
681+
return False
682+
683+
for page_index, page in enumerate(reader.pages):
684+
contents = page.get("/Contents")
685+
if contents is None:
686+
continue
687+
688+
# Decode raw stream bytes (cheap relative to full ContentStream parsing)
689+
raw_data = b""
690+
try:
691+
if isinstance(contents, ArrayObject):
692+
for item in contents:
693+
obj = item.get_object() if isinstance(item, IndirectObject) else item
694+
if hasattr(obj, "get_data"):
695+
raw_data += obj.get_data()
696+
else:
697+
obj = (
698+
contents.get_object() if isinstance(contents, IndirectObject) else contents
699+
)
700+
if hasattr(obj, "get_data"):
701+
raw_data = obj.get_data()
702+
except Exception:
703+
continue
704+
705+
# Skip pages with small content streams
706+
if len(raw_data) < min_raw_stream_bytes:
707+
continue
708+
709+
# Regex count graphics and text operators without fully parsing the stream
710+
num_graphics_ops = len(GRAPHICS_OPS_PATTERN.findall(raw_data))
711+
712+
# Early exit: if graphics ops don't even reach threshold, skip text counting
713+
if num_graphics_ops <= max_graphics_ops:
714+
continue
715+
716+
num_text_ops = len(TEXT_OPS_PATTERN.findall(raw_data))
717+
ratio = num_graphics_ops / max(num_text_ops, 1)
718+
719+
if ratio > min_graphics_to_text_ratio:
720+
logger.info(
721+
f"Page {page_index + 1} has {num_graphics_ops} graphics ops, "
722+
f"{num_text_ops} text ops (ratio: {ratio:.1f}). "
723+
f"Exceeds thresholds (ops: {max_graphics_ops}, "
724+
f"ratio: {min_graphics_to_text_ratio}). "
725+
"Flagging PDF as too complex for text extraction."
726+
)
727+
return True
728+
729+
except Exception as e:
730+
logger.debug(f"is_pdf_too_complex check failed: {e}")
731+
return False
732+
733+
finally:
734+
# Restore original cursor position for file-like inputs
735+
if (
736+
file is not None
737+
and not isinstance(file, bytes)
738+
and hasattr(file, "seek")
739+
and original_pos is not None
740+
):
741+
file.seek(original_pos)
742+
743+
return False
744+
745+
578746
@requires_dependencies("unstructured_inference")
579747
def _partition_pdf_or_image_local(
580748
filename: str = "",
@@ -621,7 +789,10 @@ def _partition_pdf_or_image_local(
621789
)
622790
from unstructured.partition.pdf_image.analysis.tools import save_analysis_artifiacts
623791
from unstructured.partition.pdf_image.form_extraction import run_form_extraction
624-
from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr
792+
from unstructured.partition.pdf_image.ocr import (
793+
process_data_with_ocr,
794+
process_file_with_ocr,
795+
)
625796
from unstructured.partition.pdf_image.pdf_image_utils import (
626797
check_element_types_to_extract,
627798
save_elements,
@@ -722,7 +893,10 @@ def _partition_pdf_or_image_local(
722893

723894
extracted_layout, layouts_links = (
724895
process_data_with_pdfminer(
725-
file=file, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config
896+
file=file,
897+
dpi=pdf_image_dpi,
898+
password=password,
899+
pdfminer_config=pdfminer_config,
726900
)
727901
if pdf_text_extractable
728902
else ([], [])
@@ -943,7 +1117,8 @@ def _partition_pdf_or_image_with_ocr(
9431117
elements.extend(page_elements)
9441118
else:
9451119
for page_number, image in enumerate(
946-
convert_pdf_to_images(filename, file, password=password), start=starting_page_number
1120+
convert_pdf_to_images(filename, file, password=password),
1121+
start=starting_page_number,
9471122
):
9481123
page_elements = _partition_pdf_or_image_with_ocr_from_image(
9491124
image=image,
@@ -1190,7 +1365,9 @@ def document_to_element_list(
11901365
**kwargs: Any,
11911366
) -> list[Element]:
11921367
"""Converts a DocumentLayout object to a list of unstructured elements."""
1193-
from unstructured.partition.pdf_image.pdfminer_processing import get_links_in_element
1368+
from unstructured.partition.pdf_image.pdfminer_processing import (
1369+
get_links_in_element,
1370+
)
11941371

11951372
elements: list[Element] = []
11961373

0 commit comments

Comments
 (0)