1616from pi_heif import register_heif_opener
1717from PIL import Image as PILImage
1818from pypdf import PdfReader
19+ from pypdf .generic import ArrayObject , IndirectObject
1920
2021from unstructured .chunking import add_chunking_strategy
2122from unstructured .cleaners .core import (
5354 check_language_args ,
5455 prepare_languages_for_tesseract ,
5556)
56- from unstructured .partition .common .metadata import apply_metadata , get_last_modified_date
57+ from unstructured .partition .common .metadata import (
58+ apply_metadata ,
59+ get_last_modified_date ,
60+ )
5761from unstructured .partition .pdf_image .pdfminer_processing import (
5862 check_annotations_within_element ,
5963 get_uris ,
6670 open_pdfminer_pages_generator ,
6771 rect_to_bbox ,
6872)
69- from unstructured .partition .strategies import determine_pdf_or_image_strategy , validate_strategy
73+ from unstructured .partition .strategies import (
74+ determine_pdf_or_image_strategy ,
75+ validate_strategy ,
76+ )
7077from unstructured .partition .text import element_from_text
7178from unstructured .partition .utils .config import env_config
7279from unstructured .partition .utils .constants import (
7784 OCRMode ,
7885 PartitionStrategy ,
7986)
80- from unstructured .partition .utils .sorting import coord_has_valid_points , sort_page_elements
87+ from unstructured .partition .utils .sorting import (
88+ coord_has_valid_points ,
89+ sort_page_elements ,
90+ )
8191from unstructured .patches .pdfminer import patch_psparser
8292from unstructured .utils import first , requires_dependencies
8393
@@ -282,25 +292,34 @@ def partition_pdf_or_image(
282292 line_overlap = pdfminer_line_overlap ,
283293 word_margin = pdfminer_word_margin ,
284294 )
285- extracted_elements = []
295+
296+ extracted_elements : list [list [Element ]] = []
286297 pdf_text_extractable = False
298+
287299 if not is_image :
288300 try :
289- extracted_elements = extractable_elements (
290- filename = filename ,
291- file = spooled_to_bytes_io_if_needed (file ),
292- languages = languages ,
293- metadata_last_modified = metadata_last_modified or last_modified ,
294- starting_page_number = starting_page_number ,
295- password = password ,
296- pdfminer_config = pdfminer_config ,
297- ** kwargs ,
298- )
299- pdf_text_extractable = any (
300- isinstance (el , Text ) and el .text .strip ()
301- for page_elements in extracted_elements
302- for el in page_elements
303- )
301+ if is_pdf_too_complex (filename = filename , file = file ):
302+ logger .info (
303+ "PDF is too complex for text extraction based on heuristic checks. "
304+ "Falling back to hi_res strategy without text extraction."
305+ )
306+
307+ else :
308+ extracted_elements = extractable_elements (
309+ filename = filename ,
310+ file = spooled_to_bytes_io_if_needed (file ),
311+ languages = languages ,
312+ metadata_last_modified = metadata_last_modified or last_modified ,
313+ starting_page_number = starting_page_number ,
314+ password = password ,
315+ pdfminer_config = pdfminer_config ,
316+ ** kwargs ,
317+ )
318+ pdf_text_extractable = any (
319+ isinstance (el , Text ) and el .text .strip ()
320+ for page_elements in extracted_elements
321+ for el in page_elements
322+ )
304323 except Exception as e :
305324 logger .debug (e )
306325 logger .info ("PDF text extraction failed, skip text extraction..." )
@@ -318,15 +337,15 @@ def partition_pdf_or_image(
318337 file .seek (0 )
319338
320339 if languages is None :
321- print ( "Warning: No languages specified, defaulting to English." )
340+ logger . warning ( " No languages specified, defaulting to English." )
322341 languages = ["eng" ]
323342 ocr_languages = prepare_languages_for_tesseract (languages )
324343
325344 if strategy == PartitionStrategy .HI_RES :
326345 # NOTE(robinson): Catches a UserWarning that occurs when detection is called
327346 with warnings .catch_warnings ():
328347 warnings .simplefilter ("ignore" )
329- elements = _partition_pdf_or_image_local (
348+ return _partition_pdf_or_image_local (
330349 filename = filename ,
331350 file = spooled_to_bytes_io_if_needed (file ),
332351 is_image = is_image ,
@@ -353,17 +372,14 @@ def partition_pdf_or_image(
353372 # NOTE(crag): do not call _process_uncategorized_text_elements here, because
354373 # extracted elements (which are text blocks outside of OD-determined blocks)
355374 # are likely not Titles and should not be identified as such.
356- return elements
357375
358376 elif strategy == PartitionStrategy .FAST :
359- out_elements = _partition_pdf_with_pdfparser (
377+ return _partition_pdf_with_pdfparser (
360378 extracted_elements = extracted_elements ,
361379 include_page_breaks = include_page_breaks ,
362380 ** kwargs ,
363381 )
364382
365- return out_elements
366-
367383 elif strategy == PartitionStrategy .OCR_ONLY :
368384 # NOTE(robinson): Catches file conversion warnings when running with PDFs
369385 with warnings .catch_warnings ():
@@ -379,9 +395,9 @@ def partition_pdf_or_image(
379395 password = password ,
380396 ** kwargs ,
381397 )
382- out_elements = _process_uncategorized_text_elements (elements )
398+ return _process_uncategorized_text_elements (elements )
383399
384- return out_elements
400+ raise ValueError ( f"Unsupported partitioning strategy: { strategy } " )
385401
386402
387403def extractable_elements (
@@ -575,6 +591,158 @@ def check_pdf_hi_res_max_pages_exceeded(
575591 )
576592
577593
594+ def is_pdf_too_complex (
595+ filename : str = "" ,
596+ file : Optional [bytes | IO [bytes ]] = None ,
597+ max_graphics_ops : int = 10_000 ,
598+ min_graphics_to_text_ratio : float = 20.0 ,
599+ min_file_size_bytes : int = 1 * 1024 * 1024 , # 1 MB
600+ min_raw_stream_bytes : int = 100_000 ,
601+ ) -> bool :
602+ """Check if a PDF is likely a complex vector drawing (e.g., CAD/engineering docs)
603+ that would be extremely slow or produce garbage results with PDFMiner text extraction.
604+
605+ Try to minimize overhead with early exits:
606+ 1. Avoid overhead by skipping files smaller than min_file_size_bytes.
607+ 2. For each page, decode the raw content stream bytes. Skip pages where the
608+ decoded stream is smaller than min_raw_stream_bytes.
609+ 3. For large streams, regex to count graphics without parsing the stream.
610+
611+ A page is flagged as too complex when it has a high number of graphics operators
612+ AND a high ratio of graphics-to-text operators.
613+
614+ Parameters
615+ ----------
616+ filename
617+ Path to a PDF file.
618+ file
619+ A file-like object or bytes.
620+ max_graphics_ops
621+ If any page exceeds this many graphics operators AND the graphics-to-text ratio
622+ exceeds `min_graphics_to_text_ratio`, the PDF is considered too complex.
623+ min_graphics_to_text_ratio
624+ Minimum ratio of graphics ops to text ops required (in conjunction with
625+ `max_graphics_ops`) to flag a page as too complex.
626+ min_file_size_bytes
627+ Skip the complexity check entirely for files smaller than this (default 2 MB).
628+ min_raw_stream_bytes
629+ Skip operator counting for pages whose decoded content stream is smaller than
630+ this (default 100 KB). Small streams can't have enough operators to trigger
631+ the threshold.
632+ """
633+
634+ # Regex patterns for counting graphics and text operators in PDF content streams.
635+ GRAPHICS_OPS_PATTERN = re .compile (
636+ rb"(?:^|(?<=\s))"
637+ rb"(?:m|l|c|v|y|h|re|S|s|f|F|f\*|B|B\*|b|b\*|n|W|W\*|cm|q|Q|Do|"
638+ rb"g|G|rg|RG|k|K|cs|CS|w|J|j|M|d|i|gs)"
639+ rb"(?=\s|$)" ,
640+ re .MULTILINE ,
641+ )
642+ TEXT_OPS_PATTERN = re .compile (
643+ rb"(?:^|(?<=\s))" rb"(?:Tj|TJ|'|\"|Tf|Td|TD|Tm|T\*|BT|ET)" rb"(?=\s|$)" ,
644+ re .MULTILINE ,
645+ )
646+
647+ original_pos : Optional [int ] = None
648+
649+ try :
650+ # Preserve file cursor position for file-like inputs
651+ if file is not None and not isinstance (file , bytes ) and hasattr (file , "tell" ):
652+ original_pos = file .tell ()
653+
654+ # Skip for small files
655+ if file is not None :
656+ if isinstance (file , bytes ):
657+ file_size = len (file )
658+ else :
659+ file .seek (0 , 2 )
660+ file_size = file .tell ()
661+ file .seek (original_pos or 0 )
662+ elif filename :
663+ file_size = os .path .getsize (filename )
664+ else :
665+ return False
666+
667+ if file_size < min_file_size_bytes :
668+ return False
669+
670+ # Build reader
671+ if file is not None :
672+ if isinstance (file , bytes ):
673+ reader = PdfReader (io .BytesIO (file ))
674+ else :
675+ file .seek (0 )
676+ reader = PdfReader (file )
677+ else :
678+ reader = PdfReader (filename )
679+
680+ if not reader .pages :
681+ return False
682+
683+ for page_index , page in enumerate (reader .pages ):
684+ contents = page .get ("/Contents" )
685+ if contents is None :
686+ continue
687+
688+ # Decode raw stream bytes (cheap relative to full ContentStream parsing)
689+ raw_data = b""
690+ try :
691+ if isinstance (contents , ArrayObject ):
692+ for item in contents :
693+ obj = item .get_object () if isinstance (item , IndirectObject ) else item
694+ if hasattr (obj , "get_data" ):
695+ raw_data += obj .get_data ()
696+ else :
697+ obj = (
698+ contents .get_object () if isinstance (contents , IndirectObject ) else contents
699+ )
700+ if hasattr (obj , "get_data" ):
701+ raw_data = obj .get_data ()
702+ except Exception :
703+ continue
704+
705+ # Skip pages with small content streams
706+ if len (raw_data ) < min_raw_stream_bytes :
707+ continue
708+
709+ # Regex count graphics and text operators without fully parsing the stream
710+ num_graphics_ops = len (GRAPHICS_OPS_PATTERN .findall (raw_data ))
711+
712+ # Early exit: if graphics ops don't even reach threshold, skip text counting
713+ if num_graphics_ops <= max_graphics_ops :
714+ continue
715+
716+ num_text_ops = len (TEXT_OPS_PATTERN .findall (raw_data ))
717+ ratio = num_graphics_ops / max (num_text_ops , 1 )
718+
719+ if ratio > min_graphics_to_text_ratio :
720+ logger .info (
721+ f"Page { page_index + 1 } has { num_graphics_ops } graphics ops, "
722+ f"{ num_text_ops } text ops (ratio: { ratio :.1f} ). "
723+ f"Exceeds thresholds (ops: { max_graphics_ops } , "
724+ f"ratio: { min_graphics_to_text_ratio } ). "
725+ "Flagging PDF as too complex for text extraction."
726+ )
727+ return True
728+
729+ except Exception as e :
730+ logger .debug (f"is_pdf_too_complex check failed: { e } " )
731+ return False
732+
733+ finally :
734+ # Restore original cursor position for file-like inputs
735+ if (
736+ file is not None
737+ and not isinstance (file , bytes )
738+ and hasattr (file , "seek" )
739+ and original_pos is not None
740+ ):
741+ file .seek (original_pos )
742+
743+ return False
744+
745+
578746@requires_dependencies ("unstructured_inference" )
579747def _partition_pdf_or_image_local (
580748 filename : str = "" ,
@@ -621,7 +789,10 @@ def _partition_pdf_or_image_local(
621789 )
622790 from unstructured .partition .pdf_image .analysis .tools import save_analysis_artifiacts
623791 from unstructured .partition .pdf_image .form_extraction import run_form_extraction
624- from unstructured .partition .pdf_image .ocr import process_data_with_ocr , process_file_with_ocr
792+ from unstructured .partition .pdf_image .ocr import (
793+ process_data_with_ocr ,
794+ process_file_with_ocr ,
795+ )
625796 from unstructured .partition .pdf_image .pdf_image_utils import (
626797 check_element_types_to_extract ,
627798 save_elements ,
@@ -722,7 +893,10 @@ def _partition_pdf_or_image_local(
722893
723894 extracted_layout , layouts_links = (
724895 process_data_with_pdfminer (
725- file = file , dpi = pdf_image_dpi , password = password , pdfminer_config = pdfminer_config
896+ file = file ,
897+ dpi = pdf_image_dpi ,
898+ password = password ,
899+ pdfminer_config = pdfminer_config ,
726900 )
727901 if pdf_text_extractable
728902 else ([], [])
@@ -943,7 +1117,8 @@ def _partition_pdf_or_image_with_ocr(
9431117 elements .extend (page_elements )
9441118 else :
9451119 for page_number , image in enumerate (
946- convert_pdf_to_images (filename , file , password = password ), start = starting_page_number
1120+ convert_pdf_to_images (filename , file , password = password ),
1121+ start = starting_page_number ,
9471122 ):
9481123 page_elements = _partition_pdf_or_image_with_ocr_from_image (
9491124 image = image ,
@@ -1190,7 +1365,9 @@ def document_to_element_list(
11901365 ** kwargs : Any ,
11911366) -> list [Element ]:
11921367 """Converts a DocumentLayout object to a list of unstructured elements."""
1193- from unstructured .partition .pdf_image .pdfminer_processing import get_links_in_element
1368+ from unstructured .partition .pdf_image .pdfminer_processing import (
1369+ get_links_in_element ,
1370+ )
11941371
11951372 elements : list [Element ] = []
11961373
0 commit comments