|
50 | 50 | ocr_data_to_elements, |
51 | 51 | spooled_to_bytes_io_if_needed, |
52 | 52 | ) |
53 | | -from unstructured.partition.common.lang import ( |
54 | | - check_language_args, |
55 | | - prepare_languages_for_tesseract, |
56 | | -) |
57 | | -from unstructured.partition.common.metadata import ( |
58 | | - apply_metadata, |
59 | | - get_last_modified_date, |
60 | | -) |
| 53 | +from unstructured.partition.common.lang import check_language_args, prepare_languages_for_tesseract |
| 54 | +from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date |
61 | 55 | from unstructured.partition.pdf_image.pdfminer_processing import ( |
62 | 56 | check_annotations_within_element, |
63 | 57 | get_uris, |
|
70 | 64 | open_pdfminer_pages_generator, |
71 | 65 | rect_to_bbox, |
72 | 66 | ) |
73 | | -from unstructured.partition.strategies import ( |
74 | | - determine_pdf_or_image_strategy, |
75 | | - validate_strategy, |
76 | | -) |
| 67 | +from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy |
77 | 68 | from unstructured.partition.text import element_from_text |
78 | 69 | from unstructured.partition.utils.config import env_config |
79 | 70 | from unstructured.partition.utils.constants import ( |
|
84 | 75 | OCRMode, |
85 | 76 | PartitionStrategy, |
86 | 77 | ) |
87 | | -from unstructured.partition.utils.sorting import ( |
88 | | - coord_has_valid_points, |
89 | | - sort_page_elements, |
90 | | -) |
| 78 | +from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements |
91 | 79 | from unstructured.patches.pdfminer import patch_psparser |
92 | 80 | from unstructured.utils import first, requires_dependencies |
93 | 81 |
|
|
116 | 104 | re.MULTILINE, |
117 | 105 | ) |
118 | 106 |
|
119 | | - |
120 | 107 | # increase the max pixels so high dpi values like 300 can still be under the PIL limit |
121 | 108 | PILImage.MAX_IMAGE_PIXELS = 5e8 |
122 | 109 |
|
@@ -590,12 +577,26 @@ def _get_pdf_page_number( |
590 | 577 | return number_of_pages |
591 | 578 |
|
592 | 579 |
|
| 580 | +def check_pdf_hi_res_max_pages_exceeded( |
| 581 | + filename: str = "", |
| 582 | + file: Optional[bytes | IO[bytes]] = None, |
| 583 | + pdf_hi_res_max_pages: int = None, |
| 584 | +) -> None: |
| 585 | + """Checks whether PDF exceeds pdf_hi_res_max_pages limit.""" |
| 586 | + if pdf_hi_res_max_pages: |
| 587 | + document_pages = _get_pdf_page_number(filename=filename, file=file) |
| 588 | + if document_pages > pdf_hi_res_max_pages: |
| 589 | + raise PageCountExceededError( |
| 590 | + document_pages=document_pages, pdf_hi_res_max_pages=pdf_hi_res_max_pages |
| 591 | + ) |
| 592 | + |
| 593 | + |
593 | 594 | def is_pdf_too_complex( |
594 | 595 | filename: str = "", |
595 | 596 | file: Optional[bytes | IO[bytes]] = None, |
596 | 597 | max_graphics_ops: int = 10_000, |
597 | 598 | min_graphics_to_text_ratio: float = 20.0, |
598 | | - min_file_size_bytes: int = 1 * 1024 * 1024, # 1 MB |
| 599 | + min_file_size_bytes: int = int(1 * 1024 * 1024), # 1 MB |
599 | 600 | min_raw_stream_bytes: int = 100_000, |
600 | 601 | ) -> bool: |
601 | 602 | """Check if a PDF is likely a complex vector drawing (e.g., CAD/engineering docs) |
@@ -729,20 +730,6 @@ def is_pdf_too_complex( |
729 | 730 | return False |
730 | 731 |
|
731 | 732 |
|
732 | | -def check_pdf_hi_res_max_pages_exceeded( |
733 | | - filename: str = "", |
734 | | - file: Optional[bytes | IO[bytes]] = None, |
735 | | - pdf_hi_res_max_pages: int = None, |
736 | | -) -> None: |
737 | | - """Checks whether PDF exceeds pdf_hi_res_max_pages limit.""" |
738 | | - if pdf_hi_res_max_pages: |
739 | | - document_pages = _get_pdf_page_number(filename=filename, file=file) |
740 | | - if document_pages > pdf_hi_res_max_pages: |
741 | | - raise PageCountExceededError( |
742 | | - document_pages=document_pages, pdf_hi_res_max_pages=pdf_hi_res_max_pages |
743 | | - ) |
744 | | - |
745 | | - |
746 | 733 | @requires_dependencies("unstructured_inference") |
747 | 734 | def _partition_pdf_or_image_local( |
748 | 735 | filename: str = "", |
@@ -789,10 +776,7 @@ def _partition_pdf_or_image_local( |
789 | 776 | ) |
790 | 777 | from unstructured.partition.pdf_image.analysis.tools import save_analysis_artifiacts |
791 | 778 | from unstructured.partition.pdf_image.form_extraction import run_form_extraction |
792 | | - from unstructured.partition.pdf_image.ocr import ( |
793 | | - process_data_with_ocr, |
794 | | - process_file_with_ocr, |
795 | | - ) |
| 779 | + from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr |
796 | 780 | from unstructured.partition.pdf_image.pdf_image_utils import ( |
797 | 781 | check_element_types_to_extract, |
798 | 782 | save_elements, |
@@ -893,10 +877,7 @@ def _partition_pdf_or_image_local( |
893 | 877 |
|
894 | 878 | extracted_layout, layouts_links = ( |
895 | 879 | process_data_with_pdfminer( |
896 | | - file=file, |
897 | | - dpi=pdf_image_dpi, |
898 | | - password=password, |
899 | | - pdfminer_config=pdfminer_config, |
| 880 | + file=file, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config |
900 | 881 | ) |
901 | 882 | if pdf_text_extractable |
902 | 883 | else ([], []) |
@@ -1117,8 +1098,7 @@ def _partition_pdf_or_image_with_ocr( |
1117 | 1098 | elements.extend(page_elements) |
1118 | 1099 | else: |
1119 | 1100 | for page_number, image in enumerate( |
1120 | | - convert_pdf_to_images(filename, file, password=password), |
1121 | | - start=starting_page_number, |
| 1101 | + convert_pdf_to_images(filename, file, password=password), start=starting_page_number |
1122 | 1102 | ): |
1123 | 1103 | page_elements = _partition_pdf_or_image_with_ocr_from_image( |
1124 | 1104 | image=image, |
@@ -1365,9 +1345,7 @@ def document_to_element_list( |
1365 | 1345 | **kwargs: Any, |
1366 | 1346 | ) -> list[Element]: |
1367 | 1347 | """Converts a DocumentLayout object to a list of unstructured elements.""" |
1368 | | - from unstructured.partition.pdf_image.pdfminer_processing import ( |
1369 | | - get_links_in_element, |
1370 | | - ) |
| 1348 | + from unstructured.partition.pdf_image.pdfminer_processing import get_links_in_element |
1371 | 1349 |
|
1372 | 1350 | elements: list[Element] = [] |
1373 | 1351 |
|
|
0 commit comments