Skip to content

Commit e50241b

Browse files
committed
linting maybe
1 parent 4a50420 commit e50241b

1 file changed

Lines changed: 23 additions & 45 deletions

File tree

unstructured/partition/pdf.py

Lines changed: 23 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,8 @@
5050
ocr_data_to_elements,
5151
spooled_to_bytes_io_if_needed,
5252
)
53-
from unstructured.partition.common.lang import (
54-
check_language_args,
55-
prepare_languages_for_tesseract,
56-
)
57-
from unstructured.partition.common.metadata import (
58-
apply_metadata,
59-
get_last_modified_date,
60-
)
53+
from unstructured.partition.common.lang import check_language_args, prepare_languages_for_tesseract
54+
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
6155
from unstructured.partition.pdf_image.pdfminer_processing import (
6256
check_annotations_within_element,
6357
get_uris,
@@ -70,10 +64,7 @@
7064
open_pdfminer_pages_generator,
7165
rect_to_bbox,
7266
)
73-
from unstructured.partition.strategies import (
74-
determine_pdf_or_image_strategy,
75-
validate_strategy,
76-
)
67+
from unstructured.partition.strategies import determine_pdf_or_image_strategy, validate_strategy
7768
from unstructured.partition.text import element_from_text
7869
from unstructured.partition.utils.config import env_config
7970
from unstructured.partition.utils.constants import (
@@ -84,10 +75,7 @@
8475
OCRMode,
8576
PartitionStrategy,
8677
)
87-
from unstructured.partition.utils.sorting import (
88-
coord_has_valid_points,
89-
sort_page_elements,
90-
)
78+
from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
9179
from unstructured.patches.pdfminer import patch_psparser
9280
from unstructured.utils import first, requires_dependencies
9381

@@ -116,7 +104,6 @@
116104
re.MULTILINE,
117105
)
118106

119-
120107
# increase the max pixels so high dpi values like 300 can still be under the PIL limit
121108
PILImage.MAX_IMAGE_PIXELS = 5e8
122109

@@ -590,12 +577,26 @@ def _get_pdf_page_number(
590577
return number_of_pages
591578

592579

580+
def check_pdf_hi_res_max_pages_exceeded(
581+
filename: str = "",
582+
file: Optional[bytes | IO[bytes]] = None,
583+
pdf_hi_res_max_pages: int = None,
584+
) -> None:
585+
"""Checks whether PDF exceeds pdf_hi_res_max_pages limit."""
586+
if pdf_hi_res_max_pages:
587+
document_pages = _get_pdf_page_number(filename=filename, file=file)
588+
if document_pages > pdf_hi_res_max_pages:
589+
raise PageCountExceededError(
590+
document_pages=document_pages, pdf_hi_res_max_pages=pdf_hi_res_max_pages
591+
)
592+
593+
593594
def is_pdf_too_complex(
594595
filename: str = "",
595596
file: Optional[bytes | IO[bytes]] = None,
596597
max_graphics_ops: int = 10_000,
597598
min_graphics_to_text_ratio: float = 20.0,
598-
min_file_size_bytes: int = 1 * 1024 * 1024, # 1 MB
599+
min_file_size_bytes: int = int(1 * 1024 * 1024), # 1 MB
599600
min_raw_stream_bytes: int = 100_000,
600601
) -> bool:
601602
"""Check if a PDF is likely a complex vector drawing (e.g., CAD/engineering docs)
@@ -729,20 +730,6 @@ def is_pdf_too_complex(
729730
return False
730731

731732

732-
def check_pdf_hi_res_max_pages_exceeded(
733-
filename: str = "",
734-
file: Optional[bytes | IO[bytes]] = None,
735-
pdf_hi_res_max_pages: int = None,
736-
) -> None:
737-
"""Checks whether PDF exceeds pdf_hi_res_max_pages limit."""
738-
if pdf_hi_res_max_pages:
739-
document_pages = _get_pdf_page_number(filename=filename, file=file)
740-
if document_pages > pdf_hi_res_max_pages:
741-
raise PageCountExceededError(
742-
document_pages=document_pages, pdf_hi_res_max_pages=pdf_hi_res_max_pages
743-
)
744-
745-
746733
@requires_dependencies("unstructured_inference")
747734
def _partition_pdf_or_image_local(
748735
filename: str = "",
@@ -789,10 +776,7 @@ def _partition_pdf_or_image_local(
789776
)
790777
from unstructured.partition.pdf_image.analysis.tools import save_analysis_artifiacts
791778
from unstructured.partition.pdf_image.form_extraction import run_form_extraction
792-
from unstructured.partition.pdf_image.ocr import (
793-
process_data_with_ocr,
794-
process_file_with_ocr,
795-
)
779+
from unstructured.partition.pdf_image.ocr import process_data_with_ocr, process_file_with_ocr
796780
from unstructured.partition.pdf_image.pdf_image_utils import (
797781
check_element_types_to_extract,
798782
save_elements,
@@ -893,10 +877,7 @@ def _partition_pdf_or_image_local(
893877

894878
extracted_layout, layouts_links = (
895879
process_data_with_pdfminer(
896-
file=file,
897-
dpi=pdf_image_dpi,
898-
password=password,
899-
pdfminer_config=pdfminer_config,
880+
file=file, dpi=pdf_image_dpi, password=password, pdfminer_config=pdfminer_config
900881
)
901882
if pdf_text_extractable
902883
else ([], [])
@@ -1117,8 +1098,7 @@ def _partition_pdf_or_image_with_ocr(
11171098
elements.extend(page_elements)
11181099
else:
11191100
for page_number, image in enumerate(
1120-
convert_pdf_to_images(filename, file, password=password),
1121-
start=starting_page_number,
1101+
convert_pdf_to_images(filename, file, password=password), start=starting_page_number
11221102
):
11231103
page_elements = _partition_pdf_or_image_with_ocr_from_image(
11241104
image=image,
@@ -1365,9 +1345,7 @@ def document_to_element_list(
13651345
**kwargs: Any,
13661346
) -> list[Element]:
13671347
"""Converts a DocumentLayout object to a list of unstructured elements."""
1368-
from unstructured.partition.pdf_image.pdfminer_processing import (
1369-
get_links_in_element,
1370-
)
1348+
from unstructured.partition.pdf_image.pdfminer_processing import get_links_in_element
13711349

13721350
elements: list[Element] = []
13731351

0 commit comments

Comments
 (0)