-
Notifications
You must be signed in to change notification settings - Fork 20
Expand file tree
/
Copy pathpdf_utils.py
More file actions
119 lines (95 loc) · 3.54 KB
/
pdf_utils.py
File metadata and controls
119 lines (95 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from __future__ import annotations
import io
import logging
from typing import cast, Optional, BinaryIO, Union
from pypdf import PdfReader
from pypdf.errors import FileNotDecryptedError, PdfReadError
from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
from unstructured_client._hooks.custom.validation_errors import FileValidationError
logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME)
# Loading pdfs with strict=False can dump a lot of warnings
# We don't need to display these
pdf_logger = logging.getLogger("pypdf")
pdf_logger.setLevel(logging.ERROR)
class PDFValidationError(FileValidationError):
"""Exception for PDF validation errors."""
def __init__(self, message: str):
super().__init__(message, file_type="PDF")
def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
reader = read_pdf_raw(pdf_file=pdf_file)
if reader:
return reader
# TODO(klaijan) - remove once debugged
pdf_logger.debug("Primary PdfReader parse failed, attempting multipart and raw extraction fallbacks.")
# Load raw bytes
# case bytes
if isinstance(pdf_file, bytes):
raw = pdf_file
# case BinaryIO
elif hasattr(pdf_file, "read"):
try:
pdf_file.seek(0)
raw = pdf_file.read()
except Exception as e:
raise IOError(f"Failed to read file stream: {e}") from e
else:
raise IOError("Expected bytes or a file-like object with 'read()' method")
# breakpoint()
# This looks for %PDF-
try:
start = raw.find(b"%PDF-")
end = raw.find(b"%%EOF") + len(b"%%EOF")
if start != -1:
sliced = raw[start:end]
pdf = PdfReader(io.BytesIO(sliced), strict=False)
return check_pdf(pdf)
except Exception as e:
pdf_logger.debug("%%PDF- slicing fallback failed: %s", e)
return None
def read_pdf_raw(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
"""Reads the given PDF file.
Args:
pdf_file: The PDF file to be read.
Returns:
The PdfReader object if the file is a PDF, None otherwise.
"""
try:
if isinstance(pdf_file, bytes):
content = cast(bytes, pdf_file)
pdf_file = io.BytesIO(content)
reader = PdfReader(pdf_file, strict=False)
return check_pdf(reader)
except (PdfReadError, UnicodeDecodeError) as e:
pdf_logger.debug("Read pdf failed: %s", e)
return None
except PDFValidationError as e:
pdf_logger.debug("Check pdf failed: %s", e)
return None
except Exception as e:
pdf_logger.debug("An unexpected error occurred: %s", e)
return None
def check_pdf(pdf: PdfReader) -> PdfReader:
"""
Check if PDF is:
- Encrypted
- Has corrupted pages
- Has corrupted root object
Throws:
- PDFValidationError if file is encrypted or corrupted
"""
try:
# This will raise if the file is encrypted
pdf.metadata # pylint: disable=pointless-statement
# This will raise if the file's root object is corrupted
pdf.root_object # pylint: disable=pointless-statement
# This will raise if the file's pages are corrupted
list(pdf.pages)
return pdf
except FileNotDecryptedError as e:
raise PDFValidationError(
"File is encrypted. Please decrypt it with password.",
) from e
except PdfReadError as e:
raise PDFValidationError(
f"File does not appear to be a valid PDF. Error: {e}",
) from e