-
Notifications
You must be signed in to change notification settings - Fork 20
Expand file tree
/
Copy pathpdf_utils.py
More file actions
75 lines (56 loc) · 2.13 KB
/
pdf_utils.py
File metadata and controls
75 lines (56 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from __future__ import annotations
import io
import logging
from typing import cast, Optional, BinaryIO, Union
from pypdf import PdfReader
from pypdf.errors import FileNotDecryptedError, PdfReadError
from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
from unstructured_client._hooks.custom.validation_errors import FileValidationError
logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME)
# Loading pdfs with strict=False can dump a lot of warnings
# We don't need to display these
pdf_logger = logging.getLogger("pypdf")
pdf_logger.setLevel(logging.ERROR)
class PDFValidationError(FileValidationError):
"""Exception for PDF validation errors."""
def __init__(self, message: str):
super().__init__(message, file_type="PDF")
def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
"""Reads the given PDF file.
Args:
pdf_file: The PDF file to be read.
Returns:
The PdfReader object if the file is a PDF, None otherwise.
"""
try:
if isinstance(pdf_file, bytes):
content = cast(bytes, pdf_file)
pdf_file = io.BytesIO(content)
return PdfReader(pdf_file, strict=False)
except (PdfReadError, UnicodeDecodeError):
return None
def check_pdf(pdf: PdfReader) -> PdfReader:
"""
Check if PDF is:
- Encrypted
- Has corrupted pages
- Has corrupted root object
Throws:
- PDFValidationError if file is encrypted or corrupted
"""
try:
# This will raise if the file is encrypted
pdf.metadata # pylint: disable=pointless-statement
# This will raise if the file's root object is corrupted
pdf.root_object # pylint: disable=pointless-statement
# This will raise if the file's pages are corrupted
list(pdf.pages)
return pdf
except FileNotDecryptedError as e:
raise PDFValidationError(
"File is encrypted. Please decrypt it with password.",
) from e
except PdfReadError as e:
raise PDFValidationError(
f"File does not appear to be a valid PDF. Error: {e}",
) from e