-
Notifications
You must be signed in to change notification settings - Fork 20
Expand file tree
/
Copy pathtest_pdf_utils.py
More file actions
66 lines (52 loc) · 1.83 KB
/
test_pdf_utils.py
File metadata and controls
66 lines (52 loc) · 1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from __future__ import annotations
import io
import pytest
from pypdf import PdfReader
from unstructured_client._hooks.custom.pdf_utils import check_pdf, read_pdf, PDFValidationError
from _test_unstructured_client.unit_utils import sample_docs_path
def _open_pdf(pdf_path: str) -> PdfReader:
with open(pdf_path, "rb") as f:
pdf_content = f.read()
return PdfReader(io.BytesIO(pdf_content))
def test_check_pdf_with_valid_pdf():
pdf_path = sample_docs_path("list-item-example-1.pdf")
pdf = _open_pdf(pdf_path)
result = check_pdf(pdf)
assert isinstance(result, PdfReader)
# TODO(klaijan) - add pdf file when file is ready
@pytest.mark.parametrize(
("pdf_name", "expected_error_message"),
[
(
"failing-encrypted.pdf",
"File is encrypted. Please decrypt it with password.",
),
(
"failing-missing-root.pdf",
"File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
),
(
"failing-missing-pages.pdf",
"File does not appear to be a valid PDF. Error: Invalid object in /Pages",
),
],
)
def test_check_pdf_raises_pdf_validation_error(
pdf_name: str, expected_error_message: str
):
"""Test that we get a PDFValidationError with the correct error message for invalid PDF files."""
pdf_path = sample_docs_path(pdf_name)
pdf = _open_pdf(pdf_path)
with pytest.raises(PDFValidationError) as exc_info:
check_pdf(pdf)
assert exc_info.value.message == expected_error_message
# TODO(klaijan) - uncomment when file is ready
"""
def test_check_read_pdf():
pdf_path = sample_docs_path(".pdf")
with open(pdf_path, "rb") as f:
pdf_content = f.read()
pdf = read_pdf(pdf_content)
result = check_pdf(pdf)
assert isinstance(result, PdfReader)
"""