Skip to content

Commit f6fcba4

Browse files
authored
Add a check for complex pdfs (#4268)
This checks if a pdf file is likely a complex document like mini-holistic-3-v1-Eng_Civil-Structural-Drawing_p001.pdf that is mostly vector graphics by comparing the ratio of vector images to text elements. This limits the overhead to every file by setting a minimum file size before running the check.
1 parent 4da154b commit f6fcba4

4 files changed

Lines changed: 264 additions & 31 deletions

File tree

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
## 0.21.12
2+
- **Add Check for complex documents**: Adds a check for complex documents to avoid pdfminer with a high ratio of vector objects
3+
14
## 0.21.11
25

36
### Enhancements
@@ -14,7 +17,6 @@
1417
### Enhancements
1518
- **Optimize PDF render mode patching performance**: Optimized `_patch_current_chars_with_render_mode` in `CustomPDFPageInterpreter` to avoid O(N²) re-scanning by tracking the last-patched index, so each `do_TJ`/`do_Tj` call only processes newly-added characters.
1619

17-
1820
## 0.21.7
1921

2022
### Enhancements

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import base64
4+
import io
45
import logging
56
import math
67
import os
@@ -1542,6 +1543,79 @@ def test_pdf_hi_res_max_pages_argument(filename, pdf_hi_res_max_pages, expected_
15421543
)
15431544

15441545

1546+
def test_is_pdf_too_complex_skips_small_file_size():
1547+
assert not pdf.is_pdf_too_complex(file=b"tiny", min_file_size_bytes=10)
1548+
1549+
1550+
def test_is_pdf_too_complex_detects_vector_heavy_page():
1551+
class MockStream:
1552+
def get_data(self):
1553+
return b" ".join([b"m"] * 120 + [b"Tj"] * 2)
1554+
1555+
reader = mock.Mock()
1556+
reader.pages = [{"/Contents": MockStream()}]
1557+
1558+
with mock.patch.object(pdf, "PdfReader", return_value=reader):
1559+
assert pdf.is_pdf_too_complex(
1560+
file=b"x" * 20,
1561+
max_graphics_ops=100,
1562+
min_graphics_to_text_ratio=20.0,
1563+
min_file_size_bytes=1,
1564+
min_raw_stream_bytes=1,
1565+
)
1566+
1567+
1568+
def test_is_pdf_too_complex_skips_pages_without_contents():
1569+
reader = mock.Mock()
1570+
reader.pages = [{"/Contents": None}]
1571+
1572+
with mock.patch.object(pdf, "PdfReader", return_value=reader):
1573+
assert not pdf.is_pdf_too_complex(
1574+
file=b"x" * 20,
1575+
min_file_size_bytes=1,
1576+
min_raw_stream_bytes=1,
1577+
)
1578+
1579+
1580+
def test_is_pdf_too_complex_skips_small_content_streams():
1581+
class MockStream:
1582+
def get_data(self):
1583+
return b"m Tj"
1584+
1585+
reader = mock.Mock()
1586+
reader.pages = [{"/Contents": MockStream()}]
1587+
1588+
with mock.patch.object(pdf, "PdfReader", return_value=reader):
1589+
assert not pdf.is_pdf_too_complex(
1590+
file=b"x" * 20,
1591+
max_graphics_ops=1,
1592+
min_graphics_to_text_ratio=1.0,
1593+
min_file_size_bytes=1,
1594+
min_raw_stream_bytes=20,
1595+
)
1596+
1597+
1598+
def test_is_pdf_too_complex_restores_file_cursor_position():
1599+
file = io.BytesIO(b"x" * 20)
1600+
file.seek(7)
1601+
1602+
reader = mock.Mock()
1603+
reader.pages = []
1604+
1605+
with mock.patch.object(pdf, "PdfReader", return_value=reader):
1606+
assert not pdf.is_pdf_too_complex(
1607+
file=file,
1608+
min_file_size_bytes=1,
1609+
min_raw_stream_bytes=1,
1610+
)
1611+
1612+
assert file.tell() == 7
1613+
1614+
1615+
def test_is_pdf_too_complex_returns_false_for_normal_pdf():
1616+
assert not pdf.is_pdf_too_complex(filename=example_doc_path("pdf/layout-parser-paper.pdf"))
1617+
1618+
15451619
def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
15461620
# TODO (yao): investigate why we need this test. The LayoutElement definition suggests bbox
15471621
# can't be None and it has to be a Rectangle object that has x1, y1, x2, y2 attributes.

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.21.11" # pragma: no cover
1+
__version__ = "0.21.12" # pragma: no cover

0 commit comments

Comments
 (0)