|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
3 | 3 | import base64 |
| 4 | +import io |
4 | 5 | import logging |
5 | 6 | import math |
6 | 7 | import os |
@@ -1542,6 +1543,75 @@ def test_pdf_hi_res_max_pages_argument(filename, pdf_hi_res_max_pages, expected_ |
1542 | 1543 | ) |
1543 | 1544 |
|
1544 | 1545 |
|
| 1546 | +def test_is_pdf_too_complex_skips_small_file_size(): |
| 1547 | + assert not pdf.is_pdf_too_complex(file=b"tiny", min_file_size_bytes=10) |
| 1548 | + |
| 1549 | + |
| 1550 | +def test_is_pdf_too_complex_detects_vector_heavy_page(): |
| 1551 | + class MockStream: |
| 1552 | + def get_data(self): |
| 1553 | + return b" ".join([b"m"] * 120 + [b"Tj"] * 2) |
| 1554 | + |
| 1555 | + reader = mock.Mock() |
| 1556 | + reader.pages = [{"/Contents": MockStream()}] |
| 1557 | + |
| 1558 | + with mock.patch.object(pdf, "PdfReader", return_value=reader): |
| 1559 | + assert pdf.is_pdf_too_complex( |
| 1560 | + file=b"x" * 20, |
| 1561 | + max_graphics_ops=100, |
| 1562 | + min_graphics_to_text_ratio=20.0, |
| 1563 | + min_file_size_bytes=1, |
| 1564 | + min_raw_stream_bytes=1, |
| 1565 | + ) |
| 1566 | + |
| 1567 | + |
| 1568 | +def test_is_pdf_too_complex_skips_pages_without_contents(): |
| 1569 | + reader = mock.Mock() |
| 1570 | + reader.pages = [{"/Contents": None}] |
| 1571 | + |
| 1572 | + with mock.patch.object(pdf, "PdfReader", return_value=reader): |
| 1573 | + assert not pdf.is_pdf_too_complex( |
| 1574 | + file=b"x" * 20, |
| 1575 | + min_file_size_bytes=1, |
| 1576 | + min_raw_stream_bytes=1, |
| 1577 | + ) |
| 1578 | + |
| 1579 | + |
| 1580 | +def test_is_pdf_too_complex_skips_small_content_streams(): |
| 1581 | + class MockStream: |
| 1582 | + def get_data(self): |
| 1583 | + return b"m Tj" |
| 1584 | + |
| 1585 | + reader = mock.Mock() |
| 1586 | + reader.pages = [{"/Contents": MockStream()}] |
| 1587 | + |
| 1588 | + with mock.patch.object(pdf, "PdfReader", return_value=reader): |
| 1589 | + assert not pdf.is_pdf_too_complex( |
| 1590 | + file=b"x" * 20, |
| 1591 | + max_graphics_ops=1, |
| 1592 | + min_graphics_to_text_ratio=1.0, |
| 1593 | + min_file_size_bytes=1, |
| 1594 | + min_raw_stream_bytes=20, |
| 1595 | + ) |
| 1596 | + |
| 1597 | + |
| 1598 | +def test_is_pdf_too_complex_restores_file_cursor_position(): |
| 1599 | + file = io.BytesIO(b"x" * 20) |
| 1600 | + file.seek(7) |
| 1601 | + |
| 1602 | + reader = mock.Mock() |
| 1603 | + reader.pages = [] |
| 1604 | + |
| 1605 | + with mock.patch.object(pdf, "PdfReader", return_value=reader): |
| 1606 | + assert not pdf.is_pdf_too_complex( |
| 1607 | + file=file, |
| 1608 | + min_file_size_bytes=1, |
| 1609 | + min_raw_stream_bytes=1, |
| 1610 | + ) |
| 1611 | + |
| 1612 | + assert file.tell() == 7 |
| 1613 | + |
| 1614 | + |
1545 | 1615 | def test_document_to_element_list_omits_coord_system_when_coord_points_absent(): |
1546 | 1616 | # TODO (yao): investigate why we need this test. The LayoutElement definition suggests bbox |
1547 | 1617 | # can't be None and it has to be a Rectangle object that has x1, y1, x2, y2 attributes. |
|
0 commit comments