Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions docs/document.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1246,6 +1246,14 @@ For details on **embedded files** refer to Appendix 3.

Check whether the document can be saved incrementally. Use it to choose the right option without encountering exceptions.

.. method:: repair()

Repair document.

* Slow for large documents.
* Does nothing on non-PDF documents.
* New in v1.27.0

.. method:: scrub(attached_files=True, clean_pages=True, embedded_files=True, hidden_text=True, javascript=True, metadata=True, redactions=True, redact_images=0, remove_links=True, reset_fields=True, reset_responses=True, thumbnails=True, xml_metadata=True)

* New in v1.16.14
Expand All @@ -1267,7 +1275,7 @@ For details on **embedded files** refer to Appendix 3.
:arg bool xml_metadata: Remove XML metadata.


.. method:: save(outfile, garbage=0, clean=False, deflate=False, deflate_images=False, deflate_fonts=False, incremental=False, ascii=False, expand=0, linear=False, pretty=False, no_new_id=False, encryption=PDF_ENCRYPT_NONE, permissions=-1, owner_pw=None, user_pw=None, use_objstms=0)
.. method:: save(outfile, garbage=0, clean=False, deflate=False, deflate_images=False, deflate_fonts=False, incremental=False, ascii=False, expand=0, linear=False, pretty=False, no_new_id=False, encryption=PDF_ENCRYPT_NONE, permissions=-1, owner_pw=None, user_pw=None, use_objstms=0, compression_effort=0, raise_on_repair=False)

* Changed in v1.18.7
* Changed in v1.19.0
Expand Down Expand Up @@ -1318,8 +1326,19 @@ For details on **embedded files** refer to Appendix 3.

:arg int use_objstms: *(new in v1.24.0)* compression option that converts eligible PDF object definitions to information that is stored in some other object's :data:`stream` data. Depending on the `deflate` parameter value, the converted object definitions will be compressed -- which can lead to very significant file size reductions.

.. warning:: The method does not check, whether a file of that name already exists, will hence not ask for confirmation, and overwrite the file. It is your responsibility as a programmer to handle this.
.. warning:: The method does not check, whether a file of that name already exists, will hence not ask for confirmation, and overwrite the file. It is your responsibility as a programmer to handle this.

:arg int compression_effort:

* 0 for default
* 1 for minimum effort.
* 100 for maximum effort.

:arg bool raise_on_repair: *(new in v1.27.0)* If true we raise an exception if the save caused a repair.
This is useful because repairs can cause changes to be lost.

Also see `Document.repair()`.

.. note::

**File size reduction**
Expand Down
15 changes: 15 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4670,6 +4670,7 @@ def ez_save(
preserve_metadata=1,
use_objstms=1,
compression_effort=0,
raise_on_repair=False,
):
'''
Save PDF using some different defaults
Expand All @@ -4694,6 +4695,7 @@ def ez_save(
preserve_metadata=preserve_metadata,
use_objstms=use_objstms,
compression_effort=compression_effort,
raise_on_repair=raise_on_repair,
)

def find_bookmark(self, bm):
Expand Down Expand Up @@ -6202,6 +6204,14 @@ def reload_page(self, page: Page) -> Page:
f'{refs_old=} {m_internal_old=:#x} {m_internal_new=:#x}'
return page

def repair(self):
'''
If we are a PDF document, does repair.
'''
pdf = _as_pdf_document(self, required=False)
if pdf.m_internal:
mupdf.pdf_check_document(pdf)

def resolve_link(self, uri=None, chapters=0):
"""Calculate internal link destination.

Expand Down Expand Up @@ -6481,9 +6491,11 @@ def save(
preserve_metadata=1,
use_objstms=0,
compression_effort=0,
raise_on_repair=False,
):
# From %pythonprepend save
#
is_repaired_pre = self.is_repaired
"""Save PDF to file, pathlib.Path or file pointer."""
if self.is_closed or self.is_encrypted:
raise ValueError("document closed or encrypted")
Expand Down Expand Up @@ -6547,6 +6559,9 @@ def save(
#log( f'{type(out)=} {type(out.this)=}')
mupdf.pdf_write_document(pdf, out, opts)
out.fz_close_output()
if raise_on_repair:
if self.is_repaired and not is_repaired_pre:
raise Exception(f'Document save did a repair')

def save_snapshot(self, filename):
"""Save a file snapshot suitable for journalling."""
Expand Down
Binary file added tests/resources/test_4790.pdf
Binary file not shown.
78 changes: 78 additions & 0 deletions tests/test_pagedelete.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,81 @@ def test_4462():
document.save(path2)
with pymupdf.open(path2) as document:
assert len(document) == 2


def test_4790():
path = os.path.normpath(f'{__file__}/../../tests/resources/test_4790.pdf')
path2 = os.path.normpath(f'{__file__}/../../tests/test_4790_out.pdf')
print()
page_to_delete = 1

# Reproduce the problem.
with pymupdf.open(path) as document:
wt = pymupdf.TOOLS.mupdf_warnings()
assert not wt, f'{wt=}'
assert len(document) == 2, f'{len(document)=}'
document.delete_pages(page_to_delete)
assert len(document) == 1, f'{len(document)=}'
document.save(path2)
wt = pymupdf.TOOLS.mupdf_warnings()
assert wt == 'repairing PDF document', f'{wt=}'
with pymupdf.open(path2) as document:
# Expect incorrect result.
assert len(document) == 2, f'{len(document)=}'

# Call mupdf.pdf_repair_xref() before delete_pages(); this works around the
# problem.
with pymupdf.open(path) as document:
document_pdf = pymupdf._as_pdf_document(document)
pymupdf.mupdf.pdf_repair_xref(document_pdf)
wt = pymupdf.TOOLS.mupdf_warnings()
assert wt == 'repairing PDF document', f'{wt=}'
document.delete_pages(page_to_delete)
document.save(path2)
with pymupdf.open(path2) as document:
# Expect correct result.
assert len(document) == 1

# Call mupdf.pdf_check_document() before delete_pages(); this works around
# the problem.
with pymupdf.open(path) as document:
document_pdf = pymupdf._as_pdf_document(document)
pymupdf.mupdf.pdf_check_document(document_pdf)
wt = pymupdf.TOOLS.mupdf_warnings()
assert wt == 'repairing PDF document', f'{wt=}'
document.delete_pages(page_to_delete)
document.save(path2)
with pymupdf.open(path2) as document:
# Expect correct result.
assert len(document) == 1

# Check that document is marked as repaired after save.
with pymupdf.open(path) as document:
assert not document.is_repaired, f'{document.is_repaired=}'
document.save(path2)
assert document.is_repaired, f'{document.is_repaired=}'
wt = pymupdf.TOOLS.mupdf_warnings()
assert wt == 'repairing PDF document', f'{wt=}'

# Check that raise_on_repair=True works.
with pymupdf.open(path) as document:
try:
document.save(path2, raise_on_repair=True)
except Exception as e:
print(f'Received expected exception: {e}', flush=1)
else:
assert 0, 'Did not get expected exception.'
wt = pymupdf.TOOLS.mupdf_warnings()
assert wt == 'repairing PDF document'

# Check that Document.repair() works.
with pymupdf.open(path) as document:
document.repair()
wt = pymupdf.TOOLS.mupdf_warnings()
assert wt == 'repairing PDF document'
document.delete_pages(page_to_delete)
document.save(path2, raise_on_repair=True)
with pymupdf.open(path2) as document:
# Expect correct result.
assert len(document) == 1, f'{len(document)=}'

23 changes: 19 additions & 4 deletions tests/test_textextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,14 +252,29 @@ def test_3197():
b'Related Tickers\nTTM\n12/31/2023\n12/31/2022\n12/31/2021\n12/31/2020\n14,918,000\n14,918,000\n6,853,000\n15,787,000\n24,269,000\n-17,628,000\n-17,628,000\n-4,347,000\n2,745,000\n-18,615,000\n2,584,000\n2,584,000\n2,511,000\n-23,498,000\n2,315,000\n25,110,000\n25,110,000\n25,340,000\n20,737,000\n25,935,000\n-8,236,000\n-8,236,000\n-6,866,000\n-6,227,000\n-5,742,000\n51,659,000\n51,659,000\n45,470,000\n27,901,000\n65,900,000\n-41,965,000\n-41,965,000\n-45,655,000\n-54,164,000\n-60,514,000\n-335,000\n-335,000\n-484,000\n--\n--\n6,682,000\n6,682,000\n-13,000\n9,560,000\n18,527,000\n \nYahoo Finance Plus Essential\naccess required.\nUnlock Access\nBreakdown\nOperating Cash\nFlow\nInvesting Cash\nFlow\nFinancing Cash\nFlow\nEnd Cash Position\nCapital Expenditure\nIssuance of Debt\nRepayment of Debt\nRepurchase of\nCapital Stock\nFree Cash Flow\n12/31/2020 - 6/1/1972\nGM\nGeneral Motors Compa\xe2\x80\xa6\n39.49 +1.23%\n\xc2\xa0\nRIVN\nRivian Automotive, Inc.\n15.39 -3.15%\n\xc2\xa0\nNIO\nNIO Inc.\n5.97 +0.17%\n\xc2\xa0\nSTLA\nStellantis N.V.\n25.63 +0.91%\n\xc2\xa0\nLCID\nLucid Group, Inc.\n3.7000 +0.54%\n\xc2\xa0\nTSLA\nTesla, Inc.\n194.77 +0.52%\n\xc2\xa0\nTM\nToyota Motor Corporati\xe2\x80\xa6\n227.09 +0.14%\n\xc2\xa0\nXPEV\nXPeng Inc.\n9.08 +0.89%\n\xc2\xa0\nFSR\nFisker Inc.\n0.5579 -11.46%\n\xc2\xa0\nCopyright \xc2\xa9 2024 Yahoo.\nAll rights reserved.\nPOPULAR QUOTES\nTesla\nDAX Index\nKOSPI\nDow Jones\nS&P BSE SENSEX\nSPDR S&P 500 ETF Trust\nEXPLORE MORE\nCredit Score Management\nHousing Market\nActive vs. Passive Investing\nShort Selling\nToday\xe2\x80\x99s Mortgage Rates\nHow Much Mortgage Can You Afford\nABOUT\nData Disclaimer\nHelp\nSuggestions\nSitemap\n',
]

num_errors = 0
with pymupdf.open(path) as document:
for i, page in enumerate(document):
text = page.get_text()
#print(f'{i=}:')

text_utf8 = text.encode('utf8')
#print(f' {text_utf8=}')
#print(f' {text_utf8_expected[i]=}')
assert text_utf8 == text_utf8_expected[i]

if text_utf8 != text_utf8_expected[i]:
num_errors += 1
print(f'Error, {i=}.')
import difflib
print(f' {text_utf8_expected[i]=}')
print(f' {text_utf8=}')
text_expected = text_utf8_expected[i].decode('utf8')
diff = difflib.unified_diff(
text_expected.split('\n'),
text.split('\n'),
lineterm='',
)
print(f'Diff expected => actual:')
print(textwrap.indent('\n'.join(diff), ' '))

assert not num_errors, f'{num_errors=}'


def test_document_text():
Expand Down