diff --git a/docs/document.rst b/docs/document.rst index 352347da9..0b1a6af08 100644 --- a/docs/document.rst +++ b/docs/document.rst @@ -1246,6 +1246,14 @@ For details on **embedded files** refer to Appendix 3. Check whether the document can be saved incrementally. Use it to choose the right option without encountering exceptions. + .. method:: repair() + + Repair document. + + * Slow for large documents. + * Does nothing on non-PDF documents. + * New in v1.27.0 + .. method:: scrub(attached_files=True, clean_pages=True, embedded_files=True, hidden_text=True, javascript=True, metadata=True, redactions=True, redact_images=0, remove_links=True, reset_fields=True, reset_responses=True, thumbnails=True, xml_metadata=True) * New in v1.16.14 @@ -1267,7 +1275,7 @@ For details on **embedded files** refer to Appendix 3. :arg bool xml_metadata: Remove XML metadata. - .. method:: save(outfile, garbage=0, clean=False, deflate=False, deflate_images=False, deflate_fonts=False, incremental=False, ascii=False, expand=0, linear=False, pretty=False, no_new_id=False, encryption=PDF_ENCRYPT_NONE, permissions=-1, owner_pw=None, user_pw=None, use_objstms=0) + .. method:: save(outfile, garbage=0, clean=False, deflate=False, deflate_images=False, deflate_fonts=False, incremental=False, ascii=False, expand=0, linear=False, pretty=False, no_new_id=False, encryption=PDF_ENCRYPT_NONE, permissions=-1, owner_pw=None, user_pw=None, use_objstms=0, compression_effort=0, raise_on_repair=False) * Changed in v1.18.7 * Changed in v1.19.0 @@ -1318,8 +1326,19 @@ For details on **embedded files** refer to Appendix 3. :arg int use_objstms: *(new in v1.24.0)* compression option that converts eligible PDF object definitions to information that is stored in some other object's :data:`stream` data. Depending on the `deflate` parameter value, the converted object definitions will be compressed -- which can lead to very significant file size reductions. - .. warning:: The method does not check, whether a file of that name already exists, will hence not ask for confirmation, and overwrite the file. It is your responsibility as a programmer to handle this. + .. warning:: The method does not check, whether a file of that name already exists, will hence not ask for confirmation, and overwrite the file. It is your responsibility as a programmer to handle this. + :arg int compression_effort: + + * 0 for default + * 1 for minimum effort. + * 100 for maximum effort. + + :arg bool raise_on_repair: *(new in v1.27.0)* If true we raise an exception if the save caused a repair. + This is useful because repairs can cause changes to be lost. + + Also see `Document.repair()`. + .. note:: **File size reduction** diff --git a/src/__init__.py b/src/__init__.py index 4b5184601..14953b10e 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -4670,6 +4670,7 @@ def ez_save( preserve_metadata=1, use_objstms=1, compression_effort=0, + raise_on_repair=False, ): ''' Save PDF using some different defaults @@ -4694,6 +4695,7 @@ def ez_save( preserve_metadata=preserve_metadata, use_objstms=use_objstms, compression_effort=compression_effort, + raise_on_repair=raise_on_repair, ) def find_bookmark(self, bm): @@ -6202,6 +6204,14 @@ def reload_page(self, page: Page) -> Page: f'{refs_old=} {m_internal_old=:#x} {m_internal_new=:#x}' return page + def repair(self): + ''' + If we are a PDF document, does repair. + ''' + pdf = _as_pdf_document(self, required=False) + if pdf.m_internal: + mupdf.pdf_check_document(pdf) + def resolve_link(self, uri=None, chapters=0): """Calculate internal link destination. @@ -6481,9 +6491,11 @@ def save( preserve_metadata=1, use_objstms=0, compression_effort=0, + raise_on_repair=False, ): # From %pythonprepend save # + is_repaired_pre = self.is_repaired """Save PDF to file, pathlib.Path or file pointer.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") @@ -6547,6 +6559,9 @@ def save( #log( f'{type(out)=} {type(out.this)=}') mupdf.pdf_write_document(pdf, out, opts) out.fz_close_output() + if raise_on_repair: + if self.is_repaired and not is_repaired_pre: + raise Exception(f'Document save did a repair') def save_snapshot(self, filename): """Save a file snapshot suitable for journalling.""" diff --git a/tests/resources/test_4790.pdf b/tests/resources/test_4790.pdf new file mode 100644 index 000000000..fe4675bd0 Binary files /dev/null and b/tests/resources/test_4790.pdf differ diff --git a/tests/test_pagedelete.py b/tests/test_pagedelete.py index 65f42e4b6..2bacc10c5 100644 --- a/tests/test_pagedelete.py +++ b/tests/test_pagedelete.py @@ -113,3 +113,81 @@ def test_4462(): document.save(path2) with pymupdf.open(path2) as document: assert len(document) == 2 + + +def test_4790(): + path = os.path.normpath(f'{__file__}/../../tests/resources/test_4790.pdf') + path2 = os.path.normpath(f'{__file__}/../../tests/test_4790_out.pdf') + print() + page_to_delete = 1 + + # Reproduce the problem. + with pymupdf.open(path) as document: + wt = pymupdf.TOOLS.mupdf_warnings() + assert not wt, f'{wt=}' + assert len(document) == 2, f'{len(document)=}' + document.delete_pages(page_to_delete) + assert len(document) == 1, f'{len(document)=}' + document.save(path2) + wt = pymupdf.TOOLS.mupdf_warnings() + assert wt == 'repairing PDF document', f'{wt=}' + with pymupdf.open(path2) as document: + # Expect incorrect result. + assert len(document) == 2, f'{len(document)=}' + + # Call mupdf.pdf_repair_xref() before delete_pages(); this works around the + # problem. + with pymupdf.open(path) as document: + document_pdf = pymupdf._as_pdf_document(document) + pymupdf.mupdf.pdf_repair_xref(document_pdf) + wt = pymupdf.TOOLS.mupdf_warnings() + assert wt == 'repairing PDF document', f'{wt=}' + document.delete_pages(page_to_delete) + document.save(path2) + with pymupdf.open(path2) as document: + # Expect correct result. + assert len(document) == 1 + + # Call mupdf.pdf_check_document() before delete_pages(); this works around + # the problem. + with pymupdf.open(path) as document: + document_pdf = pymupdf._as_pdf_document(document) + pymupdf.mupdf.pdf_check_document(document_pdf) + wt = pymupdf.TOOLS.mupdf_warnings() + assert wt == 'repairing PDF document', f'{wt=}' + document.delete_pages(page_to_delete) + document.save(path2) + with pymupdf.open(path2) as document: + # Expect correct result. + assert len(document) == 1 + + # Check that document is marked as repaired after save. + with pymupdf.open(path) as document: + assert not document.is_repaired, f'{document.is_repaired=}' + document.save(path2) + assert document.is_repaired, f'{document.is_repaired=}' + wt = pymupdf.TOOLS.mupdf_warnings() + assert wt == 'repairing PDF document', f'{wt=}' + + # Check that raise_on_repair=True works. + with pymupdf.open(path) as document: + try: + document.save(path2, raise_on_repair=True) + except Exception as e: + print(f'Received expected exception: {e}', flush=1) + else: + assert 0, 'Did not get expected exception.' + wt = pymupdf.TOOLS.mupdf_warnings() + assert wt == 'repairing PDF document' + + # Check that Document.repair() works. + with pymupdf.open(path) as document: + document.repair() + wt = pymupdf.TOOLS.mupdf_warnings() + assert wt == 'repairing PDF document' + document.delete_pages(page_to_delete) + document.save(path2, raise_on_repair=True) + with pymupdf.open(path2) as document: + # Expect correct result. + assert len(document) == 1, f'{len(document)=}' + diff --git a/tests/test_textextract.py b/tests/test_textextract.py index cdb6e4bc7..ab66f8aef 100644 --- a/tests/test_textextract.py +++ b/tests/test_textextract.py @@ -252,14 +252,29 @@ def test_3197(): b'Related Tickers\nTTM\n12/31/2023\n12/31/2022\n12/31/2021\n12/31/2020\n14,918,000\n14,918,000\n6,853,000\n15,787,000\n24,269,000\n-17,628,000\n-17,628,000\n-4,347,000\n2,745,000\n-18,615,000\n2,584,000\n2,584,000\n2,511,000\n-23,498,000\n2,315,000\n25,110,000\n25,110,000\n25,340,000\n20,737,000\n25,935,000\n-8,236,000\n-8,236,000\n-6,866,000\n-6,227,000\n-5,742,000\n51,659,000\n51,659,000\n45,470,000\n27,901,000\n65,900,000\n-41,965,000\n-41,965,000\n-45,655,000\n-54,164,000\n-60,514,000\n-335,000\n-335,000\n-484,000\n--\n--\n6,682,000\n6,682,000\n-13,000\n9,560,000\n18,527,000\n \nYahoo Finance Plus Essential\naccess required.\nUnlock Access\nBreakdown\nOperating Cash\nFlow\nInvesting Cash\nFlow\nFinancing Cash\nFlow\nEnd Cash Position\nCapital Expenditure\nIssuance of Debt\nRepayment of Debt\nRepurchase of\nCapital Stock\nFree Cash Flow\n12/31/2020 - 6/1/1972\nGM\nGeneral Motors Compa\xe2\x80\xa6\n39.49 +1.23%\n\xc2\xa0\nRIVN\nRivian Automotive, Inc.\n15.39 -3.15%\n\xc2\xa0\nNIO\nNIO Inc.\n5.97 +0.17%\n\xc2\xa0\nSTLA\nStellantis N.V.\n25.63 +0.91%\n\xc2\xa0\nLCID\nLucid Group, Inc.\n3.7000 +0.54%\n\xc2\xa0\nTSLA\nTesla, Inc.\n194.77 +0.52%\n\xc2\xa0\nTM\nToyota Motor Corporati\xe2\x80\xa6\n227.09 +0.14%\n\xc2\xa0\nXPEV\nXPeng Inc.\n9.08 +0.89%\n\xc2\xa0\nFSR\nFisker Inc.\n0.5579 -11.46%\n\xc2\xa0\nCopyright \xc2\xa9 2024 Yahoo.\nAll rights reserved.\nPOPULAR QUOTES\nTesla\nDAX Index\nKOSPI\nDow Jones\nS&P BSE SENSEX\nSPDR S&P 500 ETF Trust\nEXPLORE MORE\nCredit Score Management\nHousing Market\nActive vs. Passive Investing\nShort Selling\nToday\xe2\x80\x99s Mortgage Rates\nHow Much Mortgage Can You Afford\nABOUT\nData Disclaimer\nHelp\nSuggestions\nSitemap\n', ] + num_errors = 0 with pymupdf.open(path) as document: for i, page in enumerate(document): text = page.get_text() - #print(f'{i=}:') + text_utf8 = text.encode('utf8') - #print(f' {text_utf8=}') - #print(f' {text_utf8_expected[i]=}') - assert text_utf8 == text_utf8_expected[i] + + if text_utf8 != text_utf8_expected[i]: + num_errors += 1 + print(f'Error, {i=}.') + import difflib + print(f' {text_utf8_expected[i]=}') + print(f' {text_utf8=}') + text_expected = text_utf8_expected[i].decode('utf8') + diff = difflib.unified_diff( + text_expected.split('\n'), + text.split('\n'), + lineterm='', + ) + print(f'Diff expected => actual:') + print(textwrap.indent('\n'.join(diff), ' ')) + + assert not num_errors, f'{num_errors=}' def test_document_text():