From d6f851a2c0149b9d4e34904649fc23160a2019cc Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Wed, 4 Feb 2026 10:40:56 +0000 Subject: [PATCH 1/5] tests/test_textextract.py:test_3197(): improve output on error. --- tests/test_textextract.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/tests/test_textextract.py b/tests/test_textextract.py index cdb6e4bc7..ab66f8aef 100644 --- a/tests/test_textextract.py +++ b/tests/test_textextract.py @@ -252,14 +252,29 @@ def test_3197(): b'Related Tickers\nTTM\n12/31/2023\n12/31/2022\n12/31/2021\n12/31/2020\n14,918,000\n14,918,000\n6,853,000\n15,787,000\n24,269,000\n-17,628,000\n-17,628,000\n-4,347,000\n2,745,000\n-18,615,000\n2,584,000\n2,584,000\n2,511,000\n-23,498,000\n2,315,000\n25,110,000\n25,110,000\n25,340,000\n20,737,000\n25,935,000\n-8,236,000\n-8,236,000\n-6,866,000\n-6,227,000\n-5,742,000\n51,659,000\n51,659,000\n45,470,000\n27,901,000\n65,900,000\n-41,965,000\n-41,965,000\n-45,655,000\n-54,164,000\n-60,514,000\n-335,000\n-335,000\n-484,000\n--\n--\n6,682,000\n6,682,000\n-13,000\n9,560,000\n18,527,000\n \nYahoo Finance Plus Essential\naccess required.\nUnlock Access\nBreakdown\nOperating Cash\nFlow\nInvesting Cash\nFlow\nFinancing Cash\nFlow\nEnd Cash Position\nCapital Expenditure\nIssuance of Debt\nRepayment of Debt\nRepurchase of\nCapital Stock\nFree Cash Flow\n12/31/2020 - 6/1/1972\nGM\nGeneral Motors Compa\xe2\x80\xa6\n39.49 +1.23%\n\xc2\xa0\nRIVN\nRivian Automotive, Inc.\n15.39 -3.15%\n\xc2\xa0\nNIO\nNIO Inc.\n5.97 +0.17%\n\xc2\xa0\nSTLA\nStellantis N.V.\n25.63 +0.91%\n\xc2\xa0\nLCID\nLucid Group, Inc.\n3.7000 +0.54%\n\xc2\xa0\nTSLA\nTesla, Inc.\n194.77 +0.52%\n\xc2\xa0\nTM\nToyota Motor Corporati\xe2\x80\xa6\n227.09 +0.14%\n\xc2\xa0\nXPEV\nXPeng Inc.\n9.08 +0.89%\n\xc2\xa0\nFSR\nFisker Inc.\n0.5579 -11.46%\n\xc2\xa0\nCopyright \xc2\xa9 2024 Yahoo.\nAll rights reserved.\nPOPULAR QUOTES\nTesla\nDAX Index\nKOSPI\nDow Jones\nS&P BSE SENSEX\nSPDR S&P 500 ETF Trust\nEXPLORE MORE\nCredit Score Management\nHousing Market\nActive vs. Passive Investing\nShort Selling\nToday\xe2\x80\x99s Mortgage Rates\nHow Much Mortgage Can You Afford\nABOUT\nData Disclaimer\nHelp\nSuggestions\nSitemap\n', ] + num_errors = 0 with pymupdf.open(path) as document: for i, page in enumerate(document): text = page.get_text() - #print(f'{i=}:') + text_utf8 = text.encode('utf8') - #print(f' {text_utf8=}') - #print(f' {text_utf8_expected[i]=}') - assert text_utf8 == text_utf8_expected[i] + + if text_utf8 != text_utf8_expected[i]: + num_errors += 1 + print(f'Error, {i=}.') + import difflib + print(f' {text_utf8_expected[i]=}') + print(f' {text_utf8=}') + text_expected = text_utf8_expected[i].decode('utf8') + diff = difflib.unified_diff( + text_expected.split('\n'), + text.split('\n'), + lineterm='', + ) + print(f'Diff expected => actual:') + print(textwrap.indent('\n'.join(diff), ' ')) + + assert not num_errors, f'{num_errors=}' def test_document_text(): From b7a47b36dd0b0dcc07f437e0df70f660e284371b Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Wed, 4 Feb 2026 11:09:51 +0000 Subject: [PATCH 2/5] tests/: added new test_4790(). tests/test_pagedelete.py:test_4790(): New test, expects the current incorrect behaviour, checks workarounds fix the issue, that new Document.save() flag works, and that new Document.repair() works. tests/resources/test_4790.pdf: new input file. --- tests/resources/test_4790.pdf | Bin 0 -> 15540 bytes tests/test_pagedelete.py | 78 ++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 tests/resources/test_4790.pdf diff --git a/tests/resources/test_4790.pdf b/tests/resources/test_4790.pdf new file mode 100644 index 0000000000000000000000000000000000000000..fe4675bd0427c51a8c6f5e482a5dbc0055f79ef0 GIT binary patch literal 15540 zcmeIZWl&vRmoB<-cMtB)MmFvi+zApaKsN5~?hq`vyF;*G!JXg^L4v!xbI9A@SN-eT(!YuA#RYpyj{)gE&`&v?cd)XEa?nb}ylQ4pzf>-y$V5ZTCB$!v`*P!I(K zfa>mcV4%36lcANZDNxza6zoXG{&riHOh^a?5p47MuYR2W`vZSEzV&th+c^EBJ5T}q z+1yaf)=lTFFDn@bD?1B_48#j!;bhkXs)8MDogIwd-p%#byMf}iHcoHPI_i*d{96kd z2lroh^nmJyMvg!=ptysr-T%|uY@H09z+|j{|3bFEf8u*{D<`l6@V(VrHwmz@?PoAh z9&BUkWJbox&MPG3=;Q!4v_?U6%lMF>V6()E9;~#+Z${ev3=o@Iv|eI(Ex4=5%06V(OF1-YXb0)A#3lEKr)95&{`(uJ zb3>v2>pG$wP0w*C2X~!VkY0NjFN}?nMvz6-03ixYm=wHXV1>-#|fMc65|8=h_a8 zjy|M3&$|{oGRx^Em`?5(Ic1BawsZT7KKLA@!TTA>qevog;^mtIIx$&f{HZG;SS#od zav((}3poL%esy$^ogAGUP-;*!fbRgCB|!buHu5J3CPoGgbU=+*BLKBCyc-f35J-pb z=10~EK~}PD*DqoiH@w5NFhdbCI*0MQ`}^B*toysQgbGCGcbr3J#do=U-U%x_e4uGjO z*#Eu`|Bq$;k7fN|vn-B35dPm;v)KM48je;ik)a1?lbv>6xTp|p>JCUJaqs78UrCK( zA;VJ0aW7DeQP>)cA-AdG5%3zIj_|I@IJSHJC;)4Yp0I66BcRJ|r_m=>PT>#Ep1Vxg96s@NoeEH6gH);}ni@DCI` z{5KTz@KF*p>MaX{k)SjB&}km!tZYY8D3_Y-8|XjTMdgdS`ZUr!LKVXwLrG&Nvwlfx zsKbp+B;*S*(Sft~FHw-=@BHd7&ZwniWC1pI0;)M1IsM^~GS+`Mp`xKR7-(+I0hDNBjexPSbFl)Y!RDrBZ^V(6iw!7d?&PQpb`ZC$HB(+H&zFD+gJe*Z$~Tu0Qv|3 zfJXrUAejE_e;VJ;0Eo8>D4e(b_4yU@<>lqA@!g-DVI%bxK^7Dc79#wr&FtWI?o}>0 zjjUyaKG4yHL{-Q0$+ommIp*2-0i8p0u@uDhVWln3-KxS2e_;zcuATz>W6Zlv_?@H1 zg)zeHpZ8fWIy=pxr$DD6$0L7f0NsQE0dWw)9 zVX{b)?$MI!bdxV)3z?;e9IDB6D0PXP*(&jqyoBu9_%!4DU{#sxBP13C;EE571&)b}6lSxquFp?v+XNF3nOQwshW99dt1^XR7;);# zoQ|dz#TO&4A~J-YQ_R+SRa#kiOb@Nu*cs)kFc(y^TcJfl%FIoYwL3#&H^({Q|q8Q%aV3l6WA!K%(vrczs4g09&&w-J-=oBq3&!;SOW z{-L0ici8OnG-XMGobIKCwPi?D!c2ei~D#W^=I^!@QnA~OuQ zu33>00{Y0HVLm4JbE1W2%1R$tv35ack*8YZsu|h63e0ChwbPos_?xw)@mJrspSf}V zSV-Lw{O9(No!gwxlXb#}m)01bdBF!ic4}rAQW)dh9!~KEzIz=@sZm!Jjb`v7qj@2z z5vb}~;D7raO+kqg_-J7BjzB>V$>6o(Z5lwrQT&-8klp`2ecoFC+8|%wwzn(E&^t85 zDy5$f4CJLGqErk7I2O`2NdjB_HZRPZlaWLF9W>mG@+3Y(xozYelTrtXpZ(H}Ui@6L z-5yf$W+k?vH@`1Q?1_H5XdR(-hm?LNC>u*B_--WGBP?xP+&yN86psIui_>{w4ra5j zF#PG^o-acQF*Q!J!#cyBJ$(5EUEoZK!X4hr;A*5VY!?SJS0=mf@g0V&YqY`wE-^4q z-{_=vD2XjajZ|E%H&%^vut>KAS@|8WUn~kFx5M_AnTY+hNz6jxy4aY_Mn8JNr3pKQ z43?pYxB<$JQDkR&KOzka^3QSSPN>eui{{suEj$nTnBul z1b*-TEaDM*vHJ{V@fdCRLiH?CrJ=m4FFG;jOxjMl(d9EV`0~@6s%lqD)__RD` zr5<>u11oi9D1U;6K9?c*J3+*i2X=`=%%NPkpOQ=Y;ul%wgZ4l?Rkxr9cQ0a&4N(tk5crg;Q{b^A4SykFp{(>eGdDC>yz(TWD_x4FHC`UGN1QyL8Ib8Vvr&622nCNu=v_t{+2wM+xoFeV zESLHgVcPGCsnaGyw(^l*N+{(y@`t!LJ$y@{HTp%R#M4zj<`B4e0-hF?Q+q_(ptLL@0lhBh))_2G7H(iz77vww zSu+&vA1r{C{ZGaW|IgLGygD@TfLyIiOg5|z-*KVFg{~>Dm_kFOHe1ks^#MD2g+>LL z&TWR^Jj=uWCa4Y6>^Zr?B^5R`oxLqDwx~49@J>A6HqH4x!~Gn;AlHJY%AAO5*0=Z# zp?B(%A!CXYa)`TL(tAHptvxnjc&72-VqnJtPw<9$&y^evJPV&Iz0mwMa}PGKqG5F9 zYxyjC<;>&B6M7a!Ltgz&kC$YdZ^nykOu56Iuz*3cqe?5>qQ6L->k&%#2YYPV zuAyM4SPYR;jV8vWA8xd2_M%rMa*;X~ulOBE0-B++2FSW%eHxCTGs**h6QSI^H|R`& z93~P+PjdiRTducK5~D%U3c;zLS^Ti`MpheT7Rrl+)%Kub?M=Ilt@5!ouPh0WcoaspOxyNZR=$np_S=1IU zF84r*QV~Yjc1jIUDx^%btBmtqBzQ;!5-EFZ1TC}_SVDr86&wZ5Aj;|bc2K1Ci_u?z zN<{YOs^UqF5h>JrRr?Dn>%mbD%I}dt^q3F;U+SiBr_Fj`C%QdXr*O3xByowVQ`F5J z(?z?bMCVBr)+CSS2BWBs%460*##c8~4MEou>LEQrJ0*!WZXM4J>`!Ayy)deX>;92z zVbbO=8*ob4l#zkw_8#elvHdt^Qsfoj#$mP218@-9OH4t!!6YxCT3A2LYi zM!QCmY@>~DUvN+}Ya)zwE?cE<_!2vRD0BkTtUYEeQJ&84T;b;!WPT;jR?&Ez8m5S; zH5EF|rxz+G6UW0?$c#+gLF33k)xnw>Oy>Ljz%Qvh1HoB5bEm<(3lZx3E`B9@RVB&< zlsT}r^%nazgv+wogu`;?i*mUvN|X(<$ebtFv8aCSe^oeeChsJLbdL?f9{gTgcVL2W z>RS?l*KhENHwH&_s5R}n# zMu&kL51eHvpZ(B=GtMBDO833KBh5T|^c+>EFuVpT*a?q(K+C#{P zIx~zgvwkvAha6LL?i{BG>v*G+mcE`A(3gtpQ*_RO&A)%A^kF^DGHW&_;aGd;hm8^{+tyD&fDWzvNa?S~ zHFFHAR*&NTJRg3>EpZ#rz4LqKuAvj7k-|&+FP4y{Ol|Pu_lTl=S&L53ypFs3`r_LV z93!+Y<9ta`Jf4=c@QL@Flrap@5R%*3TfeVM%p`RI)xMDekOX{WhUzY?Gm$xQ9lA@G z^8sqx3B6^c`TEIQ-`;WhUnjxd+G^ynWULt7&)DQ%R^?#Sc!cQM(}U54s=U^sPq4ql zI@>Nd%ZEEMLu6^%eV;K7Z_>zsL4RaA@GG;GzV?)Z?0y`IC}m0PYLdKL_ z>dbyG9pocV|0;8sEM#K;ctjwbV<6%h#z!D+*KbgsBk7AKbbycq_9DEj@W?lZ4?sR^ zfmHGd-C&KepPPTb!_1kG;^Q>O_taq9va3UGn%HhIXKy~BXWU7eMe$nkul=JxGWLJN zo`pstgdtlEV_@z(SXs7a5-z6d~u;^w$*TnAhr}&gNG3=Z?h# zynIC@`qzX1(a!(t!9l$L!QBs5Z;PK6LH2>{=t(NV`0?x2VVrG^fHtH}L39)uhC@@N z2l-UG4=MM%qi8xuvpoiXypVUi(7xxbatprX{cM|eanRJKLhNnJV#)hSnQAkM`zd+N#NmQUB?tED-VfZP9Wy?Zra>8Hj2zk9wj2l z3R2_JUhs8QE1UWl{3MSe9u5yhAxazv%1Uz53lB+Ae26*k7pC8oRN=!-2&PzAMT?jp|(5#$Br;Y1rGtuU4ZEa?T*?l-Z3W4;J>z+zsuTxIrRU> zF&z%xH}g{^1B%+%*!~IWfd1&3s$_b9i}HXt{%Rrvf&Ovp|18P_`V(6Er=kDXDg@~N zPZa{h#sm8I$^FM&`fsQZ(9!=vg^>K?TY(X###yv<-BAn}cB61wH-_M&fk^z*Cfv1O zjcp7qM$RbxDrLU%OVp8xvh3^`i?%yPaNZdfrSX*aAu=S&O$E`Q9%)krE<@dCaD%s8 z@KD0()#`g2&$d^%@t70%We{4}Y~+5H;lQBklTVNJUK^j2?{oW{BNqW235k&rq7i!1 zT+lp#*bK18^D^T1UOcpj+hpEY{JUvV#nW>_p~ycHocs)Ix=8j=)r>X`u8CW66pUh# zcm=s>ZQ7zm?9R5=H8;Mz1MF=ti5a`TEblP^?QYXUst!^$t(6tyy=IJYVwJo*qfPm5$#dSH--6y(JR^<3r?dH;->RLJAbna4w-nOvnnTr?p8>~RWTd~ z&w8BD-|$sxM1h{@Ex*uyKn!oSe<5<=Cu2~NoBn0ApieBEn#L22t^ahr7rq+y-SiT7 z8b?dh=2nP&7-^t|m;t@W(J78~Gy2Dz&lMT)Bg%?qXn1m?FySP@zD-bcn7X%pKheBS z>N?jPiZIgFI!zv&)wIG3%_h-x9ZrQV63Wm=ynQVQWM~0SPT&Zx^AcS5LFB_+=P|y) zO3efV%zmGb>=*f(Wnus422{OS9_Thbiux_#QGz)xjuIJq8CeTODt&8ub*?{3H?8I2Q3 z!M-Zm1F5bHYPt}2Ki3itf0_M;B|#gss2s|hs)^A4@%^}m{P%`Al3=eawy%VKkoDIN znh(YPY)HHHM>b~PywMY}5K4de6BS%uGR&=~9Nvx%Pz?J6tW@96IwcMt)Y*ePgM)>h z2<0=)xn+j)P+`$KEG4`tI-!B;Y4m%m2HM3;k#;gZa6p2++Nj)}5`_F8dzF~VgakwQ z^cL=vyhtrZYztBSvVc7_n{Xz$pP~zrowHpoXI@U3T~TqA{W$pp^7U~XUvy^%>LBXl z;R2b3A-L5^*Y^>y@x zBEk#OtFdV3@Kq^eh_~j0nR()Za_Gp4>`Z-bKXOsP&H72_;(V}xMqD@fWrinDCTIqN zlJ^eH)wlppR`rf|IQ1Ou>5k#cojo<_`x(Vwv*Wlk0dyPznpsv3BVr1U8QvtYE_jgA5eLob7Clbf_ zC@swpSskfQX`9Y^BVCZg1 zy+}?ZQK9*X!=D0E$lcTwcH8B524!nK_xS+}H%$MBMYAL7>?2WwGVN7dmtZ$NSAwG- zbp+H0&yR~czFrMytbJYT)9Gk|r7Nc3fP*9Es_SbSQ#{qtxwsr8IeJKP;LH@_o7J-Q&Uxf&t0 zZR6(y^DX!*lSVY&2d~FFgpUm!^ck<3YAPeM7nQB+S3)}@h*l4?t4ijgi>#28v>pq^ z>6dQ4523j7utJG4Cy`_w9;pmBT_$N2js~NJNfHD_n$DuniIKXp#$I2jbh^+~1lUhd z#)Gk%Bp%)1FuL+qS`;+;p_vDo$>)))sMr}P;rvGoo3mkhd#F)QdUHfiUE7W}>lW_C z;>qL>xKq!H;(RO=0I!1db>gIG?>(>S=*tqTvK9nSL_gREThO~_4+ttn@?^W{_j-1% zYA46Aq-D;p?n;&&^0v>=`}ly2sT(ND0NCO>Yz}I<3-!Dll+hFkpW$T^#Ftx&I=;PP z(UM)co$3$I8clWj?@mrVHnWdO%lHbJIp`zm*?831CNzsVD~r`QU)nf_Nu#wpjoYFI(kROAuKL`)LvIkEfn6yrLK8=bwr2(zmswBG6LmV4`vgb z6oiOCK13h8 zOdET8+1*4UiabvnR53nVk84zyMzt;wXGxSj%xB}%)1QIMi$5daY6c4`7Y3gEgwr-a z-a(+ayYe?@ck>q7LeUWSdCwg1iisLR^&U@a`}R)yfB=NTw%;oj60Qp2OO?49Ti_ce zTyD@!3UI7Jj&kO;Hu%KDU=*lvWZpbiYt}#@;;L@8n?|yg*~Mws9Rs8q6OQh%Idl+w z3Yu&_i)84>qbp)HNTfEbC4k~^4|o}tC=WEWqob8TJ!|~#-B_9%#w@ z{c_H!Mn8wJOJir0ONOm{kr!;a7-oeM*$C);mcUXMD=gSNYOzs@8!-LxkkRnN9)Cr2 zbIq^p;&%d3kS0|J|9QXLSigzsOi+k6wT}&!btqpn4sy!nLQl)tOJtF%`R0bc?s|r4 zo83O;XRW{}-~e`{6;pbqc-CNJQcR-5V9v^Ac`xbSlTHgwMTz-j9u54h8}9pxPb(U@ z+P}tp&BL;%9n}Z{v{}E^%$yyfNkF0UX(q6@HR7g>*Em_W#v?viIM1XQ*5CXswW>Bi zP#S)I#%5zl`5MAhBMzZ0T^_MM{qS?Pvmx}H(X?FF7bkURu{ODVw1)4^vmqXy^jZv` z9&_T463+{*9@YdFD{+GQrI?_6pI_mfOOh=ic1>0TJ3)Hb~k{{ zw_jWyN1w~F>z@N57yAHRyw)X;s0po3`D--edySYa%d3Y|d1$ai3^K6!dx$e$VZ^r)KJ833i zWkcqZQ-tA_96%Wrw)Z-M_GR-2J~pC=EgP}1+M(^Fib~gHhS|Syrs9bE3^Wb8PbQ?| zC#GTR_#9zl;Ri7VGTL9BL_A2#Ds5;edaC_-+EX%}OZ-MC zRjdk#k7TB!H zR0iK9n~;p$GP$tx4k+3r7v769#i#aoy*C4AYGt0d4hcSd9yk~Ys3vsq9xC@i{-|CT ziVi<)9ASj%CQ_2b%kNtO!G)YJc?+>uD_*Cc+Lw0^C_cHbhKm!Q^qDi$UA zgm>aRf;#d2Tg3E5H)VRYwyZw6Xb7GJdTOtE_Ti^(VNA#wj&{MIIOW#>L5#NBrGmL7 zgvweeAw<=)cI4=FFXxn!4hrwqqRz2L1v`8)D?oE&PdRlwc2~IlW6?A=Y}>CK*0Aud&f*C!mwL)dS_S zC=xlli{3kX>T~2n#FgW#Fy5MoxmCc=`K!n~`nGp@VPJx(m-=x4(?p;^R9BEuY?DWK zmuPal(_Z!Ot!##vYo|*;)#|TZ?JTmxB~2PId(6Kgxef9C;h39B8p}D; zWRyYX4!QKNA4de+Vl6knSi8+ey%0J&CKqi9HCpVriV!z$u`JgCv0-e3plXr}a>HKm zf0068%cLSs4bCA-7Fc-GcdmjsNZL5JVx?#=mtaiddOWQc=n@NaSZQWR^WD39Hc!6W zef{}iR|l7Fg8|dt?Yd#V;6ox#ae)_xES`x$ABA(7P@xDRhML(?c<=<@oZ8Ds3A_kx zT*=D#tUxPjf$6@0(soOYDLm48Q@3GJ624>2mRv=S$hR!<&|9E&(h(N&I4!vQl?l*! zYJ!vGuYQmu&P0D8syptjx~aA(U;SB~oCvpdeBilfA~~ z9@ujD@-n6iHQH($K3tZ#UA$HrZytMp1Z$L4u4^MS3F%yqFnTcj1GMXZc4lyyzoW~b2-fk=O?kIA%ehJv!rAntB8-oT>K;qAX(uLiWz!*#(Sa> zv;>|F4YxG%{OHHFJlYVmp`w(WG_VmFHDAss&6Ra&usgYcU>Ty%^L z`p3UeSVd8Qc}LG-=Yflp0gQwy?^DzxM-+t*RK2f?2!dKA);ivUY3u1tPiz9)6;s;gvAV_o}`mL`uIt%W6NRN*5vEV`cVEBn`v? zo+bYAQ+myhmLytGT=#dA9L+PoWJ*z;+wYh|HnCg?+u|(=PcW~%vT-hn)1_WiK1oUl zEehjCi;f=+KdU-HWy0?CqSmEh!!seN%bBn|79ThlE&$_1G+o$@RFt7{tssAZXo z@y+x5S=5aVB6&@1tGF0f!`RIIFHw5KW+kj!EEU~=lxmr0onis}reuG(2z zv^8l@T_+8kuk&~8JGxxkCR8yqqGL88h0RNug_2G1C*6V4P`(>LT!&Atj}=4jT0hl* zQw*V1n6=W!uKNu)V~l)wvul<^upU53Czi@&5@W|)SHwww@{>=L4kLiASH+2%dJYy^ z(K^&F{d;4YvPHG`V_>5u$>0+AtXI32tjHXE5SwBX)*Z1f9Cq%9PzN<&c{;nOd$39n zc^XF)ManrHrECc<@rb$Q^##fjtc4qUbCaxJF}_w0=}!`w3_i4|jbq!~uEj=ATi)mR zWrqM;`*`@CGv=QstB`StG`>UMXz_-K3(S0Zv+uDBJ*(A26AQ=!hpDk7j;TyfKl4=D zyo|zr(9RG0X;Ro+T$>4!FMxz%rRD?ByTetl&Xfy;a7c7z!uObdOxCIq z!43Bg4iw833hDyPrJ(v<mL%%53ALtq!y^joLRctSt} z{AfO*I-U7FhSir!U5vof2d|@pQjl()o85Sruocns ztzwXIy2Y91t(0D^6yUt67qy)K|f>{Y9Pl@mk+3)kO+(jeZN}#^tdX92`v7(&K60 zVt_#{>XJ3q@@mo&Lk>8GRmPh8tkqRM_T7>8JHRyvU_kA zd`-cZl~bVdL= zz2E(M!^(^AhDpq05Z|vEq<7GdJO;T@; zP@?goP*jBj(MyeGB)@d{jZzSQI`y#5)38-6m`A*P@=7OePJOK&_-tDv!x8 zV1$mYfBO9FY4~*hdW=~k7?!xz5=EdiJ$XS48BEd0iezie3(Za!teHE;%eyi4CCJG{ zB8JaQJKgjLstwu(nIS*@yKg54WkXV^y8hZUUDP8=XvFuTp%=zU2C?M5rOm99P&h`u ztILV635$RhmU1=ZwiSgyr1@(H@9c+~y&?7)9&9!Tss(<3qi^)fAA-gJR+O zyq7J-c!nDdc1)=YHr0~ARpRlBvFKe0xr}zlem5keivs~E@_72=K_fdM7Xm*Aw5cUv zHI3)G7i_FCTYZSm$LeDoibT|C8RTbxnZu2De60DYKaf|zG`R) zAw{<`VsRRzL(Etem6S$bID#PP&9K{dut3){T4?`1d*bp#Qj&X3tMu@Z7QH(|@Mk;o zyTi4rH5+O;mH6xlw7w%=xseYWV!G1IOndSRU%!6k&-1jiO)R+k7Y92%%x$`gk=pI8#T{G2)7yNuQ(wKlvyAWVBc@6C%3JE0 zp(NEgw6_OxX(2JHK|$vfMi>Ga)@2eylTA{vD$ZEe=tov|#B^H8zzH3n=-1OR;Hyo} zFZkpm=K2ra?i*>V=;bVkzkLcF&x_v99T7bi9q{^X;?{ARYv1m=0BNP;g(moAiO7k6 z;`c5aaI>r1z9@VNA^QH3%wNj;MHdf77?_ z$w$BN%$|uCb9rJ&+@k{)@|rU<6UTQoj?x;_fhz8+J=Z7MBh!a=dD4sh{fifJ%BaPw8sEmr*>+MQBGfXN8!fVSQpisa%szS3N7%Ae8D# zJjNCsB~d00TAlb*z+8)-0%vS@%R(}Ci)DG*HDC|6RT>rk=$#)t7aIFSO$7vBQJ|gz zs=XkIEI9_*7Jt&&&h=faY2d?ZHAI1UabVVr2c2z}jIJ^F80^QR9>gE2T}@8Ru+p6i zXCb|PNn8Y#j(cqjn-hodPrs10=9^zm(eHRoPW@l}g7K97PE`F^d$0j7qZ~v4q4h3p zk(fG>*+KDh5vG806;!jedV6QQ%9@tur+BEV2Sx{&4dxZwmFk3Uml!~lb@@9;qVj~s zQa!{us~I=V>-d;lf)ln-7+SKe0XrKvWO7noh}=ZOb60XFzy#VHH|j&~%4_*GTD&Z^ zUKxJc3hov29pm0SDp_PUk-0DFQ{rnI{~aOx5u|T&@0n&*h;YU?fhREDi_~(7?-&B> z5)qP^B+AKos)}mUpzkxM4tUrc_HH*k_cW57ITBnG!VyEXzEBu*c?Lr_udSwBb!&(3y=re}>i z^5E#S1kons##}ye5M`h`N9?Q3R?*@M{HSJDWMz(Z5&Tg-HdGig6K$HbqR6E8PJrqI z%Fv}mp#=qLUZ#coM_|=E#+4|R!zg>VJYQvZ{e%-8D2c};@7gjb=*GTcFlx$Acr>Z! z#;=jI%s&FhFECFW(q3rwm_C|7O3t-HmqnU~!5i)iemwojvsupT5`L3duv3|Y@{Y7Q z=_k3_N&q51n^e&9S|D08ySEh0%th^uFX1GOOMpdke4jPS=6BwY=@cugf=84csG#?Q zzd20KABgc9ov$_|k;%+Zo3giKzmLs#MIyAKd$JK*NO#JZr{>zr7kvBj68(ZgMa*oG zyn`W|mPZk$%M zLm})xMp2A4Rr`il8Cd5`P;4MmDpFA{CHst(t}e7ETp6f$HlPk}E^Wwruf)8#bU@T& z@|P@y6Rd2xAWgLnkxaoUJ6o-c;|O$$;xwc^!cqTq-5z@O~ zlD?u%!5Li-Eza>T%5kwpp+-UrNn$j*S9c})TO7SDfjmim0bVyKnukt}UDaLk_s zAdGNbi_69-*KTOBq{*c7hs2n!@xRxMX+o4_s#fV0vUMd z7c=(dx+jVk^Cxx7_C!Mm#5c5%?tjfe{`XV|GN6_!*aWDl!$t-odn-?svH1*k(_wup zSJeZmnR|fWF8&n-d8OC<4DF0RCai)eKmHnb$Bap5EA+) zlVRiJ1*+QGzEw2*mGkg6iis`RzvV#yWh8V2B-lkoc{sVm#5lNkBqT*S*g#xdY}^uD zVw|iX9!_ydAptH?P6>7~UU4=qc1cMI9#IZS5H| z&x4(zOk-(7XBbp3o_v9Tz8lKaVV{(B-ofSU24vO24c$CsB~dSZ7&EsSF*6e^G8FY! z`3O Date: Thu, 5 Feb 2026 09:47:27 +0000 Subject: [PATCH 3/5] src/__init__.py: added `raise_on_repair` arg to Document.save(). Allows caller to detect potential data loss in saved file. --- src/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/__init__.py b/src/__init__.py index 4b5184601..f97b874d6 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -4670,6 +4670,7 @@ def ez_save( preserve_metadata=1, use_objstms=1, compression_effort=0, + raise_on_repair=False, ): ''' Save PDF using some different defaults @@ -4694,6 +4695,7 @@ def ez_save( preserve_metadata=preserve_metadata, use_objstms=use_objstms, compression_effort=compression_effort, + raise_on_repair=raise_on_repair, ) def find_bookmark(self, bm): @@ -6481,9 +6483,11 @@ def save( preserve_metadata=1, use_objstms=0, compression_effort=0, + raise_on_repair=False, ): # From %pythonprepend save # + is_repaired_pre = self.is_repaired """Save PDF to file, pathlib.Path or file pointer.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") @@ -6547,6 +6551,9 @@ def save( #log( f'{type(out)=} {type(out.this)=}') mupdf.pdf_write_document(pdf, out, opts) out.fz_close_output() + if raise_on_repair: + if self.is_repaired and not is_repaired_pre: + raise Exception(f'Document save did a repair') def save_snapshot(self, filename): """Save a file snapshot suitable for journalling.""" From 129f91fea722b30dc56bb12e324e5fcbeb0581a7 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Thu, 5 Feb 2026 10:08:02 +0000 Subject: [PATCH 4/5] src/__init__.py: added Document.repair(). --- src/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/__init__.py b/src/__init__.py index f97b874d6..14953b10e 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -6204,6 +6204,14 @@ def reload_page(self, page: Page) -> Page: f'{refs_old=} {m_internal_old=:#x} {m_internal_new=:#x}' return page + def repair(self): + ''' + If we are a PDF document, does repair. + ''' + pdf = _as_pdf_document(self, required=False) + if pdf.m_internal: + mupdf.pdf_check_document(pdf) + def resolve_link(self, uri=None, chapters=0): """Calculate internal link destination. From 4c82deb0d3019fc72447c0eb21658517cce1e273 Mon Sep 17 00:00:00 2001 From: Julian Smith Date: Thu, 5 Feb 2026 10:46:00 +0000 Subject: [PATCH 5/5] docs/document.rst: add raise_on_repair and repair(). Document Document.save()'s new . Document new Document.repair(). Also document Document.save()'s . --- docs/document.rst | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/docs/document.rst b/docs/document.rst index 352347da9..0b1a6af08 100644 --- a/docs/document.rst +++ b/docs/document.rst @@ -1246,6 +1246,14 @@ For details on **embedded files** refer to Appendix 3. Check whether the document can be saved incrementally. Use it to choose the right option without encountering exceptions. + .. method:: repair() + + Repair document. + + * Slow for large documents. + * Does nothing on non-PDF documents. + * New in v1.27.0 + .. method:: scrub(attached_files=True, clean_pages=True, embedded_files=True, hidden_text=True, javascript=True, metadata=True, redactions=True, redact_images=0, remove_links=True, reset_fields=True, reset_responses=True, thumbnails=True, xml_metadata=True) * New in v1.16.14 @@ -1267,7 +1275,7 @@ For details on **embedded files** refer to Appendix 3. :arg bool xml_metadata: Remove XML metadata. - .. method:: save(outfile, garbage=0, clean=False, deflate=False, deflate_images=False, deflate_fonts=False, incremental=False, ascii=False, expand=0, linear=False, pretty=False, no_new_id=False, encryption=PDF_ENCRYPT_NONE, permissions=-1, owner_pw=None, user_pw=None, use_objstms=0) + .. method:: save(outfile, garbage=0, clean=False, deflate=False, deflate_images=False, deflate_fonts=False, incremental=False, ascii=False, expand=0, linear=False, pretty=False, no_new_id=False, encryption=PDF_ENCRYPT_NONE, permissions=-1, owner_pw=None, user_pw=None, use_objstms=0, compression_effort=0, raise_on_repair=False) * Changed in v1.18.7 * Changed in v1.19.0 @@ -1318,8 +1326,19 @@ For details on **embedded files** refer to Appendix 3. :arg int use_objstms: *(new in v1.24.0)* compression option that converts eligible PDF object definitions to information that is stored in some other object's :data:`stream` data. Depending on the `deflate` parameter value, the converted object definitions will be compressed -- which can lead to very significant file size reductions. - .. warning:: The method does not check, whether a file of that name already exists, will hence not ask for confirmation, and overwrite the file. It is your responsibility as a programmer to handle this. + .. warning:: The method does not check, whether a file of that name already exists, will hence not ask for confirmation, and overwrite the file. It is your responsibility as a programmer to handle this. + :arg int compression_effort: + + * 0 for default + * 1 for minimum effort. + * 100 for maximum effort. + + :arg bool raise_on_repair: *(new in v1.27.0)* If true we raise an exception if the save caused a repair. + This is useful because repairs can cause changes to be lost. + + Also see `Document.repair()`. + .. note:: **File size reduction**