diff --git a/changes.txt b/changes.txt index febe272d8..fbf36ad62 100644 --- a/changes.txt +++ b/changes.txt @@ -2,6 +2,24 @@ Change Log ========== +**Changes in version 1.26.1 ()** + +* Use MuPDF-1.26.2. + +* Fixed issues: + + * **Fixed** `4520 `_: show_pdf_page does not like empty pages created by new_page + * **Fixed** `4524 `_: fitz.get_text ignores 'pages' kwarg + * **Fixed** `4412 `_: Regression? Spurious error? in insert_pdf in v1.25.4 + +* Other: + + * Partial fix for `4503 `_: Undetected character styles + * New method `Document.rewrite_images()`, useful for reducing file size, changing image formats, or converting color spaces. + * `Page.get_text()`: restrict positional args to match docs. + * Removed bogus definition of class `Shape`. + + **Changes in version 1.26.0 (2025-05-22)** * Use MuPDF-1.26.1. diff --git a/scripts/test.py b/scripts/test.py index 32e5da997..03428023a 100755 --- a/scripts/test.py +++ b/scripts/test.py @@ -91,6 +91,7 @@ --help -h Show help. + -i Set PyMuPDF implementations to test. must contain only these individual characters: @@ -109,7 +110,6 @@ specified PyMuPDF will download its default mupdf .tgz.] -M 0|1 - --build-mupdf 0|1 Whether to rebuild mupdf when we build PyMuPDF. Default is 1. @@ -233,8 +233,6 @@ Is prepended to command line args. ''' -import gh_release - import glob import os import platform @@ -253,6 +251,13 @@ finally: del sys.path[0] +try: + sys.path.insert(0, f'{pymupdf_dir_abs}/scripts') + import gh_release +finally: + del sys.path[0] + + pymupdf_dir = pipcl.relpath(pymupdf_dir_abs) log = pipcl.log0 @@ -491,7 +496,7 @@ def main(argv): cibuildwheel(env_extra, cibw_name, cibw_pyodide) elif command.startswith('install.'): - name = command.lstrip('install.') + name = command[len('install.'):] run(f'pip install --force-reinstall {name}') have_installed = True diff --git a/setup.py b/setup.py index e8c958ecb..3d6a01f2e 100755 --- a/setup.py +++ b/setup.py @@ -575,6 +575,12 @@ def get_mupdf(path=None, sha=None): darwin = sys.platform.startswith( 'darwin') windows = platform.system() == 'Windows' or platform.system().startswith('CYGWIN') msys2 = platform.system().startswith('MSYS_NT-') + +if os.environ.get('PYODIDE') == '1': + if os.environ.get('OS') != 'pyodide': + log('PYODIDE=1, setting OS=pyodide.') + os.environ['OS'] = 'pyodide' + pyodide = os.environ.get('OS') == 'pyodide' diff --git a/src/__init__.py b/src/__init__.py index 1a63b48cf..db0e69a13 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -19424,18 +19424,24 @@ def handle_args(): arg = args[0] if isinstance( arg, (list, tuple)) and len( arg) == 2: p1, p2 = arg - return *p1, *p2 + ret = *p1, *p2 + assert len(ret) == 4 + return ret if isinstance( arg, (list, tuple)) and len( arg) == 3: a, b, c = arg a = make_tuple(a) b = make_tuple(b) c = make_tuple(c) ret = *a, *b, *c + assert len(ret) == 4 return ret - arg = make_tuple( arg) - return arg + ret = make_tuple( arg) + assert len(ret) == 4, f'{arg=} {ret=}' + return ret elif len(args) == 2: - return get_xy( args[0]) + get_xy( args[1]) + ret = get_xy( args[0]) + get_xy( args[1]) + assert len(ret) == 4 + return ret elif len(args) == 3: x0, y0 = get_xy( args[0]) if (x0, y0) != (None, None): diff --git a/src/utils.py b/src/utils.py index 84c657bce..f4f45a0b9 100644 --- a/src/utils.py +++ b/src/utils.py @@ -900,6 +900,7 @@ def get_image_rects(page: pymupdf.Page, name, transform=False) -> list: def get_text( page: pymupdf.Page, option: str = "text", + *, clip: rect_like = None, flags: OptInt = None, textpage: pymupdf.TextPage = None, diff --git a/tests/resources/test_3624_expected.png b/tests/resources/test_3624_expected.png index 2173a0e44..bbfa9bc75 100644 Binary files a/tests/resources/test_3624_expected.png and b/tests/resources/test_3624_expected.png differ diff --git a/tests/resources/test_4503.pdf b/tests/resources/test_4503.pdf new file mode 100644 index 000000000..307762ed7 Binary files /dev/null and b/tests/resources/test_4503.pdf differ diff --git a/tests/resources/test_4546.pdf b/tests/resources/test_4546.pdf new file mode 100644 index 000000000..e5f2ece53 Binary files /dev/null and b/tests/resources/test_4546.pdf differ diff --git a/tests/test_general.py b/tests/test_general.py index a923ad19c..0c1620a43 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -19,6 +19,7 @@ import sys import textwrap import time +import util import gentle_compare @@ -419,8 +420,8 @@ def test_2238(): wt_expected += 'trying to repair broken xref\n' wt_expected += 'repairing PDF document' assert wt == wt_expected, f'{wt=}' - first_page = doc.load_page(0).get_text('text', pymupdf.INFINITE_RECT()) - last_page = doc.load_page(-1).get_text('text', pymupdf.INFINITE_RECT()) + first_page = doc.load_page(0).get_text('text', clip=pymupdf.INFINITE_RECT()) + last_page = doc.load_page(-1).get_text('text', clip=pymupdf.INFINITE_RECT()) print(f'first_page={first_page!r}') print(f'last_page={last_page!r}') @@ -1710,9 +1711,14 @@ def test_3624(): print(f'Saving to {path_png=}.') pixmap.save(path_png) rms = gentle_compare.pixmaps_rms(path_png_expected, path_png) + print(f'{rms=}') # We get small differences in sysinstall tests, where some thirdparty # libraries can differ. - assert rms < 1 + if rms > 1: + pixmap_diff = gentle_compare.pixmaps_diff(path_png_expected, path_png) + path_png_diff = os.path.normpath(f'{__file__}/../../tests/test_3624_diff.png') + pixmap_diff.save(path_png_diff) + assert 0, f'{rms=}' def test_4043(): @@ -1874,3 +1880,17 @@ def show(items): {'depth': 0, 'locked': 0, 'number': 7, 'on': 1, 'text': 'layer_7', 'type': 'checkbox'}, ] + +def test_4533(): + if 1: + print(f'test_4533(): doing nothing because known to segv.') + return + path = util.download( + 'https://github.com/user-attachments/files/20497146/NineData_user_manual_V3.0.5.pdf', + 'test_4533.pdf', + size=16864501, + ) + print(f'Opening {path=}.', flush=1) + with pymupdf.open(path) as document: + print(f'Have opened {path=}.', flush=1) + print(f'{len(document)=}', flush=1) diff --git a/tests/test_textextract.py b/tests/test_textextract.py index 60368441b..64491667d 100644 --- a/tests/test_textextract.py +++ b/tests/test_textextract.py @@ -823,3 +823,82 @@ def test_4363(): print(f'Found:\n {text!r}') assert 0 + +def test_4546(): + # This issue will not be fixed (in mupdf) because the test input is faulty. + # + path = os.path.normpath(f'{__file__}/../../tests/resources/test_4546.pdf') + with pymupdf.open(path) as document: + page = document[0] + text = page.get_text()[:200] + + # We can't actually test with 1.23.5 because it uses `fitz.` not `pymupdf.`. + expected_1_23_5 = b'JOB No.: \nShipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\n\xe5\x9d\x80) \nSINORICH TRANSPORT LIMITED\nADD:7C,WEST BLDG.,ZHONGQU\nMANSION,211 ZHONGSHAN\nRD. SHANTOU,515041 CN\nTEL:0754-88570001 FAX:0754-88572709\nS/O No. '.decode() + + # This output is different from expected_1_23_5. + expected_mupdf_1_26_1 = b'JOB No.: Shipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\xe5\x9d\x80) Tel: Fax: \n \nS/O No. \xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95\xe5\x8f\xb7\xe7\xa0\x81 \nSINORICH TRANSPORT LIMITED \nSHIPPING ORDER \n\xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95 \n \xe5\xb8\x82\xe5\x9c\xba\xe9\x83\xa8: \n88570009 \n88577019 \n88'.decode() + + print(f'expected_1_23_5\n{textwrap.indent(expected_1_23_5, " ")}') + print(f'expected_mupdf_1_26_1\n{textwrap.indent(expected_mupdf_1_26_1, " ")}') + + print(f'{pymupdf.version=}') + print(f'text is:\n{textwrap.indent(text, " ")}') + print(f'{text=}') + print(f'{text.encode()=}') + + if pymupdf.mupdf_version_tuple >= (1, 26, 1): + assert text == expected_mupdf_1_26_1 + else: + print(f'No expected output for {pymupdf.mupdf_version_tuple=}') + + +def test_4503(): + # Check detection of strikeout text. Behaviour is improved with + # mupdf>=1.26.2, but not perfect. + # + path = os.path.normpath(f'{__file__}/../../tests/resources/test_4503.pdf') + span_0 = None + text_0 = None + print() + print(f'{pymupdf.mupdf_version_tuple=}') + with pymupdf.open(path) as document: + page = document[0] + # Specify TEXT_COLLECT_STYLES so we collect char_flags, which contains + # FZ_STEXT_STRIKEOUT etc. + # + text = page.get_text('rawdict', flags=pymupdf.TEXTFLAGS_RAWDICT | pymupdf.TEXT_COLLECT_STYLES) + for i, block in enumerate(text['blocks']): + print(f'block {i}:') + for j, line in enumerate(block['lines']): + print(f' line {j}:') + for k, span in enumerate(line['spans']): + text = '' + for char in span['chars']: + text += char['c'] + print(f' span {k}: {span["flags"]=:#x} {span["char_flags"]=:#x}: {text!r}') + if 'the right to request the state to review' in text: + span_0 = span + text_0 = text + assert span_0 + #print(f'{span_0=}') + print(f'{span_0["flags"]=:#x}') + print(f'{span_0["char_flags"]=:#x}') + print(f'{text_0=}') + strikeout = span_0['char_flags'] & pymupdf.mupdf.FZ_STEXT_STRIKEOUT + print(f'{strikeout=}') + + if pymupdf.mupdf_version_tuple >= (1, 26, 2): + # 2025-06-09: This is still incorrect - the span should include the + # following text 'and, if appropriate,'. It looks like following spans + # are: + # strikeout=0: 'and, ' + # strikeout=1: 'if ' + # strikeout=0: 'appropri' + # strikeout=1: 'ate,' + # + assert strikeout, f'Expected bit 0 (FZ_STEXT_STRIKEOUT) to be set in {span_0["char_flags"]=:#x}.' + assert text_0 == 'the right to request the state to review ' + else: + # Expecting the bug. + assert not strikeout, f'Expected bit 0 (FZ_STEXT_STRIKEOUT) to be unset in {span_0["char_flags"]=:#x}.' + assert text_0 == 'notice the right to request the state to review and, if appropriate,'