pymupdf · julian-smith-artifex-com · Aug 19, 2025 · Aug 18, 2025 · Aug 18, 2025 · Aug 18, 2025
diff --git a/pipcl.py b/pipcl.py
@@ -1200,6 +1200,10 @@ def next( self, eof=ArgsRaise):
                 assert command is None, 'Two commands specified: {command} and {arg}.'
                 command = arg
 
+            elif arg in ('windows-vs', 'windows-python', 'show-sysconfig'):
+                assert command is None, 'Two commands specified: {command} and {arg}.'
+                command = arg
+
             elif arg == '--all':                                opt_all = True
             elif arg == '--compile':                            pass
             elif arg == '--dist-dir' or arg == '-d':            opt_dist_dir = args.next()
@@ -1212,12 +1216,6 @@ def next( self, eof=ArgsRaise):
             elif arg == '--single-version-externally-managed':  pass
             elif arg == '--verbose' or arg == '-v':             g_verbose += 1
 
-            elif arg == 'windows-vs':
-                command = arg
-                break
-            elif arg == 'windows-python':
-                command = arg
-                break
             else:
                raise Exception(f'Unrecognised arg: {arg}')
 
@@ -1268,6 +1266,39 @@ def next( self, eof=ArgsRaise):
             vs = wdev.WindowsVS(year=year, grade=grade, version=version)
             print(f'Visual Studio is:\n{vs.description_ml("    ")}')
 
+        elif command == 'show-sysconfig':
+            show_sysconfig()
+            for mod in platform, sys:
+                log0(f'{mod.__name__}:')
+                for n in dir(mod):
+                    if n.startswith('_'):
+                        continue
+                    log0(f'{mod.__name__}.{n}')
+                    if mod is platform and n == 'uname':
+                        continue
+                    if mod is platform and n == 'pdb':
+                        continue
+                    if mod is sys and n in ('breakpointhook', 'exit'):
+                        # We don't want to call these.
+                        continue
+                    v = getattr(mod, n)
+                    if callable(v):
+                        try:
+                            v = v()
+                        except Exception:
+                            pass
+                        else:
+                            #print(f'{n=}', flush=1)
+                            try:
+                                print(f'    {mod.__name__}.{n}()={v!r}')
+                            except Exception:
+                                print(f'    Failed to print value of {mod.__name__}.{n}().')
+                    else:
+                        try:
+                            print(f'    {mod.__name__}.{n}={v!r}')
+                        except Exception:
+                            print(f'    Failed to print value of {mod.__name__}.{n}.')
+
         else:
             assert 0, f'Unrecognised command: {command}'
 
@@ -1928,6 +1959,34 @@ def base_linker(vs=None, pythonflags=None, cpp=False, use_env=True):
     return linker, pythonflags
 
 
+def git_info( directory):
+    '''
+    Returns `(sha, comment, diff, branch)`, all items are str or None if not
+    available.
+
+    directory:
+        Root of git checkout.
+    '''
+    sha, comment, diff, branch = None, None, None, None
+    e, out = run(
+            f'cd {directory} && (PAGER= git show --pretty=oneline|head -n 1 && git diff)',
+            capture=1,
+            check=0
+            )
+    if not e:
+        sha, _ = out.split(' ', 1)
+        comment, diff = _.split('\n', 1)
+    e, out = run(
+            f'cd {directory} && git rev-parse --abbrev-ref HEAD',
+            capture=1,
+            check=0
+            )
+    if not e:
+        branch = out.strip()
+    log(f'git_info(): directory={directory!r} returning branch={branch!r} sha={sha!r} comment={comment!r}')
+    return sha, comment, diff, branch
+
+
 def git_items( directory, submodules=False):
     '''
     Returns list of paths for all files known to git within a `directory`.
@@ -2976,19 +3035,41 @@ def swig_get(swig, quick, swig_local='pipcl-swig-git'):
     if swig and swig.startswith('git:'):
         assert platform.system() != 'Windows'
         swig_local = os.path.abspath(swig_local)
-        swig_binary = f'{swig_local}/install/bin/swig'
+        # Note that {swig_local}/install/bin/swig doesn't work on MacoS because
+        # {swig_local}/INSTALL is a file and the fs is case-insensitive.
+        swig_binary = f'{swig_local}/install-dir/bin/swig'
         if quick and os.path.isfile(swig_binary):
             log1(f'{quick=} and {swig_binary=} already exists, so not downloading/building.')
         else:
             # Clone swig.
+            swig_env_extra = None
             git_get(
                     swig,
                     swig_local,
                     default_remote='https://github.com/swig/swig.git',
                     branch='master',
                     )
+            if darwin():
+                run(f'brew install automake')
+                run(f'brew install pcre2')
+                # Default bison doesn't work, and Brew's bison is not added to $PATH.
+                #
+                # > bison is keg-only, which means it was not symlinked into /opt/homebrew,
+                # > because macOS already provides this software and installing another version in
+                # > parallel can cause all kinds of trouble.
+                # > 
+                # > If you need to have bison first in your PATH, run:
+                # >   echo 'export PATH="/opt/homebrew/opt/bison/bin:$PATH"' >> ~/.zshrc
+                #
+                run(f'brew install bison')
+                PATH = os.environ['PATH']
+                PATH = f'/opt/homebrew/opt/bison/bin:{PATH}'
+                swig_env_extra = dict(PATH=PATH)
             # Build swig.
-            run(f'cd {swig_local} && ./autogen.sh && ./configure --prefix={swig_local}/install && make && make install')
+            run(f'cd {swig_local} && ./autogen.sh', env_extra=swig_env_extra)
+            run(f'cd {swig_local} && ./configure --prefix={swig_local}/install-dir', env_extra=swig_env_extra)
+            run(f'cd {swig_local} && make', env_extra=swig_env_extra)
+            run(f'cd {swig_local} && make install', env_extra=swig_env_extra)
         assert os.path.isfile(swig_binary)
         return swig_binary
     else:

diff --git a/scripts/test.py b/scripts/test.py
@@ -217,6 +217,8 @@
                 --swig 'git:--branch master https://github.com/swig/swig.git'
                 --swig 'git:--branch master'
                 --swig git:
+
+            2025-08-18: This fixes building with py_limited_api on python-3.13.
 
     --swig-quick 0|1
         If 1 and `--swig` starts with 'git:', we do not update/build swig if

diff --git a/tests/resources/test_3806-expected.png b/tests/resources/test_3806-expected.png
diff --git a/tests/resources/test_3806.pdf b/tests/resources/test_3806.pdf
diff --git a/tests/test_pixmap.py b/tests/test_pixmap.py
@@ -582,3 +582,21 @@ def test_4445():
     wt = pymupdf.TOOLS.mupdf_warnings()
     print(f'{wt=}')
     assert wt == 'broken xref subsection, proceeding anyway.\nTrailer Size is off-by-one. Ignoring.'
+
+
+def test_3806():
+    print()
+    print(f'{pymupdf.mupdf_version=}')
+    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3806.pdf')
+    path_png_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_3806-expected.png')
+    path_png = os.path.normpath(f'{__file__}/../../tests/test_3806.png')
+
+    with pymupdf.open(path) as document:
+        pixmap = document[0].get_pixmap()
+        pixmap.save(path_png)
+        rms = gentle_compare.pixmaps_rms(path_png_expected, pixmap)
+        print(f'{rms=}')
+        if pymupdf.mupdf_version_tuple >= (1, 27):
+            assert rms < 0.1
+        else:
+            assert rms > 50
diff --git a/tests/test_textextract.py b/tests/test_textextract.py
@@ -469,6 +469,11 @@ def test_4139():
 
 def test_4245():
     path = os.path.normpath(f'{__file__}/../../tests/resources/test_4245.pdf')
+    with pymupdf.open(path) as document:
+        page = document[0]
+        regions = page.search_for('Bart Simpson')
+        print(f'{regions=}')
+        page.add_highlight_annot(regions)
     with pymupdf.open(path) as document:
         page = document[0]
         regions = page.search_for('Bart Simpson')
@@ -649,50 +654,70 @@ def test_extendable_textpage():
     # 2025-01-28:
     #
     # We can create a pdf with two pages whose text is adjacent when stitched
-    # together vertically.
+    # together vertically:
     #
-    # We can append page to stext_page ok.
+    # Page 1:
+    # 
+    #     aaaa
+    #    
+    #     bbbb
+    #     cccc
+    #     
+    #     dddd
+    #     
+    # Page 2:
+    #     
+    #     eeee
+    #     
+    #     ffff
+    #     gggg
+    #     
+    #     hhhh
     #
-    # Extracted spans are adjacent vertically as hoped.
     #
-    # But... We always get a separate block for each page, even though the y
-    # coordinates are adjacent and so we would expect stext_page to return a
-    # single block. This is all with `sort=True`.
+    # Create a textpage for both of these pages. Then when extracting text,
+    # we need to get (specifically the `dddd` and `eeee` sequences need to be
+    # treated as the same block):
     #
-    # Maybe sort=true doesn't ever join adjacent blocks??
+    #     aaaa
+    #    
+    #     bbbb
+    #     cccc
+    #     
+    #     dddd
+    #     eeee
+    #     
+    #     ffff
+    #     gggg
+    #     
+    #     hhhh
     #
     print()
 
     path = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage.pdf')
     with pymupdf.open(filetype='pdf') as document:
         document.new_page()
         document.new_page()
-        document.save(path)
-
-    # Create document with two pages and text where a paragraph spans the two
-    # pages.
-    #
-    with pymupdf.open(path) as document:
         page0 = document[0]
         page1 = document[1]
         y = 100
+        line_height = 9.6
         for i in range(4):
-            page0.insert_text((100, y+9.6), 'abcd'[i] * 16)
-            page1.insert_text((100, y+9.6), 'efgh'[i] * 16)
-            y += 9.6
+            page0.insert_text((100, y+line_height), 'abcd'[i] * 16)
+            page1.insert_text((100, y+line_height), 'efgh'[i] * 16)
+            y += line_height
             if i%2 == 0:
-                y += 9.6*1
-        rect = (100, 100, 200, y)
-        rect2 = pymupdf.mupdf.FzRect(*rect)
-        document[0].draw_rect((100, 100, 200, y), (1, 0, 0))
-        document[1].draw_rect((100, 100, 200, y), (1, 0, 0))
-        path2 = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage2.pdf')
-        document.save(path2)
+                y += line_height
+        rect = pymupdf.mupdf.FzRect(100, 100, 200, y)
+        document[0].draw_rect(rect, (1, 0, 0))
+        document[1].draw_rect(rect, (1, 0, 0))
+        document.save(path)
 
-    # Create a stext page for both pages of our document, using direct calls to
-    # MuPDF for now.
+    # Create a stext page for the text regions in both pages of our document,
+    # using direct calls to MuPDF.
+    #
 
-    with pymupdf.Document(path2) as document:
+    with pymupdf.Document(path) as document:
 
         # Notes:
         #
@@ -701,9 +726,9 @@ def test_extendable_textpage():
         # a new block, because pen position for new device is (0, 0) and this
         # will usually be treated as a paragraph gap to the first text.
         #
-        # At the moment we use infinite mediabox when using
-        # fz_new_stext_page()'s to create the stext device. I don't know what a
-        # non-infinite mediabox would be useful for.
+        # At the moment we use infinite mediabox when creating the
+        # fz_stext_page. I don't know what a non-infinite mediabox would be
+        # useful for.
         #
         # FZ_STEXT_CLIP_RECT isn't useful at the moment, because we would need
         # to modify it to be in stext pagae coordinates (i.e. adding ctm.f
@@ -713,31 +738,40 @@ def test_extendable_textpage():
         # include each page's entire contents.
         #
 
-        ctm = pymupdf.mupdf.FzMatrix()
+        # We use our knowledge of the text rect in each page to manipulate ctm
+        # so that the stext contains text starting at (0, 0) and extending
+        # downwards.
+        #
+        y = 0
         cookie = pymupdf.mupdf.FzCookie()
 
         stext_page = pymupdf.mupdf.FzStextPage(
                 pymupdf.mupdf.FzRect(pymupdf.mupdf.FzRect.Fixed_INFINITE),  # mediabox
                 )
         stext_options = pymupdf.mupdf.FzStextOptions()
         #stext_options.flags |= pymupdf.mupdf.FZ_STEXT_CLIP_RECT
-        #stext_options.clip = rect2.internal()
+        #stext_options.clip = rect.internal()
         device = pymupdf.mupdf.fz_new_stext_device(stext_page, stext_options)
 
-        # Append second page to stext_page and prepare ctm for any later page.
+        # Add first page to stext_page at (0, y), and update <y> for the next
+        # page.
         page = document[0]
+        ctm = pymupdf.mupdf.FzMatrix(1, 0, 0, 1, -rect.x0, -rect.y0 + y)
         pymupdf.mupdf.fz_run_page(page.this, device, ctm, cookie)
-        ctm.f += rect2.y1 - rect2.y0
+        y += rect.y1 - rect.y0
 
-        # Append second page to stext_page and prepare for any later page.
+        # Add second page to stext_page at (0, y), and update <y> for the next
+        # page.
         page = document[1]
+        ctm = pymupdf.mupdf.FzMatrix(1, 0, 0, 1, -rect.x0, -rect.y0 + y)
         pymupdf.mupdf.fz_run_page(page.this, device, ctm, cookie)
-        ctm.f += rect2.y1 - rect2.y0
+        y += rect.y1 - rect.y0
 
         # We've finished adding text to stext_page.
         pymupdf.mupdf.fz_close_device(device)
 
-        # Read text from stext_page.
+        # Create a pymupdf.TextPage() for <stext_page> so we can use
+        # text_page.extractDICT() etc.
         text_page = pymupdf.TextPage(stext_page)
 
         # Read text from stext_page using text_page.extractDICT().
@@ -748,30 +782,32 @@ def test_extendable_textpage():
         pno = 0
         ydelta = 0
         for block in d['blocks']:
-            print(f'block')
+            print(f'block {block["bbox"]=}')
             for line in block['lines']:
-                print(f'    line')
+                print(f'    line {line["bbox"]=}')
                 for span in line['spans']:
-                    print(f'        span')
+                    print(f'        span {span["bbox"]=}')
                     bbox = span['bbox']
                     x0, y0, x1, y1 = bbox
                     dy = y0 - y0_prev if y0_prev else 0
                     y0_prev = y0
                     print(f'                {dy=: 5.2f} height={y1-y0:.02f} {x0:.02f} {y0:.02f} {x1:.02f} {y1:.02f} {span["text"]=}')
                     if 'eee' in span['text']:
                         pno = 1
-                        ydelta = rect2.y1 - rect2.y0
+                        ydelta = rect.y1 - rect.y0
                     y0 -= ydelta
                     y1 -= ydelta
+                    # Debugging - add green lines on original document
+                    # translating final blocks info into original coors.
                     document[pno].draw_rect((x0, y0, x1, y1), (0, 1, 0))
 
-        print('\n\n\n\n')
+        print('\n\n')
 
         print(f'Using text_page.extractText()')
         text = text_page.extractText(True)
         print(f'{text}')
 
-        print('\n\n\n\n')
+        print('\n\n')
         print(f'Using extractBLOCKS')
         text = list()
         for x0, y0, x1, y1, line, no, type_ in text_page.extractBLOCKS():
@@ -780,7 +816,7 @@ def test_extendable_textpage():
             print(f'    {line=}')
             text.append(line)
 
-        print("\n\n\n")
+        print("\n\n")
         print(f'extractBLOCKS joined by newlines:')
         print('\n'.join(text))