Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 89 additions & 8 deletions pipcl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1200,6 +1200,10 @@ def next( self, eof=ArgsRaise):
assert command is None, 'Two commands specified: {command} and {arg}.'
command = arg

elif arg in ('windows-vs', 'windows-python', 'show-sysconfig'):
assert command is None, 'Two commands specified: {command} and {arg}.'
command = arg

elif arg == '--all': opt_all = True
elif arg == '--compile': pass
elif arg == '--dist-dir' or arg == '-d': opt_dist_dir = args.next()
Expand All @@ -1212,12 +1216,6 @@ def next( self, eof=ArgsRaise):
elif arg == '--single-version-externally-managed': pass
elif arg == '--verbose' or arg == '-v': g_verbose += 1

elif arg == 'windows-vs':
command = arg
break
elif arg == 'windows-python':
command = arg
break
else:
raise Exception(f'Unrecognised arg: {arg}')

Expand Down Expand Up @@ -1268,6 +1266,39 @@ def next( self, eof=ArgsRaise):
vs = wdev.WindowsVS(year=year, grade=grade, version=version)
print(f'Visual Studio is:\n{vs.description_ml(" ")}')

elif command == 'show-sysconfig':
show_sysconfig()
for mod in platform, sys:
log0(f'{mod.__name__}:')
for n in dir(mod):
if n.startswith('_'):
continue
log0(f'{mod.__name__}.{n}')
if mod is platform and n == 'uname':
continue
if mod is platform and n == 'pdb':
continue
if mod is sys and n in ('breakpointhook', 'exit'):
# We don't want to call these.
continue
v = getattr(mod, n)
if callable(v):
try:
v = v()
except Exception:
pass
else:
#print(f'{n=}', flush=1)
try:
print(f' {mod.__name__}.{n}()={v!r}')
except Exception:
print(f' Failed to print value of {mod.__name__}.{n}().')
else:
try:
print(f' {mod.__name__}.{n}={v!r}')
except Exception:
print(f' Failed to print value of {mod.__name__}.{n}.')

else:
assert 0, f'Unrecognised command: {command}'

Expand Down Expand Up @@ -1928,6 +1959,34 @@ def base_linker(vs=None, pythonflags=None, cpp=False, use_env=True):
return linker, pythonflags


def git_info( directory):
'''
Returns `(sha, comment, diff, branch)`, all items are str or None if not
available.

directory:
Root of git checkout.
'''
sha, comment, diff, branch = None, None, None, None
e, out = run(
f'cd {directory} && (PAGER= git show --pretty=oneline|head -n 1 && git diff)',
capture=1,
check=0
)
if not e:
sha, _ = out.split(' ', 1)
comment, diff = _.split('\n', 1)
e, out = run(
f'cd {directory} && git rev-parse --abbrev-ref HEAD',
capture=1,
check=0
)
if not e:
branch = out.strip()
log(f'git_info(): directory={directory!r} returning branch={branch!r} sha={sha!r} comment={comment!r}')
return sha, comment, diff, branch


def git_items( directory, submodules=False):
'''
Returns list of paths for all files known to git within a `directory`.
Expand Down Expand Up @@ -2976,19 +3035,41 @@ def swig_get(swig, quick, swig_local='pipcl-swig-git'):
if swig and swig.startswith('git:'):
assert platform.system() != 'Windows'
swig_local = os.path.abspath(swig_local)
swig_binary = f'{swig_local}/install/bin/swig'
# Note that {swig_local}/install/bin/swig doesn't work on MacoS because
# {swig_local}/INSTALL is a file and the fs is case-insensitive.
swig_binary = f'{swig_local}/install-dir/bin/swig'
if quick and os.path.isfile(swig_binary):
log1(f'{quick=} and {swig_binary=} already exists, so not downloading/building.')
else:
# Clone swig.
swig_env_extra = None
git_get(
swig,
swig_local,
default_remote='https://github.com/swig/swig.git',
branch='master',
)
if darwin():
run(f'brew install automake')
run(f'brew install pcre2')
# Default bison doesn't work, and Brew's bison is not added to $PATH.
#
# > bison is keg-only, which means it was not symlinked into /opt/homebrew,
# > because macOS already provides this software and installing another version in
# > parallel can cause all kinds of trouble.
# >
# > If you need to have bison first in your PATH, run:
# > echo 'export PATH="/opt/homebrew/opt/bison/bin:$PATH"' >> ~/.zshrc
#
run(f'brew install bison')
PATH = os.environ['PATH']
PATH = f'/opt/homebrew/opt/bison/bin:{PATH}'
swig_env_extra = dict(PATH=PATH)
# Build swig.
run(f'cd {swig_local} && ./autogen.sh && ./configure --prefix={swig_local}/install && make && make install')
run(f'cd {swig_local} && ./autogen.sh', env_extra=swig_env_extra)
run(f'cd {swig_local} && ./configure --prefix={swig_local}/install-dir', env_extra=swig_env_extra)
run(f'cd {swig_local} && make', env_extra=swig_env_extra)
run(f'cd {swig_local} && make install', env_extra=swig_env_extra)
assert os.path.isfile(swig_binary)
return swig_binary
else:
Expand Down
2 changes: 2 additions & 0 deletions scripts/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,8 @@
--swig 'git:--branch master https://github.com/swig/swig.git'
--swig 'git:--branch master'
--swig git:

2025-08-18: This fixes building with py_limited_api on python-3.13.

--swig-quick 0|1
If 1 and `--swig` starts with 'git:', we do not update/build swig if
Expand Down
Binary file added tests/resources/test_3806-expected.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/resources/test_3806.pdf
Binary file not shown.
18 changes: 18 additions & 0 deletions tests/test_pixmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,3 +582,21 @@ def test_4445():
wt = pymupdf.TOOLS.mupdf_warnings()
print(f'{wt=}')
assert wt == 'broken xref subsection, proceeding anyway.\nTrailer Size is off-by-one. Ignoring.'


def test_3806():
print()
print(f'{pymupdf.mupdf_version=}')
path = os.path.normpath(f'{__file__}/../../tests/resources/test_3806.pdf')
path_png_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_3806-expected.png')
path_png = os.path.normpath(f'{__file__}/../../tests/test_3806.png')

with pymupdf.open(path) as document:
pixmap = document[0].get_pixmap()
pixmap.save(path_png)
rms = gentle_compare.pixmaps_rms(path_png_expected, pixmap)
print(f'{rms=}')
if pymupdf.mupdf_version_tuple >= (1, 27):
assert rms < 0.1
else:
assert rms > 50
122 changes: 79 additions & 43 deletions tests/test_textextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,11 @@ def test_4139():

def test_4245():
path = os.path.normpath(f'{__file__}/../../tests/resources/test_4245.pdf')
with pymupdf.open(path) as document:
page = document[0]
regions = page.search_for('Bart Simpson')
print(f'{regions=}')
page.add_highlight_annot(regions)
with pymupdf.open(path) as document:
page = document[0]
regions = page.search_for('Bart Simpson')
Expand Down Expand Up @@ -649,50 +654,70 @@ def test_extendable_textpage():
# 2025-01-28:
#
# We can create a pdf with two pages whose text is adjacent when stitched
# together vertically.
# together vertically:
#
# We can append page to stext_page ok.
# Page 1:
#
# aaaa
#
# bbbb
# cccc
#
# dddd
#
# Page 2:
#
# eeee
#
# ffff
# gggg
#
# hhhh
#
# Extracted spans are adjacent vertically as hoped.
#
# But... We always get a separate block for each page, even though the y
# coordinates are adjacent and so we would expect stext_page to return a
# single block. This is all with `sort=True`.
# Create a textpage for both of these pages. Then when extracting text,
# we need to get (specifically the `dddd` and `eeee` sequences need to be
# treated as the same block):
#
# Maybe sort=true doesn't ever join adjacent blocks??
# aaaa
#
# bbbb
# cccc
#
# dddd
# eeee
#
# ffff
# gggg
#
# hhhh
#
print()

path = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage.pdf')
with pymupdf.open(filetype='pdf') as document:
document.new_page()
document.new_page()
document.save(path)

# Create document with two pages and text where a paragraph spans the two
# pages.
#
with pymupdf.open(path) as document:
page0 = document[0]
page1 = document[1]
y = 100
line_height = 9.6
for i in range(4):
page0.insert_text((100, y+9.6), 'abcd'[i] * 16)
page1.insert_text((100, y+9.6), 'efgh'[i] * 16)
y += 9.6
page0.insert_text((100, y+line_height), 'abcd'[i] * 16)
page1.insert_text((100, y+line_height), 'efgh'[i] * 16)
y += line_height
if i%2 == 0:
y += 9.6*1
rect = (100, 100, 200, y)
rect2 = pymupdf.mupdf.FzRect(*rect)
document[0].draw_rect((100, 100, 200, y), (1, 0, 0))
document[1].draw_rect((100, 100, 200, y), (1, 0, 0))
path2 = os.path.normpath(f'{__file__}/../../tests/test_extendable_textpage2.pdf')
document.save(path2)
y += line_height
rect = pymupdf.mupdf.FzRect(100, 100, 200, y)
document[0].draw_rect(rect, (1, 0, 0))
document[1].draw_rect(rect, (1, 0, 0))
document.save(path)

# Create a stext page for both pages of our document, using direct calls to
# MuPDF for now.
# Create a stext page for the text regions in both pages of our document,
# using direct calls to MuPDF.
#

with pymupdf.Document(path2) as document:
with pymupdf.Document(path) as document:

# Notes:
#
Expand All @@ -701,9 +726,9 @@ def test_extendable_textpage():
# a new block, because pen position for new device is (0, 0) and this
# will usually be treated as a paragraph gap to the first text.
#
# At the moment we use infinite mediabox when using
# fz_new_stext_page()'s to create the stext device. I don't know what a
# non-infinite mediabox would be useful for.
# At the moment we use infinite mediabox when creating the
# fz_stext_page. I don't know what a non-infinite mediabox would be
# useful for.
#
# FZ_STEXT_CLIP_RECT isn't useful at the moment, because we would need
# to modify it to be in stext pagae coordinates (i.e. adding ctm.f
Expand All @@ -713,31 +738,40 @@ def test_extendable_textpage():
# include each page's entire contents.
#

ctm = pymupdf.mupdf.FzMatrix()
# We use our knowledge of the text rect in each page to manipulate ctm
# so that the stext contains text starting at (0, 0) and extending
# downwards.
#
y = 0
cookie = pymupdf.mupdf.FzCookie()

stext_page = pymupdf.mupdf.FzStextPage(
pymupdf.mupdf.FzRect(pymupdf.mupdf.FzRect.Fixed_INFINITE), # mediabox
)
stext_options = pymupdf.mupdf.FzStextOptions()
#stext_options.flags |= pymupdf.mupdf.FZ_STEXT_CLIP_RECT
#stext_options.clip = rect2.internal()
#stext_options.clip = rect.internal()
device = pymupdf.mupdf.fz_new_stext_device(stext_page, stext_options)

# Append second page to stext_page and prepare ctm for any later page.
# Add first page to stext_page at (0, y), and update <y> for the next
# page.
page = document[0]
ctm = pymupdf.mupdf.FzMatrix(1, 0, 0, 1, -rect.x0, -rect.y0 + y)
pymupdf.mupdf.fz_run_page(page.this, device, ctm, cookie)
ctm.f += rect2.y1 - rect2.y0
y += rect.y1 - rect.y0

# Append second page to stext_page and prepare for any later page.
# Add second page to stext_page at (0, y), and update <y> for the next
# page.
page = document[1]
ctm = pymupdf.mupdf.FzMatrix(1, 0, 0, 1, -rect.x0, -rect.y0 + y)
pymupdf.mupdf.fz_run_page(page.this, device, ctm, cookie)
ctm.f += rect2.y1 - rect2.y0
y += rect.y1 - rect.y0

# We've finished adding text to stext_page.
pymupdf.mupdf.fz_close_device(device)

# Read text from stext_page.
# Create a pymupdf.TextPage() for <stext_page> so we can use
# text_page.extractDICT() etc.
text_page = pymupdf.TextPage(stext_page)

# Read text from stext_page using text_page.extractDICT().
Expand All @@ -748,30 +782,32 @@ def test_extendable_textpage():
pno = 0
ydelta = 0
for block in d['blocks']:
print(f'block')
print(f'block {block["bbox"]=}')
for line in block['lines']:
print(f' line')
print(f' line {line["bbox"]=}')
for span in line['spans']:
print(f' span')
print(f' span {span["bbox"]=}')
bbox = span['bbox']
x0, y0, x1, y1 = bbox
dy = y0 - y0_prev if y0_prev else 0
y0_prev = y0
print(f' {dy=: 5.2f} height={y1-y0:.02f} {x0:.02f} {y0:.02f} {x1:.02f} {y1:.02f} {span["text"]=}')
if 'eee' in span['text']:
pno = 1
ydelta = rect2.y1 - rect2.y0
ydelta = rect.y1 - rect.y0
y0 -= ydelta
y1 -= ydelta
# Debugging - add green lines on original document
# translating final blocks info into original coors.
document[pno].draw_rect((x0, y0, x1, y1), (0, 1, 0))

print('\n\n\n\n')
print('\n\n')

print(f'Using text_page.extractText()')
text = text_page.extractText(True)
print(f'{text}')

print('\n\n\n\n')
print('\n\n')
print(f'Using extractBLOCKS')
text = list()
for x0, y0, x1, y1, line, no, type_ in text_page.extractBLOCKS():
Expand All @@ -780,7 +816,7 @@ def test_extendable_textpage():
print(f' {line=}')
text.append(line)

print("\n\n\n")
print("\n\n")
print(f'extractBLOCKS joined by newlines:')
print('\n'.join(text))

Expand Down