Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 126 additions & 3 deletions src/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,18 +89,129 @@
Matrix,
TEXTFLAGS_TEXT,
TEXT_FONT_BOLD,
TEXT_FONT_ITALIC,
TEXT_FONT_MONOSPACED,
TEXT_FONT_SUPERSCRIPT,
TEXT_COLLECT_STYLES,
TOOLS,
EMPTY_RECT,
sRGB_to_pdf,
Point,
message,
mupdf,
)

EDGES = [] # vector graphics from PyMuPDF
CHARS = [] # text characters from PyMuPDF
TEXTPAGE = None
TEXT_BOLD = mupdf.FZ_STEXT_BOLD
TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES

white_spaces = set(string.whitespace) # for checking white space only cells


def extract_cells(textpage, cell, markdown=False):
"""Extract text from a rect-like 'cell' as plain or MD style text.

This function should ultimately be used to extract text from a table cell.
Markdown output will only work correctly if extraction flag bit
TEXT_COLLECT_STYLES is set.

Args:
textpage: A PyMuPDF TextPage object. Must have been created with
TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES.
cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
markdown: If True, return text formatted for Markdown.

Returns:
A string with the text extracted from the cell.
"""
text = ""
for block in textpage.extractRAWDICT()["blocks"]:
if block["type"] != 0:
continue
block_bbox = block["bbox"]
if (
0
or block_bbox[0] > cell[2]
or block_bbox[2] < cell[0]
or block_bbox[1] > cell[3]
or block_bbox[3] < cell[1]
):
continue # skip block outside cell
for line in block["lines"]:
lbbox = line["bbox"]
if (
0
or lbbox[0] > cell[2]
or lbbox[2] < cell[0]
or lbbox[1] > cell[3]
or lbbox[3] < cell[1]
):
continue # skip line outside cell

if text: # must be a new line in the cell
text += "<br>" if markdown else "\n"

# strikeout detection only works with horizontal text
horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0)

for span in line["spans"]:
sbbox = span["bbox"]
if (
0
or sbbox[0] > cell[2]
or sbbox[2] < cell[0]
or sbbox[1] > cell[3]
or sbbox[3] < cell[1]
):
continue # skip spans outside cell

# only include chars with more than 50% bbox overlap
span_text = ""
for char in span["chars"]:
bbox = Rect(char["bbox"])
if abs(bbox & cell) > 0.5 * abs(bbox):
span_text += char["c"]

if not span_text:
continue # skip empty span

if not markdown: # no MD styling
text += span_text
continue

prefix = ""
suffix = ""
if horizontal and span["char_flags"] & TEXT_STRIKEOUT:
prefix += "~~"
suffix = "~~" + suffix
if span["char_flags"] & TEXT_BOLD:
prefix += "**"
suffix = "**" + suffix
if span["flags"] & TEXT_FONT_ITALIC:
prefix += "_"
suffix = "_" + suffix
if span["flags"] & TEXT_FONT_MONOSPACED:
prefix += "`"
suffix = "`" + suffix

if len(span["chars"]) > 2:
span_text = span_text.rstrip()

# if span continues previous styling: extend cell text
if (ls := len(suffix)) and text.endswith(suffix):
text = text[:-ls] + span_text + suffix
else: # append the span with new styling
if not span_text.strip():
text += " "
else:
text += prefix + span_text + suffix

return text.strip()


# -------------------------------------------------------------------
# End of PyMuPDF interface code
# -------------------------------------------------------------------
Expand Down Expand Up @@ -1382,7 +1493,18 @@ def to_markdown(self, clean=False, fill_empty=True):
output = "|"
rows = self.row_count
cols = self.col_count
cells = self.extract()[:] # make local copy of table text content

# cell coordinates
cell_boxes = [[c for c in r.cells] for r in self.rows]

# cell text strings
cells = [[None for i in range(cols)] for j in range(rows)]
for i, row in enumerate(cell_boxes):
for j, cell in enumerate(row):
if cell is not None:
cells[i][j] = extract_cells(
TEXTPAGE, cell_boxes[i][j], markdown=True
)

if fill_empty: # fill "None" cells where possible

Expand Down Expand Up @@ -1420,7 +1542,8 @@ def to_markdown(self, clean=False, fill_empty=True):
for i, cell in enumerate(row):
# replace None cells with empty string
# use HTML line break tag
cell = "" if not cell else cell.replace("\n", "<br>")
if cell is None:
cell = ""
if clean: # remove sensitive syntax
cell = html.escape(cell.replace("-", "&#45;"))
line += cell + "|"
Expand Down Expand Up @@ -1944,7 +2067,7 @@ def make_chars(page, clip=None):
page_number = page.number + 1
page_height = page.rect.height
ctm = page.transformation_matrix
TEXTPAGE = page.get_textpage(clip=clip, flags=TEXTFLAGS_TEXT)
TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS)
blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"]
doctop_base = page_height * page.number
for block in blocks:
Expand Down
36 changes: 36 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import os
import platform
import sys
Expand Down Expand Up @@ -45,6 +46,17 @@ def get_fds():

JM_annot_id_stem = pymupdf.JM_annot_id_stem

def get_members(a):
ret = dict()
for n in dir(a):
if not n.startswith('_'):
v = getattr(a, n)
ret[n] = v
return ret

# Allow post-test checking that pymupdf._globals has not changed.
_globals_pre = get_members(pymupdf._globals)

# Run the test.
rep = yield

Expand All @@ -59,6 +71,11 @@ def get_fds():

assert not pymupdf.TOOLS.set_small_glyph_heights()

_globals_post = get_members(pymupdf._globals)
if _globals_post != _globals_pre:
print(f'Test has changed pymupdf._globals from {_globals_pre=} to {_globals_post=}')
assert 0

log_items = pymupdf._log_items()
assert not log_items, f'log() was called; {len(log_items)=}.'

Expand All @@ -84,3 +101,22 @@ def get_fds():
if next_fd_after != next_fd_before:
print(f'Test has leaked fds, {next_fd_before=} {next_fd_after=}. {args=} {kwargs=}.')
#assert 0, f'Test has leaked fds, {next_fd_before=} {next_fd_after=}. {args=} {kwargs=}.'

if 0:
# This code can be useful to track down test failures caused by other
# tests modifying global state.
#
# We run a particular test menually after each test returns.
sys.path.insert(0, os.path.dirname(__file__))
try:
import test_tables
finally:
del sys.path[0]
print(f'### Calling test_tables.test_md_styles().')
try:
test_tables.test_md_styles()
except Exception as e:
print(f'### test_tables.test_md_styles() failed: {e}')
raise
else:
print(f'### test_tables.test_md_styles() passed.')
Binary file added tests/resources/test-styled-table.pdf
Binary file not shown.
41 changes: 22 additions & 19 deletions tests/test_font.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,25 +139,28 @@ def test_mupdf_subset_fonts2():

def test_3677():
pymupdf.TOOLS.set_subset_fontnames(True)
path = os.path.abspath(f'{__file__}/../../tests/resources/test_3677.pdf')
font_names_expected = [
'BCDEEE+Aptos',
'BCDFEE+Aptos',
'BCDGEE+Calibri-Light',
'BCDHEE+Calibri-Light',
]
font_names = list()
with pymupdf.open(path) as document:
for page in document:
for block in page.get_text('dict')['blocks']:
if block['type'] == 0:
if 'lines' in block.keys():
for line in block['lines']:
for span in line['spans']:
font_name=span['font']
print(font_name)
font_names.append(font_name)
assert font_names == font_names_expected, f'{font_names=}'
try:
path = os.path.abspath(f'{__file__}/../../tests/resources/test_3677.pdf')
font_names_expected = [
'BCDEEE+Aptos',
'BCDFEE+Aptos',
'BCDGEE+Calibri-Light',
'BCDHEE+Calibri-Light',
]
font_names = list()
with pymupdf.open(path) as document:
for page in document:
for block in page.get_text('dict')['blocks']:
if block['type'] == 0:
if 'lines' in block.keys():
for line in block['lines']:
for span in line['spans']:
font_name=span['font']
print(font_name)
font_names.append(font_name)
assert font_names == font_names_expected, f'{font_names=}'
finally:
pymupdf.TOOLS.set_subset_fontnames(False)


def test_3933():
Expand Down
1 change: 1 addition & 0 deletions tests/test_pylint.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def test_pylint():
W1309: Using an f-string that does not have any interpolated variables (f-string-without-interpolation)
R1734: Consider using [] instead of list() (use-list-literal)
R1727: Boolean condition '0 and g_exceptions_verbose' will always evaluate to '0' (condition-evals-to-constant)
R1726: (simplifiable-condition)
'''
)

Expand Down
10 changes: 10 additions & 0 deletions tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,3 +423,13 @@ def test_4017():
["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"],
]
assert tables[-1].extract() == expected_b


def test_md_styles():
"""Test output of table with MD-styled cells."""
filename = os.path.join(scriptdir, "resources", "test-styled-table.pdf")
doc = pymupdf.open(filename)
page = doc[0]
tabs = page.find_tables()[0]
text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~<br>~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**<br>**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**<br>**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~<br>~~**(3,1)**~~|Zelle (3,2)|\n\n"""
assert tabs.to_markdown() == text
Loading