diff --git a/src/table.py b/src/table.py
index 4d95ffe57..3de8b2c86 100644
--- a/src/table.py
+++ b/src/table.py
@@ -89,18 +89,129 @@
Matrix,
TEXTFLAGS_TEXT,
TEXT_FONT_BOLD,
+ TEXT_FONT_ITALIC,
+ TEXT_FONT_MONOSPACED,
TEXT_FONT_SUPERSCRIPT,
+ TEXT_COLLECT_STYLES,
TOOLS,
EMPTY_RECT,
sRGB_to_pdf,
Point,
message,
+ mupdf,
)
EDGES = [] # vector graphics from PyMuPDF
CHARS = [] # text characters from PyMuPDF
TEXTPAGE = None
+TEXT_BOLD = mupdf.FZ_STEXT_BOLD
+TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
+FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES
+
white_spaces = set(string.whitespace) # for checking white space only cells
+
+
+def extract_cells(textpage, cell, markdown=False):
+ """Extract text from a rect-like 'cell' as plain or MD style text.
+
+ This function should ultimately be used to extract text from a table cell.
+ Markdown output will only work correctly if extraction flag bit
+ TEXT_COLLECT_STYLES is set.
+
+ Args:
+ textpage: A PyMuPDF TextPage object. Must have been created with
+ TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES.
+ cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
+ markdown: If True, return text formatted for Markdown.
+
+ Returns:
+ A string with the text extracted from the cell.
+ """
+ text = ""
+ for block in textpage.extractRAWDICT()["blocks"]:
+ if block["type"] != 0:
+ continue
+ block_bbox = block["bbox"]
+ if (
+ 0
+ or block_bbox[0] > cell[2]
+ or block_bbox[2] < cell[0]
+ or block_bbox[1] > cell[3]
+ or block_bbox[3] < cell[1]
+ ):
+ continue # skip block outside cell
+ for line in block["lines"]:
+ lbbox = line["bbox"]
+ if (
+ 0
+ or lbbox[0] > cell[2]
+ or lbbox[2] < cell[0]
+ or lbbox[1] > cell[3]
+ or lbbox[3] < cell[1]
+ ):
+ continue # skip line outside cell
+
+ if text: # must be a new line in the cell
+ text += "
" if markdown else "\n"
+
+ # strikeout detection only works with horizontal text
+ horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0)
+
+ for span in line["spans"]:
+ sbbox = span["bbox"]
+ if (
+ 0
+ or sbbox[0] > cell[2]
+ or sbbox[2] < cell[0]
+ or sbbox[1] > cell[3]
+ or sbbox[3] < cell[1]
+ ):
+ continue # skip spans outside cell
+
+ # only include chars with more than 50% bbox overlap
+ span_text = ""
+ for char in span["chars"]:
+ bbox = Rect(char["bbox"])
+ if abs(bbox & cell) > 0.5 * abs(bbox):
+ span_text += char["c"]
+
+ if not span_text:
+ continue # skip empty span
+
+ if not markdown: # no MD styling
+ text += span_text
+ continue
+
+ prefix = ""
+ suffix = ""
+ if horizontal and span["char_flags"] & TEXT_STRIKEOUT:
+ prefix += "~~"
+ suffix = "~~" + suffix
+ if span["char_flags"] & TEXT_BOLD:
+ prefix += "**"
+ suffix = "**" + suffix
+ if span["flags"] & TEXT_FONT_ITALIC:
+ prefix += "_"
+ suffix = "_" + suffix
+ if span["flags"] & TEXT_FONT_MONOSPACED:
+ prefix += "`"
+ suffix = "`" + suffix
+
+ if len(span["chars"]) > 2:
+ span_text = span_text.rstrip()
+
+ # if span continues previous styling: extend cell text
+ if (ls := len(suffix)) and text.endswith(suffix):
+ text = text[:-ls] + span_text + suffix
+ else: # append the span with new styling
+ if not span_text.strip():
+ text += " "
+ else:
+ text += prefix + span_text + suffix
+
+ return text.strip()
+
+
# -------------------------------------------------------------------
# End of PyMuPDF interface code
# -------------------------------------------------------------------
@@ -1382,7 +1493,18 @@ def to_markdown(self, clean=False, fill_empty=True):
output = "|"
rows = self.row_count
cols = self.col_count
- cells = self.extract()[:] # make local copy of table text content
+
+ # cell coordinates
+ cell_boxes = [[c for c in r.cells] for r in self.rows]
+
+ # cell text strings
+ cells = [[None for i in range(cols)] for j in range(rows)]
+ for i, row in enumerate(cell_boxes):
+ for j, cell in enumerate(row):
+ if cell is not None:
+ cells[i][j] = extract_cells(
+ TEXTPAGE, cell_boxes[i][j], markdown=True
+ )
if fill_empty: # fill "None" cells where possible
@@ -1420,7 +1542,8 @@ def to_markdown(self, clean=False, fill_empty=True):
for i, cell in enumerate(row):
# replace None cells with empty string
# use HTML line break tag
- cell = "" if not cell else cell.replace("\n", "
")
+ if cell is None:
+ cell = ""
if clean: # remove sensitive syntax
cell = html.escape(cell.replace("-", "-"))
line += cell + "|"
@@ -1944,7 +2067,7 @@ def make_chars(page, clip=None):
page_number = page.number + 1
page_height = page.rect.height
ctm = page.transformation_matrix
- TEXTPAGE = page.get_textpage(clip=clip, flags=TEXTFLAGS_TEXT)
+ TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS)
blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"]
doctop_base = page_height * page.number
for block in blocks:
diff --git a/tests/conftest.py b/tests/conftest.py
index 7e123435e..4017de580 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,3 +1,4 @@
+import copy
import os
import platform
import sys
@@ -45,6 +46,17 @@ def get_fds():
JM_annot_id_stem = pymupdf.JM_annot_id_stem
+ def get_members(a):
+ ret = dict()
+ for n in dir(a):
+ if not n.startswith('_'):
+ v = getattr(a, n)
+ ret[n] = v
+ return ret
+
+ # Allow post-test checking that pymupdf._globals has not changed.
+ _globals_pre = get_members(pymupdf._globals)
+
# Run the test.
rep = yield
@@ -59,6 +71,11 @@ def get_fds():
assert not pymupdf.TOOLS.set_small_glyph_heights()
+ _globals_post = get_members(pymupdf._globals)
+ if _globals_post != _globals_pre:
+ print(f'Test has changed pymupdf._globals from {_globals_pre=} to {_globals_post=}')
+ assert 0
+
log_items = pymupdf._log_items()
assert not log_items, f'log() was called; {len(log_items)=}.'
@@ -84,3 +101,22 @@ def get_fds():
if next_fd_after != next_fd_before:
print(f'Test has leaked fds, {next_fd_before=} {next_fd_after=}. {args=} {kwargs=}.')
#assert 0, f'Test has leaked fds, {next_fd_before=} {next_fd_after=}. {args=} {kwargs=}.'
+
+ if 0:
+ # This code can be useful to track down test failures caused by other
+ # tests modifying global state.
+ #
+ # We run a particular test menually after each test returns.
+ sys.path.insert(0, os.path.dirname(__file__))
+ try:
+ import test_tables
+ finally:
+ del sys.path[0]
+ print(f'### Calling test_tables.test_md_styles().')
+ try:
+ test_tables.test_md_styles()
+ except Exception as e:
+ print(f'### test_tables.test_md_styles() failed: {e}')
+ raise
+ else:
+ print(f'### test_tables.test_md_styles() passed.')
diff --git a/tests/resources/test-styled-table.pdf b/tests/resources/test-styled-table.pdf
new file mode 100644
index 000000000..67f7e8a49
Binary files /dev/null and b/tests/resources/test-styled-table.pdf differ
diff --git a/tests/test_font.py b/tests/test_font.py
index 4d5958cd9..d9f38f5d8 100644
--- a/tests/test_font.py
+++ b/tests/test_font.py
@@ -139,25 +139,28 @@ def test_mupdf_subset_fonts2():
def test_3677():
pymupdf.TOOLS.set_subset_fontnames(True)
- path = os.path.abspath(f'{__file__}/../../tests/resources/test_3677.pdf')
- font_names_expected = [
- 'BCDEEE+Aptos',
- 'BCDFEE+Aptos',
- 'BCDGEE+Calibri-Light',
- 'BCDHEE+Calibri-Light',
- ]
- font_names = list()
- with pymupdf.open(path) as document:
- for page in document:
- for block in page.get_text('dict')['blocks']:
- if block['type'] == 0:
- if 'lines' in block.keys():
- for line in block['lines']:
- for span in line['spans']:
- font_name=span['font']
- print(font_name)
- font_names.append(font_name)
- assert font_names == font_names_expected, f'{font_names=}'
+ try:
+ path = os.path.abspath(f'{__file__}/../../tests/resources/test_3677.pdf')
+ font_names_expected = [
+ 'BCDEEE+Aptos',
+ 'BCDFEE+Aptos',
+ 'BCDGEE+Calibri-Light',
+ 'BCDHEE+Calibri-Light',
+ ]
+ font_names = list()
+ with pymupdf.open(path) as document:
+ for page in document:
+ for block in page.get_text('dict')['blocks']:
+ if block['type'] == 0:
+ if 'lines' in block.keys():
+ for line in block['lines']:
+ for span in line['spans']:
+ font_name=span['font']
+ print(font_name)
+ font_names.append(font_name)
+ assert font_names == font_names_expected, f'{font_names=}'
+ finally:
+ pymupdf.TOOLS.set_subset_fontnames(False)
def test_3933():
diff --git a/tests/test_pylint.py b/tests/test_pylint.py
index 0f0c40fa9..a3b48ae6f 100644
--- a/tests/test_pylint.py
+++ b/tests/test_pylint.py
@@ -37,6 +37,7 @@ def test_pylint():
W1309: Using an f-string that does not have any interpolated variables (f-string-without-interpolation)
R1734: Consider using [] instead of list() (use-list-literal)
R1727: Boolean condition '0 and g_exceptions_verbose' will always evaluate to '0' (condition-evals-to-constant)
+ R1726: (simplifiable-condition)
'''
)
diff --git a/tests/test_tables.py b/tests/test_tables.py
index 4fb959f4c..745369c06 100644
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -423,3 +423,13 @@ def test_4017():
["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"],
]
assert tables[-1].extract() == expected_b
+
+
+def test_md_styles():
+ """Test output of table with MD-styled cells."""
+ filename = os.path.join(scriptdir, "resources", "test-styled-table.pdf")
+ doc = pymupdf.open(filename)
+ page = doc[0]
+ tabs = page.find_tables()[0]
+ text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~
~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**
**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**
**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~
~~**(3,1)**~~|Zelle (3,2)|\n\n"""
+ assert tabs.to_markdown() == text