pymupdf · JorjMcKie · Jun 26, 2025 · Jun 11, 2025 · Jun 23, 2025 · Jun 23, 2025
diff --git a/src/table.py b/src/table.py
@@ -89,18 +89,129 @@
     Matrix,
     TEXTFLAGS_TEXT,
     TEXT_FONT_BOLD,
+    TEXT_FONT_ITALIC,
+    TEXT_FONT_MONOSPACED,
     TEXT_FONT_SUPERSCRIPT,
+    TEXT_COLLECT_STYLES,
     TOOLS,
     EMPTY_RECT,
     sRGB_to_pdf,
     Point,
     message,
+    mupdf,
 )
 
 EDGES = []  # vector graphics from PyMuPDF
 CHARS = []  # text characters from PyMuPDF
 TEXTPAGE = None
+TEXT_BOLD = mupdf.FZ_STEXT_BOLD
+TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
+FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES
+
 white_spaces = set(string.whitespace)  # for checking white space only cells
+
+
+def extract_cells(textpage, cell, markdown=False):
+    """Extract text from a rect-like 'cell' as plain or MD style text.
+
+    This function should ultimately be used to extract text from a table cell.
+    Markdown output will only work correctly if extraction flag bit
+    TEXT_COLLECT_STYLES is set.
+
+    Args:
+        textpage: A PyMuPDF TextPage object. Must have been created with
+            TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES.
+        cell: A tuple (x0, y0, x1, y1) defining the cell's bbox.
+        markdown: If True, return text formatted for Markdown.
+
+    Returns:
+        A string with the text extracted from the cell.
+    """
+    text = ""
+    for block in textpage.extractRAWDICT()["blocks"]:
+        if block["type"] != 0:
+            continue
+        block_bbox = block["bbox"]
+        if (
+            0
+            or block_bbox[0] > cell[2]
+            or block_bbox[2] < cell[0]
+            or block_bbox[1] > cell[3]
+            or block_bbox[3] < cell[1]
+        ):
+            continue  # skip block outside cell
+        for line in block["lines"]:
+            lbbox = line["bbox"]
+            if (
+                0
+                or lbbox[0] > cell[2]
+                or lbbox[2] < cell[0]
+                or lbbox[1] > cell[3]
+                or lbbox[3] < cell[1]
+            ):
+                continue  # skip line outside cell
+
+            if text:  # must be a new line in the cell
+                text += "<br>" if markdown else "\n"
+
+            # strikeout detection only works with horizontal text
+            horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0)
+
+            for span in line["spans"]:
+                sbbox = span["bbox"]
+                if (
+                    0
+                    or sbbox[0] > cell[2]
+                    or sbbox[2] < cell[0]
+                    or sbbox[1] > cell[3]
+                    or sbbox[3] < cell[1]
+                ):
+                    continue  # skip spans outside cell
+
+                # only include chars with more than 50% bbox overlap
+                span_text = ""
+                for char in span["chars"]:
+                    bbox = Rect(char["bbox"])
+                    if abs(bbox & cell) > 0.5 * abs(bbox):
+                        span_text += char["c"]
+
+                if not span_text:
+                    continue  # skip empty span
+
+                if not markdown:  # no MD styling
+                    text += span_text
+                    continue
+
+                prefix = ""
+                suffix = ""
+                if horizontal and span["char_flags"] & TEXT_STRIKEOUT:
+                    prefix += "~~"
+                    suffix = "~~" + suffix
+                if span["char_flags"] & TEXT_BOLD:
+                    prefix += "**"
+                    suffix = "**" + suffix
+                if span["flags"] & TEXT_FONT_ITALIC:
+                    prefix += "_"
+                    suffix = "_" + suffix
+                if span["flags"] & TEXT_FONT_MONOSPACED:
+                    prefix += "`"
+                    suffix = "`" + suffix
+
+                if len(span["chars"]) > 2:
+                    span_text = span_text.rstrip()
+
+                # if span continues previous styling: extend cell text
+                if (ls := len(suffix)) and text.endswith(suffix):
+                    text = text[:-ls] + span_text + suffix
+                else:  # append the span with new styling
+                    if not span_text.strip():
+                        text += " "
+                    else:
+                        text += prefix + span_text + suffix
+
+    return text.strip()
+
+
 # -------------------------------------------------------------------
 # End of PyMuPDF interface code
 # -------------------------------------------------------------------
@@ -1382,7 +1493,18 @@ def to_markdown(self, clean=False, fill_empty=True):
         output = "|"
         rows = self.row_count
         cols = self.col_count
-        cells = self.extract()[:]  # make local copy of table text content
+
+        # cell coordinates
+        cell_boxes = [[c for c in r.cells] for r in self.rows]
+
+        # cell text strings
+        cells = [[None for i in range(cols)] for j in range(rows)]
+        for i, row in enumerate(cell_boxes):
+            for j, cell in enumerate(row):
+                if cell is not None:
+                    cells[i][j] = extract_cells(
+                        TEXTPAGE, cell_boxes[i][j], markdown=True
+                    )
 
         if fill_empty:  # fill "None" cells where possible
 
@@ -1420,7 +1542,8 @@ def to_markdown(self, clean=False, fill_empty=True):
             for i, cell in enumerate(row):
                 # replace None cells with empty string
                 # use HTML line break tag
-                cell = "" if not cell else cell.replace("\n", "<br>")
+                if cell is None:
+                    cell = ""
                 if clean:  # remove sensitive syntax
                     cell = html.escape(cell.replace("-", "&#45;"))
                 line += cell + "|"
@@ -1944,7 +2067,7 @@ def make_chars(page, clip=None):
     page_number = page.number + 1
     page_height = page.rect.height
     ctm = page.transformation_matrix
-    TEXTPAGE = page.get_textpage(clip=clip, flags=TEXTFLAGS_TEXT)
+    TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS)
     blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"]
     doctop_base = page_height * page.number
     for block in blocks:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,3 +1,4 @@
+import copy
 import os
 import platform
 import sys
@@ -45,6 +46,17 @@ def get_fds():
 
     JM_annot_id_stem = pymupdf.JM_annot_id_stem
 
+    def get_members(a):
+        ret = dict()
+        for n in dir(a):
+            if not n.startswith('_'):
+                v = getattr(a, n)
+                ret[n] = v
+        return ret
+
+    # Allow post-test checking that pymupdf._globals has not changed.
+    _globals_pre = get_members(pymupdf._globals)
+
     # Run the test.
     rep = yield
 
@@ -59,6 +71,11 @@ def get_fds():
 
     assert not pymupdf.TOOLS.set_small_glyph_heights()
 
+    _globals_post = get_members(pymupdf._globals)
+    if _globals_post != _globals_pre:
+        print(f'Test has changed pymupdf._globals from {_globals_pre=} to {_globals_post=}')
+        assert 0
+
     log_items = pymupdf._log_items()
     assert not log_items, f'log() was called; {len(log_items)=}.'
 
@@ -84,3 +101,22 @@ def get_fds():
     if next_fd_after != next_fd_before:
         print(f'Test has leaked fds, {next_fd_before=} {next_fd_after=}. {args=} {kwargs=}.')
         #assert 0, f'Test has leaked fds, {next_fd_before=} {next_fd_after=}. {args=} {kwargs=}.'
+
+    if 0:
+        # This code can be useful to track down test failures caused by other
+        # tests modifying global state.
+        #
+        # We run a particular test menually after each test returns.
+        sys.path.insert(0, os.path.dirname(__file__))
+        try:
+            import test_tables
+        finally:
+            del sys.path[0]
+        print(f'### Calling test_tables.test_md_styles().')
+        try:
+            test_tables.test_md_styles()
+        except Exception as e:
+            print(f'### test_tables.test_md_styles() failed: {e}')
+            raise
+        else:
+            print(f'### test_tables.test_md_styles() passed.')
diff --git a/tests/resources/test-styled-table.pdf b/tests/resources/test-styled-table.pdf
diff --git a/tests/test_font.py b/tests/test_font.py
@@ -139,25 +139,28 @@ def test_mupdf_subset_fonts2():
 
 def test_3677():
     pymupdf.TOOLS.set_subset_fontnames(True)
-    path = os.path.abspath(f'{__file__}/../../tests/resources/test_3677.pdf')
-    font_names_expected = [
-            'BCDEEE+Aptos',
-            'BCDFEE+Aptos',
-            'BCDGEE+Calibri-Light',
-            'BCDHEE+Calibri-Light',
-            ]
-    font_names = list()
-    with pymupdf.open(path) as document:
-        for page in document:
-             for block in page.get_text('dict')['blocks']:
-                    if block['type'] == 0:
-                        if 'lines' in block.keys():
-                            for line in block['lines']:
-                                for span in line['spans']:
-                                    font_name=span['font']
-                                    print(font_name)
-                                    font_names.append(font_name)
-    assert font_names == font_names_expected, f'{font_names=}'
+    try:
+        path = os.path.abspath(f'{__file__}/../../tests/resources/test_3677.pdf')
+        font_names_expected = [
+                'BCDEEE+Aptos',
+                'BCDFEE+Aptos',
+                'BCDGEE+Calibri-Light',
+                'BCDHEE+Calibri-Light',
+                ]
+        font_names = list()
+        with pymupdf.open(path) as document:
+            for page in document:
+                 for block in page.get_text('dict')['blocks']:
+                        if block['type'] == 0:
+                            if 'lines' in block.keys():
+                                for line in block['lines']:
+                                    for span in line['spans']:
+                                        font_name=span['font']
+                                        print(font_name)
+                                        font_names.append(font_name)
+        assert font_names == font_names_expected, f'{font_names=}'
+    finally:
+        pymupdf.TOOLS.set_subset_fontnames(False)
 
 
 def test_3933():

diff --git a/tests/test_pylint.py b/tests/test_pylint.py
@@ -37,6 +37,7 @@ def test_pylint():
             W1309: Using an f-string that does not have any interpolated variables (f-string-without-interpolation)
             R1734: Consider using [] instead of list() (use-list-literal)
             R1727: Boolean condition '0 and g_exceptions_verbose' will always evaluate to '0' (condition-evals-to-constant)
+            R1726: (simplifiable-condition)
             '''
             )
 

diff --git a/tests/test_tables.py b/tests/test_tables.py
@@ -423,3 +423,13 @@ def test_4017():
             ["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"],
         ]
         assert tables[-1].extract() == expected_b
+
+
+def test_md_styles():
+    """Test output of table with MD-styled cells."""
+    filename = os.path.join(scriptdir, "resources", "test-styled-table.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    tabs = page.find_tables()[0]
+    text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~<br>~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**<br>**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**<br>**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~<br>~~**(3,1)**~~|Zelle (3,2)|\n\n"""
+    assert tabs.to_markdown() == text