pymupdf · JorjMcKie · May 16, 2025 · May 14, 2025
diff --git a/src/table.py b/src/table.py
@@ -79,6 +79,7 @@
 from collections.abc import Sequence
 from dataclasses import dataclass
 from operator import itemgetter
+import weakref
 
 # -------------------------------------------------------------------
 # Start of PyMuPDF interface code
@@ -87,6 +88,8 @@
     Rect,
     Matrix,
     TEXTFLAGS_TEXT,
+    TEXT_FONT_BOLD,
+    TEXT_FONT_SUPERSCRIPT,
     TOOLS,
     EMPTY_RECT,
     sRGB_to_pdf,
@@ -1061,7 +1064,7 @@ def get_center(word):
         if not overlap:
             condensed_bboxes.append(bbox)
 
-    if len(condensed_bboxes) == 0:
+    if not condensed_bboxes:
         return []
 
     condensed_rects = map(bbox_to_rect, condensed_bboxes)
@@ -1367,33 +1370,57 @@ def char_in_bbox(char, bbox) -> bool:
 
         return table_arr
 
-    def to_markdown(self, clean=True):
+    def to_markdown(self, clean=False, fill_empty=True):
         """Output table content as a string in Github-markdown format.
 
-        If clean is true, markdown syntax is removed from cell content."""
+        If "clean" then markdown syntax is removed from cell content.
+        If "fill_empty" then cell content None is replaced by the values
+        above (columns) or left (rows) in an effort to approximate row and
+        columns spans.
+
+        """
         output = "|"
+        rows = self.row_count
+        cols = self.col_count
+        cells = self.extract()[:]  # make local copy of table text content
+
+        if fill_empty:  # fill "None" cells where possible
+
+            # for rows, copy content from left to right
+            for j in range(rows):
+                for i in range(cols - 1):
+                    if cells[j][i + 1] is None:
+                        cells[j][i + 1] = cells[j][i]
 
-        # generate header string and MD underline
+            # for columns, copy top to bottom
+            for i in range(cols):
+                for j in range(rows - 1):
+                    if cells[j + 1][i] is None:
+                        cells[j + 1][i] = cells[j][i]
+
+        # generate header string and MD separator
         for i, name in enumerate(self.header.names):
-            if name is None or name == "":  # generate a name if empty
+            if not name:  # generate a name if empty
                 name = f"Col{i+1}"
-            name = name.replace("\n", " ")  # remove any line breaks
+            name = name.replace("\n", "<br>")  # use HTML line breaks
             if clean:  # remove sensitive syntax
                 name = html.escape(name.replace("-", "&#45;"))
             output += name + "|"
 
         output += "\n"
+        # insert GitHub header line separator
         output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
 
         # skip first row in details if header is part of the table
         j = 0 if self.header.external else 1
 
         # iterate over detail rows
-        for row in self.extract()[j:]:
+        for row in cells[j:]:
             line = "|"
             for i, cell in enumerate(row):
-                # output None cells with empty string
-                cell = "" if cell is None else cell.replace("\n", " ")
+                # replace None cells with empty string
+                # use HTML line break tag
+                cell = "" if not cell else cell.replace("\n", "<br>")
                 if clean:  # remove sensitive syntax
                     cell = html.escape(cell.replace("-", "&#45;"))
                 line += cell + "|"
@@ -1462,22 +1489,34 @@ def _get_header(self, y_tolerance=3):
         page = self.page
         y_delta = y_tolerance
 
-        def top_row_is_bold(bbox):
-            """Check if row 0 has bold text anywhere.
+        def top_row_bg_color(self):
+            """
+            Compare top row background color with color of same-sized bbox
+            above. If different, return True indicating that the original
+            table top row is already the header.
+            """
+            bbox0 = Rect(self.rows[0].bbox)
+            bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height)  # area above
+            top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1]
+            top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1]
+            if top_color0 != top_colort:
+                return True  # top row is header
+            return False
 
-            If this is true, then any non-bold text in lines above disqualify
-            these lines as header.
+        def row_has_bold(bbox):
+            """Check if a row contains some bold text.
 
-            bbox is the (potentially repaired) row 0 bbox.
+            If e.g. true for the top row, then it will be used as (internal)
+            column header row if any of the following is true:
+            * the previous (above) text line has no bold span
+            * the second table row text has no bold span
 
-            Returns True or False
+            Returns True if any spans are bold else False.
             """
-            for b in page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]:
-                for l in b["lines"]:
-                    for s in l["spans"]:
-                        if s["flags"] & 16:
-                            return True
-            return False
+            blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]
+            spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
+
+            return any(s["flags"] & TEXT_FONT_BOLD for s in spans)
 
         try:
             row = self.rows[0]
@@ -1489,50 +1528,68 @@ def top_row_is_bold(bbox):
         # return this if we determine that the top row is the header
         header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
 
-        # one-line tables have no extra header
+        # 1-line tables have no extra header
         if len(self.rows) < 2:
             return header_top_row
 
-        # x-ccordinates of columns between x0 and x1 of the table
+        # 1-column tables have no extra header
         if len(cells) < 2:
             return header_top_row
 
-        col_x = [
-            c[2] if c is not None else None for c in cells[:-1]
-        ]  # column (x) coordinates
+        # assume top row is the header if second row is empty
+        row2 = self.rows[1]  # second row
+        if all(c is None for c in row2.cells):  # no valid cell bboxes in row2
+            return header_top_row
 
         # Special check: is top row bold?
-        # If first line above table is not bold, but top-left table cell is bold,
-        # we take first table row as header
-        top_row_bold = top_row_is_bold(bbox)
+        top_row_bold = row_has_bold(bbox)
+
+        # assume top row is header if it is bold and any cell
+        # of 2nd row is non-bold
+        if top_row_bold and not row_has_bold(row2.bbox):
+            return header_top_row
+
+        if top_row_bg_color(self):
+            # if area above top row has a different background color,
+            # then top row is already the header
+            return header_top_row
 
-        # clip = area above table
+        # column coordinates (x1 values) in top row
+        col_x = [c[2] if c is not None else None for c in cells[:-1]]
+
+        # clip = page area above the table
         # We will inspect this area for text qualifying as column header.
         clip = +bbox  # take row 0 bbox
         clip.y0 = 0  # start at top of page
         clip.y1 = bbox.y0  # end at top of table
 
-        spans = []  # the text spans inside clip
-        for b in page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]:
-            for l in b["lines"]:
-                for s in l["spans"]:
-                    if (
-                        not s["flags"] & 1 and s["text"].strip()
-                    ):  # ignore superscripts and empty text
-                        spans.append(s)
+        blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
+        # non-empty, non-superscript spans above table, sorted descending by y1
+        spans = sorted(
+            [
+                s
+                for b in blocks
+                for l in b["lines"]
+                for s in l["spans"]
+                if not (
+                    white_spaces.issuperset(s["text"])
+                    or s["flags"] & TEXT_FONT_SUPERSCRIPT
+                )
+            ],
+            key=lambda s: s["bbox"][3],
+            reverse=True,
+        )
 
         select = []  # y1 coordinates above, sorted descending
         line_heights = []  # line heights above, sorted descending
         line_bolds = []  # bold indicator per line above, same sorting
 
-        # spans sorted descending
-        spans.sort(key=lambda s: s["bbox"][3], reverse=True)
         # walk through the spans and fill above 3 lists
         for i in range(len(spans)):
             s = spans[i]
             y1 = s["bbox"][3]  # span bottom
             h = y1 - s["bbox"][1]  # span bbox height
-            bold = s["flags"] & 16
+            bold = s["flags"] & TEXT_FONT_BOLD
 
             # use first item to start the lists
             if i == 0:
@@ -1541,7 +1598,7 @@ def top_row_is_bold(bbox):
                 line_bolds.append(bold)
                 continue
 
-            # get last items from the 3 lists
+            # get previous items from the 3 lists
             y0 = select[-1]
             h0 = line_heights[-1]
             bold0 = line_bolds[-1]
@@ -1565,13 +1622,13 @@ def top_row_is_bold(bbox):
         if select == []:  # nothing above the table?
             return header_top_row
 
-        select = select[:5]  # only accept up to 5 lines in any header
+        select = select[:5]  # accept up to 5 lines for an external header
 
-        # take top row as header if text above table is too far apart
+        # assume top row as header if text above is too far away
         if bbox.y0 - select[0] >= line_heights[0]:
             return header_top_row
 
-        # if top table row is bold, but line above is not:
+        # accept top row as header if bold, but line above is not
         if top_row_bold and not line_bolds[0]:
             return header_top_row
 
@@ -1738,7 +1795,7 @@ class TableFinder:
     """
 
     def __init__(self, page, settings=None):
-        self.page = page
+        self.page = weakref.proxy(page)
         self.settings = TableSettings.resolve(settings)
         self.edges = self.get_edges()
         self.intersections = edges_to_intersections(
@@ -1942,7 +1999,7 @@ def make_chars(page, clip=None):
 # We are ignoring Bézier curves completely and are converting everything
 # else to lines.
 # ------------------------------------------------------------------------
-def make_edges(page, clip=None, tset=None, add_lines=None):
+def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
     snap_x = tset.snap_x_tolerance
     snap_y = tset.snap_y_tolerance
     min_length = tset.edge_min_length
@@ -1994,16 +2051,19 @@ def are_neighbors(r1, r2):
             return True
         return False
 
-    def clean_graphics():
+    def clean_graphics(npaths=None):
         """Detect and join rectangles of "connected" vector graphics."""
-
-        paths = []  # paths relevant for table detection
-        for p in page.get_drawings():
-            # ignore fill-only graphics if they do not simulate lines,
-            # which means one of width or height are small.
+        if npaths is None:
+            allpaths = page.get_drawings()
+        else:  # accept passed-in vector graphics
+            allpaths = npaths[:]  # paths relevant for table detection
+        paths = []
+        for p in allpaths:
+            # If only looking at lines, we ignore fill-only paths,
+            # except simulated lines (i.e. small width or height).
             if (
-                p["type"] == "f"
-                and lines_strict
+                lines_strict
+                and p["type"] == "f"
                 and p["rect"].width > snap_x
                 and p["rect"].height > snap_y
             ):
@@ -2038,7 +2098,7 @@ def clean_graphics():
 
         return new_rects, paths
 
-    bboxes, paths = clean_graphics()
+    bboxes, paths = clean_graphics(npaths=paths)
 
     def is_parallel(p1, p2):
         """Check if line is roughly axis-parallel."""
@@ -2209,6 +2269,25 @@ def make_line(p, p1, p2, clip):
         if line_dict:
             EDGES.append(line_to_edge(line_dict))
 
+    if add_boxes is not None:  # add user-specified rectangles
+        assert isinstance(add_boxes, (tuple, list))
+    else:
+        add_boxes = []
+    for box in add_boxes:
+        r = Rect(box)
+        line_dict = make_line(path, r.tl, r.bl, clip)
+        if line_dict:
+            EDGES.append(line_to_edge(line_dict))
+        line_dict = make_line(path, r.bl, r.br, clip)
+        if line_dict:
+            EDGES.append(line_to_edge(line_dict))
+        line_dict = make_line(path, r.br, r.tr, clip)
+        if line_dict:
+            EDGES.append(line_to_edge(line_dict))
+        line_dict = make_line(path, r.tr, r.tl, clip)
+        if line_dict:
+            EDGES.append(line_to_edge(line_dict))
+
 
 def page_rotation_set0(page):
     """Nullify page rotation.
@@ -2290,7 +2369,9 @@ def find_tables(
     text_x_tolerance=3,
     text_y_tolerance=3,
     strategy=None,  # offer abbreviation
-    add_lines=None,  # optional user-specified lines
+    add_lines=None,  # user-specified lines
+    add_boxes=None,  # user-specified rectangles
+    paths=None,  # accept vector graphics as parameter
 ):
     global CHARS, EDGES
     CHARS = []
@@ -2344,7 +2425,12 @@ def find_tables(
 
     make_chars(page, clip=clip)  # create character list of page
     make_edges(
-        page, clip=clip, tset=tset, add_lines=add_lines
+        page,
+        clip=clip,
+        tset=tset,
+        paths=paths,
+        add_lines=add_lines,
+        add_boxes=add_boxes,
     )  # create lines and curves
     tables = TableFinder(page, settings=tset)