From 447c4badefdf6e985e1ca4c1d6456f81e902b897 Mon Sep 17 00:00:00 2001
From: "Jorj X. McKie" <jorj.x.mckie@outlook.de>
Date: Wed, 14 May 2025 11:51:39 -0400
Subject: [PATCH] Table Detection Improvements

* Support new detection parameter "add_boxes" which allows specifying "virtual" rectangles to help detection.

* Support new parameter "paths" to allow specifying previously extracted vector graphics.

* Several minor improvements, especially we now export line breaks inside table cells as HTML "<br>" tags instead of replacing "\n" by spaces.
---
 src/table.py         | 202 ++++++++++++++++++++++++++++++-------------
 tests/test_tables.py | 128 ++++++++++++++++++++-------
 2 files changed, 241 insertions(+), 89 deletions(-)
diff --git a/src/table.py b/src/table.py
index 16b64fb32..4d95ffe57 100644
--- a/src/table.py
+++ b/src/table.py
@@ -79,6 +79,7 @@
 from collections.abc import Sequence
 from dataclasses import dataclass
 from operator import itemgetter
+import weakref
 
 # -------------------------------------------------------------------
 # Start of PyMuPDF interface code
@@ -87,6 +88,8 @@
     Rect,
     Matrix,
     TEXTFLAGS_TEXT,
+    TEXT_FONT_BOLD,
+    TEXT_FONT_SUPERSCRIPT,
     TOOLS,
     EMPTY_RECT,
     sRGB_to_pdf,
@@ -1061,7 +1064,7 @@ def get_center(word):
         if not overlap:
             condensed_bboxes.append(bbox)
 
-    if len(condensed_bboxes) == 0:
+    if not condensed_bboxes:
         return []
 
     condensed_rects = map(bbox_to_rect, condensed_bboxes)
@@ -1367,33 +1370,57 @@ def char_in_bbox(char, bbox) -> bool:
 
         return table_arr
 
-    def to_markdown(self, clean=True):
+    def to_markdown(self, clean=False, fill_empty=True):
         """Output table content as a string in Github-markdown format.
 
-        If clean is true, markdown syntax is removed from cell content."""
+        If "clean" then markdown syntax is removed from cell content.
+        If "fill_empty" then cell content None is replaced by the values
+        above (columns) or left (rows) in an effort to approximate row and
+        columns spans.
+
+        """
         output = "|"
+        rows = self.row_count
+        cols = self.col_count
+        cells = self.extract()[:]  # make local copy of table text content
+
+        if fill_empty:  # fill "None" cells where possible
+
+            # for rows, copy content from left to right
+            for j in range(rows):
+                for i in range(cols - 1):
+                    if cells[j][i + 1] is None:
+                        cells[j][i + 1] = cells[j][i]
 
-        # generate header string and MD underline
+            # for columns, copy top to bottom
+            for i in range(cols):
+                for j in range(rows - 1):
+                    if cells[j + 1][i] is None:
+                        cells[j + 1][i] = cells[j][i]
+
+        # generate header string and MD separator
         for i, name in enumerate(self.header.names):
-            if name is None or name == "":  # generate a name if empty
+            if not name:  # generate a name if empty
                 name = f"Col{i+1}"
-            name = name.replace("\n", " ")  # remove any line breaks
+            name = name.replace("\n", "<br>")  # use HTML line breaks
             if clean:  # remove sensitive syntax
                 name = html.escape(name.replace("-", "&#45;"))
             output += name + "|"
 
         output += "\n"
+        # insert GitHub header line separator
         output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
 
         # skip first row in details if header is part of the table
         j = 0 if self.header.external else 1
 
         # iterate over detail rows
-        for row in self.extract()[j:]:
+        for row in cells[j:]:
             line = "|"
             for i, cell in enumerate(row):
-                # output None cells with empty string
-                cell = "" if cell is None else cell.replace("\n", " ")
+                # replace None cells with empty string
+                # use HTML line break tag
+                cell = "" if not cell else cell.replace("\n", "<br>")
                 if clean:  # remove sensitive syntax
                     cell = html.escape(cell.replace("-", "&#45;"))
                 line += cell + "|"
@@ -1462,22 +1489,34 @@ def _get_header(self, y_tolerance=3):
         page = self.page
         y_delta = y_tolerance
 
-        def top_row_is_bold(bbox):
-            """Check if row 0 has bold text anywhere.
+        def top_row_bg_color(self):
+            """
+            Compare top row background color with color of same-sized bbox
+            above. If different, return True indicating that the original
+            table top row is already the header.
+            """
+            bbox0 = Rect(self.rows[0].bbox)
+            bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height)  # area above
+            top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1]
+            top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1]
+            if top_color0 != top_colort:
+                return True  # top row is header
+            return False
 
-            If this is true, then any non-bold text in lines above disqualify
-            these lines as header.
+        def row_has_bold(bbox):
+            """Check if a row contains some bold text.
 
-            bbox is the (potentially repaired) row 0 bbox.
+            If e.g. true for the top row, then it will be used as (internal)
+            column header row if any of the following is true:
+            * the previous (above) text line has no bold span
+            * the second table row text has no bold span
 
-            Returns True or False
+            Returns True if any spans are bold else False.
             """
-            for b in page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]:
-                for l in b["lines"]:
-                    for s in l["spans"]:
-                        if s["flags"] & 16:
-                            return True
-            return False
+            blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]
+            spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
+
+            return any(s["flags"] & TEXT_FONT_BOLD for s in spans)
 
         try:
             row = self.rows[0]
@@ -1489,50 +1528,68 @@ def top_row_is_bold(bbox):
         # return this if we determine that the top row is the header
         header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
 
-        # one-line tables have no extra header
+        # 1-line tables have no extra header
         if len(self.rows) < 2:
             return header_top_row
 
-        # x-ccordinates of columns between x0 and x1 of the table
+        # 1-column tables have no extra header
         if len(cells) < 2:
             return header_top_row
 
-        col_x = [
-            c[2] if c is not None else None for c in cells[:-1]
-        ]  # column (x) coordinates
+        # assume top row is the header if second row is empty
+        row2 = self.rows[1]  # second row
+        if all(c is None for c in row2.cells):  # no valid cell bboxes in row2
+            return header_top_row
 
         # Special check: is top row bold?
-        # If first line above table is not bold, but top-left table cell is bold,
-        # we take first table row as header
-        top_row_bold = top_row_is_bold(bbox)
+        top_row_bold = row_has_bold(bbox)
+
+        # assume top row is header if it is bold and any cell
+        # of 2nd row is non-bold
+        if top_row_bold and not row_has_bold(row2.bbox):
+            return header_top_row
+
+        if top_row_bg_color(self):
+            # if area above top row has a different background color,
+            # then top row is already the header
+            return header_top_row
 
-        # clip = area above table
+        # column coordinates (x1 values) in top row
+        col_x = [c[2] if c is not None else None for c in cells[:-1]]
+
+        # clip = page area above the table
         # We will inspect this area for text qualifying as column header.
         clip = +bbox  # take row 0 bbox
         clip.y0 = 0  # start at top of page
         clip.y1 = bbox.y0  # end at top of table
 
-        spans = []  # the text spans inside clip
-        for b in page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]:
-            for l in b["lines"]:
-                for s in l["spans"]:
-                    if (
-                        not s["flags"] & 1 and s["text"].strip()
-                    ):  # ignore superscripts and empty text
-                        spans.append(s)
+        blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
+        # non-empty, non-superscript spans above table, sorted descending by y1
+        spans = sorted(
+            [
+                s
+                for b in blocks
+                for l in b["lines"]
+                for s in l["spans"]
+                if not (
+                    white_spaces.issuperset(s["text"])
+                    or s["flags"] & TEXT_FONT_SUPERSCRIPT
+                )
+            ],
+            key=lambda s: s["bbox"][3],
+            reverse=True,
+        )
 
         select = []  # y1 coordinates above, sorted descending
         line_heights = []  # line heights above, sorted descending
         line_bolds = []  # bold indicator per line above, same sorting
 
-        # spans sorted descending
-        spans.sort(key=lambda s: s["bbox"][3], reverse=True)
         # walk through the spans and fill above 3 lists
         for i in range(len(spans)):
             s = spans[i]
             y1 = s["bbox"][3]  # span bottom
             h = y1 - s["bbox"][1]  # span bbox height
-            bold = s["flags"] & 16
+            bold = s["flags"] & TEXT_FONT_BOLD
 
             # use first item to start the lists
             if i == 0:
@@ -1541,7 +1598,7 @@ def top_row_is_bold(bbox):
                 line_bolds.append(bold)
                 continue
 
-            # get last items from the 3 lists
+            # get previous items from the 3 lists
             y0 = select[-1]
             h0 = line_heights[-1]
             bold0 = line_bolds[-1]
@@ -1565,13 +1622,13 @@ def top_row_is_bold(bbox):
         if select == []:  # nothing above the table?
             return header_top_row
 
-        select = select[:5]  # only accept up to 5 lines in any header
+        select = select[:5]  # accept up to 5 lines for an external header
 
-        # take top row as header if text above table is too far apart
+        # assume top row as header if text above is too far away
         if bbox.y0 - select[0] >= line_heights[0]:
             return header_top_row
 
-        # if top table row is bold, but line above is not:
+        # accept top row as header if bold, but line above is not
         if top_row_bold and not line_bolds[0]:
             return header_top_row
 
@@ -1738,7 +1795,7 @@ class TableFinder:
     """
 
     def __init__(self, page, settings=None):
-        self.page = page
+        self.page = weakref.proxy(page)
         self.settings = TableSettings.resolve(settings)
         self.edges = self.get_edges()
         self.intersections = edges_to_intersections(
@@ -1942,7 +1999,7 @@ def make_chars(page, clip=None):
 # We are ignoring Bézier curves completely and are converting everything
 # else to lines.
 # ------------------------------------------------------------------------
-def make_edges(page, clip=None, tset=None, add_lines=None):
+def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
     snap_x = tset.snap_x_tolerance
     snap_y = tset.snap_y_tolerance
     min_length = tset.edge_min_length
@@ -1994,16 +2051,19 @@ def are_neighbors(r1, r2):
             return True
         return False
 
-    def clean_graphics():
+    def clean_graphics(npaths=None):
         """Detect and join rectangles of "connected" vector graphics."""
-
-        paths = []  # paths relevant for table detection
-        for p in page.get_drawings():
-            # ignore fill-only graphics if they do not simulate lines,
-            # which means one of width or height are small.
+        if npaths is None:
+            allpaths = page.get_drawings()
+        else:  # accept passed-in vector graphics
+            allpaths = npaths[:]  # paths relevant for table detection
+        paths = []
+        for p in allpaths:
+            # If only looking at lines, we ignore fill-only paths,
+            # except simulated lines (i.e. small width or height).
             if (
-                p["type"] == "f"
-                and lines_strict
+                lines_strict
+                and p["type"] == "f"
                 and p["rect"].width > snap_x
                 and p["rect"].height > snap_y
             ):
@@ -2038,7 +2098,7 @@ def clean_graphics():
 
         return new_rects, paths
 
-    bboxes, paths = clean_graphics()
+    bboxes, paths = clean_graphics(npaths=paths)
 
     def is_parallel(p1, p2):
         """Check if line is roughly axis-parallel."""
@@ -2209,6 +2269,25 @@ def make_line(p, p1, p2, clip):
         if line_dict:
             EDGES.append(line_to_edge(line_dict))
 
+    if add_boxes is not None:  # add user-specified rectangles
+        assert isinstance(add_boxes, (tuple, list))
+    else:
+        add_boxes = []
+    for box in add_boxes:
+        r = Rect(box)
+        line_dict = make_line(path, r.tl, r.bl, clip)
+        if line_dict:
+            EDGES.append(line_to_edge(line_dict))
+        line_dict = make_line(path, r.bl, r.br, clip)
+        if line_dict:
+            EDGES.append(line_to_edge(line_dict))
+        line_dict = make_line(path, r.br, r.tr, clip)
+        if line_dict:
+            EDGES.append(line_to_edge(line_dict))
+        line_dict = make_line(path, r.tr, r.tl, clip)
+        if line_dict:
+            EDGES.append(line_to_edge(line_dict))
+
 
 def page_rotation_set0(page):
     """Nullify page rotation.
@@ -2290,7 +2369,9 @@ def find_tables(
     text_x_tolerance=3,
     text_y_tolerance=3,
     strategy=None,  # offer abbreviation
-    add_lines=None,  # optional user-specified lines
+    add_lines=None,  # user-specified lines
+    add_boxes=None,  # user-specified rectangles
+    paths=None,  # accept vector graphics as parameter
 ):
     global CHARS, EDGES
     CHARS = []
@@ -2344,7 +2425,12 @@ def find_tables(
 
     make_chars(page, clip=clip)  # create character list of page
     make_edges(
-        page, clip=clip, tset=tset, add_lines=add_lines
+        page,
+        clip=clip,
+        tset=tset,
+        paths=paths,
+        add_lines=add_lines,
+        add_boxes=add_boxes,
     )  # create lines and curves
     tables = TableFinder(page, settings=tset)
 
diff --git a/tests/test_tables.py b/tests/test_tables.py
index ab1533825..4fb959f4c 100644
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -182,7 +182,10 @@ def test_2979():
 
     wt = pymupdf.TOOLS.mupdf_warnings()
     if pymupdf.mupdf_version_tuple >= (1, 26, 0):
-        assert wt == 'bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times...'
+        assert (
+            wt
+            == "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..."
+        )
     else:
         assert not wt
 
@@ -294,14 +297,55 @@ def test_markdown():
     text = (
         "|Header1|Header2|Header3|\n"
         "|---|---|---|\n"
-        "|Col11 Col12|Col21 Col22|Col31 Col32 Col33|\n"
-        "|Col13|Col23|Col34 Col35|\n"
+        "|Col11<br>Col12|Col21<br>Col22|Col31<br>Col32<br>Col33|\n"
+        "|Col13|Col23|Col34<br>Col35|\n"
         "|Col14|Col24|Col36|\n"
-        "|Col15|Col25 Col26||\n\n"
+        "|Col15|Col25<br>Col26||\n\n"
     )
     assert tab.to_markdown() == text
 
 
+def test_paths_param():
+    """Confirm acceptance of supplied vector graphics list."""
+    filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    tabs = page.find_tables(paths=[])  # will cause all tables are missed
+    assert tabs.tables == []
+
+
+def test_boxes_param():
+    """Confirm acceptance of supplied boxes list."""
+    filename = os.path.join(scriptdir, "resources", "small-table.pdf")
+    doc = pymupdf.open(filename)
+    page = doc[0]
+    paths = page.get_drawings()
+    box0 = page.cluster_drawings(drawings=paths)[0]
+    boxes = [box0]
+    words = page.get_text("words")
+    x_vals = [w[0] - 5 for w in words if w[4] in ("min", "max", "avg")]
+    for x in x_vals:
+        r = +box0
+        r.x1 = x
+        boxes.append(r)
+
+    y_vals = sorted(set([round(w[3]) for w in words]))
+    for y in y_vals[:-1]:  # skip last one to avoid empty row
+        r = +box0
+        r.y1 = y
+        boxes.append(r)
+
+    tabs = page.find_tables(paths=[], add_boxes=boxes)
+    tab = tabs.tables[0]
+    assert tab.extract() == [
+        ["Boiling Points °C", "min", "max", "avg"],
+        ["Noble gases", "-269", "-62", "-170.5"],
+        ["Nonmetals", "-253", "4827", "414.1"],
+        ["Metalloids", "335", "3900", "741.5"],
+        ["Metals", "357", ">5000", "2755.9"],
+    ]
+
+
 def test_dotted_grid():
     """Confirm dotted lines are detected as gridlines."""
     filename = os.path.join(scriptdir, "resources", "dotted-gridlines.pdf")
@@ -317,43 +361,65 @@ def test_dotted_grid():
 
 
 def test_4017():
-    path = os.path.normpath(f'{__file__}/../../tests/resources/test_4017.pdf')
+    path = os.path.normpath(f"{__file__}/../../tests/resources/test_4017.pdf")
     with pymupdf.open(path) as document:
         page = document[0]
-        
+
         tables = page.find_tables(add_lines=None)
         print(f"{len(tables.tables)=}.")
         tables_text = list()
         for i, table in enumerate(tables):
-            print(f'## {i=}.')
+            print(f"## {i=}.")
             t = table.extract()
             for tt in t:
-                print(f'    {tt}')
-        
+                print(f"    {tt}")
+
         # 2024-11-29: expect current incorrect output for last two tables.
-        
+
         expected_a = [
-                ['Class A/B Overcollateralization', '131.44%', '>=', '122.60%', '', 'PASS'],
-                [None, None, None, None, None, 'PASS'],
-                ['Class D Overcollateralization', '112.24%', '>=', '106.40%', '', 'PASS'],
-                [None, None, None, None, None, 'PASS'],
-                ['Event of Default', '156.08%', '>=', '102.50%', '', 'PASS'],
-                [None, None, None, None, None, 'PASS'],
-                ['Class A/B Interest Coverage', 'N/A', '>=', '120.00%', '', 'N/A'],
-                [None, None, None, None, None, 'N/A'],
-                ['Class D Interest Coverage', 'N/A', '>=', '105.00%', '', 'N/A'],
-                ]
+            ["Class A/B Overcollateralization", "131.44%", ">=", "122.60%", "", "PASS"],
+            [None, None, None, None, None, "PASS"],
+            ["Class D Overcollateralization", "112.24%", ">=", "106.40%", "", "PASS"],
+            [None, None, None, None, None, "PASS"],
+            ["Event of Default", "156.08%", ">=", "102.50%", "", "PASS"],
+            [None, None, None, None, None, "PASS"],
+            ["Class A/B Interest Coverage", "N/A", ">=", "120.00%", "", "N/A"],
+            [None, None, None, None, None, "N/A"],
+            ["Class D Interest Coverage", "N/A", ">=", "105.00%", "", "N/A"],
+        ]
         assert tables[-2].extract() == expected_a
-                
+
         expected_b = [
-                ["Moody's Maximum Rating Factor Test", '2,577', '<=', '3,250', '', 'PASS', '2,581'],
-                [None, None, None, None, None, 'PASS', None],
-                ['Minimum Floating Spread', '3.5006%', '>=', '2.0000%', '', 'PASS', '3.4871%'],
-                [None, None, None, None, None, 'PASS', None],
-                ['Minimum Weighted Average S&P Recovery\nRate Test', '40.50%', '>=', '40.00%', '', 'PASS', '40.40%'],
-                [None, None, None, None, None, 'PASS', None],
-                ['Weighted Average Life', '4.83', '<=', '9.00', '', 'PASS', '4.92'],
-                ]
+            [
+                "Moody's Maximum Rating Factor Test",
+                "2,577",
+                "<=",
+                "3,250",
+                "",
+                "PASS",
+                "2,581",
+            ],
+            [None, None, None, None, None, "PASS", None],
+            [
+                "Minimum Floating Spread",
+                "3.5006%",
+                ">=",
+                "2.0000%",
+                "",
+                "PASS",
+                "3.4871%",
+            ],
+            [None, None, None, None, None, "PASS", None],
+            [
+                "Minimum Weighted Average S&P Recovery\nRate Test",
+                "40.50%",
+                ">=",
+                "40.00%",
+                "",
+                "PASS",
+                "40.40%",
+            ],
+            [None, None, None, None, None, "PASS", None],
+            ["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"],
+        ]
         assert tables[-1].extract() == expected_b
-                
-