From 447c4badefdf6e985e1ca4c1d6456f81e902b897 Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Wed, 14 May 2025 11:51:39 -0400 Subject: [PATCH] Table Detection Improvements * Support new detection parameter "add_boxes" which allows specifying "virtual" rectangles to help detection. * Support new parameter "paths" to allow specifying previously extracted vector graphics. * Several minor improvements, especially we now export line breaks inside table cells as HTML "
" tags instead of replacing "\n" by spaces. --- src/table.py | 202 ++++++++++++++++++++++++++++++------------- tests/test_tables.py | 128 ++++++++++++++++++++------- 2 files changed, 241 insertions(+), 89 deletions(-) diff --git a/src/table.py b/src/table.py index 16b64fb32..4d95ffe57 100644 --- a/src/table.py +++ b/src/table.py @@ -79,6 +79,7 @@ from collections.abc import Sequence from dataclasses import dataclass from operator import itemgetter +import weakref # ------------------------------------------------------------------- # Start of PyMuPDF interface code @@ -87,6 +88,8 @@ Rect, Matrix, TEXTFLAGS_TEXT, + TEXT_FONT_BOLD, + TEXT_FONT_SUPERSCRIPT, TOOLS, EMPTY_RECT, sRGB_to_pdf, @@ -1061,7 +1064,7 @@ def get_center(word): if not overlap: condensed_bboxes.append(bbox) - if len(condensed_bboxes) == 0: + if not condensed_bboxes: return [] condensed_rects = map(bbox_to_rect, condensed_bboxes) @@ -1367,33 +1370,57 @@ def char_in_bbox(char, bbox) -> bool: return table_arr - def to_markdown(self, clean=True): + def to_markdown(self, clean=False, fill_empty=True): """Output table content as a string in Github-markdown format. - If clean is true, markdown syntax is removed from cell content.""" + If "clean" then markdown syntax is removed from cell content. + If "fill_empty" then cell content None is replaced by the values + above (columns) or left (rows) in an effort to approximate row and + columns spans. + + """ output = "|" + rows = self.row_count + cols = self.col_count + cells = self.extract()[:] # make local copy of table text content + + if fill_empty: # fill "None" cells where possible + + # for rows, copy content from left to right + for j in range(rows): + for i in range(cols - 1): + if cells[j][i + 1] is None: + cells[j][i + 1] = cells[j][i] - # generate header string and MD underline + # for columns, copy top to bottom + for i in range(cols): + for j in range(rows - 1): + if cells[j + 1][i] is None: + cells[j + 1][i] = cells[j][i] + + # generate header string and MD separator for i, name in enumerate(self.header.names): - if name is None or name == "": # generate a name if empty + if not name: # generate a name if empty name = f"Col{i+1}" - name = name.replace("\n", " ") # remove any line breaks + name = name.replace("\n", "
") # use HTML line breaks if clean: # remove sensitive syntax name = html.escape(name.replace("-", "-")) output += name + "|" output += "\n" + # insert GitHub header line separator output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n" # skip first row in details if header is part of the table j = 0 if self.header.external else 1 # iterate over detail rows - for row in self.extract()[j:]: + for row in cells[j:]: line = "|" for i, cell in enumerate(row): - # output None cells with empty string - cell = "" if cell is None else cell.replace("\n", " ") + # replace None cells with empty string + # use HTML line break tag + cell = "" if not cell else cell.replace("\n", "
") if clean: # remove sensitive syntax cell = html.escape(cell.replace("-", "-")) line += cell + "|" @@ -1462,22 +1489,34 @@ def _get_header(self, y_tolerance=3): page = self.page y_delta = y_tolerance - def top_row_is_bold(bbox): - """Check if row 0 has bold text anywhere. + def top_row_bg_color(self): + """ + Compare top row background color with color of same-sized bbox + above. If different, return True indicating that the original + table top row is already the header. + """ + bbox0 = Rect(self.rows[0].bbox) + bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height) # area above + top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1] + top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1] + if top_color0 != top_colort: + return True # top row is header + return False - If this is true, then any non-bold text in lines above disqualify - these lines as header. + def row_has_bold(bbox): + """Check if a row contains some bold text. - bbox is the (potentially repaired) row 0 bbox. + If e.g. true for the top row, then it will be used as (internal) + column header row if any of the following is true: + * the previous (above) text line has no bold span + * the second table row text has no bold span - Returns True or False + Returns True if any spans are bold else False. """ - for b in page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]: - for l in b["lines"]: - for s in l["spans"]: - if s["flags"] & 16: - return True - return False + blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"] + spans = [s for b in blocks for l in b["lines"] for s in l["spans"]] + + return any(s["flags"] & TEXT_FONT_BOLD for s in spans) try: row = self.rows[0] @@ -1489,50 +1528,68 @@ def top_row_is_bold(bbox): # return this if we determine that the top row is the header header_top_row = TableHeader(bbox, cells, self.extract()[0], False) - # one-line tables have no extra header + # 1-line tables have no extra header if len(self.rows) < 2: return header_top_row - # x-ccordinates of columns between x0 and x1 of the table + # 1-column tables have no extra header if len(cells) < 2: return header_top_row - col_x = [ - c[2] if c is not None else None for c in cells[:-1] - ] # column (x) coordinates + # assume top row is the header if second row is empty + row2 = self.rows[1] # second row + if all(c is None for c in row2.cells): # no valid cell bboxes in row2 + return header_top_row # Special check: is top row bold? - # If first line above table is not bold, but top-left table cell is bold, - # we take first table row as header - top_row_bold = top_row_is_bold(bbox) + top_row_bold = row_has_bold(bbox) + + # assume top row is header if it is bold and any cell + # of 2nd row is non-bold + if top_row_bold and not row_has_bold(row2.bbox): + return header_top_row + + if top_row_bg_color(self): + # if area above top row has a different background color, + # then top row is already the header + return header_top_row - # clip = area above table + # column coordinates (x1 values) in top row + col_x = [c[2] if c is not None else None for c in cells[:-1]] + + # clip = page area above the table # We will inspect this area for text qualifying as column header. clip = +bbox # take row 0 bbox clip.y0 = 0 # start at top of page clip.y1 = bbox.y0 # end at top of table - spans = [] # the text spans inside clip - for b in page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]: - for l in b["lines"]: - for s in l["spans"]: - if ( - not s["flags"] & 1 and s["text"].strip() - ): # ignore superscripts and empty text - spans.append(s) + blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"] + # non-empty, non-superscript spans above table, sorted descending by y1 + spans = sorted( + [ + s + for b in blocks + for l in b["lines"] + for s in l["spans"] + if not ( + white_spaces.issuperset(s["text"]) + or s["flags"] & TEXT_FONT_SUPERSCRIPT + ) + ], + key=lambda s: s["bbox"][3], + reverse=True, + ) select = [] # y1 coordinates above, sorted descending line_heights = [] # line heights above, sorted descending line_bolds = [] # bold indicator per line above, same sorting - # spans sorted descending - spans.sort(key=lambda s: s["bbox"][3], reverse=True) # walk through the spans and fill above 3 lists for i in range(len(spans)): s = spans[i] y1 = s["bbox"][3] # span bottom h = y1 - s["bbox"][1] # span bbox height - bold = s["flags"] & 16 + bold = s["flags"] & TEXT_FONT_BOLD # use first item to start the lists if i == 0: @@ -1541,7 +1598,7 @@ def top_row_is_bold(bbox): line_bolds.append(bold) continue - # get last items from the 3 lists + # get previous items from the 3 lists y0 = select[-1] h0 = line_heights[-1] bold0 = line_bolds[-1] @@ -1565,13 +1622,13 @@ def top_row_is_bold(bbox): if select == []: # nothing above the table? return header_top_row - select = select[:5] # only accept up to 5 lines in any header + select = select[:5] # accept up to 5 lines for an external header - # take top row as header if text above table is too far apart + # assume top row as header if text above is too far away if bbox.y0 - select[0] >= line_heights[0]: return header_top_row - # if top table row is bold, but line above is not: + # accept top row as header if bold, but line above is not if top_row_bold and not line_bolds[0]: return header_top_row @@ -1738,7 +1795,7 @@ class TableFinder: """ def __init__(self, page, settings=None): - self.page = page + self.page = weakref.proxy(page) self.settings = TableSettings.resolve(settings) self.edges = self.get_edges() self.intersections = edges_to_intersections( @@ -1942,7 +1999,7 @@ def make_chars(page, clip=None): # We are ignoring Bézier curves completely and are converting everything # else to lines. # ------------------------------------------------------------------------ -def make_edges(page, clip=None, tset=None, add_lines=None): +def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None): snap_x = tset.snap_x_tolerance snap_y = tset.snap_y_tolerance min_length = tset.edge_min_length @@ -1994,16 +2051,19 @@ def are_neighbors(r1, r2): return True return False - def clean_graphics(): + def clean_graphics(npaths=None): """Detect and join rectangles of "connected" vector graphics.""" - - paths = [] # paths relevant for table detection - for p in page.get_drawings(): - # ignore fill-only graphics if they do not simulate lines, - # which means one of width or height are small. + if npaths is None: + allpaths = page.get_drawings() + else: # accept passed-in vector graphics + allpaths = npaths[:] # paths relevant for table detection + paths = [] + for p in allpaths: + # If only looking at lines, we ignore fill-only paths, + # except simulated lines (i.e. small width or height). if ( - p["type"] == "f" - and lines_strict + lines_strict + and p["type"] == "f" and p["rect"].width > snap_x and p["rect"].height > snap_y ): @@ -2038,7 +2098,7 @@ def clean_graphics(): return new_rects, paths - bboxes, paths = clean_graphics() + bboxes, paths = clean_graphics(npaths=paths) def is_parallel(p1, p2): """Check if line is roughly axis-parallel.""" @@ -2209,6 +2269,25 @@ def make_line(p, p1, p2, clip): if line_dict: EDGES.append(line_to_edge(line_dict)) + if add_boxes is not None: # add user-specified rectangles + assert isinstance(add_boxes, (tuple, list)) + else: + add_boxes = [] + for box in add_boxes: + r = Rect(box) + line_dict = make_line(path, r.tl, r.bl, clip) + if line_dict: + EDGES.append(line_to_edge(line_dict)) + line_dict = make_line(path, r.bl, r.br, clip) + if line_dict: + EDGES.append(line_to_edge(line_dict)) + line_dict = make_line(path, r.br, r.tr, clip) + if line_dict: + EDGES.append(line_to_edge(line_dict)) + line_dict = make_line(path, r.tr, r.tl, clip) + if line_dict: + EDGES.append(line_to_edge(line_dict)) + def page_rotation_set0(page): """Nullify page rotation. @@ -2290,7 +2369,9 @@ def find_tables( text_x_tolerance=3, text_y_tolerance=3, strategy=None, # offer abbreviation - add_lines=None, # optional user-specified lines + add_lines=None, # user-specified lines + add_boxes=None, # user-specified rectangles + paths=None, # accept vector graphics as parameter ): global CHARS, EDGES CHARS = [] @@ -2344,7 +2425,12 @@ def find_tables( make_chars(page, clip=clip) # create character list of page make_edges( - page, clip=clip, tset=tset, add_lines=add_lines + page, + clip=clip, + tset=tset, + paths=paths, + add_lines=add_lines, + add_boxes=add_boxes, ) # create lines and curves tables = TableFinder(page, settings=tset) diff --git a/tests/test_tables.py b/tests/test_tables.py index ab1533825..4fb959f4c 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -182,7 +182,10 @@ def test_2979(): wt = pymupdf.TOOLS.mupdf_warnings() if pymupdf.mupdf_version_tuple >= (1, 26, 0): - assert wt == 'bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times...' + assert ( + wt + == "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..." + ) else: assert not wt @@ -294,14 +297,55 @@ def test_markdown(): text = ( "|Header1|Header2|Header3|\n" "|---|---|---|\n" - "|Col11 Col12|Col21 Col22|Col31 Col32 Col33|\n" - "|Col13|Col23|Col34 Col35|\n" + "|Col11
Col12|Col21
Col22|Col31
Col32
Col33|\n" + "|Col13|Col23|Col34
Col35|\n" "|Col14|Col24|Col36|\n" - "|Col15|Col25 Col26||\n\n" + "|Col15|Col25
Col26||\n\n" ) assert tab.to_markdown() == text +def test_paths_param(): + """Confirm acceptance of supplied vector graphics list.""" + filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf") + doc = pymupdf.open(filename) + page = doc[0] + tabs = page.find_tables(paths=[]) # will cause all tables are missed + assert tabs.tables == [] + + +def test_boxes_param(): + """Confirm acceptance of supplied boxes list.""" + filename = os.path.join(scriptdir, "resources", "small-table.pdf") + doc = pymupdf.open(filename) + page = doc[0] + paths = page.get_drawings() + box0 = page.cluster_drawings(drawings=paths)[0] + boxes = [box0] + words = page.get_text("words") + x_vals = [w[0] - 5 for w in words if w[4] in ("min", "max", "avg")] + for x in x_vals: + r = +box0 + r.x1 = x + boxes.append(r) + + y_vals = sorted(set([round(w[3]) for w in words])) + for y in y_vals[:-1]: # skip last one to avoid empty row + r = +box0 + r.y1 = y + boxes.append(r) + + tabs = page.find_tables(paths=[], add_boxes=boxes) + tab = tabs.tables[0] + assert tab.extract() == [ + ["Boiling Points °C", "min", "max", "avg"], + ["Noble gases", "-269", "-62", "-170.5"], + ["Nonmetals", "-253", "4827", "414.1"], + ["Metalloids", "335", "3900", "741.5"], + ["Metals", "357", ">5000", "2755.9"], + ] + + def test_dotted_grid(): """Confirm dotted lines are detected as gridlines.""" filename = os.path.join(scriptdir, "resources", "dotted-gridlines.pdf") @@ -317,43 +361,65 @@ def test_dotted_grid(): def test_4017(): - path = os.path.normpath(f'{__file__}/../../tests/resources/test_4017.pdf') + path = os.path.normpath(f"{__file__}/../../tests/resources/test_4017.pdf") with pymupdf.open(path) as document: page = document[0] - + tables = page.find_tables(add_lines=None) print(f"{len(tables.tables)=}.") tables_text = list() for i, table in enumerate(tables): - print(f'## {i=}.') + print(f"## {i=}.") t = table.extract() for tt in t: - print(f' {tt}') - + print(f" {tt}") + # 2024-11-29: expect current incorrect output for last two tables. - + expected_a = [ - ['Class A/B Overcollateralization', '131.44%', '>=', '122.60%', '', 'PASS'], - [None, None, None, None, None, 'PASS'], - ['Class D Overcollateralization', '112.24%', '>=', '106.40%', '', 'PASS'], - [None, None, None, None, None, 'PASS'], - ['Event of Default', '156.08%', '>=', '102.50%', '', 'PASS'], - [None, None, None, None, None, 'PASS'], - ['Class A/B Interest Coverage', 'N/A', '>=', '120.00%', '', 'N/A'], - [None, None, None, None, None, 'N/A'], - ['Class D Interest Coverage', 'N/A', '>=', '105.00%', '', 'N/A'], - ] + ["Class A/B Overcollateralization", "131.44%", ">=", "122.60%", "", "PASS"], + [None, None, None, None, None, "PASS"], + ["Class D Overcollateralization", "112.24%", ">=", "106.40%", "", "PASS"], + [None, None, None, None, None, "PASS"], + ["Event of Default", "156.08%", ">=", "102.50%", "", "PASS"], + [None, None, None, None, None, "PASS"], + ["Class A/B Interest Coverage", "N/A", ">=", "120.00%", "", "N/A"], + [None, None, None, None, None, "N/A"], + ["Class D Interest Coverage", "N/A", ">=", "105.00%", "", "N/A"], + ] assert tables[-2].extract() == expected_a - + expected_b = [ - ["Moody's Maximum Rating Factor Test", '2,577', '<=', '3,250', '', 'PASS', '2,581'], - [None, None, None, None, None, 'PASS', None], - ['Minimum Floating Spread', '3.5006%', '>=', '2.0000%', '', 'PASS', '3.4871%'], - [None, None, None, None, None, 'PASS', None], - ['Minimum Weighted Average S&P Recovery\nRate Test', '40.50%', '>=', '40.00%', '', 'PASS', '40.40%'], - [None, None, None, None, None, 'PASS', None], - ['Weighted Average Life', '4.83', '<=', '9.00', '', 'PASS', '4.92'], - ] + [ + "Moody's Maximum Rating Factor Test", + "2,577", + "<=", + "3,250", + "", + "PASS", + "2,581", + ], + [None, None, None, None, None, "PASS", None], + [ + "Minimum Floating Spread", + "3.5006%", + ">=", + "2.0000%", + "", + "PASS", + "3.4871%", + ], + [None, None, None, None, None, "PASS", None], + [ + "Minimum Weighted Average S&P Recovery\nRate Test", + "40.50%", + ">=", + "40.00%", + "", + "PASS", + "40.40%", + ], + [None, None, None, None, None, "PASS", None], + ["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"], + ] assert tables[-1].extract() == expected_b - -