diff --git a/src/table.py b/src/table.py
index 16b64fb32..4d95ffe57 100644
--- a/src/table.py
+++ b/src/table.py
@@ -79,6 +79,7 @@
from collections.abc import Sequence
from dataclasses import dataclass
from operator import itemgetter
+import weakref
# -------------------------------------------------------------------
# Start of PyMuPDF interface code
@@ -87,6 +88,8 @@
Rect,
Matrix,
TEXTFLAGS_TEXT,
+ TEXT_FONT_BOLD,
+ TEXT_FONT_SUPERSCRIPT,
TOOLS,
EMPTY_RECT,
sRGB_to_pdf,
@@ -1061,7 +1064,7 @@ def get_center(word):
if not overlap:
condensed_bboxes.append(bbox)
- if len(condensed_bboxes) == 0:
+ if not condensed_bboxes:
return []
condensed_rects = map(bbox_to_rect, condensed_bboxes)
@@ -1367,33 +1370,57 @@ def char_in_bbox(char, bbox) -> bool:
return table_arr
- def to_markdown(self, clean=True):
+ def to_markdown(self, clean=False, fill_empty=True):
"""Output table content as a string in Github-markdown format.
- If clean is true, markdown syntax is removed from cell content."""
+ If "clean" then markdown syntax is removed from cell content.
+ If "fill_empty" then cell content None is replaced by the values
+ above (columns) or left (rows) in an effort to approximate row and
+ columns spans.
+
+ """
output = "|"
+ rows = self.row_count
+ cols = self.col_count
+ cells = self.extract()[:] # make local copy of table text content
+
+ if fill_empty: # fill "None" cells where possible
+
+ # for rows, copy content from left to right
+ for j in range(rows):
+ for i in range(cols - 1):
+ if cells[j][i + 1] is None:
+ cells[j][i + 1] = cells[j][i]
- # generate header string and MD underline
+ # for columns, copy top to bottom
+ for i in range(cols):
+ for j in range(rows - 1):
+ if cells[j + 1][i] is None:
+ cells[j + 1][i] = cells[j][i]
+
+ # generate header string and MD separator
for i, name in enumerate(self.header.names):
- if name is None or name == "": # generate a name if empty
+ if not name: # generate a name if empty
name = f"Col{i+1}"
- name = name.replace("\n", " ") # remove any line breaks
+ name = name.replace("\n", "
") # use HTML line breaks
if clean: # remove sensitive syntax
name = html.escape(name.replace("-", "-"))
output += name + "|"
output += "\n"
+ # insert GitHub header line separator
output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"
# skip first row in details if header is part of the table
j = 0 if self.header.external else 1
# iterate over detail rows
- for row in self.extract()[j:]:
+ for row in cells[j:]:
line = "|"
for i, cell in enumerate(row):
- # output None cells with empty string
- cell = "" if cell is None else cell.replace("\n", " ")
+ # replace None cells with empty string
+ # use HTML line break tag
+ cell = "" if not cell else cell.replace("\n", "
")
if clean: # remove sensitive syntax
cell = html.escape(cell.replace("-", "-"))
line += cell + "|"
@@ -1462,22 +1489,34 @@ def _get_header(self, y_tolerance=3):
page = self.page
y_delta = y_tolerance
- def top_row_is_bold(bbox):
- """Check if row 0 has bold text anywhere.
+ def top_row_bg_color(self):
+ """
+ Compare top row background color with color of same-sized bbox
+ above. If different, return True indicating that the original
+ table top row is already the header.
+ """
+ bbox0 = Rect(self.rows[0].bbox)
+ bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height) # area above
+ top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1]
+ top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1]
+ if top_color0 != top_colort:
+ return True # top row is header
+ return False
- If this is true, then any non-bold text in lines above disqualify
- these lines as header.
+ def row_has_bold(bbox):
+ """Check if a row contains some bold text.
- bbox is the (potentially repaired) row 0 bbox.
+ If e.g. true for the top row, then it will be used as (internal)
+ column header row if any of the following is true:
+ * the previous (above) text line has no bold span
+ * the second table row text has no bold span
- Returns True or False
+ Returns True if any spans are bold else False.
"""
- for b in page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]:
- for l in b["lines"]:
- for s in l["spans"]:
- if s["flags"] & 16:
- return True
- return False
+ blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]
+ spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
+
+ return any(s["flags"] & TEXT_FONT_BOLD for s in spans)
try:
row = self.rows[0]
@@ -1489,50 +1528,68 @@ def top_row_is_bold(bbox):
# return this if we determine that the top row is the header
header_top_row = TableHeader(bbox, cells, self.extract()[0], False)
- # one-line tables have no extra header
+ # 1-line tables have no extra header
if len(self.rows) < 2:
return header_top_row
- # x-ccordinates of columns between x0 and x1 of the table
+ # 1-column tables have no extra header
if len(cells) < 2:
return header_top_row
- col_x = [
- c[2] if c is not None else None for c in cells[:-1]
- ] # column (x) coordinates
+ # assume top row is the header if second row is empty
+ row2 = self.rows[1] # second row
+ if all(c is None for c in row2.cells): # no valid cell bboxes in row2
+ return header_top_row
# Special check: is top row bold?
- # If first line above table is not bold, but top-left table cell is bold,
- # we take first table row as header
- top_row_bold = top_row_is_bold(bbox)
+ top_row_bold = row_has_bold(bbox)
+
+ # assume top row is header if it is bold and any cell
+ # of 2nd row is non-bold
+ if top_row_bold and not row_has_bold(row2.bbox):
+ return header_top_row
+
+ if top_row_bg_color(self):
+ # if area above top row has a different background color,
+ # then top row is already the header
+ return header_top_row
- # clip = area above table
+ # column coordinates (x1 values) in top row
+ col_x = [c[2] if c is not None else None for c in cells[:-1]]
+
+ # clip = page area above the table
# We will inspect this area for text qualifying as column header.
clip = +bbox # take row 0 bbox
clip.y0 = 0 # start at top of page
clip.y1 = bbox.y0 # end at top of table
- spans = [] # the text spans inside clip
- for b in page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]:
- for l in b["lines"]:
- for s in l["spans"]:
- if (
- not s["flags"] & 1 and s["text"].strip()
- ): # ignore superscripts and empty text
- spans.append(s)
+ blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
+ # non-empty, non-superscript spans above table, sorted descending by y1
+ spans = sorted(
+ [
+ s
+ for b in blocks
+ for l in b["lines"]
+ for s in l["spans"]
+ if not (
+ white_spaces.issuperset(s["text"])
+ or s["flags"] & TEXT_FONT_SUPERSCRIPT
+ )
+ ],
+ key=lambda s: s["bbox"][3],
+ reverse=True,
+ )
select = [] # y1 coordinates above, sorted descending
line_heights = [] # line heights above, sorted descending
line_bolds = [] # bold indicator per line above, same sorting
- # spans sorted descending
- spans.sort(key=lambda s: s["bbox"][3], reverse=True)
# walk through the spans and fill above 3 lists
for i in range(len(spans)):
s = spans[i]
y1 = s["bbox"][3] # span bottom
h = y1 - s["bbox"][1] # span bbox height
- bold = s["flags"] & 16
+ bold = s["flags"] & TEXT_FONT_BOLD
# use first item to start the lists
if i == 0:
@@ -1541,7 +1598,7 @@ def top_row_is_bold(bbox):
line_bolds.append(bold)
continue
- # get last items from the 3 lists
+ # get previous items from the 3 lists
y0 = select[-1]
h0 = line_heights[-1]
bold0 = line_bolds[-1]
@@ -1565,13 +1622,13 @@ def top_row_is_bold(bbox):
if select == []: # nothing above the table?
return header_top_row
- select = select[:5] # only accept up to 5 lines in any header
+ select = select[:5] # accept up to 5 lines for an external header
- # take top row as header if text above table is too far apart
+ # assume top row as header if text above is too far away
if bbox.y0 - select[0] >= line_heights[0]:
return header_top_row
- # if top table row is bold, but line above is not:
+ # accept top row as header if bold, but line above is not
if top_row_bold and not line_bolds[0]:
return header_top_row
@@ -1738,7 +1795,7 @@ class TableFinder:
"""
def __init__(self, page, settings=None):
- self.page = page
+ self.page = weakref.proxy(page)
self.settings = TableSettings.resolve(settings)
self.edges = self.get_edges()
self.intersections = edges_to_intersections(
@@ -1942,7 +1999,7 @@ def make_chars(page, clip=None):
# We are ignoring Bézier curves completely and are converting everything
# else to lines.
# ------------------------------------------------------------------------
-def make_edges(page, clip=None, tset=None, add_lines=None):
+def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
snap_x = tset.snap_x_tolerance
snap_y = tset.snap_y_tolerance
min_length = tset.edge_min_length
@@ -1994,16 +2051,19 @@ def are_neighbors(r1, r2):
return True
return False
- def clean_graphics():
+ def clean_graphics(npaths=None):
"""Detect and join rectangles of "connected" vector graphics."""
-
- paths = [] # paths relevant for table detection
- for p in page.get_drawings():
- # ignore fill-only graphics if they do not simulate lines,
- # which means one of width or height are small.
+ if npaths is None:
+ allpaths = page.get_drawings()
+ else: # accept passed-in vector graphics
+ allpaths = npaths[:] # paths relevant for table detection
+ paths = []
+ for p in allpaths:
+ # If only looking at lines, we ignore fill-only paths,
+ # except simulated lines (i.e. small width or height).
if (
- p["type"] == "f"
- and lines_strict
+ lines_strict
+ and p["type"] == "f"
and p["rect"].width > snap_x
and p["rect"].height > snap_y
):
@@ -2038,7 +2098,7 @@ def clean_graphics():
return new_rects, paths
- bboxes, paths = clean_graphics()
+ bboxes, paths = clean_graphics(npaths=paths)
def is_parallel(p1, p2):
"""Check if line is roughly axis-parallel."""
@@ -2209,6 +2269,25 @@ def make_line(p, p1, p2, clip):
if line_dict:
EDGES.append(line_to_edge(line_dict))
+ if add_boxes is not None: # add user-specified rectangles
+ assert isinstance(add_boxes, (tuple, list))
+ else:
+ add_boxes = []
+ for box in add_boxes:
+ r = Rect(box)
+ line_dict = make_line(path, r.tl, r.bl, clip)
+ if line_dict:
+ EDGES.append(line_to_edge(line_dict))
+ line_dict = make_line(path, r.bl, r.br, clip)
+ if line_dict:
+ EDGES.append(line_to_edge(line_dict))
+ line_dict = make_line(path, r.br, r.tr, clip)
+ if line_dict:
+ EDGES.append(line_to_edge(line_dict))
+ line_dict = make_line(path, r.tr, r.tl, clip)
+ if line_dict:
+ EDGES.append(line_to_edge(line_dict))
+
def page_rotation_set0(page):
"""Nullify page rotation.
@@ -2290,7 +2369,9 @@ def find_tables(
text_x_tolerance=3,
text_y_tolerance=3,
strategy=None, # offer abbreviation
- add_lines=None, # optional user-specified lines
+ add_lines=None, # user-specified lines
+ add_boxes=None, # user-specified rectangles
+ paths=None, # accept vector graphics as parameter
):
global CHARS, EDGES
CHARS = []
@@ -2344,7 +2425,12 @@ def find_tables(
make_chars(page, clip=clip) # create character list of page
make_edges(
- page, clip=clip, tset=tset, add_lines=add_lines
+ page,
+ clip=clip,
+ tset=tset,
+ paths=paths,
+ add_lines=add_lines,
+ add_boxes=add_boxes,
) # create lines and curves
tables = TableFinder(page, settings=tset)
diff --git a/tests/test_tables.py b/tests/test_tables.py
index ab1533825..4fb959f4c 100644
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -182,7 +182,10 @@ def test_2979():
wt = pymupdf.TOOLS.mupdf_warnings()
if pymupdf.mupdf_version_tuple >= (1, 26, 0):
- assert wt == 'bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times...'
+ assert (
+ wt
+ == "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..."
+ )
else:
assert not wt
@@ -294,14 +297,55 @@ def test_markdown():
text = (
"|Header1|Header2|Header3|\n"
"|---|---|---|\n"
- "|Col11 Col12|Col21 Col22|Col31 Col32 Col33|\n"
- "|Col13|Col23|Col34 Col35|\n"
+ "|Col11
Col12|Col21
Col22|Col31
Col32
Col33|\n"
+ "|Col13|Col23|Col34
Col35|\n"
"|Col14|Col24|Col36|\n"
- "|Col15|Col25 Col26||\n\n"
+ "|Col15|Col25
Col26||\n\n"
)
assert tab.to_markdown() == text
+def test_paths_param():
+ """Confirm acceptance of supplied vector graphics list."""
+ filename = os.path.join(scriptdir, "resources", "strict-yes-no.pdf")
+ doc = pymupdf.open(filename)
+ page = doc[0]
+ tabs = page.find_tables(paths=[]) # will cause all tables are missed
+ assert tabs.tables == []
+
+
+def test_boxes_param():
+ """Confirm acceptance of supplied boxes list."""
+ filename = os.path.join(scriptdir, "resources", "small-table.pdf")
+ doc = pymupdf.open(filename)
+ page = doc[0]
+ paths = page.get_drawings()
+ box0 = page.cluster_drawings(drawings=paths)[0]
+ boxes = [box0]
+ words = page.get_text("words")
+ x_vals = [w[0] - 5 for w in words if w[4] in ("min", "max", "avg")]
+ for x in x_vals:
+ r = +box0
+ r.x1 = x
+ boxes.append(r)
+
+ y_vals = sorted(set([round(w[3]) for w in words]))
+ for y in y_vals[:-1]: # skip last one to avoid empty row
+ r = +box0
+ r.y1 = y
+ boxes.append(r)
+
+ tabs = page.find_tables(paths=[], add_boxes=boxes)
+ tab = tabs.tables[0]
+ assert tab.extract() == [
+ ["Boiling Points °C", "min", "max", "avg"],
+ ["Noble gases", "-269", "-62", "-170.5"],
+ ["Nonmetals", "-253", "4827", "414.1"],
+ ["Metalloids", "335", "3900", "741.5"],
+ ["Metals", "357", ">5000", "2755.9"],
+ ]
+
+
def test_dotted_grid():
"""Confirm dotted lines are detected as gridlines."""
filename = os.path.join(scriptdir, "resources", "dotted-gridlines.pdf")
@@ -317,43 +361,65 @@ def test_dotted_grid():
def test_4017():
- path = os.path.normpath(f'{__file__}/../../tests/resources/test_4017.pdf')
+ path = os.path.normpath(f"{__file__}/../../tests/resources/test_4017.pdf")
with pymupdf.open(path) as document:
page = document[0]
-
+
tables = page.find_tables(add_lines=None)
print(f"{len(tables.tables)=}.")
tables_text = list()
for i, table in enumerate(tables):
- print(f'## {i=}.')
+ print(f"## {i=}.")
t = table.extract()
for tt in t:
- print(f' {tt}')
-
+ print(f" {tt}")
+
# 2024-11-29: expect current incorrect output for last two tables.
-
+
expected_a = [
- ['Class A/B Overcollateralization', '131.44%', '>=', '122.60%', '', 'PASS'],
- [None, None, None, None, None, 'PASS'],
- ['Class D Overcollateralization', '112.24%', '>=', '106.40%', '', 'PASS'],
- [None, None, None, None, None, 'PASS'],
- ['Event of Default', '156.08%', '>=', '102.50%', '', 'PASS'],
- [None, None, None, None, None, 'PASS'],
- ['Class A/B Interest Coverage', 'N/A', '>=', '120.00%', '', 'N/A'],
- [None, None, None, None, None, 'N/A'],
- ['Class D Interest Coverage', 'N/A', '>=', '105.00%', '', 'N/A'],
- ]
+ ["Class A/B Overcollateralization", "131.44%", ">=", "122.60%", "", "PASS"],
+ [None, None, None, None, None, "PASS"],
+ ["Class D Overcollateralization", "112.24%", ">=", "106.40%", "", "PASS"],
+ [None, None, None, None, None, "PASS"],
+ ["Event of Default", "156.08%", ">=", "102.50%", "", "PASS"],
+ [None, None, None, None, None, "PASS"],
+ ["Class A/B Interest Coverage", "N/A", ">=", "120.00%", "", "N/A"],
+ [None, None, None, None, None, "N/A"],
+ ["Class D Interest Coverage", "N/A", ">=", "105.00%", "", "N/A"],
+ ]
assert tables[-2].extract() == expected_a
-
+
expected_b = [
- ["Moody's Maximum Rating Factor Test", '2,577', '<=', '3,250', '', 'PASS', '2,581'],
- [None, None, None, None, None, 'PASS', None],
- ['Minimum Floating Spread', '3.5006%', '>=', '2.0000%', '', 'PASS', '3.4871%'],
- [None, None, None, None, None, 'PASS', None],
- ['Minimum Weighted Average S&P Recovery\nRate Test', '40.50%', '>=', '40.00%', '', 'PASS', '40.40%'],
- [None, None, None, None, None, 'PASS', None],
- ['Weighted Average Life', '4.83', '<=', '9.00', '', 'PASS', '4.92'],
- ]
+ [
+ "Moody's Maximum Rating Factor Test",
+ "2,577",
+ "<=",
+ "3,250",
+ "",
+ "PASS",
+ "2,581",
+ ],
+ [None, None, None, None, None, "PASS", None],
+ [
+ "Minimum Floating Spread",
+ "3.5006%",
+ ">=",
+ "2.0000%",
+ "",
+ "PASS",
+ "3.4871%",
+ ],
+ [None, None, None, None, None, "PASS", None],
+ [
+ "Minimum Weighted Average S&P Recovery\nRate Test",
+ "40.50%",
+ ">=",
+ "40.00%",
+ "",
+ "PASS",
+ "40.40%",
+ ],
+ [None, None, None, None, None, "PASS", None],
+ ["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"],
+ ]
assert tables[-1].extract() == expected_b
-
-