Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 144 additions & 58 deletions src/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
from collections.abc import Sequence
from dataclasses import dataclass
from operator import itemgetter
import weakref

# -------------------------------------------------------------------
# Start of PyMuPDF interface code
Expand All @@ -87,6 +88,8 @@
Rect,
Matrix,
TEXTFLAGS_TEXT,
TEXT_FONT_BOLD,
TEXT_FONT_SUPERSCRIPT,
TOOLS,
EMPTY_RECT,
sRGB_to_pdf,
Expand Down Expand Up @@ -1061,7 +1064,7 @@ def get_center(word):
if not overlap:
condensed_bboxes.append(bbox)

if len(condensed_bboxes) == 0:
if not condensed_bboxes:
return []

condensed_rects = map(bbox_to_rect, condensed_bboxes)
Expand Down Expand Up @@ -1367,33 +1370,57 @@ def char_in_bbox(char, bbox) -> bool:

return table_arr

def to_markdown(self, clean=True):
def to_markdown(self, clean=False, fill_empty=True):
"""Output table content as a string in Github-markdown format.

If clean is true, markdown syntax is removed from cell content."""
If "clean" then markdown syntax is removed from cell content.
If "fill_empty" then cell content None is replaced by the values
above (columns) or left (rows) in an effort to approximate row and
columns spans.

"""
output = "|"
rows = self.row_count
cols = self.col_count
cells = self.extract()[:] # make local copy of table text content

if fill_empty: # fill "None" cells where possible

# for rows, copy content from left to right
for j in range(rows):
for i in range(cols - 1):
if cells[j][i + 1] is None:
cells[j][i + 1] = cells[j][i]

# generate header string and MD underline
# for columns, copy top to bottom
for i in range(cols):
for j in range(rows - 1):
if cells[j + 1][i] is None:
cells[j + 1][i] = cells[j][i]

# generate header string and MD separator
for i, name in enumerate(self.header.names):
if name is None or name == "": # generate a name if empty
if not name: # generate a name if empty
name = f"Col{i+1}"
name = name.replace("\n", " ") # remove any line breaks
name = name.replace("\n", "<br>") # use HTML line breaks
if clean: # remove sensitive syntax
name = html.escape(name.replace("-", "&#45;"))
output += name + "|"

output += "\n"
# insert GitHub header line separator
output += "|" + "|".join("---" for i in range(self.col_count)) + "|\n"

# skip first row in details if header is part of the table
j = 0 if self.header.external else 1

# iterate over detail rows
for row in self.extract()[j:]:
for row in cells[j:]:
line = "|"
for i, cell in enumerate(row):
# output None cells with empty string
cell = "" if cell is None else cell.replace("\n", " ")
# replace None cells with empty string
# use HTML line break tag
cell = "" if not cell else cell.replace("\n", "<br>")
if clean: # remove sensitive syntax
cell = html.escape(cell.replace("-", "&#45;"))
line += cell + "|"
Expand Down Expand Up @@ -1462,22 +1489,34 @@ def _get_header(self, y_tolerance=3):
page = self.page
y_delta = y_tolerance

def top_row_is_bold(bbox):
"""Check if row 0 has bold text anywhere.
def top_row_bg_color(self):
"""
Compare top row background color with color of same-sized bbox
above. If different, return True indicating that the original
table top row is already the header.
"""
bbox0 = Rect(self.rows[0].bbox)
bboxt = bbox0 + (0, -bbox0.height, 0, -bbox0.height) # area above
top_color0 = page.get_pixmap(clip=bbox0).color_topusage()[1]
top_colort = page.get_pixmap(clip=bboxt).color_topusage()[1]
if top_color0 != top_colort:
return True # top row is header
return False

If this is true, then any non-bold text in lines above disqualify
these lines as header.
def row_has_bold(bbox):
"""Check if a row contains some bold text.

bbox is the (potentially repaired) row 0 bbox.
If e.g. true for the top row, then it will be used as (internal)
column header row if any of the following is true:
* the previous (above) text line has no bold span
* the second table row text has no bold span

Returns True or False
Returns True if any spans are bold else False.
"""
for b in page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]:
for l in b["lines"]:
for s in l["spans"]:
if s["flags"] & 16:
return True
return False
blocks = page.get_text("dict", flags=TEXTFLAGS_TEXT, clip=bbox)["blocks"]
spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]

return any(s["flags"] & TEXT_FONT_BOLD for s in spans)

try:
row = self.rows[0]
Expand All @@ -1489,50 +1528,68 @@ def top_row_is_bold(bbox):
# return this if we determine that the top row is the header
header_top_row = TableHeader(bbox, cells, self.extract()[0], False)

# one-line tables have no extra header
# 1-line tables have no extra header
if len(self.rows) < 2:
return header_top_row

# x-ccordinates of columns between x0 and x1 of the table
# 1-column tables have no extra header
if len(cells) < 2:
return header_top_row

col_x = [
c[2] if c is not None else None for c in cells[:-1]
] # column (x) coordinates
# assume top row is the header if second row is empty
row2 = self.rows[1] # second row
if all(c is None for c in row2.cells): # no valid cell bboxes in row2
return header_top_row

# Special check: is top row bold?
# If first line above table is not bold, but top-left table cell is bold,
# we take first table row as header
top_row_bold = top_row_is_bold(bbox)
top_row_bold = row_has_bold(bbox)

# assume top row is header if it is bold and any cell
# of 2nd row is non-bold
if top_row_bold and not row_has_bold(row2.bbox):
return header_top_row

if top_row_bg_color(self):
# if area above top row has a different background color,
# then top row is already the header
return header_top_row

# clip = area above table
# column coordinates (x1 values) in top row
col_x = [c[2] if c is not None else None for c in cells[:-1]]

# clip = page area above the table
# We will inspect this area for text qualifying as column header.
clip = +bbox # take row 0 bbox
clip.y0 = 0 # start at top of page
clip.y1 = bbox.y0 # end at top of table

spans = [] # the text spans inside clip
for b in page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]:
for l in b["lines"]:
for s in l["spans"]:
if (
not s["flags"] & 1 and s["text"].strip()
): # ignore superscripts and empty text
spans.append(s)
blocks = page.get_text("dict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
# non-empty, non-superscript spans above table, sorted descending by y1
spans = sorted(
[
s
for b in blocks
for l in b["lines"]
for s in l["spans"]
if not (
white_spaces.issuperset(s["text"])
or s["flags"] & TEXT_FONT_SUPERSCRIPT
)
],
key=lambda s: s["bbox"][3],
reverse=True,
)

select = [] # y1 coordinates above, sorted descending
line_heights = [] # line heights above, sorted descending
line_bolds = [] # bold indicator per line above, same sorting

# spans sorted descending
spans.sort(key=lambda s: s["bbox"][3], reverse=True)
# walk through the spans and fill above 3 lists
for i in range(len(spans)):
s = spans[i]
y1 = s["bbox"][3] # span bottom
h = y1 - s["bbox"][1] # span bbox height
bold = s["flags"] & 16
bold = s["flags"] & TEXT_FONT_BOLD

# use first item to start the lists
if i == 0:
Expand All @@ -1541,7 +1598,7 @@ def top_row_is_bold(bbox):
line_bolds.append(bold)
continue

# get last items from the 3 lists
# get previous items from the 3 lists
y0 = select[-1]
h0 = line_heights[-1]
bold0 = line_bolds[-1]
Expand All @@ -1565,13 +1622,13 @@ def top_row_is_bold(bbox):
if select == []: # nothing above the table?
return header_top_row

select = select[:5] # only accept up to 5 lines in any header
select = select[:5] # accept up to 5 lines for an external header

# take top row as header if text above table is too far apart
# assume top row as header if text above is too far away
if bbox.y0 - select[0] >= line_heights[0]:
return header_top_row

# if top table row is bold, but line above is not:
# accept top row as header if bold, but line above is not
if top_row_bold and not line_bolds[0]:
return header_top_row

Expand Down Expand Up @@ -1738,7 +1795,7 @@ class TableFinder:
"""

def __init__(self, page, settings=None):
self.page = page
self.page = weakref.proxy(page)
self.settings = TableSettings.resolve(settings)
self.edges = self.get_edges()
self.intersections = edges_to_intersections(
Expand Down Expand Up @@ -1942,7 +1999,7 @@ def make_chars(page, clip=None):
# We are ignoring Bézier curves completely and are converting everything
# else to lines.
# ------------------------------------------------------------------------
def make_edges(page, clip=None, tset=None, add_lines=None):
def make_edges(page, clip=None, tset=None, paths=None, add_lines=None, add_boxes=None):
snap_x = tset.snap_x_tolerance
snap_y = tset.snap_y_tolerance
min_length = tset.edge_min_length
Expand Down Expand Up @@ -1994,16 +2051,19 @@ def are_neighbors(r1, r2):
return True
return False

def clean_graphics():
def clean_graphics(npaths=None):
"""Detect and join rectangles of "connected" vector graphics."""

paths = [] # paths relevant for table detection
for p in page.get_drawings():
# ignore fill-only graphics if they do not simulate lines,
# which means one of width or height are small.
if npaths is None:
allpaths = page.get_drawings()
else: # accept passed-in vector graphics
allpaths = npaths[:] # paths relevant for table detection
paths = []
for p in allpaths:
# If only looking at lines, we ignore fill-only paths,
# except simulated lines (i.e. small width or height).
if (
p["type"] == "f"
and lines_strict
lines_strict
and p["type"] == "f"
and p["rect"].width > snap_x
and p["rect"].height > snap_y
):
Expand Down Expand Up @@ -2038,7 +2098,7 @@ def clean_graphics():

return new_rects, paths

bboxes, paths = clean_graphics()
bboxes, paths = clean_graphics(npaths=paths)

def is_parallel(p1, p2):
"""Check if line is roughly axis-parallel."""
Expand Down Expand Up @@ -2209,6 +2269,25 @@ def make_line(p, p1, p2, clip):
if line_dict:
EDGES.append(line_to_edge(line_dict))

if add_boxes is not None: # add user-specified rectangles
assert isinstance(add_boxes, (tuple, list))
else:
add_boxes = []
for box in add_boxes:
r = Rect(box)
line_dict = make_line(path, r.tl, r.bl, clip)
if line_dict:
EDGES.append(line_to_edge(line_dict))
line_dict = make_line(path, r.bl, r.br, clip)
if line_dict:
EDGES.append(line_to_edge(line_dict))
line_dict = make_line(path, r.br, r.tr, clip)
if line_dict:
EDGES.append(line_to_edge(line_dict))
line_dict = make_line(path, r.tr, r.tl, clip)
if line_dict:
EDGES.append(line_to_edge(line_dict))


def page_rotation_set0(page):
"""Nullify page rotation.
Expand Down Expand Up @@ -2290,7 +2369,9 @@ def find_tables(
text_x_tolerance=3,
text_y_tolerance=3,
strategy=None, # offer abbreviation
add_lines=None, # optional user-specified lines
add_lines=None, # user-specified lines
add_boxes=None, # user-specified rectangles
paths=None, # accept vector graphics as parameter
):
global CHARS, EDGES
CHARS = []
Expand Down Expand Up @@ -2344,7 +2425,12 @@ def find_tables(

make_chars(page, clip=clip) # create character list of page
make_edges(
page, clip=clip, tset=tset, add_lines=add_lines
page,
clip=clip,
tset=tset,
paths=paths,
add_lines=add_lines,
add_boxes=add_boxes,
) # create lines and curves
tables = TableFinder(page, settings=tset)

Expand Down
Loading