Skip to content

Commit d6fbd18

Browse files
authored
Update table.py
Parameters ---------- footnote : {"none", "last_single_cell"}, optional If "last_single_cell" and the table’s final row has exactly one non-empty cell, that row is treated as a foot-note. Returns ------- list | (list, str | None)
1 parent 29a327a commit d6fbd18

1 file changed

Lines changed: 45 additions & 9 deletions

File tree

src/table.py

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,9 +1332,35 @@ def row_count(self) -> int: # PyMuPDF extension
13321332
def col_count(self) -> int: # PyMuPDF extension
13331333
return max([len(r.cells) for r in self.rows])
13341334

1335-
def extract(self, **kwargs) -> list:
1335+
def extract(
1336+
self,
1337+
*,
1338+
footnote: str = "none",
1339+
**kwargs,
1340+
) -> list | tuple[list, str | None]:
1341+
"""
1342+
Extract the table’s text content.
1343+
1344+
Parameters
1345+
----------
1346+
footnote : {"none", "last_single_cell"}, optional
1347+
• "none" (default) – return the table exactly as on page.
1348+
• "last_single_cell" – if the final physical row contains exactly
1349+
one non-empty cell, treat that row as a foot-note: remove it from
1350+
the table and return its text alongside the table data.
1351+
1352+
Other **kwargs are forwarded to `extract_text()`.
1353+
1354+
Returns
1355+
-------
1356+
list
1357+
When *footnote="none"* – the table content as a list of rows.
1358+
(list, str | None)
1359+
When *footnote="last_single_cell"* – the table content **and**
1360+
the extracted foot-note text (or *None* if no foot-note found).
1361+
"""
13361362
chars = CHARS
1337-
table_arr = []
1363+
table_arr: list[list[str | None]] = []
13381364

13391365
def char_in_bbox(char, bbox) -> bool:
13401366
v_mid = (char["top"] + char["bottom"]) / 2
@@ -1344,19 +1370,19 @@ def char_in_bbox(char, bbox) -> bool:
13441370
(h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
13451371
)
13461372

1373+
# -----------------------------------------
1374+
# Build raw rows × columns string matrix
1375+
# -----------------------------------------
13471376
for row in self.rows:
13481377
arr = []
1349-
row_chars = [char for char in chars if char_in_bbox(char, row.bbox)]
1378+
row_chars = [c for c in chars if char_in_bbox(c, row.bbox)]
13501379

13511380
for cell in row.cells:
13521381
if cell is None:
13531382
cell_text = None
13541383
else:
1355-
cell_chars = [
1356-
char for char in row_chars if char_in_bbox(char, cell)
1357-
]
1358-
1359-
if len(cell_chars):
1384+
cell_chars = [c for c in row_chars if char_in_bbox(c, cell)]
1385+
if cell_chars:
13601386
kwargs["x_shift"] = cell[0]
13611387
kwargs["y_shift"] = cell[1]
13621388
if "layout" in kwargs:
@@ -1368,7 +1394,17 @@ def char_in_bbox(char, bbox) -> bool:
13681394
arr.append(cell_text)
13691395
table_arr.append(arr)
13701396

1371-
return table_arr
1397+
# -----------------------------------------
1398+
# Optional foot-note post-processing
1399+
# -----------------------------------------
1400+
footnote_txt: str | None = None
1401+
if footnote == "last_single_cell" and table_arr:
1402+
non_empty = [c for c in table_arr[-1] if c and str(c).strip()]
1403+
if len(non_empty) == 1:
1404+
footnote_txt = non_empty[0]
1405+
table_arr = table_arr[:-1]
1406+
1407+
return (table_arr, footnote_txt) if footnote != "none" else table_arr
13721408

13731409
def to_markdown(self, clean=False, fill_empty=True):
13741410
"""Output table content as a string in Github-markdown format.

0 commit comments

Comments
 (0)