@@ -1332,9 +1332,35 @@ def row_count(self) -> int: # PyMuPDF extension
13321332 def col_count (self ) -> int : # PyMuPDF extension
13331333 return max ([len (r .cells ) for r in self .rows ])
13341334
1335- def extract (self , ** kwargs ) -> list :
1335+ def extract (
1336+ self ,
1337+ * ,
1338+ footnote : str = "none" ,
1339+ ** kwargs ,
1340+ ) -> list | tuple [list , str | None ]:
1341+ """
1342+ Extract the table’s text content.
1343+
1344+ Parameters
1345+ ----------
1346+ footnote : {"none", "last_single_cell"}, optional
1347+ • "none" (default) – return the table exactly as on page.
1348+ • "last_single_cell" – if the final physical row contains exactly
1349+ one non-empty cell, treat that row as a foot-note: remove it from
1350+ the table and return its text alongside the table data.
1351+
1352+ Other **kwargs are forwarded to `extract_text()`.
1353+
1354+ Returns
1355+ -------
1356+ list
1357+ When *footnote="none"* – the table content as a list of rows.
1358+ (list, str | None)
1359+ When *footnote="last_single_cell"* – the table content **and**
1360+ the extracted foot-note text (or *None* if no foot-note found).
1361+ """
13361362 chars = CHARS
1337- table_arr = []
1363+ table_arr : list [ list [ str | None ]] = []
13381364
13391365 def char_in_bbox (char , bbox ) -> bool :
13401366 v_mid = (char ["top" ] + char ["bottom" ]) / 2
@@ -1344,19 +1370,19 @@ def char_in_bbox(char, bbox) -> bool:
13441370 (h_mid >= x0 ) and (h_mid < x1 ) and (v_mid >= top ) and (v_mid < bottom )
13451371 )
13461372
1373+ # -----------------------------------------
1374+ # Build raw rows × columns string matrix
1375+ # -----------------------------------------
13471376 for row in self .rows :
13481377 arr = []
1349- row_chars = [char for char in chars if char_in_bbox (char , row .bbox )]
1378+ row_chars = [c for c in chars if char_in_bbox (c , row .bbox )]
13501379
13511380 for cell in row .cells :
13521381 if cell is None :
13531382 cell_text = None
13541383 else :
1355- cell_chars = [
1356- char for char in row_chars if char_in_bbox (char , cell )
1357- ]
1358-
1359- if len (cell_chars ):
1384+ cell_chars = [c for c in row_chars if char_in_bbox (c , cell )]
1385+ if cell_chars :
13601386 kwargs ["x_shift" ] = cell [0 ]
13611387 kwargs ["y_shift" ] = cell [1 ]
13621388 if "layout" in kwargs :
@@ -1368,7 +1394,17 @@ def char_in_bbox(char, bbox) -> bool:
13681394 arr .append (cell_text )
13691395 table_arr .append (arr )
13701396
1371- return table_arr
1397+ # -----------------------------------------
1398+ # Optional foot-note post-processing
1399+ # -----------------------------------------
1400+ footnote_txt : str | None = None
1401+ if footnote == "last_single_cell" and table_arr :
1402+ non_empty = [c for c in table_arr [- 1 ] if c and str (c ).strip ()]
1403+ if len (non_empty ) == 1 :
1404+ footnote_txt = non_empty [0 ]
1405+ table_arr = table_arr [:- 1 ]
1406+
1407+ return (table_arr , footnote_txt ) if footnote != "none" else table_arr
13721408
13731409 def to_markdown (self , clean = False , fill_empty = True ):
13741410 """Output table content as a string in Github-markdown format.
0 commit comments