Skip to content

Commit 004447f

Browse files
committed
Fix: Add extend_edges function to fix table extraction with one strat text and the other non-text
1 parent fdc472b commit 004447f

3 files changed

Lines changed: 88 additions & 0 deletions

File tree

src/table.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
from collections.abc import Sequence
8080
from dataclasses import dataclass
8181
from operator import itemgetter
82+
from typing import Literal
8283
import weakref
8384
import pymupdf
8485
from pymupdf import mupdf
@@ -2036,6 +2037,16 @@ def __init__(self, page, settings=None):
20362037
self.textpage = None
20372038
self.settings = TableSettings.resolve(settings)
20382039
self.edges = self.get_edges()
2040+
if (
2041+
self.settings.horizontal_strategy == "text"
2042+
and self.settings.vertical_strategy != "text"
2043+
):
2044+
extend_edges(self.edges, "h", self.settings.intersection_x_tolerance)
2045+
elif (
2046+
self.settings.vertical_strategy == "text"
2047+
and self.settings.horizontal_strategy != "text"
2048+
):
2049+
extend_edges(self.edges, "v", self.settings.intersection_y_tolerance)
20392050
self.intersections = edges_to_intersections(
20402051
self.edges,
20412052
self.settings.intersection_x_tolerance,
@@ -2726,3 +2737,68 @@ def find_tables(
27262737
for table in tbf.tables:
27272738
table.textpage = TEXTPAGE
27282739
return tbf
2740+
2741+
2742+
def extend_edges(
2743+
edges: list,
2744+
extend_orientation: Literal["h", "v"],
2745+
intersection_tolerance: float,
2746+
) -> None:
2747+
"""
2748+
Extend the edges to the nearest edge vertical to them
2749+
"""
2750+
v_edges, h_edges = [
2751+
list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h")
2752+
]
2753+
2754+
v_edges = sorted(v_edges, key=itemgetter("x0", "top"))
2755+
h_edges = sorted(h_edges, key=itemgetter("top", "x0"))
2756+
2757+
if extend_orientation == "h":
2758+
edges_to_extend = h_edges
2759+
other_edges = v_edges
2760+
first_prop_to_extend, second_prop_to_extend = "x0", "x1"
2761+
loc_prop = "top"
2762+
loc_prop_others = "x0"
2763+
first_prop_range, second_prop_range = "top", "bottom"
2764+
else:
2765+
edges_to_extend = v_edges
2766+
other_edges = h_edges
2767+
first_prop_to_extend, second_prop_to_extend = "top", "bottom"
2768+
loc_prop = "x0"
2769+
loc_prop_others = "top"
2770+
first_prop_range, second_prop_range = "x0", "x1"
2771+
2772+
for edge_to_extend in edges_to_extend:
2773+
loc = edge_to_extend[loc_prop]
2774+
edges_intersect_to_this_edge = [
2775+
edge
2776+
for edge in other_edges
2777+
if (loc - edge[second_prop_range] <= intersection_tolerance)
2778+
and (edge[first_prop_range] - loc <= intersection_tolerance)
2779+
]
2780+
n_edges_intersect_to_this_edge = len(edges_intersect_to_this_edge)
2781+
if n_edges_intersect_to_this_edge > 1:
2782+
first_val_to_extend, second_val_to_extend = (
2783+
edge_to_extend[first_prop_to_extend],
2784+
edge_to_extend[second_prop_to_extend],
2785+
)
2786+
# Extend first value (left for horizontal, top for vertical)
2787+
for i in range(n_edges_intersect_to_this_edge):
2788+
loc_edge_i = edges_intersect_to_this_edge[i][loc_prop_others]
2789+
if first_val_to_extend - loc_edge_i < -intersection_tolerance:
2790+
if i != 0:
2791+
edge_to_extend[first_prop_to_extend] = (
2792+
edges_intersect_to_this_edge[i - 1][loc_prop_others]
2793+
)
2794+
break
2795+
2796+
# Extend second value (right for horizontal, bottom for vertical)
2797+
for i in range(n_edges_intersect_to_this_edge - 1, -1, -1):
2798+
loc_edge_i = edges_intersect_to_this_edge[i][loc_prop_others]
2799+
if second_val_to_extend - loc_edge_i > -intersection_tolerance:
2800+
if i != n_edges_intersect_to_this_edge - 1:
2801+
edge_to_extend[second_prop_to_extend] = (
2802+
edges_intersect_to_this_edge[i + 1][loc_prop_others]
2803+
)
2804+
break
21.6 KB
Binary file not shown.

tests/test_tables.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,3 +458,15 @@ def test_md_styles():
458458
tabs = page.find_tables()[0]
459459
text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~<br>~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**<br>**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**<br>**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~<br>~~**(3,1)**~~|Zelle (3,2)|\n\n"""
460460
assert tabs.to_markdown() == text
461+
462+
463+
def test_one_strat_text_the_other_strat_non_text():
464+
filename = os.path.join(scriptdir, "resources", "text-lines-tables.pdf")
465+
doc = pymupdf.open(filename)
466+
page = doc[0]
467+
tabs = page.find_tables(horizontal_strategy="text", vertical_strategy="lines_strict").tables
468+
assert len(tabs) == 1
469+
assert tabs[0].extract() == [["AAAA", "BBBB"], ["", ""], ["CCCC", "DDDD"]]
470+
tabs = page.find_tables(vertical_strategy="text", horizontal_strategy="lines_strict")
471+
assert len(tabs) == 1
472+
assert tabs[0].extract() == [["1111", "2222"], ["3333", "4444"]]

0 commit comments

Comments
 (0)