|
79 | 79 | from collections.abc import Sequence |
80 | 80 | from dataclasses import dataclass |
81 | 81 | from operator import itemgetter |
| 82 | +from typing import Literal |
82 | 83 | import weakref |
83 | 84 | import pymupdf |
84 | 85 | from pymupdf import mupdf |
@@ -2036,6 +2037,16 @@ def __init__(self, page, settings=None): |
2036 | 2037 | self.textpage = None |
2037 | 2038 | self.settings = TableSettings.resolve(settings) |
2038 | 2039 | self.edges = self.get_edges() |
| 2040 | + if ( |
| 2041 | + self.settings.horizontal_strategy == "text" |
| 2042 | + and self.settings.vertical_strategy != "text" |
| 2043 | + ): |
| 2044 | + extend_edges(self.edges, "h", self.settings.intersection_x_tolerance) |
| 2045 | + elif ( |
| 2046 | + self.settings.vertical_strategy == "text" |
| 2047 | + and self.settings.horizontal_strategy != "text" |
| 2048 | + ): |
| 2049 | + extend_edges(self.edges, "v", self.settings.intersection_y_tolerance) |
2039 | 2050 | self.intersections = edges_to_intersections( |
2040 | 2051 | self.edges, |
2041 | 2052 | self.settings.intersection_x_tolerance, |
@@ -2726,3 +2737,68 @@ def find_tables( |
2726 | 2737 | for table in tbf.tables: |
2727 | 2738 | table.textpage = TEXTPAGE |
2728 | 2739 | return tbf |
| 2740 | + |
| 2741 | + |
| 2742 | +def extend_edges( |
| 2743 | + edges: list, |
| 2744 | + extend_orientation: Literal["h", "v"], |
| 2745 | + intersection_tolerance: float, |
| 2746 | +) -> None: |
| 2747 | + """ |
| 2748 | + Extend the edges to the nearest edge vertical to them |
| 2749 | + """ |
| 2750 | + v_edges, h_edges = [ |
| 2751 | + list(filter(lambda x: x["orientation"] == o, edges)) for o in ("v", "h") |
| 2752 | + ] |
| 2753 | + |
| 2754 | + v_edges = sorted(v_edges, key=itemgetter("x0", "top")) |
| 2755 | + h_edges = sorted(h_edges, key=itemgetter("top", "x0")) |
| 2756 | + |
| 2757 | + if extend_orientation == "h": |
| 2758 | + edges_to_extend = h_edges |
| 2759 | + other_edges = v_edges |
| 2760 | + first_prop_to_extend, second_prop_to_extend = "x0", "x1" |
| 2761 | + loc_prop = "top" |
| 2762 | + loc_prop_others = "x0" |
| 2763 | + first_prop_range, second_prop_range = "top", "bottom" |
| 2764 | + else: |
| 2765 | + edges_to_extend = v_edges |
| 2766 | + other_edges = h_edges |
| 2767 | + first_prop_to_extend, second_prop_to_extend = "top", "bottom" |
| 2768 | + loc_prop = "x0" |
| 2769 | + loc_prop_others = "top" |
| 2770 | + first_prop_range, second_prop_range = "x0", "x1" |
| 2771 | + |
| 2772 | + for edge_to_extend in edges_to_extend: |
| 2773 | + loc = edge_to_extend[loc_prop] |
| 2774 | + edges_intersect_to_this_edge = [ |
| 2775 | + edge |
| 2776 | + for edge in other_edges |
| 2777 | + if (loc - edge[second_prop_range] <= intersection_tolerance) |
| 2778 | + and (edge[first_prop_range] - loc <= intersection_tolerance) |
| 2779 | + ] |
| 2780 | + n_edges_intersect_to_this_edge = len(edges_intersect_to_this_edge) |
| 2781 | + if n_edges_intersect_to_this_edge > 1: |
| 2782 | + first_val_to_extend, second_val_to_extend = ( |
| 2783 | + edge_to_extend[first_prop_to_extend], |
| 2784 | + edge_to_extend[second_prop_to_extend], |
| 2785 | + ) |
| 2786 | + # Extend first value (left for horizontal, top for vertical) |
| 2787 | + for i in range(n_edges_intersect_to_this_edge): |
| 2788 | + loc_edge_i = edges_intersect_to_this_edge[i][loc_prop_others] |
| 2789 | + if first_val_to_extend - loc_edge_i < -intersection_tolerance: |
| 2790 | + if i != 0: |
| 2791 | + edge_to_extend[first_prop_to_extend] = ( |
| 2792 | + edges_intersect_to_this_edge[i - 1][loc_prop_others] |
| 2793 | + ) |
| 2794 | + break |
| 2795 | + |
| 2796 | + # Extend second value (right for horizontal, bottom for vertical) |
| 2797 | + for i in range(n_edges_intersect_to_this_edge - 1, -1, -1): |
| 2798 | + loc_edge_i = edges_intersect_to_this_edge[i][loc_prop_others] |
| 2799 | + if second_val_to_extend - loc_edge_i > -intersection_tolerance: |
| 2800 | + if i != n_edges_intersect_to_this_edge - 1: |
| 2801 | + edge_to_extend[second_prop_to_extend] = ( |
| 2802 | + edges_intersect_to_this_edge[i + 1][loc_prop_others] |
| 2803 | + ) |
| 2804 | + break |
0 commit comments