|
81 | 81 | from dataclasses import dataclass |
82 | 82 | from operator import itemgetter |
83 | 83 | import weakref |
| 84 | +import pathlib |
| 85 | + |
84 | 86 | import pymupdf |
85 | 87 | from pymupdf import mupdf |
86 | | -import pathlib |
87 | 88 |
|
88 | 89 | # ------------------------------------------------------------------- |
89 | 90 | # Start of PyMuPDF interface code |
|
94 | 95 | # Optionally use the TGIF table grid finder. |
95 | 96 | # This replace fz_find_table_within_bounds. |
96 | 97 | USE_TGIF = os.getenv("USE_TGIF", "0") |
| 98 | +EXTRACTOR_V4 = None # Keep pylint happy. |
97 | 99 | if USE_TGIF == "1": |
98 | 100 | print("Using TGIFVx for table grid extraction.") |
99 | | - import pymupdf.tgif |
| 101 | + import pymupdf.tgif # pylint: disable=import-error |
100 | 102 | elif USE_TGIF == "4": |
101 | 103 | print("Using TGEV4 for table grid extraction.") |
102 | | - import numpy as np |
103 | 104 | from pymupdf.TableGridExtractorV4 import TableGridExtractorV4 |
104 | 105 |
|
105 | 106 | EXTRACTOR_V4 = TableGridExtractorV4( |
|
112 | 113 | # filter_empty_lines=not args.no_filter_empty, |
113 | 114 | ) |
114 | 115 | else: |
115 | | - print("Using legacy table grid extraction.") |
| 116 | + if os.environ.get('PYMUPDF_LEGACY_TABLE_DIAGNOSTIC') != '0': |
| 117 | + print("Using legacy table grid extraction.") |
116 | 118 |
|
117 | 119 | EDGES = [] # vector graphics from PyMuPDF |
118 | 120 | CHARS = [] # text characters from PyMuPDF |
@@ -191,14 +193,14 @@ def get_table_cells_from_rect_tgif1(page, word_rects, rect): |
191 | 193 | bound = mupdf.FzRect(*rect) |
192 | 194 |
|
193 | 195 | try: |
194 | | - r, xpos, ypos = pymupdf.tgif.fz_visual_table_grid_finder(page, bound) |
| 196 | + r, xpos, ypos = pymupdf.tgif.fz_visual_table_grid_finder(page, bound) # pylint: disable=no-member |
195 | 197 | x_count = int(xpos.m_internal.len) |
196 | 198 | x_values = [xpos.list(i).pos for i in range(x_count)] |
197 | 199 | y_count = int(ypos.m_internal.len) |
198 | 200 | y_values = [ypos.list(i).pos for i in range(y_count)] |
199 | 201 | if xpos.m_internal.max_uncertainty > 0 or ypos.m_internal.max_uncertainty > 0: |
200 | 202 | print(f"{page.number=}: grid with uncertainty for {bound=}") |
201 | | - except: |
| 203 | + except Exception: |
202 | 204 | return cells |
203 | 205 | for i in range(y_count - 1): |
204 | 206 | for j in range(x_count - 1): |
@@ -2793,11 +2795,11 @@ def find_tables( |
2793 | 2795 | if my_boxes: |
2794 | 2796 | word_rects = [pymupdf.Rect(w[:4]) for w in TEXTPAGE.extractWORDS()] |
2795 | 2797 | tp2 = page.get_textpage(flags=TABLE_DETECTOR_FLAGS) |
2796 | | - for rect in my_boxes: |
2797 | | - cells = make_table_from_bbox( |
2798 | | - page, tp2, word_rects, rect |
2799 | | - ) # pylint: disable=E0606 |
2800 | | - tbf.tables.append(Table(page, cells)) |
| 2798 | + for rect in my_boxes: |
| 2799 | + cells = make_table_from_bbox( |
| 2800 | + page, tp2, word_rects, rect |
| 2801 | + ) # pylint: disable=E0606 |
| 2802 | + tbf.tables.append(Table(page, cells)) |
2801 | 2803 | except Exception as e: |
2802 | 2804 | pymupdf.message("find_tables: exception occurred: %s" % str(e)) |
2803 | 2805 | return None |
|
0 commit comments