diff --git a/src/table.py b/src/table.py index 4d95ffe57..3de8b2c86 100644 --- a/src/table.py +++ b/src/table.py @@ -89,18 +89,129 @@ Matrix, TEXTFLAGS_TEXT, TEXT_FONT_BOLD, + TEXT_FONT_ITALIC, + TEXT_FONT_MONOSPACED, TEXT_FONT_SUPERSCRIPT, + TEXT_COLLECT_STYLES, TOOLS, EMPTY_RECT, sRGB_to_pdf, Point, message, + mupdf, ) EDGES = [] # vector graphics from PyMuPDF CHARS = [] # text characters from PyMuPDF TEXTPAGE = None +TEXT_BOLD = mupdf.FZ_STEXT_BOLD +TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT +FLAGS = TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES + white_spaces = set(string.whitespace) # for checking white space only cells + + +def extract_cells(textpage, cell, markdown=False): + """Extract text from a rect-like 'cell' as plain or MD style text. + + This function should ultimately be used to extract text from a table cell. + Markdown output will only work correctly if extraction flag bit + TEXT_COLLECT_STYLES is set. + + Args: + textpage: A PyMuPDF TextPage object. Must have been created with + TEXTFLAGS_TEXT | TEXT_COLLECT_STYLES. + cell: A tuple (x0, y0, x1, y1) defining the cell's bbox. + markdown: If True, return text formatted for Markdown. + + Returns: + A string with the text extracted from the cell. + """ + text = "" + for block in textpage.extractRAWDICT()["blocks"]: + if block["type"] != 0: + continue + block_bbox = block["bbox"] + if ( + 0 + or block_bbox[0] > cell[2] + or block_bbox[2] < cell[0] + or block_bbox[1] > cell[3] + or block_bbox[3] < cell[1] + ): + continue # skip block outside cell + for line in block["lines"]: + lbbox = line["bbox"] + if ( + 0 + or lbbox[0] > cell[2] + or lbbox[2] < cell[0] + or lbbox[1] > cell[3] + or lbbox[3] < cell[1] + ): + continue # skip line outside cell + + if text: # must be a new line in the cell + text += "
" if markdown else "\n" + + # strikeout detection only works with horizontal text + horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0) + + for span in line["spans"]: + sbbox = span["bbox"] + if ( + 0 + or sbbox[0] > cell[2] + or sbbox[2] < cell[0] + or sbbox[1] > cell[3] + or sbbox[3] < cell[1] + ): + continue # skip spans outside cell + + # only include chars with more than 50% bbox overlap + span_text = "" + for char in span["chars"]: + bbox = Rect(char["bbox"]) + if abs(bbox & cell) > 0.5 * abs(bbox): + span_text += char["c"] + + if not span_text: + continue # skip empty span + + if not markdown: # no MD styling + text += span_text + continue + + prefix = "" + suffix = "" + if horizontal and span["char_flags"] & TEXT_STRIKEOUT: + prefix += "~~" + suffix = "~~" + suffix + if span["char_flags"] & TEXT_BOLD: + prefix += "**" + suffix = "**" + suffix + if span["flags"] & TEXT_FONT_ITALIC: + prefix += "_" + suffix = "_" + suffix + if span["flags"] & TEXT_FONT_MONOSPACED: + prefix += "`" + suffix = "`" + suffix + + if len(span["chars"]) > 2: + span_text = span_text.rstrip() + + # if span continues previous styling: extend cell text + if (ls := len(suffix)) and text.endswith(suffix): + text = text[:-ls] + span_text + suffix + else: # append the span with new styling + if not span_text.strip(): + text += " " + else: + text += prefix + span_text + suffix + + return text.strip() + + # ------------------------------------------------------------------- # End of PyMuPDF interface code # ------------------------------------------------------------------- @@ -1382,7 +1493,18 @@ def to_markdown(self, clean=False, fill_empty=True): output = "|" rows = self.row_count cols = self.col_count - cells = self.extract()[:] # make local copy of table text content + + # cell coordinates + cell_boxes = [[c for c in r.cells] for r in self.rows] + + # cell text strings + cells = [[None for i in range(cols)] for j in range(rows)] + for i, row in enumerate(cell_boxes): + for j, cell in enumerate(row): + if cell is not None: + cells[i][j] = extract_cells( + TEXTPAGE, cell_boxes[i][j], markdown=True + ) if fill_empty: # fill "None" cells where possible @@ -1420,7 +1542,8 @@ def to_markdown(self, clean=False, fill_empty=True): for i, cell in enumerate(row): # replace None cells with empty string # use HTML line break tag - cell = "" if not cell else cell.replace("\n", "
") + if cell is None: + cell = "" if clean: # remove sensitive syntax cell = html.escape(cell.replace("-", "-")) line += cell + "|" @@ -1944,7 +2067,7 @@ def make_chars(page, clip=None): page_number = page.number + 1 page_height = page.rect.height ctm = page.transformation_matrix - TEXTPAGE = page.get_textpage(clip=clip, flags=TEXTFLAGS_TEXT) + TEXTPAGE = page.get_textpage(clip=clip, flags=FLAGS) blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"] doctop_base = page_height * page.number for block in blocks: diff --git a/tests/conftest.py b/tests/conftest.py index 7e123435e..4017de580 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,4 @@ +import copy import os import platform import sys @@ -45,6 +46,17 @@ def get_fds(): JM_annot_id_stem = pymupdf.JM_annot_id_stem + def get_members(a): + ret = dict() + for n in dir(a): + if not n.startswith('_'): + v = getattr(a, n) + ret[n] = v + return ret + + # Allow post-test checking that pymupdf._globals has not changed. + _globals_pre = get_members(pymupdf._globals) + # Run the test. rep = yield @@ -59,6 +71,11 @@ def get_fds(): assert not pymupdf.TOOLS.set_small_glyph_heights() + _globals_post = get_members(pymupdf._globals) + if _globals_post != _globals_pre: + print(f'Test has changed pymupdf._globals from {_globals_pre=} to {_globals_post=}') + assert 0 + log_items = pymupdf._log_items() assert not log_items, f'log() was called; {len(log_items)=}.' @@ -84,3 +101,22 @@ def get_fds(): if next_fd_after != next_fd_before: print(f'Test has leaked fds, {next_fd_before=} {next_fd_after=}. {args=} {kwargs=}.') #assert 0, f'Test has leaked fds, {next_fd_before=} {next_fd_after=}. {args=} {kwargs=}.' + + if 0: + # This code can be useful to track down test failures caused by other + # tests modifying global state. + # + # We run a particular test menually after each test returns. + sys.path.insert(0, os.path.dirname(__file__)) + try: + import test_tables + finally: + del sys.path[0] + print(f'### Calling test_tables.test_md_styles().') + try: + test_tables.test_md_styles() + except Exception as e: + print(f'### test_tables.test_md_styles() failed: {e}') + raise + else: + print(f'### test_tables.test_md_styles() passed.') diff --git a/tests/resources/test-styled-table.pdf b/tests/resources/test-styled-table.pdf new file mode 100644 index 000000000..67f7e8a49 Binary files /dev/null and b/tests/resources/test-styled-table.pdf differ diff --git a/tests/test_font.py b/tests/test_font.py index 4d5958cd9..d9f38f5d8 100644 --- a/tests/test_font.py +++ b/tests/test_font.py @@ -139,25 +139,28 @@ def test_mupdf_subset_fonts2(): def test_3677(): pymupdf.TOOLS.set_subset_fontnames(True) - path = os.path.abspath(f'{__file__}/../../tests/resources/test_3677.pdf') - font_names_expected = [ - 'BCDEEE+Aptos', - 'BCDFEE+Aptos', - 'BCDGEE+Calibri-Light', - 'BCDHEE+Calibri-Light', - ] - font_names = list() - with pymupdf.open(path) as document: - for page in document: - for block in page.get_text('dict')['blocks']: - if block['type'] == 0: - if 'lines' in block.keys(): - for line in block['lines']: - for span in line['spans']: - font_name=span['font'] - print(font_name) - font_names.append(font_name) - assert font_names == font_names_expected, f'{font_names=}' + try: + path = os.path.abspath(f'{__file__}/../../tests/resources/test_3677.pdf') + font_names_expected = [ + 'BCDEEE+Aptos', + 'BCDFEE+Aptos', + 'BCDGEE+Calibri-Light', + 'BCDHEE+Calibri-Light', + ] + font_names = list() + with pymupdf.open(path) as document: + for page in document: + for block in page.get_text('dict')['blocks']: + if block['type'] == 0: + if 'lines' in block.keys(): + for line in block['lines']: + for span in line['spans']: + font_name=span['font'] + print(font_name) + font_names.append(font_name) + assert font_names == font_names_expected, f'{font_names=}' + finally: + pymupdf.TOOLS.set_subset_fontnames(False) def test_3933(): diff --git a/tests/test_pylint.py b/tests/test_pylint.py index 0f0c40fa9..a3b48ae6f 100644 --- a/tests/test_pylint.py +++ b/tests/test_pylint.py @@ -37,6 +37,7 @@ def test_pylint(): W1309: Using an f-string that does not have any interpolated variables (f-string-without-interpolation) R1734: Consider using [] instead of list() (use-list-literal) R1727: Boolean condition '0 and g_exceptions_verbose' will always evaluate to '0' (condition-evals-to-constant) + R1726: (simplifiable-condition) ''' ) diff --git a/tests/test_tables.py b/tests/test_tables.py index 4fb959f4c..745369c06 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -423,3 +423,13 @@ def test_4017(): ["Weighted Average Life", "4.83", "<=", "9.00", "", "PASS", "4.92"], ] assert tables[-1].extract() == expected_b + + +def test_md_styles(): + """Test output of table with MD-styled cells.""" + filename = os.path.join(scriptdir, "resources", "test-styled-table.pdf") + doc = pymupdf.open(filename) + page = doc[0] + tabs = page.find_tables()[0] + text = """|Column 1|Column 2|Column 3|\n|---|---|---|\n|Zelle (0,0)|**Bold (0,1)**|Zelle (0,2)|\n|~~Strikeout (1,0), Zeile 1~~
~~Hier kommt Zeile 2.~~|Zelle (1,1)|~~Strikeout (1,2)~~|\n|**`Bold-monospaced`**
**`(2,0)`**|_Italic (2,1)_|**_Bold-italic_**
**_(2,2)_**|\n|Zelle (3,0)|~~**Bold-strikeout**~~
~~**(3,1)**~~|Zelle (3,2)|\n\n""" + assert tabs.to_markdown() == text