From 81db75107a06ef93664a294965e3f417b8aab2e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Wed, 27 May 2026 13:41:17 +0300 Subject: [PATCH] Fix bug in to_wikitext() with caption attributes When translating a WikiNode TABLE_CAPTION with attributes back into wikitext, we omitted a final `|` pipe character necessary for proper parsing of caption attributes. Note that this is not necessary for TABLE attributes. This bug was only recently discovered because we didn't do anything with table captions before... --- src/wikitextprocessor/node_expand.py | 5 ++++- src/wikitextprocessor/parser.py | 10 +++++++--- tests/test_node_expand.py | 8 +++++++- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/wikitextprocessor/node_expand.py b/src/wikitextprocessor/node_expand.py index 419932ec..825b7af0 100644 --- a/src/wikitextprocessor/node_expand.py +++ b/src/wikitextprocessor/node_expand.py @@ -139,7 +139,10 @@ def recurse(node: Union[GeneralNode, WikiNodeListArgs]) -> str: parts.append(recurse(node.children)) parts.append("\n|}\n") elif kind == NodeKind.TABLE_CAPTION: - parts.append("\n|+ {}\n".format(to_attrs(node))) + if tc_attrs := to_attrs(node): + parts.append("\n|+ {} |\n".format(tc_attrs)) + else: + parts.append("\n|+\n") parts.append(recurse(node.children)) elif kind == NodeKind.TABLE_ROW: parts.append("\n|- {}\n".format(to_attrs(node))) diff --git a/src/wikitextprocessor/parser.py b/src/wikitextprocessor/parser.py index 59b84dfd..a8fac10c 100644 --- a/src/wikitextprocessor/parser.py +++ b/src/wikitextprocessor/parser.py @@ -1414,7 +1414,8 @@ def table_start_fn(ctx: "Wtp", token: str) -> None: # something=other, something="other", something = 'other' attr_assignment_pair = ( - r"""\s*[^"'>/=\0-\037\s]+""" r"""\s*=\s*("[^"]*"|'[^']*'|[^"'<>`\s]+)""" + r"""\s*[^"'>/=\0-\037\s]+""" + r"""\s*=\s*("[^"]*"|'[^']*'|[^"'<>`\s]+)""" ) attr_assignments_re = re.compile( @@ -1428,7 +1429,7 @@ def check_for_attributes(ctx: "Wtp", node: WikiNode) -> tuple[bool, str]: # Old behavior added here to return earlier without needing # to use regex matching; if the old version worked, why not? - # If this fail, then resort to the reverse parsing + regex. + # If this fails, then resort to the reverse parsing + regex. _parser_merge_str_children(ctx) if len(node.children) == 1 and isinstance(node.children[0], str): ret = node.children.pop() @@ -1623,7 +1624,10 @@ def table_cell_fn(ctx: "Wtp", token: str) -> None: if len(node.children) == 1 and isinstance( attrs := node.children[0], str ): - node.children.pop() + # At this point of parsing, we're just behind the start + # of one of the above node types; if they are followed + # by a `|`, that means the first child is an attr section + node.children.pop(0) # Using the walrus operator and pop()ing without return # is just to make the type-checker happy without using # an assert that attrs is definitely a str... diff --git a/tests/test_node_expand.py b/tests/test_node_expand.py index 3fa1ba50..5b5f38db 100644 --- a/tests/test_node_expand.py +++ b/tests/test_node_expand.py @@ -154,7 +154,13 @@ def test_table2(self): self.backcvt('{| class="x"\n|}', '\n{| class="x"\n\n|}\n') def test_tablecaption1(self): - self.backcvt("{|\n|+\ncapt\n|}", "\n{| \n\n|+ \n\ncapt\n\n|}\n") + self.backcvt("{|\n|+\ncapt\n|}", "\n{| \n\n|+\n\ncapt\n\n|}\n") + + def test_tablecaption2(self): + self.backcvt( + "{|\n |+ class='foo' |\ncapt\n|}", + '\n{| \n\n|+ class="foo" |\n\ncapt\n\n|}\n', + ) def test_tablerowcell1(self): self.backcvt(