Skip to content

Commit ecb885e

Browse files
Do not linktrail if following text is not [a-z]?
See wiktectract issue #1604 tatuylonen/wiktextract#1604 https://en.wikipedia.org/wiki/Help:Wikitext#Blend_link This should not be merged as is, because it will create problems in other extractors that might rely on different behavior. In the best-case scenario, there might be two different camps: 1) Languages that use spaces that want to do linktrailing 2) Languages without spaces that can't do linktrailing If this is the case, we might be able to get away with a kludge that checks whether the script of the last character in the link matches the script of the first character after the link.
1 parent 9905b1f commit ecb885e

3 files changed

Lines changed: 17 additions & 8 deletions

File tree

src/wikitextprocessor/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1029,7 +1029,7 @@ def text_fn(ctx: "Wtp", token: str) -> None:
10291029
and not node.children[-1].children
10301030
and not ctx.suppress_special
10311031
):
1032-
m = re.match(r"(?s)(\w+)(.*)", token)
1032+
m = re.match(r"(?s)([a-z]+)(.*)", token)
10331033
if m:
10341034
node.children[-1].children.append(m.group(1))
10351035
token = m.group(2)

src/wikitextprocessor/parserfns.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from collections.abc import Callable, Sequence
1010
from datetime import datetime, timezone
1111
from pathlib import Path
12-
from typing import TYPE_CHECKING, Optional, Union
12+
from typing import TYPE_CHECKING, Any, Optional, Union
1313

1414
import dateparser
1515

@@ -1185,7 +1185,7 @@ def parse_timestamp(
11851185
if not dt:
11861186
dt = "now"
11871187

1188-
settings: dateparser._Settings = {"RETURN_AS_TIMEZONE_AWARE": True}
1188+
settings: dict[str, Any] = {"RETURN_AS_TIMEZONE_AWARE": True}
11891189
if loc in ("", "0"):
11901190
dt += " UTC"
11911191

@@ -1206,15 +1206,15 @@ def parse_timestamp(
12061206
# php's strtotime() (which is the original function used)
12071207
# but we can handle special cases here and hope
12081208
# people on wiktionary don't go crazy with weird formatting
1209-
t = dateparser.parse(dt, settings=settings)
1209+
t = dateparser.parse(dt, settings=settings) # type: ignore
12101210
if t is None:
12111211
m = re.match(
12121212
r"([^+]*)\s*(\+\s*\d+\s*(day|year|month)s?)\s*$", orig_dt
12131213
)
12141214
if m:
1215-
main_date = dateparser.parse(m.group(1), settings=settings)
1216-
add_time = dateparser.parse(m.group(2), settings=settings)
1217-
now = dateparser.parse("now", settings=settings)
1215+
main_date = dateparser.parse(m.group(1), settings=settings) # type: ignore
1216+
add_time = dateparser.parse(m.group(2), settings=settings) # type: ignore
1217+
now = dateparser.parse("now", settings=settings) # type: ignore
12181218
if main_date and add_time is not None and now is not None:
12191219
# this is just a kludge: dateparser parses "+2 days" as
12201220
# "2 days AGO". The now-datetime object is used to check

tests/test_parser.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1101,7 +1101,7 @@ def test_link12(self):
11011101
self.assertEqual(link.kind, NodeKind.LINK)
11021102
self.assertEqual(link.largs, [["foo"], ["\n[bar"]])
11031103

1104-
def test_link_trailing(self):
1104+
def test_link_trailing_1(self):
11051105
tree = self.parse("test", "[[Help]]ing heal")
11061106
self.assertEqual(len(tree.children), 2)
11071107
a, b = tree.children
@@ -1110,6 +1110,15 @@ def test_link_trailing(self):
11101110
self.assertEqual(a.children, ["ing"])
11111111
self.assertEqual(b, " heal")
11121112

1113+
def test_link_trailing_not_latin(self):
1114+
tree = self.parse("test", "[[appellāre]]の直説法所相現在第 foo")
1115+
self.assertEqual(len(tree.children), 2)
1116+
a, b = tree.children
1117+
self.assertEqual(a.kind, NodeKind.LINK)
1118+
self.assertEqual(a.largs, [["appellāre"]])
1119+
self.assertEqual(a.children, [])
1120+
self.assertEqual(b, "の直説法所相現在第 foo")
1121+
11131122
def test_url1(self):
11141123
tree = self.parse("test", "this https://wikipedia.com link")
11151124
self.assertEqual(len(tree.children), 3)

0 commit comments

Comments
 (0)