Skip to content

Commit 9d9a410

Browse files
Merge pull request #414 from tatuylonen/linktrailing
Make linktrailing regex Wtp attribute.
2 parents 59dc20b + 980bb47 commit 9d9a410

5 files changed

Lines changed: 32 additions & 9 deletions

File tree

.github/workflows/lint.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,3 @@ jobs:
2424
- run: python -m mypy -p wikitextprocessor
2525
- run: python -m ruff check .
2626
- run: python -m ruff format --diff .
27-
- uses: crate-ci/typos@v1

src/wikitextprocessor/core.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ class Wtp:
282282
"notes", # NOTE error messages
283283
"wiki_notices", # WIKI error messages
284284
"wikidata_session",
285+
"linktrailing_re",
285286
)
286287

287288
def __init__(
@@ -355,6 +356,14 @@ def __init__(
355356
if not quiet:
356357
logger.setLevel(logging.DEBUG)
357358
self.wikidata_session: Session | None = None
359+
# Default regex pattern, will sometimes cause trouble.
360+
# Linktrailing is when you have [[a li]]nk that consumes the
361+
# trailing suffix so that the whole word is blue. Languages
362+
# without spaces, like Japanese, should use the English
363+
# [a-z] pattern, other languages their own if `w+` actually
364+
# causes problems in them.
365+
# Will be modified later in wiktextract wxr through WiktionaryConfig.
366+
self.linktrailing_re = re.compile(r"(?s)(\w+)(.*)")
358367

359368
def create_db(self) -> None:
360369
from .wikidata import init_wikidata_cache

src/wikitextprocessor/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1029,7 +1029,7 @@ def text_fn(ctx: "Wtp", token: str) -> None:
10291029
and not node.children[-1].children
10301030
and not ctx.suppress_special
10311031
):
1032-
m = re.match(r"(?s)(\w+)(.*)", token)
1032+
m = ctx.linktrailing_re.match(token)
10331033
if m:
10341034
node.children[-1].children.append(m.group(1))
10351035
token = m.group(2)

src/wikitextprocessor/parserfns.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from collections.abc import Callable, Sequence
1010
from datetime import datetime, timezone
1111
from pathlib import Path
12-
from typing import TYPE_CHECKING, Optional, Union
12+
from typing import TYPE_CHECKING, Any, Optional, Union
1313

1414
import dateparser
1515

@@ -1187,7 +1187,7 @@ def parse_timestamp(
11871187
if not dt:
11881188
dt = "now"
11891189

1190-
settings: dateparser._Settings = {"RETURN_AS_TIMEZONE_AWARE": True}
1190+
settings: dict[str, Any] = {"RETURN_AS_TIMEZONE_AWARE": True}
11911191
if loc in ("", "0"):
11921192
dt += " UTC"
11931193

@@ -1208,15 +1208,15 @@ def parse_timestamp(
12081208
# php's strtotime() (which is the original function used)
12091209
# but we can handle special cases here and hope
12101210
# people on wiktionary don't go crazy with weird formatting
1211-
t = dateparser.parse(dt, settings=settings)
1211+
t = dateparser.parse(dt, settings=settings) # type: ignore
12121212
if t is None:
12131213
m = re.match(
12141214
r"([^+]*)\s*(\+\s*\d+\s*(day|year|month)s?)\s*$", orig_dt
12151215
)
12161216
if m:
1217-
main_date = dateparser.parse(m.group(1), settings=settings)
1218-
add_time = dateparser.parse(m.group(2), settings=settings)
1219-
now = dateparser.parse("now", settings=settings)
1217+
main_date = dateparser.parse(m.group(1), settings=settings) # type: ignore
1218+
add_time = dateparser.parse(m.group(2), settings=settings) # type: ignore
1219+
now = dateparser.parse("now", settings=settings) # type: ignore
12201220
if main_date and add_time is not None and now is not None:
12211221
# this is just a kludge: dateparser parses "+2 days" as
12221222
# "2 days AGO". The now-datetime object is used to check

tests/test_parser.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#
33
# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
44

5+
import re
56
import unittest
67

78
from wikitextprocessor import Wtp
@@ -1101,7 +1102,7 @@ def test_link12(self):
11011102
self.assertEqual(link.kind, NodeKind.LINK)
11021103
self.assertEqual(link.largs, [["foo"], ["\n[bar"]])
11031104

1104-
def test_link_trailing(self):
1105+
def test_link_trailing_1(self):
11051106
tree = self.parse("test", "[[Help]]ing heal")
11061107
self.assertEqual(len(tree.children), 2)
11071108
a, b = tree.children
@@ -1110,6 +1111,20 @@ def test_link_trailing(self):
11101111
self.assertEqual(a.children, ["ing"])
11111112
self.assertEqual(b, " heal")
11121113

1114+
def test_link_trailing_not_latin(self):
1115+
_linktrailing_re = self.ctx.linktrailing_re
1116+
# Normally this alternative pattern would be provided by Wiktextract's
1117+
# WiktextractConfig or something similar.
1118+
self.ctx.linktrailing_re = re.compile(r"(?s)([a-z]+)(.*)")
1119+
tree = self.parse("test", "[[appellāre]]の直説法所相現在第 foo")
1120+
self.ctx.linktrailing_re = _linktrailing_re
1121+
self.assertEqual(len(tree.children), 2)
1122+
a, b = tree.children
1123+
self.assertEqual(a.kind, NodeKind.LINK)
1124+
self.assertEqual(a.largs, [["appellāre"]])
1125+
self.assertEqual(a.children, [])
1126+
self.assertEqual(b, "の直説法所相現在第 foo")
1127+
11131128
def test_url1(self):
11141129
tree = self.parse("test", "this https://wikipedia.com link")
11151130
self.assertEqual(len(tree.children), 3)

0 commit comments

Comments
 (0)