Skip to content

Commit df6fd36

Browse files
✨Allow plugins to register inline terminator characters (#391)
The inline `text` rule used a hardcoded, unexpandable set of terminator characters, forcing plugins that need to trigger on non-terminator characters (e.g. `w` for GFM `www.` autolinks) to resort to core-rule post-processing workarounds. ## Changes - **`parser_inline.py`**: Moves the terminator set onto `ParserInline` as `_terminator_chars` (a `set[str]` seeded from `_DEFAULT_TERMINATORS`) with a pre-compiled `terminator_re: re.Pattern[str]` attribute. Exposes `add_terminator_char(ch)` to extend the set; the regex is rebuilt eagerly only when a genuinely new character is added, keeping zero per-call overhead in the hot path. - **`rules_inline/text.py`**: Drops the module-level `_TerminatorChars` set and `@functools.cache`-decorated factory. The `text` rule now reads `state.md.inline.terminator_re` directly. - **`docs/contributing.md`**: Updates the "Why is my inline rule not executed?" FAQ to document the new API. ## Usage ```python def gfm_autolink_plugin(md: MarkdownIt) -> None: md.inline.add_terminator_char("w") md.inline.ruler.push("gfm_autolink_www", _www_rule) ``` Fully backward-compatible — the default terminator set is unchanged. --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: chrisjsewell <2997570+chrisjsewell@users.noreply.github.com> Co-authored-by: Chris Sewell <chrisj_sewell@hotmail.com>
1 parent d4ea0ca commit df6fd36

4 files changed

Lines changed: 119 additions & 43 deletions

File tree

docs/contributing.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,12 @@ __Note:__ Don't try to replace text with HTML markup! That's not secure.
118118

119119
### Why is my inline rule not executed?
120120

121-
The inline parser skips pieces of texts to optimize speed. It stops only on [a small set of chars](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_inline/text.mjs), which can be tokens. We did not made this list extensible for performance reasons too.
121+
The inline parser skips pieces of texts to optimize speed. It stops only on [a small set of chars](https://github.com/executablebooks/markdown-it-py/blob/master/markdown_it/parser_inline.py), which can be tokens.
122122

123-
If you are absolutely sure that something important is missing there - create a
124-
ticket and we will consider adding it as a new charcode.
123+
If your inline rule needs to trigger on a character that is not in the default terminator set, you can register it via `md.inline.add_terminator_char`:
124+
125+
```python
126+
def my_plugin(md: MarkdownIt) -> None:
127+
md.inline.add_terminator_char("w") # stop text rule on 'w'
128+
md.inline.ruler.push("my_rule", my_inline_rule)
129+
```

markdown_it/parser_inline.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from __future__ import annotations
44

55
from collections.abc import Callable
6+
import functools
7+
import re
68
from typing import TYPE_CHECKING
79

810
from . import rules_inline
@@ -15,6 +17,47 @@
1517
from markdown_it import MarkdownIt
1618

1719

20+
# Default set of characters that terminate a text token and allow inline rules to fire.
21+
# '{}$%@~+=:' reserved for extensions.
22+
# Note: Don't confuse with "Markdown ASCII Punctuation" chars.
23+
# http://spec.commonmark.org/0.15/#ascii-punctuation-character
24+
_DEFAULT_TERMINATORS: frozenset[str] = frozenset(
25+
{
26+
"\n",
27+
"!",
28+
"#",
29+
"$",
30+
"%",
31+
"&",
32+
"*",
33+
"+",
34+
"-",
35+
":",
36+
"<",
37+
"=",
38+
">",
39+
"@",
40+
"[",
41+
"\\",
42+
"]",
43+
"^",
44+
"_",
45+
"`",
46+
"{",
47+
"}",
48+
"~",
49+
}
50+
)
51+
52+
53+
# Lazily compiled regex for the default terminator set. The @cache ensures it is
54+
# compiled at most once (on first ParserInline instantiation) and shared across all
55+
# instances that have not added extra chars, keeping __init__ cost near zero.
56+
@functools.cache
57+
def _default_terminator_re() -> re.Pattern[str]:
58+
return re.compile("[" + re.escape("".join(_DEFAULT_TERMINATORS)) + "]")
59+
60+
1861
# Parser rules
1962
RuleFuncInlineType = Callable[[StateInline, bool], bool]
2063
"""(state: StateInline, silent: bool) -> matched: bool)
@@ -61,6 +104,30 @@ def __init__(self) -> None:
61104
self.ruler2 = Ruler[RuleFuncInline2Type]()
62105
for name, rule2 in _rules2:
63106
self.ruler2.push(name, rule2)
107+
# Characters that stop the text rule, allowing other inline rules to fire.
108+
# _extra_terminator_chars is only allocated when add_terminator_char() is called
109+
# with a char outside the defaults, keeping __init__ allocation-free.
110+
self._extra_terminator_chars: set[str] = set()
111+
# Pre-compiled regex shared with all default instances (no copy in the common path).
112+
self.terminator_re: re.Pattern[str] = _default_terminator_re()
113+
114+
def add_terminator_char(self, ch: str) -> None:
115+
"""Register a character that stops the ``text`` rule, allowing inline rules to fire.
116+
117+
This lets plugins declare which characters their inline rules react to,
118+
mirroring the ``MARKER`` mechanism in the Rust markdown-it implementation.
119+
120+
:param ch: A single character to add to the terminator set.
121+
"""
122+
if ch not in _DEFAULT_TERMINATORS and ch not in self._extra_terminator_chars:
123+
self._extra_terminator_chars.add(ch)
124+
self.terminator_re = re.compile(
125+
"["
126+
+ re.escape(
127+
"".join(_DEFAULT_TERMINATORS | self._extra_terminator_chars)
128+
)
129+
+ "]"
130+
)
64131

65132
def skipToken(self, state: StateInline) -> None:
66133
"""Skip single token by running all rules in validation mode;

markdown_it/rules_inline/text.py

Lines changed: 1 addition & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,15 @@
1-
import functools
2-
import re
3-
41
# Skip text characters for text token, place those to pending buffer
52
# and increment current pos
63
from .state_inline import StateInline
74

85
# Rule to skip pure text
9-
# '{}$%@~+=:' reserved for extensions
10-
11-
# !!!! Don't confuse with "Markdown ASCII Punctuation" chars
12-
# http://spec.commonmark.org/0.15/#ascii-punctuation-character
13-
14-
15-
_TerminatorChars = {
16-
"\n",
17-
"!",
18-
"#",
19-
"$",
20-
"%",
21-
"&",
22-
"*",
23-
"+",
24-
"-",
25-
":",
26-
"<",
27-
"=",
28-
">",
29-
"@",
30-
"[",
31-
"\\",
32-
"]",
33-
"^",
34-
"_",
35-
"`",
36-
"{",
37-
"}",
38-
"~",
39-
}
40-
41-
42-
@functools.cache
43-
def _terminator_char_regex() -> re.Pattern[str]:
44-
return re.compile("[" + re.escape("".join(_TerminatorChars)) + "]")
456

467

478
def text(state: StateInline, silent: bool) -> bool:
489
pos = state.pos
4910
posMax = state.posMax
5011

51-
terminator_char = _terminator_char_regex().search(state.src, pos)
12+
terminator_char = state.md.inline.terminator_re.search(state.src, pos)
5213
pos = terminator_char.start() if terminator_char else posMax
5314

5415
if pos == state.pos:

tests/test_api/test_plugin_creation.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,3 +89,46 @@ def _plugin(_md: MarkdownIt) -> None:
8989

9090
MarkdownIt().use(_plugin).parse("a")
9191
assert "plugin called" in capsys.readouterr().out
92+
93+
94+
def test_add_terminator_char():
95+
"""Test that add_terminator_char stops the text rule on a new character."""
96+
hit_positions = []
97+
98+
def w_rule(state, silent):
99+
if state.src[state.pos] != "w":
100+
return False
101+
hit_positions.append(state.pos)
102+
state.pos += 1
103+
return True
104+
105+
def _plugin(_md: MarkdownIt) -> None:
106+
_md.inline.add_terminator_char("w")
107+
_md.inline.ruler.before("text", "w_rule", w_rule)
108+
109+
md = MarkdownIt().use(_plugin)
110+
111+
# Without the terminator 'w' would be consumed as plain text;
112+
# with it the rule fires exactly for the 'w' at position 1 in "awb".
113+
md.render("awb")
114+
assert hit_positions == [1]
115+
116+
117+
def test_add_terminator_char_idempotent():
118+
"""add_terminator_char with an already-present char should not rebuild the regex."""
119+
md = MarkdownIt()
120+
original_re = md.inline.terminator_re
121+
122+
# '\n' is already in the default set – adding it again must not rebuild
123+
md.inline.add_terminator_char("\n")
124+
assert md.inline.terminator_re is original_re
125+
126+
127+
def test_add_terminator_char_rebuilds():
128+
"""add_terminator_char with a new char should rebuild the regex."""
129+
md = MarkdownIt()
130+
original_re = md.inline.terminator_re
131+
132+
md.inline.add_terminator_char("w")
133+
assert md.inline.terminator_re is not original_re
134+
assert "w" in md.inline._extra_terminator_chars

0 commit comments

Comments
 (0)