✨Allow plugins to register inline terminator characters (#391)

Copilot · chrisjsewell · web-flow · commit df6fd361099c · 2026-05-06T17:09:32.000+02:00
The inline `text` rule used a hardcoded, unexpandable set of terminator characters, forcing plugins that need to trigger on non-terminator characters (e.g. `w` for GFM `www.` autolinks) to resort to core-rule post-processing workarounds. ## Changes - **`parser_inline.py`**: Moves the terminator set onto `ParserInline` as `_terminator_chars` (a `set[str]` seeded from `_DEFAULT_TERMINATORS`) with a pre-compiled `terminator_re: re.Pattern[str]` attribute. Exposes `add_terminator_char(ch)` to extend the set; the regex is rebuilt eagerly only when a genuinely new character is added, keeping zero per-call overhead in the hot path. - **`rules_inline/text.py`**: Drops the module-level `_TerminatorChars` set and `@functools.cache`-decorated factory. The `text` rule now reads `state.md.inline.terminator_re` directly. - **`docs/contributing.md`**: Updates the "Why is my inline rule not executed?" FAQ to document the new API. ## Usage ```python def gfm_autolink_plugin(md: MarkdownIt) -> None: md.inline.add_terminator_char("w") md.inline.ruler.push("gfm_autolink_www", _www_rule) ``` Fully backward-compatible — the default terminator set is unchanged. --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: chrisjsewell <2997570+chrisjsewell@users.noreply.github.com> Co-authored-by: Chris Sewell <chrisj_sewell@hotmail.com>
diff --git a/docs/contributing.md b/docs/contributing.md
@@ -118,7 +118,12 @@ __Note:__ Don't try to replace text with HTML markup! That's not secure.
 
 ### Why is my inline rule not executed?
 
-The inline parser skips pieces of texts to optimize speed. It stops only on [a small set of chars](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_inline/text.mjs), which can be tokens. We did not made this list extensible for performance reasons too.
+The inline parser skips pieces of texts to optimize speed. It stops only on [a small set of chars](https://github.com/executablebooks/markdown-it-py/blob/master/markdown_it/parser_inline.py), which can be tokens.
 
-If you are absolutely sure that something important is missing there - create a
-ticket and we will consider adding it as a new charcode.
+If your inline rule needs to trigger on a character that is not in the default terminator set, you can register it via `md.inline.add_terminator_char`:
+
+```python
+def my_plugin(md: MarkdownIt) -> None:
+    md.inline.add_terminator_char("w")  # stop text rule on 'w'
+    md.inline.ruler.push("my_rule", my_inline_rule)
+```
diff --git a/markdown_it/parser_inline.py b/markdown_it/parser_inline.py
@@ -3,6 +3,8 @@
 from __future__ import annotations
 
 from collections.abc import Callable
+import functools
+import re
 from typing import TYPE_CHECKING
 
 from . import rules_inline
@@ -15,6 +17,47 @@
     from markdown_it import MarkdownIt
 
 
+# Default set of characters that terminate a text token and allow inline rules to fire.
+# '{}$%@~+=:' reserved for extensions.
+# Note: Don't confuse with "Markdown ASCII Punctuation" chars.
+# http://spec.commonmark.org/0.15/#ascii-punctuation-character
+_DEFAULT_TERMINATORS: frozenset[str] = frozenset(
+    {
+        "\n",
+        "!",
+        "#",
+        "$",
+        "%",
+        "&",
+        "*",
+        "+",
+        "-",
+        ":",
+        "<",
+        "=",
+        ">",
+        "@",
+        "[",
+        "\\",
+        "]",
+        "^",
+        "_",
+        "`",
+        "{",
+        "}",
+        "~",
+    }
+)
+
+
+# Lazily compiled regex for the default terminator set.  The @cache ensures it is
+# compiled at most once (on first ParserInline instantiation) and shared across all
+# instances that have not added extra chars, keeping __init__ cost near zero.
+@functools.cache
+def _default_terminator_re() -> re.Pattern[str]:
+    return re.compile("[" + re.escape("".join(_DEFAULT_TERMINATORS)) + "]")
+
+
 # Parser rules
 RuleFuncInlineType = Callable[[StateInline, bool], bool]
 """(state: StateInline, silent: bool) -> matched: bool)
@@ -61,6 +104,30 @@ def __init__(self) -> None:
         self.ruler2 = Ruler[RuleFuncInline2Type]()
         for name, rule2 in _rules2:
             self.ruler2.push(name, rule2)
+        # Characters that stop the text rule, allowing other inline rules to fire.
+        # _extra_terminator_chars is only allocated when add_terminator_char() is called
+        # with a char outside the defaults, keeping __init__ allocation-free.
+        self._extra_terminator_chars: set[str] = set()
+        # Pre-compiled regex shared with all default instances (no copy in the common path).
+        self.terminator_re: re.Pattern[str] = _default_terminator_re()
+
+    def add_terminator_char(self, ch: str) -> None:
+        """Register a character that stops the ``text`` rule, allowing inline rules to fire.
+
+        This lets plugins declare which characters their inline rules react to,
+        mirroring the ``MARKER`` mechanism in the Rust markdown-it implementation.
+
+        :param ch: A single character to add to the terminator set.
+        """
+        if ch not in _DEFAULT_TERMINATORS and ch not in self._extra_terminator_chars:
+            self._extra_terminator_chars.add(ch)
+            self.terminator_re = re.compile(
+                "["
+                + re.escape(
+                    "".join(_DEFAULT_TERMINATORS | self._extra_terminator_chars)
+                )
+                + "]"
+            )
 
     def skipToken(self, state: StateInline) -> None:
         """Skip single token by running all rules in validation mode;
diff --git a/markdown_it/rules_inline/text.py b/markdown_it/rules_inline/text.py
@@ -1,54 +1,15 @@
-import functools
-import re
-
 # Skip text characters for text token, place those to pending buffer
 # and increment current pos
 from .state_inline import StateInline
 
 # Rule to skip pure text
-# '{}$%@~+=:' reserved for extensions
-
-# !!!! Don't confuse with "Markdown ASCII Punctuation" chars
-# http://spec.commonmark.org/0.15/#ascii-punctuation-character
-
-
-_TerminatorChars = {
-    "\n",
-    "!",
-    "#",
-    "$",
-    "%",
-    "&",
-    "*",
-    "+",
-    "-",
-    ":",
-    "<",
-    "=",
-    ">",
-    "@",
-    "[",
-    "\\",
-    "]",
-    "^",
-    "_",
-    "`",
-    "{",
-    "}",
-    "~",
-}
-
-
-@functools.cache
-def _terminator_char_regex() -> re.Pattern[str]:
-    return re.compile("[" + re.escape("".join(_TerminatorChars)) + "]")
 
 
 def text(state: StateInline, silent: bool) -> bool:
     pos = state.pos
     posMax = state.posMax
 
-    terminator_char = _terminator_char_regex().search(state.src, pos)
+    terminator_char = state.md.inline.terminator_re.search(state.src, pos)
     pos = terminator_char.start() if terminator_char else posMax
 
     if pos == state.pos:
diff --git a/tests/test_api/test_plugin_creation.py b/tests/test_api/test_plugin_creation.py
@@ -89,3 +89,46 @@ def _plugin(_md: MarkdownIt) -> None:
 
     MarkdownIt().use(_plugin).parse("a")
     assert "plugin called" in capsys.readouterr().out
+
+
+def test_add_terminator_char():
+    """Test that add_terminator_char stops the text rule on a new character."""
+    hit_positions = []
+
+    def w_rule(state, silent):
+        if state.src[state.pos] != "w":
+            return False
+        hit_positions.append(state.pos)
+        state.pos += 1
+        return True
+
+    def _plugin(_md: MarkdownIt) -> None:
+        _md.inline.add_terminator_char("w")
+        _md.inline.ruler.before("text", "w_rule", w_rule)
+
+    md = MarkdownIt().use(_plugin)
+
+    # Without the terminator 'w' would be consumed as plain text;
+    # with it the rule fires exactly for the 'w' at position 1 in "awb".
+    md.render("awb")
+    assert hit_positions == [1]
+
+
+def test_add_terminator_char_idempotent():
+    """add_terminator_char with an already-present char should not rebuild the regex."""
+    md = MarkdownIt()
+    original_re = md.inline.terminator_re
+
+    # '\n' is already in the default set – adding it again must not rebuild
+    md.inline.add_terminator_char("\n")
+    assert md.inline.terminator_re is original_re
+
+
+def test_add_terminator_char_rebuilds():
+    """add_terminator_char with a new char should rebuild the regex."""
+    md = MarkdownIt()
+    original_re = md.inline.terminator_re
+
+    md.inline.add_terminator_char("w")
+    assert md.inline.terminator_re is not original_re
+    assert "w" in md.inline._extra_terminator_chars