👌 Fix quadratic complexity in fragments_join / text_join (#389)

petricevich · chrisjsewell · web-flow · commit d4ea0ca7f44e · 2026-05-06T16:54:19.000+02:00
Optimize adjacent-token joining in both inline cleanup stages by
replacing repeated pairwise string concatenation with a single
`"".join(...)` over each contiguous run.

## Details

- `fragments_join` merges adjacent `text` tokens left behind after
emphasis/strikethrough post-processing and recalculates token levels
- `text_join` converts `text_special` tokens to `text` and performs the
final adjacent-text merge in the inline token stream

Both rules previously rebuilt growing strings incrementally, which can
become quadratic for long runs.

## Why

Tested on an adversarial ~190 KB document with ~30k intraword
underscores on a single line. With `tracemalloc` running:

|           | render time | peak Python alloc |
|-----------|-------------|-------------------|
| before    | 2.2s        | 4476 MB           |
| after     | 0.6s        | 23 MB             |

It's not just a contrived attack input - this kind of thing also shows
up naturally in Markdown produced by OCR pipelines, where tables of
identifiers / references can easily contain very long runs of
underscores or other delimiter characters.

## Tests

Added focused tests for both rules:

- `fragments_join`: verifies raw adjacent text fragments remain when
both join stages are disabled, and that `fragments_join` alone collapses
them when `text_join` is disabled
- `text_join`: verifies escaped characters remain as multiple
`text_special` tokens when `text_join` is disabled, and are converted
and merged into a single `text` token when enabled

## Result

No behavioral change in parser output, with less unnecessary work when
joining long runs of adjacent tokens.

---------

Co-authored-by: Chris Sewell &lt;chrisj_sewell@hotmail.com&gt;
diff --git a/markdown_it/rules_core/text_join.py b/markdown_it/rules_core/text_join.py
@@ -21,15 +21,33 @@ def text_join(state: StateCore) -> None:
 
         # convert text_special to text and join all adjacent text nodes
         new_tokens: list[Token] = []
-        for child_token in inline_token.children or []:
+        children = inline_token.children or []
+        i = 0
+        while i < len(children):
+            child_token = children[i]
             if child_token.type == "text_special":
                 child_token.type = "text"
             if (
                 child_token.type == "text"
                 and new_tokens
                 and new_tokens[-1].type == "text"
             ):
-                new_tokens[-1].content += child_token.content
+                # Collapse a run of adjacent text nodes in a single join, instead
+                # of pairwise `a + b` concatenation. The pairwise form is O(L*k)
+                # in the size of the run because each step rebuilds the growing
+                # prefix; "".join is O(L).
+                parts = [new_tokens[-1].content, child_token.content]
+                i += 1
+                while i < len(children):
+                    next_token = children[i]
+                    if next_token.type == "text_special":
+                        next_token.type = "text"
+                    if next_token.type != "text":
+                        break
+                    parts.append(next_token.content)
+                    i += 1
+                new_tokens[-1].content = "".join(parts)
             else:
                 new_tokens.append(child_token)
+                i += 1
         inline_token.children = new_tokens
diff --git a/markdown_it/rules_inline/fragments_join.py b/markdown_it/rules_inline/fragments_join.py
@@ -29,14 +29,25 @@ def fragments_join(state: StateInline) -> None:
             and curr + 1 < maximum
             and state.tokens[curr + 1].type == "text"
         ):
-            # collapse two adjacent text nodes
-            state.tokens[curr + 1].content = (
-                state.tokens[curr].content + state.tokens[curr + 1].content
-            )
-        else:
-            if curr != last:
-                state.tokens[last] = state.tokens[curr]
+            # Collapse a run of adjacent text nodes in a single join, instead
+            # of pairwise `a + b` concatenation. The pairwise form is O(L*k)
+            # in the size of the run because each step rebuilds the growing
+            # prefix; "".join is O(L).
+            parts = [state.tokens[curr].content]
+            curr += 1
+            while curr < maximum and state.tokens[curr].type == "text":
+                parts.append(state.tokens[curr].content)
+                curr += 1
+            merged = state.tokens[curr - 1]
+            merged.content = "".join(parts)
+            merged.level = level
+            state.tokens[last] = merged
             last += 1
+            continue
+
+        if curr != last:
+            state.tokens[last] = state.tokens[curr]
+        last += 1
         curr += 1
 
     if curr != last:
diff --git a/tests/test_api/test_main.py b/tests/test_api/test_main.py
@@ -279,3 +279,66 @@ def test_table_tokens(data_regression):
     """
     )
     data_regression.check([t.as_dict() for t in tokens])
+
+
+def test_fragments_join_merges_adjacent_text_tokens():
+    """fragments_join should merge runs of adjacent text tokens into one.
+
+    Underscore characters flanked by word characters (e.g. ``a_b``) are not
+    valid emphasis delimiters in CommonMark, so the emphasis rule leaves each
+    ``_`` as a plain text token adjacent to the surrounding text tokens,
+    giving a run of five tokens: text("a"), text("_"), text("b c"),
+    text("_"), text("d").
+
+    Note: there is also a core-level ``text_join`` rule that collapses adjacent
+    text tokens as a fallback.  We disable it here so that the assertions are
+    sensitive only to ``fragments_join``.
+    """
+    src = "a_b c_d"
+
+    # --- both rules disabled: five separate text tokens must survive ---
+    md_both_off = MarkdownIt()
+    md_both_off.disable(["text_join", "fragments_join"])
+    children_both_off = md_both_off.parseInline(src)[0].children
+    assert children_both_off is not None
+    assert len(children_both_off) > 1, "expected multiple text tokens with no merging"
+    assert all(t.type == "text" for t in children_both_off)
+
+    # --- only fragments_join enabled (text_join still off): run must collapse ---
+    md_fj_on = MarkdownIt()
+    md_fj_on.disable("text_join")
+    children_fj_on = md_fj_on.parseInline(src)[0].children
+    assert children_fj_on is not None
+    assert len(children_fj_on) == 1
+    assert children_fj_on[0].type == "text"
+    assert children_fj_on[0].content == "a_b c_d"
+
+
+def test_text_join_merges_adjacent_text_special_tokens():
+    """text_join should convert text_special tokens and merge runs into one.
+
+    Backslash-escaped characters each produce a ``text_special`` token.
+    ``fragments_join`` only merges ``text`` tokens, so a run of
+    ``text_special`` tokens passes through it untouched.  ``text_join``
+    must then convert them to ``text`` and collapse the run in a single
+    pass rather than via pairwise concatenation.
+    """
+    # Three consecutive backslash escapes → three text_special tokens before
+    # text_join runs.
+    src = r"\*\*\*"
+
+    # --- text_join disabled: three text_special tokens must survive ---
+    md_off = MarkdownIt()
+    md_off.disable("text_join")
+    children_off = md_off.parseInline(src)[0].children
+    assert children_off is not None
+    assert len(children_off) > 1, "expected multiple text_special tokens before merging"
+    assert all(t.type == "text_special" for t in children_off)
+
+    # --- text_join enabled (default): must collapse to a single text token ---
+    md_on = MarkdownIt()
+    children_on = md_on.parseInline(src)[0].children
+    assert children_on is not None
+    assert len(children_on) == 1
+    assert children_on[0].type == "text"
+    assert children_on[0].content == "***"