Skip to content

Commit d4ea0ca

Browse files
👌 Fix quadratic complexity in fragments_join / text_join (#389)
Optimize adjacent-token joining in both inline cleanup stages by replacing repeated pairwise string concatenation with a single `"".join(...)` over each contiguous run. ## Details - `fragments_join` merges adjacent `text` tokens left behind after emphasis/strikethrough post-processing and recalculates token levels - `text_join` converts `text_special` tokens to `text` and performs the final adjacent-text merge in the inline token stream Both rules previously rebuilt growing strings incrementally, which can become quadratic for long runs. ## Why Tested on an adversarial ~190 KB document with ~30k intraword underscores on a single line. With `tracemalloc` running: | | render time | peak Python alloc | |-----------|-------------|-------------------| | before | 2.2s | 4476 MB | | after | 0.6s | 23 MB | It's not just a contrived attack input - this kind of thing also shows up naturally in Markdown produced by OCR pipelines, where tables of identifiers / references can easily contain very long runs of underscores or other delimiter characters. ## Tests Added focused tests for both rules: - `fragments_join`: verifies raw adjacent text fragments remain when both join stages are disabled, and that `fragments_join` alone collapses them when `text_join` is disabled - `text_join`: verifies escaped characters remain as multiple `text_special` tokens when `text_join` is disabled, and are converted and merged into a single `text` token when enabled ## Result No behavioral change in parser output, with less unnecessary work when joining long runs of adjacent tokens. --------- Co-authored-by: Chris Sewell <chrisj_sewell@hotmail.com>
1 parent 8933147 commit d4ea0ca

3 files changed

Lines changed: 101 additions & 9 deletions

File tree

markdown_it/rules_core/text_join.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,33 @@ def text_join(state: StateCore) -> None:
2121

2222
# convert text_special to text and join all adjacent text nodes
2323
new_tokens: list[Token] = []
24-
for child_token in inline_token.children or []:
24+
children = inline_token.children or []
25+
i = 0
26+
while i < len(children):
27+
child_token = children[i]
2528
if child_token.type == "text_special":
2629
child_token.type = "text"
2730
if (
2831
child_token.type == "text"
2932
and new_tokens
3033
and new_tokens[-1].type == "text"
3134
):
32-
new_tokens[-1].content += child_token.content
35+
# Collapse a run of adjacent text nodes in a single join, instead
36+
# of pairwise `a + b` concatenation. The pairwise form is O(L*k)
37+
# in the size of the run because each step rebuilds the growing
38+
# prefix; "".join is O(L).
39+
parts = [new_tokens[-1].content, child_token.content]
40+
i += 1
41+
while i < len(children):
42+
next_token = children[i]
43+
if next_token.type == "text_special":
44+
next_token.type = "text"
45+
if next_token.type != "text":
46+
break
47+
parts.append(next_token.content)
48+
i += 1
49+
new_tokens[-1].content = "".join(parts)
3350
else:
3451
new_tokens.append(child_token)
52+
i += 1
3553
inline_token.children = new_tokens

markdown_it/rules_inline/fragments_join.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,25 @@ def fragments_join(state: StateInline) -> None:
2929
and curr + 1 < maximum
3030
and state.tokens[curr + 1].type == "text"
3131
):
32-
# collapse two adjacent text nodes
33-
state.tokens[curr + 1].content = (
34-
state.tokens[curr].content + state.tokens[curr + 1].content
35-
)
36-
else:
37-
if curr != last:
38-
state.tokens[last] = state.tokens[curr]
32+
# Collapse a run of adjacent text nodes in a single join, instead
33+
# of pairwise `a + b` concatenation. The pairwise form is O(L*k)
34+
# in the size of the run because each step rebuilds the growing
35+
# prefix; "".join is O(L).
36+
parts = [state.tokens[curr].content]
37+
curr += 1
38+
while curr < maximum and state.tokens[curr].type == "text":
39+
parts.append(state.tokens[curr].content)
40+
curr += 1
41+
merged = state.tokens[curr - 1]
42+
merged.content = "".join(parts)
43+
merged.level = level
44+
state.tokens[last] = merged
3945
last += 1
46+
continue
47+
48+
if curr != last:
49+
state.tokens[last] = state.tokens[curr]
50+
last += 1
4051
curr += 1
4152

4253
if curr != last:

tests/test_api/test_main.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,3 +279,66 @@ def test_table_tokens(data_regression):
279279
"""
280280
)
281281
data_regression.check([t.as_dict() for t in tokens])
282+
283+
284+
def test_fragments_join_merges_adjacent_text_tokens():
285+
"""fragments_join should merge runs of adjacent text tokens into one.
286+
287+
Underscore characters flanked by word characters (e.g. ``a_b``) are not
288+
valid emphasis delimiters in CommonMark, so the emphasis rule leaves each
289+
``_`` as a plain text token adjacent to the surrounding text tokens,
290+
giving a run of five tokens: text("a"), text("_"), text("b c"),
291+
text("_"), text("d").
292+
293+
Note: there is also a core-level ``text_join`` rule that collapses adjacent
294+
text tokens as a fallback. We disable it here so that the assertions are
295+
sensitive only to ``fragments_join``.
296+
"""
297+
src = "a_b c_d"
298+
299+
# --- both rules disabled: five separate text tokens must survive ---
300+
md_both_off = MarkdownIt()
301+
md_both_off.disable(["text_join", "fragments_join"])
302+
children_both_off = md_both_off.parseInline(src)[0].children
303+
assert children_both_off is not None
304+
assert len(children_both_off) > 1, "expected multiple text tokens with no merging"
305+
assert all(t.type == "text" for t in children_both_off)
306+
307+
# --- only fragments_join enabled (text_join still off): run must collapse ---
308+
md_fj_on = MarkdownIt()
309+
md_fj_on.disable("text_join")
310+
children_fj_on = md_fj_on.parseInline(src)[0].children
311+
assert children_fj_on is not None
312+
assert len(children_fj_on) == 1
313+
assert children_fj_on[0].type == "text"
314+
assert children_fj_on[0].content == "a_b c_d"
315+
316+
317+
def test_text_join_merges_adjacent_text_special_tokens():
318+
"""text_join should convert text_special tokens and merge runs into one.
319+
320+
Backslash-escaped characters each produce a ``text_special`` token.
321+
``fragments_join`` only merges ``text`` tokens, so a run of
322+
``text_special`` tokens passes through it untouched. ``text_join``
323+
must then convert them to ``text`` and collapse the run in a single
324+
pass rather than via pairwise concatenation.
325+
"""
326+
# Three consecutive backslash escapes → three text_special tokens before
327+
# text_join runs.
328+
src = r"\*\*\*"
329+
330+
# --- text_join disabled: three text_special tokens must survive ---
331+
md_off = MarkdownIt()
332+
md_off.disable("text_join")
333+
children_off = md_off.parseInline(src)[0].children
334+
assert children_off is not None
335+
assert len(children_off) > 1, "expected multiple text_special tokens before merging"
336+
assert all(t.type == "text_special" for t in children_off)
337+
338+
# --- text_join enabled (default): must collapse to a single text token ---
339+
md_on = MarkdownIt()
340+
children_on = md_on.parseInline(src)[0].children
341+
assert children_on is not None
342+
assert len(children_on) == 1
343+
assert children_on[0].type == "text"
344+
assert children_on[0].content == "***"

0 commit comments

Comments
 (0)