Skip to content

Commit ec506e5

Browse files
nperez0111claude
andcommitted
refactor: improve markdown inline parser with tokenizer-array pattern
- Refactor parseInline() from monolithic if/else chain to ordered array of standalone tokenizer functions for better maintainability - Batch consecutive plain-text characters for single escapeHtml() call instead of per-character escaping - Improve emphasis closing delimiter check to reject single-char closers adjacent to same delimiter on either side (not just before) - Add blockquote lazy continuation support per CommonMark spec - Add blockquoteLazyContinuation test case Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 7c3e17d commit ec506e5

3 files changed

Lines changed: 188 additions & 125 deletions

File tree

packages/core/src/api/parsers/markdown/markdownToHtml.ts

Lines changed: 160 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,130 @@ function isIntraword(text: string, i: number, delimLen: number): boolean {
3939

4040
// ─── Inline Parser ───────────────────────────────────────────────────────────
4141

42+
type InlineTokenizer = (
43+
text: string,
44+
i: number
45+
) => { html: string; end: number } | null;
46+
47+
function tryBackslashEscape(
48+
text: string,
49+
i: number
50+
): { html: string; end: number } | null {
51+
if (text[i] !== "\\" || i + 1 >= text.length) {return null;}
52+
const next = text[i + 1];
53+
// Hard line break: backslash at end of line
54+
if (next === "\n") {
55+
return { html: "<br>\n", end: i + 2 };
56+
}
57+
// Escapable characters
58+
if ("\\`*_{}[]()#+-.!~|>".includes(next)) {
59+
return { html: escapeHtml(next), end: i + 2 };
60+
}
61+
return null;
62+
}
63+
64+
function tryInlineCode(
65+
text: string,
66+
i: number
67+
): { html: string; end: number } | null {
68+
if (text[i] !== "`") {return null;}
69+
return parseInlineCode(text, i);
70+
}
71+
72+
function tryImage(
73+
text: string,
74+
i: number
75+
): { html: string; end: number } | null {
76+
if (text[i] !== "!" || text[i + 1] !== "[") {return null;}
77+
return parseImage(text, i);
78+
}
79+
80+
function tryLink(
81+
text: string,
82+
i: number
83+
): { html: string; end: number } | null {
84+
if (text[i] !== "[") {return null;}
85+
return parseLink(text, i);
86+
}
87+
88+
function tryStrikethrough(
89+
text: string,
90+
i: number
91+
): { html: string; end: number } | null {
92+
if (text[i] !== "~" || text[i + 1] !== "~") {return null;}
93+
return parseDelimited(text, i, "~~", "<del>", "</del>");
94+
}
95+
96+
function tryBoldItalic(
97+
text: string,
98+
i: number
99+
): { html: string; end: number } | null {
100+
if (
101+
(text[i] === "*" && text[i + 1] === "*" && text[i + 2] === "*") ||
102+
(text[i] === "_" &&
103+
text[i + 1] === "_" &&
104+
text[i + 2] === "_" &&
105+
!isIntraword(text, i, 3))
106+
) {
107+
const delimiter = text.substring(i, i + 3);
108+
return parseDelimited(text, i, delimiter, "<strong><em>", "</em></strong>");
109+
}
110+
return null;
111+
}
112+
113+
function tryBold(
114+
text: string,
115+
i: number
116+
): { html: string; end: number } | null {
117+
if (
118+
(text[i] === "*" && text[i + 1] === "*") ||
119+
(text[i] === "_" && text[i + 1] === "_" && !isIntraword(text, i, 2))
120+
) {
121+
const delimiter = text.substring(i, i + 2);
122+
return parseDelimited(text, i, delimiter, "<strong>", "</strong>");
123+
}
124+
return null;
125+
}
126+
127+
function tryItalic(
128+
text: string,
129+
i: number
130+
): { html: string; end: number } | null {
131+
if (text[i] === "*" || (text[i] === "_" && !isIntraword(text, i, 1))) {
132+
return parseDelimited(text, i, text[i], "<em>", "</em>");
133+
}
134+
return null;
135+
}
136+
137+
function trySoftBreak(
138+
text: string,
139+
i: number
140+
): { html: string; end: number } | null {
141+
if (text[i] === "\n") {
142+
return { html: "\n", end: i + 1 };
143+
}
144+
return null;
145+
}
146+
147+
/** Characters that can start an inline syntax token. */
148+
const SPECIAL_CHARS = new Set("\\`![~*_\n");
149+
150+
/**
151+
* Ordered array of inline tokenizers, tried in priority order.
152+
* The first match wins.
153+
*/
154+
const inlineTokenizers: InlineTokenizer[] = [
155+
tryBackslashEscape,
156+
tryInlineCode,
157+
tryImage,
158+
tryLink,
159+
tryStrikethrough,
160+
tryBoldItalic, // *** / ___
161+
tryBold, // ** / __
162+
tryItalic, // * / _
163+
trySoftBreak,
164+
];
165+
42166
/**
43167
* Parse inline markdown syntax and return HTML.
44168
* Handles: bold, italic, bold+italic, strikethrough, inline code,
@@ -49,133 +173,29 @@ function parseInline(text: string): string {
49173
let i = 0;
50174

51175
while (i < text.length) {
52-
// Backslash escape
53-
if (text[i] === "\\" && i + 1 < text.length) {
54-
const next = text[i + 1];
55-
// Hard line break: backslash at end of line
56-
if (next === "\n") {
57-
result += "<br>\n";
58-
i += 2;
59-
continue;
60-
}
61-
// Escapable characters
62-
if ("\\`*_{}[]()#+-.!~|>".includes(next)) {
63-
result += escapeHtml(next);
64-
i += 2;
65-
continue;
66-
}
67-
}
68-
69-
// Inline code (highest priority for inline)
70-
if (text[i] === "`") {
71-
const codeResult = parseInlineCode(text, i);
72-
if (codeResult) {
73-
result += codeResult.html;
74-
i = codeResult.end;
75-
continue;
76-
}
77-
}
78-
79-
// Images ![alt](url)
80-
if (text[i] === "!" && text[i + 1] === "[") {
81-
const imgResult = parseImage(text, i);
82-
if (imgResult) {
83-
result += imgResult.html;
84-
i = imgResult.end;
85-
continue;
86-
}
87-
}
88-
89-
// Links [text](url)
90-
if (text[i] === "[") {
91-
const linkResult = parseLink(text, i);
92-
if (linkResult) {
93-
result += linkResult.html;
94-
i = linkResult.end;
95-
continue;
96-
}
97-
}
98-
99-
// Strikethrough ~~text~~
100-
if (text[i] === "~" && text[i + 1] === "~") {
101-
const strikeResult = parseDelimited(text, i, "~~", "<del>", "</del>");
102-
if (strikeResult) {
103-
result += strikeResult.html;
104-
i = strikeResult.end;
105-
continue;
106-
}
107-
}
108-
109-
// Bold+Italic ***text*** or ___text___
110-
if (
111-
(text[i] === "*" && text[i + 1] === "*" && text[i + 2] === "*") ||
112-
(text[i] === "_" &&
113-
text[i + 1] === "_" &&
114-
text[i + 2] === "_" &&
115-
!isIntraword(text, i, 3))
116-
) {
117-
const delimiter = text.substring(i, i + 3);
118-
const tripleResult = parseDelimited(
119-
text,
120-
i,
121-
delimiter,
122-
"<strong><em>",
123-
"</em></strong>"
124-
);
125-
if (tripleResult) {
126-
result += tripleResult.html;
127-
i = tripleResult.end;
128-
continue;
129-
}
130-
}
131-
132-
// Bold **text** or __text__
133-
if (
134-
(text[i] === "*" && text[i + 1] === "*") ||
135-
(text[i] === "_" && text[i + 1] === "_" && !isIntraword(text, i, 2))
136-
) {
137-
const delimiter = text.substring(i, i + 2);
138-
const boldResult = parseDelimited(
139-
text,
140-
i,
141-
delimiter,
142-
"<strong>",
143-
"</strong>"
144-
);
145-
if (boldResult) {
146-
result += boldResult.html;
147-
i = boldResult.end;
148-
continue;
149-
}
150-
}
151-
152-
// Italic *text* or _text_
153-
if (text[i] === "*" || (text[i] === "_" && !isIntraword(text, i, 1))) {
154-
const delimiter = text[i];
155-
const italicResult = parseDelimited(
156-
text,
157-
i,
158-
delimiter,
159-
"<em>",
160-
"</em>"
161-
);
162-
if (italicResult) {
163-
result += italicResult.html;
164-
i = italicResult.end;
165-
continue;
176+
// Try each tokenizer in priority order
177+
let matched = false;
178+
if (SPECIAL_CHARS.has(text[i])) {
179+
for (const tokenizer of inlineTokenizers) {
180+
const r = tokenizer(text, i);
181+
if (r) {
182+
result += r.html;
183+
i = r.end;
184+
matched = true;
185+
break;
186+
}
166187
}
167188
}
168189

169-
// Newline within paragraph (soft break)
170-
if (text[i] === "\n") {
171-
result += "\n";
190+
if (!matched) {
191+
// Batch consecutive plain-text characters and escape once
192+
const runStart = i;
172193
i++;
173-
continue;
194+
while (i < text.length && !SPECIAL_CHARS.has(text[i])) {
195+
i++;
196+
}
197+
result += escapeHtml(text.substring(runStart, i));
174198
}
175-
176-
// Regular character
177-
result += escapeHtml(text[i]);
178-
i++;
179199
}
180200

181201
return result;
@@ -355,11 +375,11 @@ function parseDelimited(
355375
}
356376

357377
// For single-char delimiters, don't accept closer if it's part of a
358-
// multi-char run (e.g., don't treat second * in ** as italic closer)
378+
// multi-char run (e.g., don't treat the * in ** as italic closer)
359379
if (
360380
len === 1 &&
361-
j > 0 &&
362-
text[j - 1] === delimiter[0]
381+
((j > 0 && text[j - 1] === delimiter[0]) ||
382+
(j + len < text.length && text[j + len] === delimiter[0]))
363383
) {
364384
j++;
365385
continue;
@@ -559,6 +579,21 @@ function tokenize(markdown: string): Token[] {
559579
quoteLines.push(lines[i].replace(/^\s{0,3}>\s?/, ""));
560580
i++;
561581
}
582+
// Lazy continuation: collect non-blank lines that don't start a new
583+
// block-level element (per CommonMark spec)
584+
while (i < lines.length) {
585+
const cur = lines[i];
586+
if (cur.trim() === "") {break;}
587+
// Stop on block-level markers
588+
if (/^\s{0,3}>/.test(cur)) {break;} // new blockquote
589+
if (/^(#{1,6})\s/.test(cur)) {break;} // heading
590+
if (/^(`{3,}|~{3,})/.test(cur)) {break;} // code fence
591+
if (/^(\s{0,3})([-*_])\s*(\2\s*){2,}$/.test(cur)) {break;} // hr
592+
if (/^\s*([-*+]|\d+[.)])\s+/.test(cur)) {break;} // list item
593+
if (/^\s*\|(.+\|)+\s*$/.test(cur)) {break;} // table
594+
quoteLines.push(cur);
595+
i++;
596+
}
562597
tokens.push({
563598
type: "blockquote",
564599
content: quoteLines.join("\n"),
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {},
7+
"text": "This is a quote that continues here and here too",
8+
"type": "text",
9+
},
10+
],
11+
"id": "1",
12+
"props": {
13+
"backgroundColor": "default",
14+
"textColor": "default",
15+
},
16+
"type": "quote",
17+
},
18+
]

tests/src/unit/core/formatConversion/parse/parseTestInstances.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1747,6 +1747,16 @@ still same paragraph`,
17471747
},
17481748
executeTest: testParseMarkdown,
17491749
},
1750+
// Blockquote with lazy continuation (no > on continuation lines)
1751+
{
1752+
testCase: {
1753+
name: "blockquoteLazyContinuation",
1754+
content: `> This is a quote
1755+
that continues here
1756+
and here too`,
1757+
},
1758+
executeTest: testParseMarkdown,
1759+
},
17501760
// Complex document
17511761
{
17521762
testCase: {

0 commit comments

Comments
 (0)