Skip to content

Commit 0580928

Browse files
nperez0111claude
andcommitted
fix(markdown): handle 6 common-pattern parser gaps from CommonMark spec
Surfaced by running the marked.js CommonMark 0.31.2 test suite against the parser as a diagnostic. Fixes patterns real users hit; intentionally skips the long tail of edge cases per the minimal-parser scope. - Two-space hard line break (`text \n`) now emits <br> - ATX heading trailing whitespace and closing #s stripped - Code span: internal newlines collapse to spaces, leading/trailing space stripped only when content has a non-space char - Link and image titles parsed from `(url "title")` form - Angle-bracket-wrapped URLs `(<url>)` strip the brackets for both links and images - Paragraph lines with up to 3 leading spaces have the indent stripped; trailing whitespace at end of paragraph is also trimmed Adds 7 regression tests pinned to specific CommonMark example numbers. CommonMark normalized pass rate: 229 -> 280 (+51 cases). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 0f746d5 commit 0580928

9 files changed

Lines changed: 284 additions & 38 deletions

File tree

packages/core/src/api/parsers/markdown/markdownToHtml.ts

Lines changed: 91 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,22 @@ function parseInline(text: string): string {
206206
let i = 0;
207207

208208
while (i < text.length) {
209+
// Hard line break: 2+ trailing spaces immediately before a newline.
210+
// (The other hard-break form, backslash + newline, is handled by
211+
// tryBackslashEscape.) Strip the trailing spaces from the accumulated
212+
// result before emitting the <br>.
213+
if (
214+
text[i] === "\n" &&
215+
i >= 2 &&
216+
text[i - 1] === " " &&
217+
text[i - 2] === " "
218+
) {
219+
result = result.replace(/ +$/, "");
220+
result += "<br>\n";
221+
i++;
222+
continue;
223+
}
224+
209225
// Try each tokenizer in priority order
210226
let matched = false;
211227
if (SPECIAL_CHARS.has(text[i])) {
@@ -258,11 +274,16 @@ function parseInlineCode(
258274
}
259275
if (closeCount === openCount) {
260276
let code = text.substring(i, closeStart);
261-
// Strip one leading and one trailing space if both exist
277+
// Per CommonMark: line endings inside a code span are converted to
278+
// single spaces, then if the result starts AND ends with a space and
279+
// is not all-spaces, one leading + trailing space is stripped (so
280+
// `` ` `foo` ` `` is `<code>`foo`</code>`).
281+
code = code.replace(/\n/g, " ");
262282
if (
263283
code.length >= 2 &&
264284
code[0] === " " &&
265-
code[code.length - 1] === " "
285+
code[code.length - 1] === " " &&
286+
/[^ ]/.test(code)
266287
) {
267288
code = code.substring(1, code.length - 1);
268289
}
@@ -295,17 +316,9 @@ function parseImage(
295316
if (parenEnd === -1) {return null;}
296317

297318
const alt = text.substring(altStart, altEnd);
298-
let urlContent = text.substring(urlStart, parenEnd).trim();
299-
let title: string | undefined;
300-
301-
// Check for title in quotes
302-
const titleMatch = urlContent.match(/^(\S+)\s+"([^"]*)"$/);
303-
if (titleMatch) {
304-
urlContent = titleMatch[1];
305-
title = titleMatch[2];
306-
}
307-
308-
const url = urlContent;
319+
const { url, title } = parseDestinationAndTitle(
320+
text.substring(urlStart, parenEnd),
321+
);
309322

310323
if (isVideoUrl(url)) {
311324
// Match remark-rehype behavior: data-name comes from the title, not alt
@@ -315,8 +328,10 @@ function parseImage(
315328
};
316329
}
317330

331+
const titleAttr =
332+
title !== undefined ? ` title="${escapeHtml(title)}"` : "";
318333
return {
319-
html: `<img src="${escapeHtml(url)}" alt="${escapeHtml(alt)}">`,
334+
html: `<img src="${escapeHtml(url)}" alt="${escapeHtml(alt)}"${titleAttr}>`,
320335
end: parenEnd + 1,
321336
};
322337
}
@@ -337,10 +352,14 @@ function parseLink(
337352
if (parenEnd === -1) {return null;}
338353

339354
const linkText = text.substring(textStart, textEnd);
340-
const url = extractDestination(text.substring(urlStart, parenEnd).trim());
355+
const { url, title } = parseDestinationAndTitle(
356+
text.substring(urlStart, parenEnd),
357+
);
341358

359+
const titleAttr =
360+
title !== undefined ? ` title="${escapeHtml(title)}"` : "";
342361
return {
343-
html: `<a href="${escapeHtml(url)}">${parseInline(linkText)}</a>`,
362+
html: `<a href="${escapeHtml(url)}"${titleAttr}>${parseInline(linkText)}</a>`,
344363
end: parenEnd + 1,
345364
};
346365
}
@@ -378,32 +397,56 @@ function findClosingParen(text: string, openPos: number): number {
378397
}
379398

380399
/**
381-
* Extract the destination URL from a link/image URL+title string.
382-
* Handles angle-bracket destinations and strips optional titles.
383-
* E.g., `<url>` → `url`, `url "title"` → `url`
400+
* Parse the inside of `(...)` from a link/image (the URL and optional title).
401+
* Handles three URL forms:
402+
* - bare: `/uri` or `/uri "title"`
403+
* - angle-bracket: `<url>` or `<url> "title"` (brackets are stripped)
404+
* And three title-quote forms: `"..."`, `'...'`, `(...)`.
384405
*/
385-
function extractDestination(raw: string): string {
386-
// Angle-bracket destination: <url>
387-
if (raw.startsWith("<") && raw.endsWith(">")) {
388-
return raw.substring(1, raw.length - 1);
389-
}
406+
function parseDestinationAndTitle(raw: string): {
407+
url: string;
408+
title?: string;
409+
} {
410+
raw = raw.trim();
411+
let url: string;
412+
let rest: string;
413+
390414
if (raw.startsWith("<")) {
391415
const close = raw.indexOf(">");
392-
if (close !== -1) {
393-
return raw.substring(1, close);
416+
if (close === -1) {
417+
// Unmatched `<` — treat the whole thing as the URL minus the `<`.
418+
url = raw.substring(1);
419+
rest = "";
420+
} else {
421+
url = raw.substring(1, close);
422+
rest = raw.substring(close + 1).trim();
394423
}
395-
}
396-
// Split at first unescaped whitespace to separate destination from title
397-
for (let i = 0; i < raw.length; i++) {
398-
if (raw[i] === "\\" && i + 1 < raw.length) {
399-
i++; // skip escaped char
400-
continue;
424+
} else {
425+
// Split at first unescaped whitespace.
426+
let split = raw.length;
427+
for (let i = 0; i < raw.length; i++) {
428+
if (raw[i] === "\\" && i + 1 < raw.length) {
429+
i++;
430+
continue;
431+
}
432+
if (raw[i] === " " || raw[i] === "\t" || raw[i] === "\n") {
433+
split = i;
434+
break;
435+
}
401436
}
402-
if (raw[i] === " " || raw[i] === "\t" || raw[i] === "\n") {
403-
return raw.substring(0, i);
437+
url = raw.substring(0, split);
438+
rest = raw.substring(split).trim();
439+
}
440+
441+
let title: string | undefined;
442+
if (rest.length > 0) {
443+
const titleMatch = rest.match(/^"([^"]*)"$|^'([^']*)'$|^\(([^)]*)\)$/);
444+
if (titleMatch) {
445+
title = titleMatch[1] ?? titleMatch[2] ?? titleMatch[3];
404446
}
405447
}
406-
return raw;
448+
449+
return { url, title };
407450
}
408451

409452
function parseDelimited(
@@ -602,8 +645,11 @@ function tokenize(markdown: string): Token[] {
602645
continue;
603646
}
604647

605-
// ATX Heading
606-
const headingMatch = line.match(/^(#{1,6})\s+(.+?)(?:\s+#+)?$/);
648+
// ATX Heading.
649+
// - Closing `#` sequence requires a preceding space (so `### foo###`
650+
// keeps the trailing #s as text, while `### foo ###` strips them).
651+
// - Trailing whitespace is always stripped from the heading content.
652+
const headingMatch = line.match(/^(#{1,6})\s+(.+?)(?:\s+#+\s*|\s*)$/);
607653
if (headingMatch) {
608654
tokens.push({
609655
type: "heading",
@@ -843,9 +889,16 @@ function tokenize(markdown: string): Token[] {
843889
paraLines.push(nextLine);
844890
i++;
845891
}
892+
// CommonMark allows up to 3 leading spaces of indent on paragraph lines.
893+
// Also strip trailing whitespace from the final line so a trailing
894+
// hard-break sequence (` \n` at end of paragraph) doesn't leak as
895+
// literal trailing spaces in the rendered output.
846896
tokens.push({
847897
type: "paragraph",
848-
content: paraLines.join("\n"),
898+
content: paraLines
899+
.map((l) => l.replace(/^ {1,3}/, ""))
900+
.join("\n")
901+
.replace(/[ \t]+$/, ""),
849902
});
850903
prevLineWasBlank = false;
851904
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {
7+
"code": true,
8+
},
9+
"text": "foo bar baz",
10+
"type": "text",
11+
},
12+
],
13+
"id": "1",
14+
"props": {
15+
"backgroundColor": "default",
16+
"textAlignment": "left",
17+
"textColor": "default",
18+
},
19+
"type": "paragraph",
20+
},
21+
]
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {},
7+
"text": "Line one
8+
Line two",
9+
"type": "text",
10+
},
11+
],
12+
"id": "1",
13+
"props": {
14+
"backgroundColor": "default",
15+
"textAlignment": "left",
16+
"textColor": "default",
17+
},
18+
"type": "paragraph",
19+
},
20+
]
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {},
7+
"text": "foo",
8+
"type": "text",
9+
},
10+
],
11+
"id": "1",
12+
"props": {
13+
"backgroundColor": "default",
14+
"isToggleable": false,
15+
"level": 1,
16+
"textAlignment": "left",
17+
"textColor": "default",
18+
},
19+
"type": "heading",
20+
},
21+
]
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {},
7+
"text": "foo",
8+
"type": "text",
9+
},
10+
],
11+
"id": "1",
12+
"props": {
13+
"backgroundColor": "default",
14+
"isToggleable": false,
15+
"level": 3,
16+
"textAlignment": "left",
17+
"textColor": "default",
18+
},
19+
"type": "heading",
20+
},
21+
]
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[
2+
{
3+
"children": [],
4+
"content": undefined,
5+
"id": "1",
6+
"props": {
7+
"backgroundColor": "default",
8+
"caption": "",
9+
"name": "alt",
10+
"showPreview": true,
11+
"textAlignment": "left",
12+
"url": "https://example.com/image.png",
13+
},
14+
"type": "image",
15+
},
16+
]
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[
2+
{
3+
"children": [],
4+
"content": undefined,
5+
"id": "1",
6+
"props": {
7+
"backgroundColor": "default",
8+
"caption": "",
9+
"name": "alt text",
10+
"showPreview": true,
11+
"textAlignment": "left",
12+
"url": "https://example.com/image.png",
13+
},
14+
"type": "image",
15+
},
16+
]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {},
7+
"text": "aaa bbb",
8+
"type": "text",
9+
},
10+
],
11+
"id": "1",
12+
"props": {
13+
"backgroundColor": "default",
14+
"textAlignment": "left",
15+
"textColor": "default",
16+
},
17+
"type": "paragraph",
18+
},
19+
]

0 commit comments

Comments
 (0)