Skip to content

Commit 0f746d5

Browse files
nperez0111claude
andcommitted
feat(markdown): pass raw HTML through to the parser output
Inline HTML tags, comments, CDATA, processing instructions, and declarations now pass through verbatim instead of being escaped to literal text. Block-level HTML (div, details, etc.) is emitted as a raw HTML block until the next blank line, per the CommonMark type-6 rules. Bare angle brackets that don't form a valid tag still get HTML-escaped, so plain text like "1 < 2" is unaffected. Adds 7 markdown parse snapshot tests covering inline tags, attributes, void tags, block-level div, HTML comments, bare angle brackets, and block HTML interrupting a paragraph. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent f9b9527 commit 0f746d5

9 files changed

Lines changed: 345 additions & 2 deletions

File tree

packages/core/src/api/parsers/markdown/markdownToHtml.ts

Lines changed: 92 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,40 @@ function trySoftBreak(
144144
return null;
145145
}
146146

147+
// Inline raw HTML: pass through tags, comments, CDATA, processing
148+
// instructions, and declarations verbatim so authors can mix HTML into
149+
// markdown (e.g. `text <em>foo</em> more`). Anything that doesn't match
150+
// these shapes falls through and gets HTML-escaped as plain text.
151+
const INLINE_HTML_TAG_RE =
152+
/^<\/?[a-zA-Z][a-zA-Z0-9-]*(?:\s+[a-zA-Z_:][a-zA-Z0-9_.:-]*(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s"'=<>`]+))?)*\s*\/?>/;
153+
const HTML_COMMENT_RE = /^<!--[\s\S]*?-->/;
154+
const HTML_CDATA_RE = /^<!\[CDATA\[[\s\S]*?\]\]>/;
155+
const HTML_PI_RE = /^<\?[\s\S]*?\?>/;
156+
const HTML_DECL_RE = /^<![A-Za-z][\s\S]*?>/;
157+
158+
function tryInlineHtml(
159+
text: string,
160+
i: number
161+
): { html: string; end: number } | null {
162+
if (text[i] !== "<") {return null;}
163+
const rest = text.substring(i);
164+
for (const re of [
165+
HTML_COMMENT_RE,
166+
HTML_CDATA_RE,
167+
HTML_PI_RE,
168+
HTML_DECL_RE,
169+
INLINE_HTML_TAG_RE,
170+
]) {
171+
const m = rest.match(re);
172+
if (m) {
173+
return { html: m[0], end: i + m[0].length };
174+
}
175+
}
176+
return null;
177+
}
178+
147179
/** Characters that can start an inline syntax token. */
148-
const SPECIAL_CHARS = new Set("\\`![~*_\n");
180+
const SPECIAL_CHARS = new Set("\\`![~*_\n<");
149181

150182
/**
151183
* Ordered array of inline tokenizers, tried in priority order.
@@ -160,6 +192,7 @@ const inlineTokenizers: InlineTokenizer[] = [
160192
tryBoldItalic, // *** / ___
161193
tryBold, // ** / __
162194
tryItalic, // * / _
195+
tryInlineHtml,
163196
trySoftBreak,
164197
];
165198

@@ -481,14 +514,46 @@ interface TableToken extends BlockToken {
481514
alignments: ("left" | "center" | "right" | null)[];
482515
}
483516

517+
interface RawHtmlToken extends BlockToken {
518+
type: "rawHtml";
519+
content: string;
520+
}
521+
484522
type Token =
485523
| HeadingToken
486524
| ParagraphToken
487525
| CodeBlockToken
488526
| BlockquoteToken
489527
| HorizontalRuleToken
490528
| ListItemToken
491-
| TableToken;
529+
| TableToken
530+
| RawHtmlToken;
531+
532+
/**
533+
* HTML block-level tag names (from the CommonMark type-6 list). When a line
534+
* starts with `<` followed by one of these tag names, the run of non-blank
535+
* lines is emitted verbatim as raw HTML rather than wrapped in a paragraph.
536+
*/
537+
const HTML_BLOCK_TAGS = new Set([
538+
"address", "article", "aside", "base", "basefont", "blockquote", "body",
539+
"caption", "center", "col", "colgroup", "dd", "details", "dialog", "dir",
540+
"div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form",
541+
"frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header",
542+
"hr", "html", "iframe", "legend", "li", "link", "main", "menu", "menuitem",
543+
"nav", "noframes", "ol", "optgroup", "option", "p", "param", "section",
544+
"source", "summary", "table", "tbody", "td", "tfoot", "th", "thead",
545+
"title", "tr", "track", "ul",
546+
]);
547+
548+
function isHtmlBlockStart(line: string): boolean {
549+
// <!-- ..., <?..., <![CDATA[..., <!DOCTYPE, etc.
550+
if (/^ {0,3}<(!--|\?|![A-Za-z]|!\[CDATA\[)/.test(line)) {
551+
return true;
552+
}
553+
const m = line.match(/^ {0,3}<\/?([a-zA-Z][a-zA-Z0-9-]*)(?:\s|\/?>|$)/);
554+
if (!m) {return false;}
555+
return HTML_BLOCK_TAGS.has(m[1].toLowerCase());
556+
}
492557

493558
// ─── Block-Level Tokenizer ──────────────────────────────────────────────────
494559

@@ -735,6 +800,23 @@ function tokenize(markdown: string): Token[] {
735800
continue;
736801
}
737802

803+
// Block-level raw HTML: a line starting with `<tag>` (block-level tag),
804+
// `<!-- ... -->`, `<?...?>`, `<!DOCTYPE ...>`, or `<![CDATA[...]]>`.
805+
// Lines are emitted verbatim until the next blank line.
806+
if (isHtmlBlockStart(line)) {
807+
const htmlLines: string[] = [];
808+
while (i < lines.length && lines[i].trim() !== "") {
809+
htmlLines.push(lines[i]);
810+
i++;
811+
}
812+
tokens.push({
813+
type: "rawHtml",
814+
content: htmlLines.join("\n"),
815+
});
816+
prevLineWasBlank = false;
817+
continue;
818+
}
819+
738820
// Paragraph (default)
739821
const paraLines: string[] = [line];
740822
i++;
@@ -749,6 +831,7 @@ function tokenize(markdown: string): Token[] {
749831
if (/^(\s{0,3})([-*_])\s*(\2\s*){2,}$/.test(nextLine)) {break;}
750832
if (/^\s*([-*+]|\d+[.)])\s+/.test(nextLine)) {break;}
751833
if (/^\s*\|(.+\|)+\s*$/.test(nextLine)) {break;}
834+
if (isHtmlBlockStart(nextLine)) {break;}
752835
// Check if next-next line is setext marker
753836
if (
754837
i + 1 < lines.length &&
@@ -921,6 +1004,13 @@ function tokensToHtml(tokens: Token[]): string {
9211004
break;
9221005
}
9231006

1007+
case "rawHtml": {
1008+
const t = token as RawHtmlToken;
1009+
html += t.content;
1010+
i++;
1011+
break;
1012+
}
1013+
9241014
default:
9251015
i++;
9261016
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {},
7+
"text": "1 < 2 and 3 > 0",
8+
"type": "text",
9+
},
10+
],
11+
"id": "1",
12+
"props": {
13+
"backgroundColor": "default",
14+
"textAlignment": "left",
15+
"textColor": "default",
16+
},
17+
"type": "paragraph",
18+
},
19+
]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {},
7+
"text": "Next paragraph.",
8+
"type": "text",
9+
},
10+
],
11+
"id": "1",
12+
"props": {
13+
"backgroundColor": "default",
14+
"textAlignment": "left",
15+
"textColor": "default",
16+
},
17+
"type": "paragraph",
18+
},
19+
]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {},
7+
"text": "A warning block.",
8+
"type": "text",
9+
},
10+
],
11+
"id": "1",
12+
"props": {
13+
"backgroundColor": "default",
14+
"textAlignment": "left",
15+
"textColor": "default",
16+
},
17+
"type": "paragraph",
18+
},
19+
]
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {},
7+
"text": "Some text before.",
8+
"type": "text",
9+
},
10+
],
11+
"id": "1",
12+
"props": {
13+
"backgroundColor": "default",
14+
"textAlignment": "left",
15+
"textColor": "default",
16+
},
17+
"type": "paragraph",
18+
},
19+
{
20+
"children": [],
21+
"content": [
22+
{
23+
"styles": {},
24+
"text": "raw block Some text after.",
25+
"type": "text",
26+
},
27+
],
28+
"id": "2",
29+
"props": {
30+
"backgroundColor": "default",
31+
"textAlignment": "left",
32+
"textColor": "default",
33+
},
34+
"type": "paragraph",
35+
},
36+
]
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {},
7+
"text": "Hello ",
8+
"type": "text",
9+
},
10+
{
11+
"styles": {
12+
"italic": true,
13+
},
14+
"text": "world",
15+
"type": "text",
16+
},
17+
{
18+
"styles": {},
19+
"text": "!",
20+
"type": "text",
21+
},
22+
],
23+
"id": "1",
24+
"props": {
25+
"backgroundColor": "default",
26+
"textAlignment": "left",
27+
"textColor": "default",
28+
},
29+
"type": "paragraph",
30+
},
31+
]
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {},
7+
"text": "Line one
8+
line two.",
9+
"type": "text",
10+
},
11+
],
12+
"id": "1",
13+
"props": {
14+
"backgroundColor": "default",
15+
"textAlignment": "left",
16+
"textColor": "default",
17+
},
18+
"type": "paragraph",
19+
},
20+
]
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
[
2+
{
3+
"children": [],
4+
"content": [
5+
{
6+
"styles": {},
7+
"text": "Text with ",
8+
"type": "text",
9+
},
10+
{
11+
"styles": {
12+
"bold": true,
13+
},
14+
"text": "bold",
15+
"type": "text",
16+
},
17+
{
18+
"styles": {},
19+
"text": " and ",
20+
"type": "text",
21+
},
22+
{
23+
"content": [
24+
{
25+
"styles": {},
26+
"text": "link",
27+
"type": "text",
28+
},
29+
],
30+
"href": "https://example.com",
31+
"type": "link",
32+
},
33+
{
34+
"styles": {},
35+
"text": ".",
36+
"type": "text",
37+
},
38+
],
39+
"id": "1",
40+
"props": {
41+
"backgroundColor": "default",
42+
"textAlignment": "left",
43+
"textColor": "default",
44+
},
45+
"type": "paragraph",
46+
},
47+
]

0 commit comments

Comments
 (0)