Skip to content

Commit 337fa30

Browse files
authored
Experimental support for pasting rich content into markdown input box (#317604)
* Experimental support for pasting rich content into markdown input box * fix comments * Cleanup more cases
1 parent ffb0919 commit 337fa30

3 files changed

Lines changed: 404 additions & 2 deletions

File tree

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
/*---------------------------------------------------------------------------------------------
2+
* Copyright (c) Microsoft Corporation. All rights reserved.
3+
* Licensed under the MIT License. See License.txt in the project root for license information.
4+
*--------------------------------------------------------------------------------------------*/
5+
6+
/**
7+
* Lightweight HTML-to-Markdown converter.
8+
*
9+
* Handles a small set of common inline and block elements so that content
10+
* pasted from web pages keeps its basic structure (headings, links, bold,
11+
* italic, code, lists) when inserted into a Markdown-aware surface such as
12+
* the chat input.
13+
*/
14+
const maxInputLength = 200_000;
15+
16+
export function convertHtmlToMarkdown(html: string): string {
17+
// Bail out on very large inputs to avoid regex backtracking cost
18+
if (html.length > maxInputLength) {
19+
return html.replace(/<[^>]+>/g, '');
20+
}
21+
22+
// Work on a mutable copy
23+
let md = html;
24+
25+
// Normalise line endings
26+
md = md.replace(/\r\n?/g, '\n');
27+
28+
// --- block elements ---------------------------------------------------
29+
30+
// Headings
31+
md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, (_m, inner) => `\n# ${inlineClean(inner)}\n`);
32+
md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, (_m, inner) => `\n## ${inlineClean(inner)}\n`);
33+
md = md.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, (_m, inner) => `\n### ${inlineClean(inner)}\n`);
34+
md = md.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, (_m, inner) => `\n#### ${inlineClean(inner)}\n`);
35+
md = md.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, (_m, inner) => `\n##### ${inlineClean(inner)}\n`);
36+
md = md.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, (_m, inner) => `\n###### ${inlineClean(inner)}\n`);
37+
38+
// Code blocks: <pre><code>…</code></pre>
39+
md = md.replace(/<pre[^>]*>\s*<code[^>]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${cleanCodeBlock(inner)}\n\`\`\`\n`);
40+
41+
// Standalone <pre> without <code>
42+
md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${cleanCodeBlock(inner)}\n\`\`\`\n`);
43+
44+
// Blockquote
45+
md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_m, inner) => {
46+
const lines = inlineClean(inner).split('\n').map(l => `> ${l.trim()}`);
47+
return `\n${lines.join('\n')}\n`;
48+
});
49+
50+
// Ordered list items — number them before stripping the <ol> wrapper
51+
md = md.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (_m, inner) => {
52+
let index = 0;
53+
const numbered = inner.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_liM: string, liInner: string) => {
54+
index++;
55+
return `${index}. ${inlineClean(liInner).trim()}\n`;
56+
});
57+
return `\n${numbered.replace(/<[^>]+>/g, '')}\n`;
58+
});
59+
60+
// Unordered list items - convert before stripping the list wrapper
61+
md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_m, inner) => `- ${inlineClean(inner).trim()}\n`);
62+
md = md.replace(/<\/?ul[^>]*>/gi, '\n');
63+
64+
// Paragraphs and divs → double newline
65+
md = md.replace(/<\/p>/gi, '\n\n');
66+
md = md.replace(/<p[^>]*>/gi, '');
67+
md = md.replace(/<\/div>/gi, '\n');
68+
md = md.replace(/<div[^>]*>/gi, '');
69+
70+
// Line breaks
71+
md = md.replace(/<br\s*\/?>/gi, '\n');
72+
73+
// Horizontal rules
74+
md = md.replace(/<hr[^>]*\/?>/gi, '\n---\n');
75+
76+
// --- inline elements --------------------------------------------------
77+
78+
// Links - must come before we strip remaining tags
79+
md = md.replace(/<a\s[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => sanitizeLink(href, inlineClean(text).trim()));
80+
81+
// Images
82+
md = md.replace(/<img\s[^>]*alt="([^"]*)"[^>]*src="([^"]*)"[^>]*\/?>/gi, '![$1]($2)');
83+
md = md.replace(/<img\s[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, '![$2]($1)');
84+
md = md.replace(/<img\s[^>]*src="([^"]*)"[^>]*\/?>/gi, '![]($1)');
85+
86+
// Bold / strong
87+
md = md.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`);
88+
89+
// Italic / emphasis
90+
md = md.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`);
91+
92+
// Inline code
93+
md = md.replace(/<code(\s[^>]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``);
94+
95+
// Strikethrough
96+
md = md.replace(/<(del|s|strike)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `~~${inlineClean(inner)}~~`);
97+
98+
// --- cleanup ----------------------------------------------------------
99+
100+
// Strip any remaining HTML tags
101+
md = md.replace(/<[^>]+>/g, '');
102+
103+
// Decode common HTML entities
104+
md = decodeEntities(md);
105+
106+
// Collapse runs of 3+ newlines into 2
107+
md = md.replace(/\n{3,}/g, '\n\n');
108+
109+
return md.trim();
110+
}
111+
112+
/** Recursively strip tags for use inside an inline markdown construct. */
113+
function inlineClean(html: string): string {
114+
// Process nested inline elements first
115+
let result = html;
116+
result = result.replace(/<a\s[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => sanitizeLink(href, inlineClean(text).trim()));
117+
result = result.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`);
118+
result = result.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`);
119+
result = result.replace(/<code(\s[^>]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``);
120+
result = result.replace(/<(del|s|strike)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `~~${inlineClean(inner)}~~`);
121+
result = result.replace(/<br\s*\/?>/gi, '\n');
122+
result = result.replace(/<[^>]+>/g, '');
123+
return decodeEntities(result);
124+
}
125+
126+
/** Strip tags, normalise <br>, and decode entities inside a code block while preserving indentation. */
127+
function cleanCodeBlock(html: string): string {
128+
let result = html;
129+
// Normalise <br> to newlines
130+
result = result.replace(/<br\s*\/?>/gi, '\n');
131+
// Strip all HTML tags (e.g. syntax-highlighting <span>s)
132+
result = result.replace(/<[^>]+>/g, '');
133+
result = decodeEntities(result);
134+
// Trim only leading/trailing newlines, preserving indentation
135+
result = result.replace(/^\n+|\n+$/g, '');
136+
return result;
137+
}
138+
139+
/** Produce a markdown link, stripping dangerous schemes like `javascript:`. */
140+
function sanitizeLink(href: string, text: string): string {
141+
if (/^(javascript|vbscript|data):/i.test(href.trim())) {
142+
return text;
143+
}
144+
return `[${text}](${href})`;
145+
}
146+
147+
/** Decode the most common HTML entities, including numeric character references. */
148+
function decodeEntities(text: string): string {
149+
return text
150+
.replace(/&amp;/g, '&')
151+
.replace(/&lt;/g, '<')
152+
.replace(/&gt;/g, '>')
153+
.replace(/&quot;/g, '"')
154+
.replace(/&#39;/g, '\'')
155+
.replace(/&nbsp;/g, ' ')
156+
.replace(/&#x(?<hex>[0-9a-fA-F]+);/g, (...args) => safeFromCodePoint(parseInt(args.at(-1).hex, 16)))
157+
.replace(/&#(?<dec>\d+);/g, (...args) => safeFromCodePoint(parseInt(args.at(-1).dec, 10)));
158+
}
159+
160+
function safeFromCodePoint(code: number): string {
161+
if (code >= 0 && code <= 0x10FFFF) {
162+
try {
163+
return String.fromCodePoint(code);
164+
} catch {
165+
// invalid code point
166+
}
167+
}
168+
return '';
169+
}
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
/*---------------------------------------------------------------------------------------------
2+
* Copyright (c) Microsoft Corporation. All rights reserved.
3+
* Licensed under the MIT License. See License.txt in the project root for license information.
4+
*--------------------------------------------------------------------------------------------*/
5+
import assert from 'assert';
6+
import { convertHtmlToMarkdown } from '../../common/htmlToMarkdown.js';
7+
import { ensureNoDisposablesAreLeakedInTestSuite } from './utils.js';
8+
9+
suite('htmlToMarkdown', () => {
10+
ensureNoDisposablesAreLeakedInTestSuite();
11+
12+
test('converts headings', () => {
13+
assert.strictEqual(convertHtmlToMarkdown('<h1>Title</h1>'), '# Title');
14+
assert.strictEqual(convertHtmlToMarkdown('<h2>Subtitle</h2>'), '## Subtitle');
15+
assert.strictEqual(convertHtmlToMarkdown('<h3>Section</h3>'), '### Section');
16+
assert.strictEqual(convertHtmlToMarkdown('<h4>Sub-section</h4>'), '#### Sub-section');
17+
assert.strictEqual(convertHtmlToMarkdown('<h5>Minor</h5>'), '##### Minor');
18+
assert.strictEqual(convertHtmlToMarkdown('<h6>Smallest</h6>'), '###### Smallest');
19+
});
20+
21+
test('converts links', () => {
22+
assert.strictEqual(
23+
convertHtmlToMarkdown('<a href="https://example.com">Example</a>'),
24+
'[Example](https://example.com)'
25+
);
26+
});
27+
28+
test('strips dangerous schemes from links', () => {
29+
assert.strictEqual(
30+
convertHtmlToMarkdown('<a href="javascript:alert(1)">click</a>'),
31+
'click'
32+
);
33+
assert.strictEqual(
34+
convertHtmlToMarkdown('<a href="vbscript:run">run</a>'),
35+
'run'
36+
);
37+
assert.strictEqual(
38+
convertHtmlToMarkdown('<a href="data:text/html,<h1>hi</h1>">data</a>'),
39+
'data'
40+
);
41+
});
42+
43+
test('converts bold and italic', () => {
44+
assert.strictEqual(convertHtmlToMarkdown('<strong>bold</strong>'), '**bold**');
45+
assert.strictEqual(convertHtmlToMarkdown('<b>bold</b>'), '**bold**');
46+
assert.strictEqual(convertHtmlToMarkdown('<em>italic</em>'), '*italic*');
47+
assert.strictEqual(convertHtmlToMarkdown('<i>italic</i>'), '*italic*');
48+
});
49+
50+
test('converts inline code', () => {
51+
assert.strictEqual(convertHtmlToMarkdown('<code>foo()</code>'), '`foo()`');
52+
});
53+
54+
test('converts code blocks', () => {
55+
assert.strictEqual(
56+
convertHtmlToMarkdown('<pre><code>const x = 1;</code></pre>'),
57+
'```\nconst x = 1;\n```'
58+
);
59+
});
60+
61+
test('converts syntax-highlighted code blocks by stripping inner tags', () => {
62+
assert.strictEqual(
63+
convertHtmlToMarkdown('<pre><code><span class="kw">const</span> x = <span class="num">1</span>;</code></pre>'),
64+
'```\nconst x = 1;\n```'
65+
);
66+
});
67+
68+
test('preserves indentation in code blocks', () => {
69+
assert.strictEqual(
70+
convertHtmlToMarkdown('<pre><code>function foo() {\n return 1;\n}</code></pre>'),
71+
'```\nfunction foo() {\n return 1;\n}\n```'
72+
);
73+
});
74+
75+
test('converts unordered lists', () => {
76+
const html = '<ul><li>one</li><li>two</li><li>three</li></ul>';
77+
assert.strictEqual(convertHtmlToMarkdown(html), '- one\n- two\n- three');
78+
});
79+
80+
test('converts ordered lists to numbered items', () => {
81+
const html = '<ol><li>first</li><li>second</li></ol>';
82+
assert.strictEqual(convertHtmlToMarkdown(html), '1. first\n2. second');
83+
});
84+
85+
test('converts line breaks', () => {
86+
assert.strictEqual(convertHtmlToMarkdown('hello<br>world'), 'hello\nworld');
87+
assert.strictEqual(convertHtmlToMarkdown('hello<br/>world'), 'hello\nworld');
88+
});
89+
90+
test('converts horizontal rules', () => {
91+
assert.strictEqual(convertHtmlToMarkdown('above<hr>below'), 'above\n---\nbelow');
92+
});
93+
94+
test('converts strikethrough', () => {
95+
assert.strictEqual(convertHtmlToMarkdown('<del>removed</del>'), '~~removed~~');
96+
assert.strictEqual(convertHtmlToMarkdown('<s>struck</s>'), '~~struck~~');
97+
});
98+
99+
test('converts blockquotes', () => {
100+
assert.strictEqual(
101+
convertHtmlToMarkdown('<blockquote>quoted text</blockquote>'),
102+
'> quoted text'
103+
);
104+
});
105+
106+
test('converts images', () => {
107+
assert.strictEqual(
108+
convertHtmlToMarkdown('<img src="https://example.com/img.png" alt="photo">'),
109+
'![photo](https://example.com/img.png)'
110+
);
111+
});
112+
113+
test('decodes HTML entities', () => {
114+
assert.strictEqual(convertHtmlToMarkdown('&amp; &lt; &gt; &quot; &#39;'), '& < > " \'');
115+
});
116+
117+
test('strips unknown tags', () => {
118+
assert.strictEqual(convertHtmlToMarkdown('<span class="x">hello</span>'), 'hello');
119+
});
120+
121+
test('handles nested inline elements', () => {
122+
assert.strictEqual(
123+
convertHtmlToMarkdown('<strong><em>bold italic</em></strong>'),
124+
'***bold italic***'
125+
);
126+
});
127+
128+
test('handles link with bold text inside', () => {
129+
assert.strictEqual(
130+
convertHtmlToMarkdown('<a href="https://example.com"><strong>click here</strong></a>'),
131+
'[**click here**](https://example.com)'
132+
);
133+
});
134+
135+
test('handles heading with link inside', () => {
136+
assert.strictEqual(
137+
convertHtmlToMarkdown('<h2><a href="https://example.com">Title</a></h2>'),
138+
'## [Title](https://example.com)'
139+
);
140+
});
141+
142+
test('collapses excessive newlines', () => {
143+
const html = '<p>one</p><p></p><p></p><p>two</p>';
144+
const result = convertHtmlToMarkdown(html);
145+
assert.ok(!result.includes('\n\n\n'), 'should not have 3+ consecutive newlines');
146+
assert.ok(result.includes('one'));
147+
assert.ok(result.includes('two'));
148+
});
149+
150+
test('handles a realistic web page snippet', () => {
151+
const html = `
152+
<h1>Getting Started</h1>
153+
<p>Welcome to <strong>VS Code</strong>. Visit <a href="https://code.visualstudio.com">the website</a> for more info.</p>
154+
<ul>
155+
<li>Fast</li>
156+
<li>Extensible</li>
157+
</ul>
158+
`;
159+
const md = convertHtmlToMarkdown(html);
160+
assert.ok(md.includes('# Getting Started'));
161+
assert.ok(md.includes('**VS Code**'));
162+
assert.ok(md.includes('[the website](https://code.visualstudio.com)'));
163+
assert.ok(md.includes('- Fast'));
164+
assert.ok(md.includes('- Extensible'));
165+
});
166+
167+
test('decodes numeric HTML entities', () => {
168+
assert.strictEqual(convertHtmlToMarkdown('&#60;tag&#62;'), '<tag>');
169+
assert.strictEqual(convertHtmlToMarkdown('&#x3C;tag&#x3E;'), '<tag>');
170+
assert.strictEqual(convertHtmlToMarkdown('&#8212;'), '—');
171+
assert.strictEqual(convertHtmlToMarkdown('&#x2014;'), '—');
172+
});
173+
174+
test('falls back to tag-stripping for very large input', () => {
175+
const large = '<b>' + 'x'.repeat(200_001) + '</b>';
176+
const result = convertHtmlToMarkdown(large);
177+
// Should strip tags but NOT apply markdown bold formatting
178+
assert.ok(!result.includes('**'));
179+
assert.ok(!result.includes('<b>'));
180+
});
181+
});

0 commit comments

Comments
 (0)