microsoft · lramos15 · May 21, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/src/vs/base/common/htmlToMarkdown.ts b/src/vs/base/common/htmlToMarkdown.ts
@@ -0,0 +1,169 @@
+/*---------------------------------------------------------------------------------------------
+ *  Copyright (c) Microsoft Corporation. All rights reserved.
+ *  Licensed under the MIT License. See License.txt in the project root for license information.
+ *--------------------------------------------------------------------------------------------*/
+
+/**
+ * Lightweight HTML-to-Markdown converter.
+ *
+ * Handles a small set of common inline and block elements so that content
+ * pasted from web pages keeps its basic structure (headings, links, bold,
+ * italic, code, lists) when inserted into a Markdown-aware surface such as
+ * the chat input.
+ */
+const maxInputLength = 200_000;
+
+export function convertHtmlToMarkdown(html: string): string {
+	// Bail out on very large inputs to avoid regex backtracking cost
+	if (html.length > maxInputLength) {
+		return html.replace(/<[^>]+>/g, '');
+	}
+
+	// Work on a mutable copy
+	let md = html;
+
+	// Normalise line endings
+	md = md.replace(/\r\n?/g, '\n');
+
+	// --- block elements ---------------------------------------------------
+
+	// Headings
+	md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, (_m, inner) => `\n# ${inlineClean(inner)}\n`);
+	md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, (_m, inner) => `\n## ${inlineClean(inner)}\n`);
+	md = md.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, (_m, inner) => `\n### ${inlineClean(inner)}\n`);
+	md = md.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, (_m, inner) => `\n#### ${inlineClean(inner)}\n`);
+	md = md.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, (_m, inner) => `\n##### ${inlineClean(inner)}\n`);
+	md = md.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, (_m, inner) => `\n###### ${inlineClean(inner)}\n`);
+
+	// Code blocks: <pre><code>…</code></pre>
+	md = md.replace(/<pre[^>]*>\s*<code[^>]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${cleanCodeBlock(inner)}\n\`\`\`\n`);
+
+	// Standalone <pre> without <code>
+	md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${cleanCodeBlock(inner)}\n\`\`\`\n`);
+
+	// Blockquote
+	md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_m, inner) => {
+		const lines = inlineClean(inner).split('\n').map(l => `> ${l.trim()}`);
+		return `\n${lines.join('\n')}\n`;
+	});
+
+	// Ordered list items — number them before stripping the <ol> wrapper
+	md = md.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (_m, inner) => {
+		let index = 0;
+		const numbered = inner.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_liM: string, liInner: string) => {
+			index++;
+			return `${index}. ${inlineClean(liInner).trim()}\n`;
+		});
+		return `\n${numbered.replace(/<[^>]+>/g, '')}\n`;
+	});
+
+	// Unordered list items - convert before stripping the list wrapper
+	md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_m, inner) => `- ${inlineClean(inner).trim()}\n`);
+	md = md.replace(/<\/?ul[^>]*>/gi, '\n');
+
+	// Paragraphs and divs → double newline
+	md = md.replace(/<\/p>/gi, '\n\n');
+	md = md.replace(/<p[^>]*>/gi, '');
+	md = md.replace(/<\/div>/gi, '\n');
+	md = md.replace(/<div[^>]*>/gi, '');
+
+	// Line breaks
+	md = md.replace(/<br\s*\/?>/gi, '\n');
+
+	// Horizontal rules
+	md = md.replace(/<hr[^>]*\/?>/gi, '\n---\n');
+
+	// --- inline elements --------------------------------------------------
+
+	// Links - must come before we strip remaining tags
+	md = md.replace(/<a\s[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => sanitizeLink(href, inlineClean(text).trim()));
+
+	// Images
+	md = md.replace(/<img\s[^>]*alt="([^"]*)"[^>]*src="([^"]*)"[^>]*\/?>/gi, '![$1]($2)');
+	md = md.replace(/<img\s[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, '![$2]($1)');
+	md = md.replace(/<img\s[^>]*src="([^"]*)"[^>]*\/?>/gi, '![]($1)');
+
+	// Bold / strong
+	md = md.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`);
+
+	// Italic / emphasis
+	md = md.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`);
+
+	// Inline code
+	md = md.replace(/<code(\s[^>]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``);
+
+	// Strikethrough
+	md = md.replace(/<(del|s|strike)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `~~${inlineClean(inner)}~~`);
+
+	// --- cleanup ----------------------------------------------------------
+
+	// Strip any remaining HTML tags
+	md = md.replace(/<[^>]+>/g, '');
+
+	// Decode common HTML entities
+	md = decodeEntities(md);
+
+	// Collapse runs of 3+ newlines into 2
+	md = md.replace(/\n{3,}/g, '\n\n');
+
+	return md.trim();
+}
+
+/** Recursively strip tags for use inside an inline markdown construct. */
+function inlineClean(html: string): string {
+	// Process nested inline elements first
+	let result = html;
+	result = result.replace(/<a\s[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => sanitizeLink(href, inlineClean(text).trim()));
+	result = result.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`);
+	result = result.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`);
+	result = result.replace(/<code(\s[^>]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``);
+	result = result.replace(/<(del|s|strike)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `~~${inlineClean(inner)}~~`);
+	result = result.replace(/<br\s*\/?>/gi, '\n');
+	result = result.replace(/<[^>]+>/g, '');
+	return decodeEntities(result);
+}
+
+/** Strip tags, normalise <br>, and decode entities inside a code block while preserving indentation. */
+function cleanCodeBlock(html: string): string {
+	let result = html;
+	// Normalise <br> to newlines
+	result = result.replace(/<br\s*\/?>/gi, '\n');
+	// Strip all HTML tags (e.g. syntax-highlighting <span>s)
+	result = result.replace(/<[^>]+>/g, '');
+	result = decodeEntities(result);
+	// Trim only leading/trailing newlines, preserving indentation
+	result = result.replace(/^\n+|\n+$/g, '');
+	return result;
+}
+
+/** Produce a markdown link, stripping dangerous schemes like `javascript:`. */
+function sanitizeLink(href: string, text: string): string {
+	if (/^(javascript|vbscript|data):/i.test(href.trim())) {
+		return text;
+	}
+	return `[${text}](${href})`;
+}
+
+/** Decode the most common HTML entities, including numeric character references. */
+function decodeEntities(text: string): string {
+	return text
+		.replace(/&amp;/g, '&')
+		.replace(/&lt;/g, '<')
+		.replace(/&gt;/g, '>')
+		.replace(/&quot;/g, '"')
+		.replace(/&#39;/g, '\'')
+		.replace(/&nbsp;/g, ' ')
+		.replace(/&#x(?<hex>[0-9a-fA-F]+);/g, (...args) => safeFromCodePoint(parseInt(args.at(-1).hex, 16)))
+		.replace(/&#(?<dec>\d+);/g, (...args) => safeFromCodePoint(parseInt(args.at(-1).dec, 10)));
+}
+
+function safeFromCodePoint(code: number): string {
+	if (code >= 0 && code <= 0x10FFFF) {
+		try {
+			return String.fromCodePoint(code);
+		} catch {
+			// invalid code point
+		}
+	}
+	return '';
+}
diff --git a/src/vs/base/test/common/htmlToMarkdown.test.ts b/src/vs/base/test/common/htmlToMarkdown.test.ts
@@ -0,0 +1,181 @@
+/*---------------------------------------------------------------------------------------------
+ *  Copyright (c) Microsoft Corporation. All rights reserved.
+ *  Licensed under the MIT License. See License.txt in the project root for license information.
+ *--------------------------------------------------------------------------------------------*/
+import assert from 'assert';
+import { convertHtmlToMarkdown } from '../../common/htmlToMarkdown.js';
+import { ensureNoDisposablesAreLeakedInTestSuite } from './utils.js';
+
+suite('htmlToMarkdown', () => {
+	ensureNoDisposablesAreLeakedInTestSuite();
+
+	test('converts headings', () => {
+		assert.strictEqual(convertHtmlToMarkdown('<h1>Title</h1>'), '# Title');
+		assert.strictEqual(convertHtmlToMarkdown('<h2>Subtitle</h2>'), '## Subtitle');
+		assert.strictEqual(convertHtmlToMarkdown('<h3>Section</h3>'), '### Section');
+		assert.strictEqual(convertHtmlToMarkdown('<h4>Sub-section</h4>'), '#### Sub-section');
+		assert.strictEqual(convertHtmlToMarkdown('<h5>Minor</h5>'), '##### Minor');
+		assert.strictEqual(convertHtmlToMarkdown('<h6>Smallest</h6>'), '###### Smallest');
+	});
+
+	test('converts links', () => {
+		assert.strictEqual(
+			convertHtmlToMarkdown('<a href="https://example.com">Example</a>'),
+			'[Example](https://example.com)'
+		);
+	});
+
+	test('strips dangerous schemes from links', () => {
+		assert.strictEqual(
+			convertHtmlToMarkdown('<a href="javascript:alert(1)">click</a>'),
+			'click'
+		);
+		assert.strictEqual(
+			convertHtmlToMarkdown('<a href="vbscript:run">run</a>'),
+			'run'
+		);
+		assert.strictEqual(
+			convertHtmlToMarkdown('<a href="data:text/html,<h1>hi</h1>">data</a>'),
+			'data'
+		);
+	});
+
+	test('converts bold and italic', () => {
+		assert.strictEqual(convertHtmlToMarkdown('<strong>bold</strong>'), '**bold**');
+		assert.strictEqual(convertHtmlToMarkdown('<b>bold</b>'), '**bold**');
+		assert.strictEqual(convertHtmlToMarkdown('<em>italic</em>'), '*italic*');
+		assert.strictEqual(convertHtmlToMarkdown('<i>italic</i>'), '*italic*');
+	});
+
+	test('converts inline code', () => {
+		assert.strictEqual(convertHtmlToMarkdown('<code>foo()</code>'), '`foo()`');
+	});
+
+	test('converts code blocks', () => {
+		assert.strictEqual(
+			convertHtmlToMarkdown('<pre><code>const x = 1;</code></pre>'),
+			'```\nconst x = 1;\n```'
+		);
+	});
+
+	test('converts syntax-highlighted code blocks by stripping inner tags', () => {
+		assert.strictEqual(
+			convertHtmlToMarkdown('<pre><code><span class="kw">const</span> x = <span class="num">1</span>;</code></pre>'),
+			'```\nconst x = 1;\n```'
+		);
+	});
+
+	test('preserves indentation in code blocks', () => {
+		assert.strictEqual(
+			convertHtmlToMarkdown('<pre><code>function foo() {\n  return 1;\n}</code></pre>'),
+			'```\nfunction foo() {\n  return 1;\n}\n```'
+		);
+	});
+
+	test('converts unordered lists', () => {
+		const html = '<ul><li>one</li><li>two</li><li>three</li></ul>';
+		assert.strictEqual(convertHtmlToMarkdown(html), '- one\n- two\n- three');
+	});
+
+	test('converts ordered lists to numbered items', () => {
+		const html = '<ol><li>first</li><li>second</li></ol>';
+		assert.strictEqual(convertHtmlToMarkdown(html), '1. first\n2. second');
+	});
+
+	test('converts line breaks', () => {
+		assert.strictEqual(convertHtmlToMarkdown('hello<br>world'), 'hello\nworld');
+		assert.strictEqual(convertHtmlToMarkdown('hello<br/>world'), 'hello\nworld');
+	});
+
+	test('converts horizontal rules', () => {
+		assert.strictEqual(convertHtmlToMarkdown('above<hr>below'), 'above\n---\nbelow');
+	});
+
+	test('converts strikethrough', () => {
+		assert.strictEqual(convertHtmlToMarkdown('<del>removed</del>'), '~~removed~~');
+		assert.strictEqual(convertHtmlToMarkdown('<s>struck</s>'), '~~struck~~');
+	});
+
+	test('converts blockquotes', () => {
+		assert.strictEqual(
+			convertHtmlToMarkdown('<blockquote>quoted text</blockquote>'),
+			'> quoted text'
+		);
+	});
+
+	test('converts images', () => {
+		assert.strictEqual(
+			convertHtmlToMarkdown('<img src="https://example.com/img.png" alt="photo">'),
+			'![photo](https://example.com/img.png)'
+		);
+	});
+
+	test('decodes HTML entities', () => {
+		assert.strictEqual(convertHtmlToMarkdown('&amp; &lt; &gt; &quot; &#39;'), '& < > " \'');
+	});
+
+	test('strips unknown tags', () => {
+		assert.strictEqual(convertHtmlToMarkdown('<span class="x">hello</span>'), 'hello');
+	});
+
+	test('handles nested inline elements', () => {
+		assert.strictEqual(
+			convertHtmlToMarkdown('<strong><em>bold italic</em></strong>'),
+			'***bold italic***'
+		);
+	});
+
+	test('handles link with bold text inside', () => {
+		assert.strictEqual(
+			convertHtmlToMarkdown('<a href="https://example.com"><strong>click here</strong></a>'),
+			'[**click here**](https://example.com)'
+		);
+	});
+
+	test('handles heading with link inside', () => {
+		assert.strictEqual(
+			convertHtmlToMarkdown('<h2><a href="https://example.com">Title</a></h2>'),
+			'## [Title](https://example.com)'
+		);
+	});
+
+	test('collapses excessive newlines', () => {
+		const html = '<p>one</p><p></p><p></p><p>two</p>';
+		const result = convertHtmlToMarkdown(html);
+		assert.ok(!result.includes('\n\n\n'), 'should not have 3+ consecutive newlines');
+		assert.ok(result.includes('one'));
+		assert.ok(result.includes('two'));
+	});
+
+	test('handles a realistic web page snippet', () => {
+		const html = `
+			<h1>Getting Started</h1>
+			<p>Welcome to <strong>VS Code</strong>. Visit <a href="https://code.visualstudio.com">the website</a> for more info.</p>
+			<ul>
+				<li>Fast</li>
+				<li>Extensible</li>
+			</ul>
+		`;
+		const md = convertHtmlToMarkdown(html);
+		assert.ok(md.includes('# Getting Started'));
+		assert.ok(md.includes('**VS Code**'));
+		assert.ok(md.includes('[the website](https://code.visualstudio.com)'));
+		assert.ok(md.includes('- Fast'));
+		assert.ok(md.includes('- Extensible'));
+	});
+
+	test('decodes numeric HTML entities', () => {
+		assert.strictEqual(convertHtmlToMarkdown('&#60;tag&#62;'), '<tag>');
+		assert.strictEqual(convertHtmlToMarkdown('&#x3C;tag&#x3E;'), '<tag>');
+		assert.strictEqual(convertHtmlToMarkdown('&#8212;'), '—');
+		assert.strictEqual(convertHtmlToMarkdown('&#x2014;'), '—');
+	});
+
+	test('falls back to tag-stripping for very large input', () => {
+		const large = '<b>' + 'x'.repeat(200_001) + '</b>';
+		const result = convertHtmlToMarkdown(large);
+		// Should strip tags but NOT apply markdown bold formatting
+		assert.ok(!result.includes('**'));
+		assert.ok(!result.includes('<b>'));
+	});
+});