Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 169 additions & 0 deletions src/vs/base/common/htmlToMarkdown.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/

/**
* Lightweight HTML-to-Markdown converter.
*
* Handles a small set of common inline and block elements so that content
* pasted from web pages keeps its basic structure (headings, links, bold,
* italic, code, lists) when inserted into a Markdown-aware surface such as
* the chat input.
*/
const maxInputLength = 200_000;

export function convertHtmlToMarkdown(html: string): string {
// Bail out on very large inputs to avoid regex backtracking cost
if (html.length > maxInputLength) {
return html.replace(/<[^>]+>/g, '');
}

// Work on a mutable copy
let md = html;

// Normalise line endings
md = md.replace(/\r\n?/g, '\n');

// --- block elements ---------------------------------------------------

// Headings
md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, (_m, inner) => `\n# ${inlineClean(inner)}\n`);
md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, (_m, inner) => `\n## ${inlineClean(inner)}\n`);
md = md.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, (_m, inner) => `\n### ${inlineClean(inner)}\n`);
md = md.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, (_m, inner) => `\n#### ${inlineClean(inner)}\n`);
md = md.replace(/<h5[^>]*>([\s\S]*?)<\/h5>/gi, (_m, inner) => `\n##### ${inlineClean(inner)}\n`);
md = md.replace(/<h6[^>]*>([\s\S]*?)<\/h6>/gi, (_m, inner) => `\n###### ${inlineClean(inner)}\n`);

// Code blocks: <pre><code>…</code></pre>
md = md.replace(/<pre[^>]*>\s*<code[^>]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${cleanCodeBlock(inner)}\n\`\`\`\n`);

// Standalone <pre> without <code>
md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${cleanCodeBlock(inner)}\n\`\`\`\n`);

// Blockquote
md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, (_m, inner) => {
const lines = inlineClean(inner).split('\n').map(l => `> ${l.trim()}`);
return `\n${lines.join('\n')}\n`;
});

// Ordered list items — number them before stripping the <ol> wrapper
md = md.replace(/<ol[^>]*>([\s\S]*?)<\/ol>/gi, (_m, inner) => {
let index = 0;
const numbered = inner.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_liM: string, liInner: string) => {
index++;
return `${index}. ${inlineClean(liInner).trim()}\n`;
});
return `\n${numbered.replace(/<[^>]+>/g, '')}\n`;
});

// Unordered list items - convert before stripping the list wrapper
md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_m, inner) => `- ${inlineClean(inner).trim()}\n`);
md = md.replace(/<\/?ul[^>]*>/gi, '\n');

// Paragraphs and divs → double newline
md = md.replace(/<\/p>/gi, '\n\n');
md = md.replace(/<p[^>]*>/gi, '');
md = md.replace(/<\/div>/gi, '\n');
md = md.replace(/<div[^>]*>/gi, '');

// Line breaks
md = md.replace(/<br\s*\/?>/gi, '\n');

// Horizontal rules
md = md.replace(/<hr[^>]*\/?>/gi, '\n---\n');

// --- inline elements --------------------------------------------------

// Links - must come before we strip remaining tags
md = md.replace(/<a\s[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => sanitizeLink(href, inlineClean(text).trim()));

// Images
md = md.replace(/<img\s[^>]*alt="([^"]*)"[^>]*src="([^"]*)"[^>]*\/?>/gi, '![$1]($2)');
md = md.replace(/<img\s[^>]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, '![$2]($1)');
md = md.replace(/<img\s[^>]*src="([^"]*)"[^>]*\/?>/gi, '![]($1)');

// Bold / strong
md = md.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`);

// Italic / emphasis
md = md.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`);

// Inline code
md = md.replace(/<code(\s[^>]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``);

// Strikethrough
md = md.replace(/<(del|s|strike)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `~~${inlineClean(inner)}~~`);

// --- cleanup ----------------------------------------------------------

// Strip any remaining HTML tags
md = md.replace(/<[^>]+>/g, '');

// Decode common HTML entities
md = decodeEntities(md);

// Collapse runs of 3+ newlines into 2
md = md.replace(/\n{3,}/g, '\n\n');

return md.trim();
}

/** Recursively strip tags for use inside an inline markdown construct. */
function inlineClean(html: string): string {
// Process nested inline elements first
let result = html;
result = result.replace(/<a\s[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => sanitizeLink(href, inlineClean(text).trim()));
result = result.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`);
result = result.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`);
result = result.replace(/<code(\s[^>]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``);
result = result.replace(/<(del|s|strike)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `~~${inlineClean(inner)}~~`);
result = result.replace(/<br\s*\/?>/gi, '\n');
result = result.replace(/<[^>]+>/g, '');
return decodeEntities(result);
}

/** Strip tags, normalise <br>, and decode entities inside a code block while preserving indentation. */
function cleanCodeBlock(html: string): string {
let result = html;
// Normalise <br> to newlines
result = result.replace(/<br\s*\/?>/gi, '\n');
// Strip all HTML tags (e.g. syntax-highlighting <span>s)
result = result.replace(/<[^>]+>/g, '');
result = decodeEntities(result);
// Trim only leading/trailing newlines, preserving indentation
result = result.replace(/^\n+|\n+$/g, '');
return result;
}

/** Produce a markdown link, stripping dangerous schemes like `javascript:`. */
function sanitizeLink(href: string, text: string): string {
if (/^(javascript|vbscript|data):/i.test(href.trim())) {
return text;
}
return `[${text}](${href})`;
}

/** Decode the most common HTML entities, including numeric character references. */
function decodeEntities(text: string): string {
return text
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, '\'')
.replace(/&nbsp;/g, ' ')
.replace(/&#x(?<hex>[0-9a-fA-F]+);/g, (...args) => safeFromCodePoint(parseInt(args.at(-1).hex, 16)))
.replace(/&#(?<dec>\d+);/g, (...args) => safeFromCodePoint(parseInt(args.at(-1).dec, 10)));
}

function safeFromCodePoint(code: number): string {
if (code >= 0 && code <= 0x10FFFF) {
try {
return String.fromCodePoint(code);
} catch {
// invalid code point
}
}
return '';
}
181 changes: 181 additions & 0 deletions src/vs/base/test/common/htmlToMarkdown.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
import assert from 'assert';
import { convertHtmlToMarkdown } from '../../common/htmlToMarkdown.js';
import { ensureNoDisposablesAreLeakedInTestSuite } from './utils.js';

suite('htmlToMarkdown', () => {
ensureNoDisposablesAreLeakedInTestSuite();

test('converts headings', () => {
assert.strictEqual(convertHtmlToMarkdown('<h1>Title</h1>'), '# Title');
assert.strictEqual(convertHtmlToMarkdown('<h2>Subtitle</h2>'), '## Subtitle');
assert.strictEqual(convertHtmlToMarkdown('<h3>Section</h3>'), '### Section');
assert.strictEqual(convertHtmlToMarkdown('<h4>Sub-section</h4>'), '#### Sub-section');
assert.strictEqual(convertHtmlToMarkdown('<h5>Minor</h5>'), '##### Minor');
assert.strictEqual(convertHtmlToMarkdown('<h6>Smallest</h6>'), '###### Smallest');
});

test('converts links', () => {
assert.strictEqual(
convertHtmlToMarkdown('<a href="https://example.com">Example</a>'),
'[Example](https://example.com)'
);
});

test('strips dangerous schemes from links', () => {
assert.strictEqual(
convertHtmlToMarkdown('<a href="javascript:alert(1)">click</a>'),
'click'
);
assert.strictEqual(
convertHtmlToMarkdown('<a href="vbscript:run">run</a>'),
'run'
);
assert.strictEqual(
convertHtmlToMarkdown('<a href="data:text/html,<h1>hi</h1>">data</a>'),
'data'
);
});

test('converts bold and italic', () => {
assert.strictEqual(convertHtmlToMarkdown('<strong>bold</strong>'), '**bold**');
assert.strictEqual(convertHtmlToMarkdown('<b>bold</b>'), '**bold**');
assert.strictEqual(convertHtmlToMarkdown('<em>italic</em>'), '*italic*');
assert.strictEqual(convertHtmlToMarkdown('<i>italic</i>'), '*italic*');
});

test('converts inline code', () => {
assert.strictEqual(convertHtmlToMarkdown('<code>foo()</code>'), '`foo()`');
});

test('converts code blocks', () => {
assert.strictEqual(
convertHtmlToMarkdown('<pre><code>const x = 1;</code></pre>'),
'```\nconst x = 1;\n```'
);
});

test('converts syntax-highlighted code blocks by stripping inner tags', () => {
assert.strictEqual(
convertHtmlToMarkdown('<pre><code><span class="kw">const</span> x = <span class="num">1</span>;</code></pre>'),
'```\nconst x = 1;\n```'
);
});

test('preserves indentation in code blocks', () => {
assert.strictEqual(
convertHtmlToMarkdown('<pre><code>function foo() {\n return 1;\n}</code></pre>'),
'```\nfunction foo() {\n return 1;\n}\n```'
);
});

test('converts unordered lists', () => {
const html = '<ul><li>one</li><li>two</li><li>three</li></ul>';
assert.strictEqual(convertHtmlToMarkdown(html), '- one\n- two\n- three');
});

test('converts ordered lists to numbered items', () => {
const html = '<ol><li>first</li><li>second</li></ol>';
assert.strictEqual(convertHtmlToMarkdown(html), '1. first\n2. second');
});

test('converts line breaks', () => {
assert.strictEqual(convertHtmlToMarkdown('hello<br>world'), 'hello\nworld');
assert.strictEqual(convertHtmlToMarkdown('hello<br/>world'), 'hello\nworld');
});

test('converts horizontal rules', () => {
assert.strictEqual(convertHtmlToMarkdown('above<hr>below'), 'above\n---\nbelow');
});

test('converts strikethrough', () => {
assert.strictEqual(convertHtmlToMarkdown('<del>removed</del>'), '~~removed~~');
assert.strictEqual(convertHtmlToMarkdown('<s>struck</s>'), '~~struck~~');
});

test('converts blockquotes', () => {
assert.strictEqual(
convertHtmlToMarkdown('<blockquote>quoted text</blockquote>'),
'> quoted text'
);
});

test('converts images', () => {
assert.strictEqual(
convertHtmlToMarkdown('<img src="https://example.com/img.png" alt="photo">'),
'![photo](https://example.com/img.png)'
);
});

test('decodes HTML entities', () => {
assert.strictEqual(convertHtmlToMarkdown('&amp; &lt; &gt; &quot; &#39;'), '& < > " \'');
});

test('strips unknown tags', () => {
assert.strictEqual(convertHtmlToMarkdown('<span class="x">hello</span>'), 'hello');
});

test('handles nested inline elements', () => {
assert.strictEqual(
convertHtmlToMarkdown('<strong><em>bold italic</em></strong>'),
'***bold italic***'
);
});

test('handles link with bold text inside', () => {
assert.strictEqual(
convertHtmlToMarkdown('<a href="https://example.com"><strong>click here</strong></a>'),
'[**click here**](https://example.com)'
);
});

test('handles heading with link inside', () => {
assert.strictEqual(
convertHtmlToMarkdown('<h2><a href="https://example.com">Title</a></h2>'),
'## [Title](https://example.com)'
);
});

test('collapses excessive newlines', () => {
const html = '<p>one</p><p></p><p></p><p>two</p>';
const result = convertHtmlToMarkdown(html);
assert.ok(!result.includes('\n\n\n'), 'should not have 3+ consecutive newlines');
assert.ok(result.includes('one'));
assert.ok(result.includes('two'));
});

test('handles a realistic web page snippet', () => {
const html = `
<h1>Getting Started</h1>
<p>Welcome to <strong>VS Code</strong>. Visit <a href="https://code.visualstudio.com">the website</a> for more info.</p>
<ul>
<li>Fast</li>
<li>Extensible</li>
</ul>
`;
const md = convertHtmlToMarkdown(html);
assert.ok(md.includes('# Getting Started'));
assert.ok(md.includes('**VS Code**'));
assert.ok(md.includes('[the website](https://code.visualstudio.com)'));
assert.ok(md.includes('- Fast'));
assert.ok(md.includes('- Extensible'));
});

test('decodes numeric HTML entities', () => {
assert.strictEqual(convertHtmlToMarkdown('&#60;tag&#62;'), '<tag>');
assert.strictEqual(convertHtmlToMarkdown('&#x3C;tag&#x3E;'), '<tag>');
assert.strictEqual(convertHtmlToMarkdown('&#8212;'), '—');
assert.strictEqual(convertHtmlToMarkdown('&#x2014;'), '—');
});

test('falls back to tag-stripping for very large input', () => {
const large = '<b>' + 'x'.repeat(200_001) + '</b>';
const result = convertHtmlToMarkdown(large);
// Should strip tags but NOT apply markdown bold formatting
assert.ok(!result.includes('**'));
assert.ok(!result.includes('<b>'));
});
});
Loading
Loading