From 317906daa252428f5134fb72a37886705dcd84bd Mon Sep 17 00:00:00 2001 From: Logan Ramos Date: Wed, 20 May 2026 15:35:42 -0400 Subject: [PATCH 1/3] Experimental support for pasting rich content into markdown input box --- src/vs/base/common/htmlToMarkdown.ts | 118 +++++++++++++++ .../base/test/common/htmlToMarkdown.test.ts | 137 ++++++++++++++++++ .../widget/input/editor/chatPasteProviders.ts | 55 ++++++- 3 files changed, 309 insertions(+), 1 deletion(-) create mode 100644 src/vs/base/common/htmlToMarkdown.ts create mode 100644 src/vs/base/test/common/htmlToMarkdown.test.ts diff --git a/src/vs/base/common/htmlToMarkdown.ts b/src/vs/base/common/htmlToMarkdown.ts new file mode 100644 index 0000000000000..2f84a90d25c6d --- /dev/null +++ b/src/vs/base/common/htmlToMarkdown.ts @@ -0,0 +1,118 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +/** + * Lightweight HTML-to-Markdown converter. + * + * Handles a small set of common inline and block elements so that content + * pasted from web pages keeps its basic structure (headings, links, bold, + * italic, code, lists) when inserted into a Markdown-aware surface such as + * the chat input. + */ +export function convertHtmlToMarkdown(html: string): string { + // Work on a mutable copy + let md = html; + + // Normalise line endings + md = md.replace(/\r\n?/g, '\n'); + + // --- block elements --------------------------------------------------- + + // Headings + md = md.replace(/]*>([\s\S]*?)<\/h1>/gi, (_m, inner) => `\n# ${inlineClean(inner)}\n`); + md = md.replace(/]*>([\s\S]*?)<\/h2>/gi, (_m, inner) => `\n## ${inlineClean(inner)}\n`); + md = md.replace(/]*>([\s\S]*?)<\/h3>/gi, (_m, inner) => `\n### ${inlineClean(inner)}\n`); + md = md.replace(/]*>([\s\S]*?)<\/h4>/gi, (_m, inner) => `\n#### ${inlineClean(inner)}\n`); + md = md.replace(/]*>([\s\S]*?)<\/h5>/gi, (_m, inner) => `\n##### ${inlineClean(inner)}\n`); + md = md.replace(/]*>([\s\S]*?)<\/h6>/gi, (_m, inner) => `\n###### ${inlineClean(inner)}\n`); + + // Code blocks:
+ md = md.replace(/]*>\s*]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${decodeEntities(inner).trim()}\n\`\`\`\n`); + + // Standalone
 without 
+	md = md.replace(/]*>([\s\S]*?)<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${decodeEntities(inner).trim()}\n\`\`\`\n`);
+
+	// Blockquote
+	md = md.replace(/]*>([\s\S]*?)<\/blockquote>/gi, (_m, inner) => {
+		const lines = inlineClean(inner).split('\n').map(l => `> ${l.trim()}`);
+		return `\n${lines.join('\n')}\n`;
+	});
+
+	// List items - convert before stripping the list wrapper
+	md = md.replace(/]*>([\s\S]*?)<\/li>/gi, (_m, inner) => `- ${inlineClean(inner).trim()}\n`);
+	md = md.replace(/<\/?[ou]l[^>]*>/gi, '\n');
+
+	// Paragraphs and divs → double newline
+	md = md.replace(/<\/p>/gi, '\n\n');
+	md = md.replace(/]*>/gi, '');
+	md = md.replace(/<\/div>/gi, '\n');
+	md = md.replace(/]*>/gi, '');
+
+	// Line breaks
+	md = md.replace(//gi, '\n');
+
+	// Horizontal rules
+	md = md.replace(/]*\/?>/gi, '\n---\n');
+
+	// --- inline elements --------------------------------------------------
+
+	// Links - must come before we strip remaining tags
+	md = md.replace(/]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => `[${inlineClean(text).trim()}](${href})`);
+
+	// Images
+	md = md.replace(/]*alt="([^"]*)"[^>]*src="([^"]*)"[^>]*\/?>/gi, '![$1]($2)');
+	md = md.replace(/]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, '![$2]($1)');
+	md = md.replace(/]*src="([^"]*)"[^>]*\/?>/gi, '![]($1)');
+
+	// Bold / strong
+	md = md.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`);
+
+	// Italic / emphasis
+	md = md.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`);
+
+	// Inline code
+	md = md.replace(/]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``);
+
+	// Strikethrough
+	md = md.replace(/<(del|s|strike)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `~~${inlineClean(inner)}~~`);
+
+	// --- cleanup ----------------------------------------------------------
+
+	// Strip any remaining HTML tags
+	md = md.replace(/<[^>]+>/g, '');
+
+	// Decode common HTML entities
+	md = decodeEntities(md);
+
+	// Collapse runs of 3+ newlines into 2
+	md = md.replace(/\n{3,}/g, '\n\n');
+
+	return md.trim();
+}
+
+/** Recursively strip tags for use inside an inline markdown construct. */
+function inlineClean(html: string): string {
+	// Process nested inline elements first
+	let result = html;
+	result = result.replace(/]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => `[${inlineClean(text).trim()}](${href})`);
+	result = result.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`);
+	result = result.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`);
+	result = result.replace(/]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``);
+	result = result.replace(/<(del|s|strike)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `~~${inlineClean(inner)}~~`);
+	result = result.replace(//gi, '\n');
+	result = result.replace(/<[^>]+>/g, '');
+	return decodeEntities(result);
+}
+
+/** Decode the most common HTML entities. */
+function decodeEntities(text: string): string {
+	return text
+		.replace(/&/g, '&')
+		.replace(/</g, '<')
+		.replace(/>/g, '>')
+		.replace(/"/g, '"')
+		.replace(/'/g, '\'')
+		.replace(/ /g, ' ');
+}
diff --git a/src/vs/base/test/common/htmlToMarkdown.test.ts b/src/vs/base/test/common/htmlToMarkdown.test.ts
new file mode 100644
index 0000000000000..ff6083e571291
--- /dev/null
+++ b/src/vs/base/test/common/htmlToMarkdown.test.ts
@@ -0,0 +1,137 @@
+/*---------------------------------------------------------------------------------------------
+ *  Copyright (c) Microsoft Corporation. All rights reserved.
+ *  Licensed under the MIT License. See License.txt in the project root for license information.
+ *--------------------------------------------------------------------------------------------*/
+import assert from 'assert';
+import { convertHtmlToMarkdown } from '../../common/htmlToMarkdown.js';
+import { ensureNoDisposablesAreLeakedInTestSuite } from './utils.js';
+
+suite('htmlToMarkdown', () => {
+	ensureNoDisposablesAreLeakedInTestSuite();
+
+	test('converts headings', () => {
+		assert.strictEqual(convertHtmlToMarkdown('

Title

'), '# Title'); + assert.strictEqual(convertHtmlToMarkdown('

Subtitle

'), '## Subtitle'); + assert.strictEqual(convertHtmlToMarkdown('

Section

'), '### Section'); + assert.strictEqual(convertHtmlToMarkdown('

Sub-section

'), '#### Sub-section'); + assert.strictEqual(convertHtmlToMarkdown('
Minor
'), '##### Minor'); + assert.strictEqual(convertHtmlToMarkdown('
Smallest
'), '###### Smallest'); + }); + + test('converts links', () => { + assert.strictEqual( + convertHtmlToMarkdown('Example'), + '[Example](https://example.com)' + ); + }); + + test('converts bold and italic', () => { + assert.strictEqual(convertHtmlToMarkdown('bold'), '**bold**'); + assert.strictEqual(convertHtmlToMarkdown('bold'), '**bold**'); + assert.strictEqual(convertHtmlToMarkdown('italic'), '*italic*'); + assert.strictEqual(convertHtmlToMarkdown('italic'), '*italic*'); + }); + + test('converts inline code', () => { + assert.strictEqual(convertHtmlToMarkdown('foo()'), '`foo()`'); + }); + + test('converts code blocks', () => { + assert.strictEqual( + convertHtmlToMarkdown('
const x = 1;
'), + '```\nconst x = 1;\n```' + ); + }); + + test('converts unordered lists', () => { + const html = '
  • one
  • two
  • three
'; + assert.strictEqual(convertHtmlToMarkdown(html), '- one\n- two\n- three'); + }); + + test('converts ordered lists to bullet items', () => { + const html = '
  1. first
  2. second
'; + assert.strictEqual(convertHtmlToMarkdown(html), '- first\n- second'); + }); + + test('converts line breaks', () => { + assert.strictEqual(convertHtmlToMarkdown('hello
world'), 'hello\nworld'); + assert.strictEqual(convertHtmlToMarkdown('hello
world'), 'hello\nworld'); + }); + + test('converts horizontal rules', () => { + assert.strictEqual(convertHtmlToMarkdown('above
below'), 'above\n---\nbelow'); + }); + + test('converts strikethrough', () => { + assert.strictEqual(convertHtmlToMarkdown('removed'), '~~removed~~'); + assert.strictEqual(convertHtmlToMarkdown('struck'), '~~struck~~'); + }); + + test('converts blockquotes', () => { + assert.strictEqual( + convertHtmlToMarkdown('
quoted text
'), + '> quoted text' + ); + }); + + test('converts images', () => { + assert.strictEqual( + convertHtmlToMarkdown('photo'), + '![photo](https://example.com/img.png)' + ); + }); + + test('decodes HTML entities', () => { + assert.strictEqual(convertHtmlToMarkdown('& < > " ''), '& < > " \''); + }); + + test('strips unknown tags', () => { + assert.strictEqual(convertHtmlToMarkdown('hello'), 'hello'); + }); + + test('handles nested inline elements', () => { + assert.strictEqual( + convertHtmlToMarkdown('bold italic'), + '***bold italic***' + ); + }); + + test('handles link with bold text inside', () => { + assert.strictEqual( + convertHtmlToMarkdown('click here'), + '[**click here**](https://example.com)' + ); + }); + + test('handles heading with link inside', () => { + assert.strictEqual( + convertHtmlToMarkdown('

Title

'), + '## [Title](https://example.com)' + ); + }); + + test('collapses excessive newlines', () => { + const html = '

one

two

'; + const result = convertHtmlToMarkdown(html); + assert.ok(!result.includes('\n\n\n'), 'should not have 3+ consecutive newlines'); + assert.ok(result.includes('one')); + assert.ok(result.includes('two')); + }); + + test('handles a realistic web page snippet', () => { + const html = ` +

Getting Started

+

Welcome to VS Code. Visit the website for more info.

+
    +
  • Fast
  • +
  • Extensible
  • +
+ `; + const md = convertHtmlToMarkdown(html); + assert.ok(md.includes('# Getting Started')); + assert.ok(md.includes('**VS Code**')); + assert.ok(md.includes('[the website](https://code.visualstudio.com)')); + assert.ok(md.includes('- Fast')); + assert.ok(md.includes('- Extensible')); + }); +}); diff --git a/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts b/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts index be28fc3feedb6..341c9a03ced8e 100644 --- a/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts +++ b/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts @@ -5,6 +5,7 @@ import { CancellationToken } from '../../../../../../../base/common/cancellation.js'; import { Codicon } from '../../../../../../../base/common/codicons.js'; import { createStringDataTransferItem, IDataTransferItem, IReadonlyVSDataTransfer, VSDataTransfer } from '../../../../../../../base/common/dataTransfer.js'; +import { convertHtmlToMarkdown } from '../../../../../../../base/common/htmlToMarkdown.js'; import { alert } from '../../../../../../../base/browser/ui/aria/aria.js'; import { HierarchicalKind } from '../../../../../../../base/common/hierarchicalKind.js'; import { Disposable } from '../../../../../../../base/common/lifecycle.js'; @@ -15,7 +16,7 @@ import { basename, joinPath } from '../../../../../../../base/common/resources.j import { URI, UriComponents } from '../../../../../../../base/common/uri.js'; import { Position } from '../../../../../../../editor/common/core/position.js'; import { IRange } from '../../../../../../../editor/common/core/range.js'; -import { DocumentPasteContext, DocumentPasteEdit, DocumentPasteEditProvider, DocumentPasteEditsSession, SymbolKinds } from '../../../../../../../editor/common/languages.js'; +import { DocumentPasteContext, DocumentPasteEdit, DocumentPasteEditProvider, DocumentPasteEditsSession, DocumentPasteTriggerKind, SymbolKinds } from '../../../../../../../editor/common/languages.js'; import { ITextModel } from '../../../../../../../editor/common/model.js'; import { ILanguageFeaturesService } from '../../../../../../../editor/common/services/languageFeatures.js'; import { IModelService } from '../../../../../../../editor/common/services/model.js'; @@ -679,6 +680,57 @@ class PasteSymbolProvider implements DocumentPasteEditProvider { } } +class PasteHtmlProvider implements DocumentPasteEditProvider { + + public readonly kind = new HierarchicalKind('chat.paste.html'); + public readonly providedPasteEditKinds = [this.kind]; + + public readonly copyMimeTypes = []; + public readonly pasteMimeTypes = ['text/html']; + + async provideDocumentPasteEdits(model: ITextModel, _ranges: readonly IRange[], dataTransfer: IReadonlyVSDataTransfer, context: DocumentPasteContext, token: CancellationToken): Promise { + if (model.uri.scheme !== Schemas.vscodeChatInput) { + return; + } + + // Only activate on automatic paste, not explicit "Paste As" + if (context.triggerKind !== DocumentPasteTriggerKind.Automatic) { + return; + } + + const entry = dataTransfer.get('text/html'); + const htmlText = await entry?.asString(); + if (!htmlText || token.isCancellationRequested) { + return; + } + + // Skip if the HTML is trivially plain text (no meaningful tags) + if (!/<(a|strong|b|em|i|h[1-6]|code|pre|ul|ol|li|blockquote|del|s|strike|img)\b/i.test(htmlText)) { + return; + } + + const markdown = convertHtmlToMarkdown(htmlText); + + // If conversion produced nothing useful, fall back + if (!markdown) { + return; + } + + return { + dispose() { }, + edits: [{ + insertText: markdown, + title: localize('pasteHtmlAsMarkdown', 'Paste as Markdown'), + kind: this.kind, + yieldTo: [ + { kind: new HierarchicalKind('chat.attach.text') }, + { kind: new HierarchicalKind('chat.attach.image') }, + ], + }], + }; + } +} + export class ChatPasteProvidersFeature extends Disposable { constructor( @IInstantiationService instaService: IInstantiationService, @@ -694,6 +746,7 @@ export class ChatPasteProvidersFeature extends Disposable { this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, instaService.createInstance(CopyAttachmentsProvider))); this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, new PasteImageProvider(chatWidgetService, extensionService, fileService, environmentService, logService))); this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, new PasteTextProvider(chatWidgetService, modelService))); + this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, new PasteHtmlProvider())); this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, instaService.createInstance(PasteSymbolProvider))); this._register(languageFeaturesService.documentPasteEditProvider.register('*', instaService.createInstance(CopyTextProvider))); } From 71e9f7f3b8b15356bf80c239ba29080867927b42 Mon Sep 17 00:00:00 2001 From: Logan Ramos Date: Wed, 20 May 2026 17:14:03 -0400 Subject: [PATCH 2/3] fix comments --- src/vs/base/common/htmlToMarkdown.ts | 17 +++++++++++++++-- src/vs/base/test/common/htmlToMarkdown.test.ts | 14 ++++++++++++++ .../widget/input/editor/chatPasteProviders.ts | 7 ++++--- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/vs/base/common/htmlToMarkdown.ts b/src/vs/base/common/htmlToMarkdown.ts index 2f84a90d25c6d..bb5f6e1f990e6 100644 --- a/src/vs/base/common/htmlToMarkdown.ts +++ b/src/vs/base/common/htmlToMarkdown.ts @@ -29,10 +29,10 @@ export function convertHtmlToMarkdown(html: string): string { md = md.replace(/]*>([\s\S]*?)<\/h6>/gi, (_m, inner) => `\n###### ${inlineClean(inner)}\n`); // Code blocks:
- md = md.replace(/]*>\s*]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${decodeEntities(inner).trim()}\n\`\`\`\n`); + md = md.replace(/]*>\s*]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${cleanCodeBlock(inner)}\n\`\`\`\n`); // Standalone
 without 
-	md = md.replace(/]*>([\s\S]*?)<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${decodeEntities(inner).trim()}\n\`\`\`\n`);
+	md = md.replace(/]*>([\s\S]*?)<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${cleanCodeBlock(inner)}\n\`\`\`\n`);
 
 	// Blockquote
 	md = md.replace(/]*>([\s\S]*?)<\/blockquote>/gi, (_m, inner) => {
@@ -106,6 +106,19 @@ function inlineClean(html: string): string {
 	return decodeEntities(result);
 }
 
+/** Strip tags, normalise 
, and decode entities inside a code block while preserving indentation. */ +function cleanCodeBlock(html: string): string { + let result = html; + // Normalise
to newlines + result = result.replace(//gi, '\n'); + // Strip all HTML tags (e.g. syntax-highlighting s) + result = result.replace(/<[^>]+>/g, ''); + result = decodeEntities(result); + // Trim only leading/trailing newlines, preserving indentation + result = result.replace(/^\n+|\n+$/g, ''); + return result; +} + /** Decode the most common HTML entities. */ function decodeEntities(text: string): string { return text diff --git a/src/vs/base/test/common/htmlToMarkdown.test.ts b/src/vs/base/test/common/htmlToMarkdown.test.ts index ff6083e571291..37d92cb6729c3 100644 --- a/src/vs/base/test/common/htmlToMarkdown.test.ts +++ b/src/vs/base/test/common/htmlToMarkdown.test.ts @@ -43,6 +43,20 @@ suite('htmlToMarkdown', () => { ); }); + test('converts syntax-highlighted code blocks by stripping inner tags', () => { + assert.strictEqual( + convertHtmlToMarkdown('
const x = 1;
'), + '```\nconst x = 1;\n```' + ); + }); + + test('preserves indentation in code blocks', () => { + assert.strictEqual( + convertHtmlToMarkdown('
function foo() {\n  return 1;\n}
'), + '```\nfunction foo() {\n return 1;\n}\n```' + ); + }); + test('converts unordered lists', () => { const html = '
  • one
  • two
  • three
'; assert.strictEqual(convertHtmlToMarkdown(html), '- one\n- two\n- three'); diff --git a/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts b/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts index 341c9a03ced8e..b752465235a9e 100644 --- a/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts +++ b/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts @@ -686,7 +686,7 @@ class PasteHtmlProvider implements DocumentPasteEditProvider { public readonly providedPasteEditKinds = [this.kind]; public readonly copyMimeTypes = []; - public readonly pasteMimeTypes = ['text/html']; + public readonly pasteMimeTypes = [Mimes.html]; async provideDocumentPasteEdits(model: ITextModel, _ranges: readonly IRange[], dataTransfer: IReadonlyVSDataTransfer, context: DocumentPasteContext, token: CancellationToken): Promise { if (model.uri.scheme !== Schemas.vscodeChatInput) { @@ -698,14 +698,14 @@ class PasteHtmlProvider implements DocumentPasteEditProvider { return; } - const entry = dataTransfer.get('text/html'); + const entry = dataTransfer.get(Mimes.html); const htmlText = await entry?.asString(); if (!htmlText || token.isCancellationRequested) { return; } // Skip if the HTML is trivially plain text (no meaningful tags) - if (!/<(a|strong|b|em|i|h[1-6]|code|pre|ul|ol|li|blockquote|del|s|strike|img)\b/i.test(htmlText)) { + if (!/<(a|strong|b|em|i|h[1-6]|code|pre|ul|ol|li|blockquote|del|s|strike|img|hr)\b/i.test(htmlText)) { return; } @@ -722,6 +722,7 @@ class PasteHtmlProvider implements DocumentPasteEditProvider { insertText: markdown, title: localize('pasteHtmlAsMarkdown', 'Paste as Markdown'), kind: this.kind, + handledMimeType: Mimes.html, yieldTo: [ { kind: new HierarchicalKind('chat.attach.text') }, { kind: new HierarchicalKind('chat.attach.image') }, From 539a91ba65d78fd64169400d2d5ab515bfc7cbc0 Mon Sep 17 00:00:00 2001 From: Logan Ramos Date: Wed, 20 May 2026 17:30:27 -0400 Subject: [PATCH 3/3] Cleanup more cases --- src/vs/base/common/htmlToMarkdown.ts | 50 ++++++++++++++++--- .../base/test/common/htmlToMarkdown.test.ts | 34 ++++++++++++- .../widget/input/editor/chatPasteProviders.ts | 28 +++++------ 3 files changed, 89 insertions(+), 23 deletions(-) diff --git a/src/vs/base/common/htmlToMarkdown.ts b/src/vs/base/common/htmlToMarkdown.ts index bb5f6e1f990e6..576873bc84448 100644 --- a/src/vs/base/common/htmlToMarkdown.ts +++ b/src/vs/base/common/htmlToMarkdown.ts @@ -11,7 +11,14 @@ * italic, code, lists) when inserted into a Markdown-aware surface such as * the chat input. */ +const maxInputLength = 200_000; + export function convertHtmlToMarkdown(html: string): string { + // Bail out on very large inputs to avoid regex backtracking cost + if (html.length > maxInputLength) { + return html.replace(/<[^>]+>/g, ''); + } + // Work on a mutable copy let md = html; @@ -40,9 +47,19 @@ export function convertHtmlToMarkdown(html: string): string { return `\n${lines.join('\n')}\n`; }); - // List items - convert before stripping the list wrapper + // Ordered list items — number them before stripping the
    wrapper + md = md.replace(/]*>([\s\S]*?)<\/ol>/gi, (_m, inner) => { + let index = 0; + const numbered = inner.replace(/]*>([\s\S]*?)<\/li>/gi, (_liM: string, liInner: string) => { + index++; + return `${index}. ${inlineClean(liInner).trim()}\n`; + }); + return `\n${numbered.replace(/<[^>]+>/g, '')}\n`; + }); + + // Unordered list items - convert before stripping the list wrapper md = md.replace(/]*>([\s\S]*?)<\/li>/gi, (_m, inner) => `- ${inlineClean(inner).trim()}\n`); - md = md.replace(/<\/?[ou]l[^>]*>/gi, '\n'); + md = md.replace(/<\/?ul[^>]*>/gi, '\n'); // Paragraphs and divs → double newline md = md.replace(/<\/p>/gi, '\n\n'); @@ -59,7 +76,7 @@ export function convertHtmlToMarkdown(html: string): string { // --- inline elements -------------------------------------------------- // Links - must come before we strip remaining tags - md = md.replace(/]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => `[${inlineClean(text).trim()}](${href})`); + md = md.replace(/]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => sanitizeLink(href, inlineClean(text).trim())); // Images md = md.replace(/]*alt="([^"]*)"[^>]*src="([^"]*)"[^>]*\/?>/gi, '![$1]($2)'); @@ -96,7 +113,7 @@ export function convertHtmlToMarkdown(html: string): string { function inlineClean(html: string): string { // Process nested inline elements first let result = html; - result = result.replace(/]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => `[${inlineClean(text).trim()}](${href})`); + result = result.replace(/]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => sanitizeLink(href, inlineClean(text).trim())); result = result.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`); result = result.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`); result = result.replace(/]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``); @@ -119,7 +136,15 @@ function cleanCodeBlock(html: string): string { return result; } -/** Decode the most common HTML entities. */ +/** Produce a markdown link, stripping dangerous schemes like `javascript:`. */ +function sanitizeLink(href: string, text: string): string { + if (/^(javascript|vbscript|data):/i.test(href.trim())) { + return text; + } + return `[${text}](${href})`; +} + +/** Decode the most common HTML entities, including numeric character references. */ function decodeEntities(text: string): string { return text .replace(/&/g, '&') @@ -127,5 +152,18 @@ function decodeEntities(text: string): string { .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, '\'') - .replace(/ /g, ' '); + .replace(/ /g, ' ') + .replace(/&#x(?[0-9a-fA-F]+);/g, (...args) => safeFromCodePoint(parseInt(args.at(-1).hex, 16))) + .replace(/&#(?\d+);/g, (...args) => safeFromCodePoint(parseInt(args.at(-1).dec, 10))); +} + +function safeFromCodePoint(code: number): string { + if (code >= 0 && code <= 0x10FFFF) { + try { + return String.fromCodePoint(code); + } catch { + // invalid code point + } + } + return ''; } diff --git a/src/vs/base/test/common/htmlToMarkdown.test.ts b/src/vs/base/test/common/htmlToMarkdown.test.ts index 37d92cb6729c3..88a5adf30c38d 100644 --- a/src/vs/base/test/common/htmlToMarkdown.test.ts +++ b/src/vs/base/test/common/htmlToMarkdown.test.ts @@ -25,6 +25,21 @@ suite('htmlToMarkdown', () => { ); }); + test('strips dangerous schemes from links', () => { + assert.strictEqual( + convertHtmlToMarkdown('click'), + 'click' + ); + assert.strictEqual( + convertHtmlToMarkdown('run'), + 'run' + ); + assert.strictEqual( + convertHtmlToMarkdown('data'), + 'data' + ); + }); + test('converts bold and italic', () => { assert.strictEqual(convertHtmlToMarkdown('bold'), '**bold**'); assert.strictEqual(convertHtmlToMarkdown('bold'), '**bold**'); @@ -62,9 +77,9 @@ suite('htmlToMarkdown', () => { assert.strictEqual(convertHtmlToMarkdown(html), '- one\n- two\n- three'); }); - test('converts ordered lists to bullet items', () => { + test('converts ordered lists to numbered items', () => { const html = '
    1. first
    2. second
    '; - assert.strictEqual(convertHtmlToMarkdown(html), '- first\n- second'); + assert.strictEqual(convertHtmlToMarkdown(html), '1. first\n2. second'); }); test('converts line breaks', () => { @@ -148,4 +163,19 @@ suite('htmlToMarkdown', () => { assert.ok(md.includes('- Fast')); assert.ok(md.includes('- Extensible')); }); + + test('decodes numeric HTML entities', () => { + assert.strictEqual(convertHtmlToMarkdown('<tag>'), ''); + assert.strictEqual(convertHtmlToMarkdown('<tag>'), ''); + assert.strictEqual(convertHtmlToMarkdown('—'), '—'); + assert.strictEqual(convertHtmlToMarkdown('—'), '—'); + }); + + test('falls back to tag-stripping for very large input', () => { + const large = '' + 'x'.repeat(200_001) + ''; + const result = convertHtmlToMarkdown(large); + // Should strip tags but NOT apply markdown bold formatting + assert.ok(!result.includes('**')); + assert.ok(!result.includes('')); + }); }); diff --git a/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts b/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts index b752465235a9e..450e51de4c29d 100644 --- a/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts +++ b/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts @@ -2,11 +2,11 @@ * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See License.txt in the project root for license information. *--------------------------------------------------------------------------------------------*/ +import { alert } from '../../../../../../../base/browser/ui/aria/aria.js'; import { CancellationToken } from '../../../../../../../base/common/cancellation.js'; import { Codicon } from '../../../../../../../base/common/codicons.js'; import { createStringDataTransferItem, IDataTransferItem, IReadonlyVSDataTransfer, VSDataTransfer } from '../../../../../../../base/common/dataTransfer.js'; import { convertHtmlToMarkdown } from '../../../../../../../base/common/htmlToMarkdown.js'; -import { alert } from '../../../../../../../base/browser/ui/aria/aria.js'; import { HierarchicalKind } from '../../../../../../../base/common/hierarchicalKind.js'; import { Disposable } from '../../../../../../../base/common/lifecycle.js'; import { revive } from '../../../../../../../base/common/marshalling.js'; @@ -693,7 +693,8 @@ class PasteHtmlProvider implements DocumentPasteEditProvider { return; } - // Only activate on automatic paste, not explicit "Paste As" + // Only activate on automatic paste — for explicit "Paste As" the user + // likely wants the raw text or an attachment, not a converted markdown form. if (context.triggerKind !== DocumentPasteTriggerKind.Automatic) { return; } @@ -716,19 +717,16 @@ class PasteHtmlProvider implements DocumentPasteEditProvider { return; } - return { - dispose() { }, - edits: [{ - insertText: markdown, - title: localize('pasteHtmlAsMarkdown', 'Paste as Markdown'), - kind: this.kind, - handledMimeType: Mimes.html, - yieldTo: [ - { kind: new HierarchicalKind('chat.attach.text') }, - { kind: new HierarchicalKind('chat.attach.image') }, - ], - }], - }; + return createEditSession({ + insertText: markdown, + title: localize('pasteHtmlAsMarkdown', 'Paste as Markdown'), + kind: this.kind, + handledMimeType: Mimes.html, + yieldTo: [ + { kind: new HierarchicalKind('chat.attach.text') }, + { kind: new HierarchicalKind('chat.attach.image') }, + ], + }); } }