diff --git a/src/vs/base/common/htmlToMarkdown.ts b/src/vs/base/common/htmlToMarkdown.ts new file mode 100644 index 0000000000000..576873bc84448 --- /dev/null +++ b/src/vs/base/common/htmlToMarkdown.ts @@ -0,0 +1,169 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +/** + * Lightweight HTML-to-Markdown converter. + * + * Handles a small set of common inline and block elements so that content + * pasted from web pages keeps its basic structure (headings, links, bold, + * italic, code, lists) when inserted into a Markdown-aware surface such as + * the chat input. + */ +const maxInputLength = 200_000; + +export function convertHtmlToMarkdown(html: string): string { + // Bail out on very large inputs to avoid regex backtracking cost + if (html.length > maxInputLength) { + return html.replace(/<[^>]+>/g, ''); + } + + // Work on a mutable copy + let md = html; + + // Normalise line endings + md = md.replace(/\r\n?/g, '\n'); + + // --- block elements --------------------------------------------------- + + // Headings + md = md.replace(/
…
+ md = md.replace(/]*>\s*]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${cleanCodeBlock(inner)}\n\`\`\`\n`);
+
+ // Standalone without
+ md = md.replace(/]*>([\s\S]*?)<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${cleanCodeBlock(inner)}\n\`\`\`\n`);
+
+ // Blockquote
+ md = md.replace(/]*>([\s\S]*?)<\/blockquote>/gi, (_m, inner) => {
+ const lines = inlineClean(inner).split('\n').map(l => `> ${l.trim()}`);
+ return `\n${lines.join('\n')}\n`;
+ });
+
+ // Ordered list items — number them before stripping the wrapper
+ md = md.replace(/]*>([\s\S]*?)<\/ol>/gi, (_m, inner) => {
+ let index = 0;
+ const numbered = inner.replace(/- ]*>([\s\S]*?)<\/li>/gi, (_liM: string, liInner: string) => {
+ index++;
+ return `${index}. ${inlineClean(liInner).trim()}\n`;
+ });
+ return `\n${numbered.replace(/<[^>]+>/g, '')}\n`;
+ });
+
+ // Unordered list items - convert before stripping the list wrapper
+ md = md.replace(/
- ]*>([\s\S]*?)<\/li>/gi, (_m, inner) => `- ${inlineClean(inner).trim()}\n`);
+ md = md.replace(/<\/?ul[^>]*>/gi, '\n');
+
+ // Paragraphs and divs → double newline
+ md = md.replace(/<\/p>/gi, '\n\n');
+ md = md.replace(/
]*>/gi, '');
+ md = md.replace(/<\/div>/gi, '\n');
+ md = md.replace(/
]*>/gi, '');
+
+ // Line breaks
+ md = md.replace(/
/gi, '\n');
+
+ // Horizontal rules
+ md = md.replace(/
]*\/?>/gi, '\n---\n');
+
+ // --- inline elements --------------------------------------------------
+
+ // Links - must come before we strip remaining tags
+ md = md.replace(/]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => sanitizeLink(href, inlineClean(text).trim()));
+
+ // Images
+ md = md.replace(/
]*alt="([^"]*)"[^>]*src="([^"]*)"[^>]*\/?>/gi, '');
+ md = md.replace(/
]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, '');
+ md = md.replace(/
]*src="([^"]*)"[^>]*\/?>/gi, '');
+
+ // Bold / strong
+ md = md.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`);
+
+ // Italic / emphasis
+ md = md.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`);
+
+ // Inline code
+ md = md.replace(/]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``);
+
+ // Strikethrough
+ md = md.replace(/<(del|s|strike)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `~~${inlineClean(inner)}~~`);
+
+ // --- cleanup ----------------------------------------------------------
+
+ // Strip any remaining HTML tags
+ md = md.replace(/<[^>]+>/g, '');
+
+ // Decode common HTML entities
+ md = decodeEntities(md);
+
+ // Collapse runs of 3+ newlines into 2
+ md = md.replace(/\n{3,}/g, '\n\n');
+
+ return md.trim();
+}
+
+/** Recursively strip tags for use inside an inline markdown construct. */
+function inlineClean(html: string): string {
+ // Process nested inline elements first
+ let result = html;
+ result = result.replace(/]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => sanitizeLink(href, inlineClean(text).trim()));
+ result = result.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`);
+ result = result.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`);
+ result = result.replace(/]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``);
+ result = result.replace(/<(del|s|strike)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `~~${inlineClean(inner)}~~`);
+ result = result.replace(/
/gi, '\n');
+ result = result.replace(/<[^>]+>/g, '');
+ return decodeEntities(result);
+}
+
+/** Strip tags, normalise
, and decode entities inside a code block while preserving indentation. */
+function cleanCodeBlock(html: string): string {
+ let result = html;
+ // Normalise
to newlines
+ result = result.replace(/
/gi, '\n');
+ // Strip all HTML tags (e.g. syntax-highlighting s)
+ result = result.replace(/<[^>]+>/g, '');
+ result = decodeEntities(result);
+ // Trim only leading/trailing newlines, preserving indentation
+ result = result.replace(/^\n+|\n+$/g, '');
+ return result;
+}
+
+/** Produce a markdown link, stripping dangerous schemes like `javascript:`. */
+function sanitizeLink(href: string, text: string): string {
+ if (/^(javascript|vbscript|data):/i.test(href.trim())) {
+ return text;
+ }
+ return `[${text}](${href})`;
+}
+
+/** Decode the most common HTML entities, including numeric character references. */
+function decodeEntities(text: string): string {
+ return text
+ .replace(/&/g, '&')
+ .replace(/</g, '<')
+ .replace(/>/g, '>')
+ .replace(/"/g, '"')
+ .replace(/'/g, '\'')
+ .replace(/ /g, ' ')
+ .replace(/(?[0-9a-fA-F]+);/g, (...args) => safeFromCodePoint(parseInt(args.at(-1).hex, 16)))
+ .replace(/(?\d+);/g, (...args) => safeFromCodePoint(parseInt(args.at(-1).dec, 10)));
+}
+
+function safeFromCodePoint(code: number): string {
+ if (code >= 0 && code <= 0x10FFFF) {
+ try {
+ return String.fromCodePoint(code);
+ } catch {
+ // invalid code point
+ }
+ }
+ return '';
+}
diff --git a/src/vs/base/test/common/htmlToMarkdown.test.ts b/src/vs/base/test/common/htmlToMarkdown.test.ts
new file mode 100644
index 0000000000000..88a5adf30c38d
--- /dev/null
+++ b/src/vs/base/test/common/htmlToMarkdown.test.ts
@@ -0,0 +1,181 @@
+/*---------------------------------------------------------------------------------------------
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See License.txt in the project root for license information.
+ *--------------------------------------------------------------------------------------------*/
+import assert from 'assert';
+import { convertHtmlToMarkdown } from '../../common/htmlToMarkdown.js';
+import { ensureNoDisposablesAreLeakedInTestSuite } from './utils.js';
+
+suite('htmlToMarkdown', () => {
+ ensureNoDisposablesAreLeakedInTestSuite();
+
+ test('converts headings', () => {
+ assert.strictEqual(convertHtmlToMarkdown('Title
'), '# Title');
+ assert.strictEqual(convertHtmlToMarkdown('Subtitle
'), '## Subtitle');
+ assert.strictEqual(convertHtmlToMarkdown('Section
'), '### Section');
+ assert.strictEqual(convertHtmlToMarkdown('Sub-section
'), '#### Sub-section');
+ assert.strictEqual(convertHtmlToMarkdown('Minor
'), '##### Minor');
+ assert.strictEqual(convertHtmlToMarkdown('Smallest
'), '###### Smallest');
+ });
+
+ test('converts links', () => {
+ assert.strictEqual(
+ convertHtmlToMarkdown('Example'),
+ '[Example](https://example.com)'
+ );
+ });
+
+ test('strips dangerous schemes from links', () => {
+ assert.strictEqual(
+ convertHtmlToMarkdown('click'),
+ 'click'
+ );
+ assert.strictEqual(
+ convertHtmlToMarkdown('run'),
+ 'run'
+ );
+ assert.strictEqual(
+ convertHtmlToMarkdown('data'),
+ 'data'
+ );
+ });
+
+ test('converts bold and italic', () => {
+ assert.strictEqual(convertHtmlToMarkdown('bold'), '**bold**');
+ assert.strictEqual(convertHtmlToMarkdown('bold'), '**bold**');
+ assert.strictEqual(convertHtmlToMarkdown('italic'), '*italic*');
+ assert.strictEqual(convertHtmlToMarkdown('italic'), '*italic*');
+ });
+
+ test('converts inline code', () => {
+ assert.strictEqual(convertHtmlToMarkdown('foo()'), '`foo()`');
+ });
+
+ test('converts code blocks', () => {
+ assert.strictEqual(
+ convertHtmlToMarkdown('const x = 1;
'),
+ '```\nconst x = 1;\n```'
+ );
+ });
+
+ test('converts syntax-highlighted code blocks by stripping inner tags', () => {
+ assert.strictEqual(
+ convertHtmlToMarkdown('const x = 1;
'),
+ '```\nconst x = 1;\n```'
+ );
+ });
+
+ test('preserves indentation in code blocks', () => {
+ assert.strictEqual(
+ convertHtmlToMarkdown('function foo() {\n return 1;\n}
'),
+ '```\nfunction foo() {\n return 1;\n}\n```'
+ );
+ });
+
+ test('converts unordered lists', () => {
+ const html = '- one
- two
- three
';
+ assert.strictEqual(convertHtmlToMarkdown(html), '- one\n- two\n- three');
+ });
+
+ test('converts ordered lists to numbered items', () => {
+ const html = '- first
- second
';
+ assert.strictEqual(convertHtmlToMarkdown(html), '1. first\n2. second');
+ });
+
+ test('converts line breaks', () => {
+ assert.strictEqual(convertHtmlToMarkdown('hello
world'), 'hello\nworld');
+ assert.strictEqual(convertHtmlToMarkdown('hello
world'), 'hello\nworld');
+ });
+
+ test('converts horizontal rules', () => {
+ assert.strictEqual(convertHtmlToMarkdown('above
below'), 'above\n---\nbelow');
+ });
+
+ test('converts strikethrough', () => {
+ assert.strictEqual(convertHtmlToMarkdown('removed'), '~~removed~~');
+ assert.strictEqual(convertHtmlToMarkdown('struck'), '~~struck~~');
+ });
+
+ test('converts blockquotes', () => {
+ assert.strictEqual(
+ convertHtmlToMarkdown('quoted text
'),
+ '> quoted text'
+ );
+ });
+
+ test('converts images', () => {
+ assert.strictEqual(
+ convertHtmlToMarkdown('
'),
+ ''
+ );
+ });
+
+ test('decodes HTML entities', () => {
+ assert.strictEqual(convertHtmlToMarkdown('& < > " ''), '& < > " \'');
+ });
+
+ test('strips unknown tags', () => {
+ assert.strictEqual(convertHtmlToMarkdown('hello'), 'hello');
+ });
+
+ test('handles nested inline elements', () => {
+ assert.strictEqual(
+ convertHtmlToMarkdown('bold italic'),
+ '***bold italic***'
+ );
+ });
+
+ test('handles link with bold text inside', () => {
+ assert.strictEqual(
+ convertHtmlToMarkdown('click here'),
+ '[**click here**](https://example.com)'
+ );
+ });
+
+ test('handles heading with link inside', () => {
+ assert.strictEqual(
+ convertHtmlToMarkdown('Title
'),
+ '## [Title](https://example.com)'
+ );
+ });
+
+ test('collapses excessive newlines', () => {
+ const html = 'one
two
';
+ const result = convertHtmlToMarkdown(html);
+ assert.ok(!result.includes('\n\n\n'), 'should not have 3+ consecutive newlines');
+ assert.ok(result.includes('one'));
+ assert.ok(result.includes('two'));
+ });
+
+ test('handles a realistic web page snippet', () => {
+ const html = `
+ Getting Started
+ Welcome to VS Code. Visit the website for more info.
+
+ - Fast
+ - Extensible
+
+ `;
+ const md = convertHtmlToMarkdown(html);
+ assert.ok(md.includes('# Getting Started'));
+ assert.ok(md.includes('**VS Code**'));
+ assert.ok(md.includes('[the website](https://code.visualstudio.com)'));
+ assert.ok(md.includes('- Fast'));
+ assert.ok(md.includes('- Extensible'));
+ });
+
+ test('decodes numeric HTML entities', () => {
+ assert.strictEqual(convertHtmlToMarkdown('<tag>'), '');
+ assert.strictEqual(convertHtmlToMarkdown('<tag>'), '');
+ assert.strictEqual(convertHtmlToMarkdown('—'), '—');
+ assert.strictEqual(convertHtmlToMarkdown('—'), '—');
+ });
+
+ test('falls back to tag-stripping for very large input', () => {
+ const large = '' + 'x'.repeat(200_001) + '';
+ const result = convertHtmlToMarkdown(large);
+ // Should strip tags but NOT apply markdown bold formatting
+ assert.ok(!result.includes('**'));
+ assert.ok(!result.includes(''));
+ });
+});
diff --git a/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts b/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts
index be28fc3feedb6..450e51de4c29d 100644
--- a/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts
+++ b/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts
@@ -2,10 +2,11 @@
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
+import { alert } from '../../../../../../../base/browser/ui/aria/aria.js';
import { CancellationToken } from '../../../../../../../base/common/cancellation.js';
import { Codicon } from '../../../../../../../base/common/codicons.js';
import { createStringDataTransferItem, IDataTransferItem, IReadonlyVSDataTransfer, VSDataTransfer } from '../../../../../../../base/common/dataTransfer.js';
-import { alert } from '../../../../../../../base/browser/ui/aria/aria.js';
+import { convertHtmlToMarkdown } from '../../../../../../../base/common/htmlToMarkdown.js';
import { HierarchicalKind } from '../../../../../../../base/common/hierarchicalKind.js';
import { Disposable } from '../../../../../../../base/common/lifecycle.js';
import { revive } from '../../../../../../../base/common/marshalling.js';
@@ -15,7 +16,7 @@ import { basename, joinPath } from '../../../../../../../base/common/resources.j
import { URI, UriComponents } from '../../../../../../../base/common/uri.js';
import { Position } from '../../../../../../../editor/common/core/position.js';
import { IRange } from '../../../../../../../editor/common/core/range.js';
-import { DocumentPasteContext, DocumentPasteEdit, DocumentPasteEditProvider, DocumentPasteEditsSession, SymbolKinds } from '../../../../../../../editor/common/languages.js';
+import { DocumentPasteContext, DocumentPasteEdit, DocumentPasteEditProvider, DocumentPasteEditsSession, DocumentPasteTriggerKind, SymbolKinds } from '../../../../../../../editor/common/languages.js';
import { ITextModel } from '../../../../../../../editor/common/model.js';
import { ILanguageFeaturesService } from '../../../../../../../editor/common/services/languageFeatures.js';
import { IModelService } from '../../../../../../../editor/common/services/model.js';
@@ -679,6 +680,56 @@ class PasteSymbolProvider implements DocumentPasteEditProvider {
}
}
+class PasteHtmlProvider implements DocumentPasteEditProvider {
+
+ public readonly kind = new HierarchicalKind('chat.paste.html');
+ public readonly providedPasteEditKinds = [this.kind];
+
+ public readonly copyMimeTypes = [];
+ public readonly pasteMimeTypes = [Mimes.html];
+
+ async provideDocumentPasteEdits(model: ITextModel, _ranges: readonly IRange[], dataTransfer: IReadonlyVSDataTransfer, context: DocumentPasteContext, token: CancellationToken): Promise {
+ if (model.uri.scheme !== Schemas.vscodeChatInput) {
+ return;
+ }
+
+ // Only activate on automatic paste — for explicit "Paste As" the user
+ // likely wants the raw text or an attachment, not a converted markdown form.
+ if (context.triggerKind !== DocumentPasteTriggerKind.Automatic) {
+ return;
+ }
+
+ const entry = dataTransfer.get(Mimes.html);
+ const htmlText = await entry?.asString();
+ if (!htmlText || token.isCancellationRequested) {
+ return;
+ }
+
+ // Skip if the HTML is trivially plain text (no meaningful tags)
+ if (!/<(a|strong|b|em|i|h[1-6]|code|pre|ul|ol|li|blockquote|del|s|strike|img|hr)\b/i.test(htmlText)) {
+ return;
+ }
+
+ const markdown = convertHtmlToMarkdown(htmlText);
+
+ // If conversion produced nothing useful, fall back
+ if (!markdown) {
+ return;
+ }
+
+ return createEditSession({
+ insertText: markdown,
+ title: localize('pasteHtmlAsMarkdown', 'Paste as Markdown'),
+ kind: this.kind,
+ handledMimeType: Mimes.html,
+ yieldTo: [
+ { kind: new HierarchicalKind('chat.attach.text') },
+ { kind: new HierarchicalKind('chat.attach.image') },
+ ],
+ });
+ }
+}
+
export class ChatPasteProvidersFeature extends Disposable {
constructor(
@IInstantiationService instaService: IInstantiationService,
@@ -694,6 +745,7 @@ export class ChatPasteProvidersFeature extends Disposable {
this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, instaService.createInstance(CopyAttachmentsProvider)));
this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, new PasteImageProvider(chatWidgetService, extensionService, fileService, environmentService, logService)));
this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, new PasteTextProvider(chatWidgetService, modelService)));
+ this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, new PasteHtmlProvider()));
this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, instaService.createInstance(PasteSymbolProvider)));
this._register(languageFeaturesService.documentPasteEditProvider.register('*', instaService.createInstance(CopyTextProvider)));
}