diff --git a/src/vs/base/common/htmlToMarkdown.ts b/src/vs/base/common/htmlToMarkdown.ts new file mode 100644 index 0000000000000..576873bc84448 --- /dev/null +++ b/src/vs/base/common/htmlToMarkdown.ts @@ -0,0 +1,169 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +/** + * Lightweight HTML-to-Markdown converter. + * + * Handles a small set of common inline and block elements so that content + * pasted from web pages keeps its basic structure (headings, links, bold, + * italic, code, lists) when inserted into a Markdown-aware surface such as + * the chat input. + */ +const maxInputLength = 200_000; + +export function convertHtmlToMarkdown(html: string): string { + // Bail out on very large inputs to avoid regex backtracking cost + if (html.length > maxInputLength) { + return html.replace(/<[^>]+>/g, ''); + } + + // Work on a mutable copy + let md = html; + + // Normalise line endings + md = md.replace(/\r\n?/g, '\n'); + + // --- block elements --------------------------------------------------- + + // Headings + md = md.replace(/]*>([\s\S]*?)<\/h1>/gi, (_m, inner) => `\n# ${inlineClean(inner)}\n`); + md = md.replace(/]*>([\s\S]*?)<\/h2>/gi, (_m, inner) => `\n## ${inlineClean(inner)}\n`); + md = md.replace(/]*>([\s\S]*?)<\/h3>/gi, (_m, inner) => `\n### ${inlineClean(inner)}\n`); + md = md.replace(/]*>([\s\S]*?)<\/h4>/gi, (_m, inner) => `\n#### ${inlineClean(inner)}\n`); + md = md.replace(/]*>([\s\S]*?)<\/h5>/gi, (_m, inner) => `\n##### ${inlineClean(inner)}\n`); + md = md.replace(/]*>([\s\S]*?)<\/h6>/gi, (_m, inner) => `\n###### ${inlineClean(inner)}\n`); + + // Code blocks:
+ md = md.replace(/]*>\s*]*>([\s\S]*?)<\/code>\s*<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${cleanCodeBlock(inner)}\n\`\`\`\n`); + + // Standalone
 without 
+	md = md.replace(/]*>([\s\S]*?)<\/pre>/gi, (_m, inner) => `\n\`\`\`\n${cleanCodeBlock(inner)}\n\`\`\`\n`);
+
+	// Blockquote
+	md = md.replace(/]*>([\s\S]*?)<\/blockquote>/gi, (_m, inner) => {
+		const lines = inlineClean(inner).split('\n').map(l => `> ${l.trim()}`);
+		return `\n${lines.join('\n')}\n`;
+	});
+
+	// Ordered list items — number them before stripping the 
    wrapper + md = md.replace(/]*>([\s\S]*?)<\/ol>/gi, (_m, inner) => { + let index = 0; + const numbered = inner.replace(/]*>([\s\S]*?)<\/li>/gi, (_liM: string, liInner: string) => { + index++; + return `${index}. ${inlineClean(liInner).trim()}\n`; + }); + return `\n${numbered.replace(/<[^>]+>/g, '')}\n`; + }); + + // Unordered list items - convert before stripping the list wrapper + md = md.replace(/]*>([\s\S]*?)<\/li>/gi, (_m, inner) => `- ${inlineClean(inner).trim()}\n`); + md = md.replace(/<\/?ul[^>]*>/gi, '\n'); + + // Paragraphs and divs → double newline + md = md.replace(/<\/p>/gi, '\n\n'); + md = md.replace(/]*>/gi, ''); + md = md.replace(/<\/div>/gi, '\n'); + md = md.replace(/]*>/gi, ''); + + // Line breaks + md = md.replace(//gi, '\n'); + + // Horizontal rules + md = md.replace(/]*\/?>/gi, '\n---\n'); + + // --- inline elements -------------------------------------------------- + + // Links - must come before we strip remaining tags + md = md.replace(/]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => sanitizeLink(href, inlineClean(text).trim())); + + // Images + md = md.replace(/]*alt="([^"]*)"[^>]*src="([^"]*)"[^>]*\/?>/gi, '![$1]($2)'); + md = md.replace(/]*src="([^"]*)"[^>]*alt="([^"]*)"[^>]*\/?>/gi, '![$2]($1)'); + md = md.replace(/]*src="([^"]*)"[^>]*\/?>/gi, '![]($1)'); + + // Bold / strong + md = md.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`); + + // Italic / emphasis + md = md.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`); + + // Inline code + md = md.replace(/]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``); + + // Strikethrough + md = md.replace(/<(del|s|strike)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `~~${inlineClean(inner)}~~`); + + // --- cleanup ---------------------------------------------------------- + + // Strip any remaining HTML tags + md = md.replace(/<[^>]+>/g, ''); + + // Decode common HTML entities + md = decodeEntities(md); + + // Collapse runs of 3+ newlines into 2 + md = md.replace(/\n{3,}/g, '\n\n'); + + return md.trim(); +} + +/** Recursively strip tags for use inside an inline markdown construct. */ +function inlineClean(html: string): string { + // Process nested inline elements first + let result = html; + result = result.replace(/]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_m, href, text) => sanitizeLink(href, inlineClean(text).trim())); + result = result.replace(/<(strong|b)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `**${inlineClean(inner)}**`); + result = result.replace(/<(em|i)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `*${inlineClean(inner)}*`); + result = result.replace(/]*)?>([\s\S]*?)<\/code>/gi, (_m, _attrs, inner) => `\`${decodeEntities(inner)}\``); + result = result.replace(/<(del|s|strike)(\s[^>]*)?>([\s\S]*?)<\/\1>/gi, (_m, _tag, _attrs, inner) => `~~${inlineClean(inner)}~~`); + result = result.replace(//gi, '\n'); + result = result.replace(/<[^>]+>/g, ''); + return decodeEntities(result); +} + +/** Strip tags, normalise
    , and decode entities inside a code block while preserving indentation. */ +function cleanCodeBlock(html: string): string { + let result = html; + // Normalise
    to newlines + result = result.replace(//gi, '\n'); + // Strip all HTML tags (e.g. syntax-highlighting s) + result = result.replace(/<[^>]+>/g, ''); + result = decodeEntities(result); + // Trim only leading/trailing newlines, preserving indentation + result = result.replace(/^\n+|\n+$/g, ''); + return result; +} + +/** Produce a markdown link, stripping dangerous schemes like `javascript:`. */ +function sanitizeLink(href: string, text: string): string { + if (/^(javascript|vbscript|data):/i.test(href.trim())) { + return text; + } + return `[${text}](${href})`; +} + +/** Decode the most common HTML entities, including numeric character references. */ +function decodeEntities(text: string): string { + return text + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, '\'') + .replace(/ /g, ' ') + .replace(/&#x(?[0-9a-fA-F]+);/g, (...args) => safeFromCodePoint(parseInt(args.at(-1).hex, 16))) + .replace(/&#(?\d+);/g, (...args) => safeFromCodePoint(parseInt(args.at(-1).dec, 10))); +} + +function safeFromCodePoint(code: number): string { + if (code >= 0 && code <= 0x10FFFF) { + try { + return String.fromCodePoint(code); + } catch { + // invalid code point + } + } + return ''; +} diff --git a/src/vs/base/test/common/htmlToMarkdown.test.ts b/src/vs/base/test/common/htmlToMarkdown.test.ts new file mode 100644 index 0000000000000..88a5adf30c38d --- /dev/null +++ b/src/vs/base/test/common/htmlToMarkdown.test.ts @@ -0,0 +1,181 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ +import assert from 'assert'; +import { convertHtmlToMarkdown } from '../../common/htmlToMarkdown.js'; +import { ensureNoDisposablesAreLeakedInTestSuite } from './utils.js'; + +suite('htmlToMarkdown', () => { + ensureNoDisposablesAreLeakedInTestSuite(); + + test('converts headings', () => { + assert.strictEqual(convertHtmlToMarkdown('

    Title

    '), '# Title'); + assert.strictEqual(convertHtmlToMarkdown('

    Subtitle

    '), '## Subtitle'); + assert.strictEqual(convertHtmlToMarkdown('

    Section

    '), '### Section'); + assert.strictEqual(convertHtmlToMarkdown('

    Sub-section

    '), '#### Sub-section'); + assert.strictEqual(convertHtmlToMarkdown('
    Minor
    '), '##### Minor'); + assert.strictEqual(convertHtmlToMarkdown('
    Smallest
    '), '###### Smallest'); + }); + + test('converts links', () => { + assert.strictEqual( + convertHtmlToMarkdown('Example'), + '[Example](https://example.com)' + ); + }); + + test('strips dangerous schemes from links', () => { + assert.strictEqual( + convertHtmlToMarkdown('click'), + 'click' + ); + assert.strictEqual( + convertHtmlToMarkdown('run'), + 'run' + ); + assert.strictEqual( + convertHtmlToMarkdown('data'), + 'data' + ); + }); + + test('converts bold and italic', () => { + assert.strictEqual(convertHtmlToMarkdown('bold'), '**bold**'); + assert.strictEqual(convertHtmlToMarkdown('bold'), '**bold**'); + assert.strictEqual(convertHtmlToMarkdown('italic'), '*italic*'); + assert.strictEqual(convertHtmlToMarkdown('italic'), '*italic*'); + }); + + test('converts inline code', () => { + assert.strictEqual(convertHtmlToMarkdown('foo()'), '`foo()`'); + }); + + test('converts code blocks', () => { + assert.strictEqual( + convertHtmlToMarkdown('
    const x = 1;
    '), + '```\nconst x = 1;\n```' + ); + }); + + test('converts syntax-highlighted code blocks by stripping inner tags', () => { + assert.strictEqual( + convertHtmlToMarkdown('
    const x = 1;
    '), + '```\nconst x = 1;\n```' + ); + }); + + test('preserves indentation in code blocks', () => { + assert.strictEqual( + convertHtmlToMarkdown('
    function foo() {\n  return 1;\n}
    '), + '```\nfunction foo() {\n return 1;\n}\n```' + ); + }); + + test('converts unordered lists', () => { + const html = '
    • one
    • two
    • three
    '; + assert.strictEqual(convertHtmlToMarkdown(html), '- one\n- two\n- three'); + }); + + test('converts ordered lists to numbered items', () => { + const html = '
    1. first
    2. second
    '; + assert.strictEqual(convertHtmlToMarkdown(html), '1. first\n2. second'); + }); + + test('converts line breaks', () => { + assert.strictEqual(convertHtmlToMarkdown('hello
    world'), 'hello\nworld'); + assert.strictEqual(convertHtmlToMarkdown('hello
    world'), 'hello\nworld'); + }); + + test('converts horizontal rules', () => { + assert.strictEqual(convertHtmlToMarkdown('above
    below'), 'above\n---\nbelow'); + }); + + test('converts strikethrough', () => { + assert.strictEqual(convertHtmlToMarkdown('removed'), '~~removed~~'); + assert.strictEqual(convertHtmlToMarkdown('struck'), '~~struck~~'); + }); + + test('converts blockquotes', () => { + assert.strictEqual( + convertHtmlToMarkdown('
    quoted text
    '), + '> quoted text' + ); + }); + + test('converts images', () => { + assert.strictEqual( + convertHtmlToMarkdown('photo'), + '![photo](https://example.com/img.png)' + ); + }); + + test('decodes HTML entities', () => { + assert.strictEqual(convertHtmlToMarkdown('& < > " ''), '& < > " \''); + }); + + test('strips unknown tags', () => { + assert.strictEqual(convertHtmlToMarkdown('hello'), 'hello'); + }); + + test('handles nested inline elements', () => { + assert.strictEqual( + convertHtmlToMarkdown('bold italic'), + '***bold italic***' + ); + }); + + test('handles link with bold text inside', () => { + assert.strictEqual( + convertHtmlToMarkdown('click here'), + '[**click here**](https://example.com)' + ); + }); + + test('handles heading with link inside', () => { + assert.strictEqual( + convertHtmlToMarkdown('

    Title

    '), + '## [Title](https://example.com)' + ); + }); + + test('collapses excessive newlines', () => { + const html = '

    one

    two

    '; + const result = convertHtmlToMarkdown(html); + assert.ok(!result.includes('\n\n\n'), 'should not have 3+ consecutive newlines'); + assert.ok(result.includes('one')); + assert.ok(result.includes('two')); + }); + + test('handles a realistic web page snippet', () => { + const html = ` +

    Getting Started

    +

    Welcome to VS Code. Visit the website for more info.

    +
      +
    • Fast
    • +
    • Extensible
    • +
    + `; + const md = convertHtmlToMarkdown(html); + assert.ok(md.includes('# Getting Started')); + assert.ok(md.includes('**VS Code**')); + assert.ok(md.includes('[the website](https://code.visualstudio.com)')); + assert.ok(md.includes('- Fast')); + assert.ok(md.includes('- Extensible')); + }); + + test('decodes numeric HTML entities', () => { + assert.strictEqual(convertHtmlToMarkdown('<tag>'), ''); + assert.strictEqual(convertHtmlToMarkdown('<tag>'), ''); + assert.strictEqual(convertHtmlToMarkdown('—'), '—'); + assert.strictEqual(convertHtmlToMarkdown('—'), '—'); + }); + + test('falls back to tag-stripping for very large input', () => { + const large = '' + 'x'.repeat(200_001) + ''; + const result = convertHtmlToMarkdown(large); + // Should strip tags but NOT apply markdown bold formatting + assert.ok(!result.includes('**')); + assert.ok(!result.includes('')); + }); +}); diff --git a/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts b/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts index be28fc3feedb6..450e51de4c29d 100644 --- a/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts +++ b/src/vs/workbench/contrib/chat/browser/widget/input/editor/chatPasteProviders.ts @@ -2,10 +2,11 @@ * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See License.txt in the project root for license information. *--------------------------------------------------------------------------------------------*/ +import { alert } from '../../../../../../../base/browser/ui/aria/aria.js'; import { CancellationToken } from '../../../../../../../base/common/cancellation.js'; import { Codicon } from '../../../../../../../base/common/codicons.js'; import { createStringDataTransferItem, IDataTransferItem, IReadonlyVSDataTransfer, VSDataTransfer } from '../../../../../../../base/common/dataTransfer.js'; -import { alert } from '../../../../../../../base/browser/ui/aria/aria.js'; +import { convertHtmlToMarkdown } from '../../../../../../../base/common/htmlToMarkdown.js'; import { HierarchicalKind } from '../../../../../../../base/common/hierarchicalKind.js'; import { Disposable } from '../../../../../../../base/common/lifecycle.js'; import { revive } from '../../../../../../../base/common/marshalling.js'; @@ -15,7 +16,7 @@ import { basename, joinPath } from '../../../../../../../base/common/resources.j import { URI, UriComponents } from '../../../../../../../base/common/uri.js'; import { Position } from '../../../../../../../editor/common/core/position.js'; import { IRange } from '../../../../../../../editor/common/core/range.js'; -import { DocumentPasteContext, DocumentPasteEdit, DocumentPasteEditProvider, DocumentPasteEditsSession, SymbolKinds } from '../../../../../../../editor/common/languages.js'; +import { DocumentPasteContext, DocumentPasteEdit, DocumentPasteEditProvider, DocumentPasteEditsSession, DocumentPasteTriggerKind, SymbolKinds } from '../../../../../../../editor/common/languages.js'; import { ITextModel } from '../../../../../../../editor/common/model.js'; import { ILanguageFeaturesService } from '../../../../../../../editor/common/services/languageFeatures.js'; import { IModelService } from '../../../../../../../editor/common/services/model.js'; @@ -679,6 +680,56 @@ class PasteSymbolProvider implements DocumentPasteEditProvider { } } +class PasteHtmlProvider implements DocumentPasteEditProvider { + + public readonly kind = new HierarchicalKind('chat.paste.html'); + public readonly providedPasteEditKinds = [this.kind]; + + public readonly copyMimeTypes = []; + public readonly pasteMimeTypes = [Mimes.html]; + + async provideDocumentPasteEdits(model: ITextModel, _ranges: readonly IRange[], dataTransfer: IReadonlyVSDataTransfer, context: DocumentPasteContext, token: CancellationToken): Promise { + if (model.uri.scheme !== Schemas.vscodeChatInput) { + return; + } + + // Only activate on automatic paste — for explicit "Paste As" the user + // likely wants the raw text or an attachment, not a converted markdown form. + if (context.triggerKind !== DocumentPasteTriggerKind.Automatic) { + return; + } + + const entry = dataTransfer.get(Mimes.html); + const htmlText = await entry?.asString(); + if (!htmlText || token.isCancellationRequested) { + return; + } + + // Skip if the HTML is trivially plain text (no meaningful tags) + if (!/<(a|strong|b|em|i|h[1-6]|code|pre|ul|ol|li|blockquote|del|s|strike|img|hr)\b/i.test(htmlText)) { + return; + } + + const markdown = convertHtmlToMarkdown(htmlText); + + // If conversion produced nothing useful, fall back + if (!markdown) { + return; + } + + return createEditSession({ + insertText: markdown, + title: localize('pasteHtmlAsMarkdown', 'Paste as Markdown'), + kind: this.kind, + handledMimeType: Mimes.html, + yieldTo: [ + { kind: new HierarchicalKind('chat.attach.text') }, + { kind: new HierarchicalKind('chat.attach.image') }, + ], + }); + } +} + export class ChatPasteProvidersFeature extends Disposable { constructor( @IInstantiationService instaService: IInstantiationService, @@ -694,6 +745,7 @@ export class ChatPasteProvidersFeature extends Disposable { this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, instaService.createInstance(CopyAttachmentsProvider))); this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, new PasteImageProvider(chatWidgetService, extensionService, fileService, environmentService, logService))); this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, new PasteTextProvider(chatWidgetService, modelService))); + this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, new PasteHtmlProvider())); this._register(languageFeaturesService.documentPasteEditProvider.register({ scheme: Schemas.vscodeChatInput, pattern: '*', hasAccessToAllModels: true }, instaService.createInstance(PasteSymbolProvider))); this._register(languageFeaturesService.documentPasteEditProvider.register('*', instaService.createInstance(CopyTextProvider))); }