Merge branch 'dr-fix-tool-validator' into 'main'

Růžička, David · Růžička, David · commit 7ca1d160d5a6 · 2026-04-30T15:46:29.000+02:00
feat: add html_description_policy to upstream MCP provider config

See merge request ai-adoption/mcp/mcp4openapi!8
diff --git a/profile-schema.json b/profile-schema.json
@@ -1847,6 +1847,11 @@
           "type": "number",
           "description": "Optional request timeout for upstream MCP calls."
         },
+        "html_description_policy": {
+          "type": "string",
+          "enum": ["allow", "strip", "drop"],
+          "description": "Controls how HTML tags in upstream tool descriptions and inputSchema are handled.\n- drop (default): tools containing HTML chars (<, >, backtick) are dropped\n- strip: HTML tags are stripped from descriptions/schema string values; tool is kept\n- allow: HTML checks are skipped entirely; tool passes through as-is"
+        },
         "validation_endpoint": {
           "type": "string",
           "description": "Optional endpoint to validate upstream credentials at session init (fail-fast)."
diff --git a/src/generated-schemas.ts b/src/generated-schemas.ts
@@ -359,6 +359,7 @@ export const upstreamMcpServerConfigSchema = z.object({
     tool_prefix: z.string().optional(),
     tools: upstreamMcpToolPolicySchema.optional(),
     timeout_ms: z.number().optional(),
+    html_description_policy: z.enum(['allow', 'strip', 'drop']).optional(),
     validation_endpoint: z.string().optional(),
     validation_method: z.union([z.literal("HEAD"), z.literal("GET")]).optional(),
     validation_timeout_ms: z.number().optional()
diff --git a/src/mcp/mcp-server.ts b/src/mcp/mcp-server.ts
@@ -1906,7 +1906,7 @@ export class MCPServer {
         );
       }
       const rawTools = result.tools;
-      const sanitized = sanitizeToolList(rawTools, this.logger);
+      const sanitized = sanitizeToolList(rawTools, this.logger, provider.html_description_policy ?? 'drop');
       const policyFiltered = applyProviderToolPolicy(sanitized.tools, provider.tools);
       // Cache sanitized+policy-filtered tool names for tools/call gate enforcement.
       // Tools dropped here (bad description/inputSchema) must not be callable via tools/call.
diff --git a/src/types/profile.ts b/src/types/profile.ts
@@ -96,6 +96,14 @@ export interface UpstreamMcpServerConfig {
   /** Optional request timeout for upstream MCP calls. */
   timeout_ms?: number;
 
+  /**
+   * Controls how HTML tags in upstream tool descriptions and inputSchema are handled.
+   * - drop (default): tools containing HTML chars (<, >, backtick) are dropped
+   * - strip: HTML tags are stripped from descriptions/schema values; tool is kept
+   * - allow: HTML checks are skipped entirely; tool passes through as-is
+   */
+  html_description_policy?: 'allow' | 'strip' | 'drop';
+
   /** Optional endpoint to validate upstream credentials at session init (fail-fast). */
   validation_endpoint?: string;
   /** HTTP method for validation probe. Default: 'HEAD'. */
diff --git a/src/upstream/upstream-tool-sanitizer.test.ts b/src/upstream/upstream-tool-sanitizer.test.ts
@@ -417,6 +417,92 @@ describe('sanitizeToolList', () => {
   });
 });
 
+describe('sanitizeToolList — html_description_policy', () => {
+  let logger: Logger;
+
+  beforeEach(() => {
+    logger = { debug: vi.fn(), info: vi.fn(), warn: vi.fn(), error: vi.fn() };
+  });
+
+  describe('drop (default)', () => {
+    it('drops tool with HTML in description', () => {
+      const tool = makeTool('t', 'Creates <b>bold</b> text');
+      const result = sanitizeToolList([tool], logger);
+      expect(result.tools).toHaveLength(0);
+      expect(result.dropped[0].reason).toBe('forbidden characters in description');
+    });
+
+    it('drops tool with HTML in inputSchema string value', () => {
+      const tool: Tool = { name: 't', inputSchema: { type: 'object', properties: { x: { type: 'string', description: '<b>bad</b>' } } } };
+      const result = sanitizeToolList([tool], logger);
+      expect(result.tools).toHaveLength(0);
+      expect(result.dropped[0].reason).toBe('forbidden characters in input schema');
+    });
+  });
+
+  describe('strip', () => {
+    it('strips HTML tags from description and keeps tool', () => {
+      const tool = makeTool('t', 'Creates <b>bold</b> issue <br/> in project');
+      const result = sanitizeToolList([tool], logger, 'strip');
+      expect(result.tools).toHaveLength(1);
+      expect(result.tools[0].description).toBe('Creates bold issue  in project');
+      expect(result.dropped).toHaveLength(0);
+      expect(logger.warn).not.toHaveBeenCalled();
+    });
+
+    it('strips HTML tags from inputSchema string values', () => {
+      const tool: Tool = {
+        name: 't',
+        description: 'plain',
+        inputSchema: { type: 'object', properties: { x: { type: 'string', description: 'Use <code>format</code> param' } } },
+      };
+      const result = sanitizeToolList([tool], logger, 'strip');
+      expect(result.tools).toHaveLength(1);
+      const xDesc = (result.tools[0].inputSchema as any).properties.x.description;
+      expect(xDesc).toBe('Use format param');
+    });
+
+    it('does not mutate original tool object', () => {
+      const tool = makeTool('t', 'Has <b>HTML</b>');
+      sanitizeToolList([tool], logger, 'strip');
+      expect(tool.description).toBe('Has <b>HTML</b>');
+    });
+
+    it('still drops tool with invalid name regardless of strip policy', () => {
+      const tool = makeTool('bad name!', 'Has <b>HTML</b>');
+      const result = sanitizeToolList([tool], logger, 'strip');
+      expect(result.tools).toHaveLength(0);
+      expect(result.dropped[0].reason).toBe('invalid characters in tool name');
+    });
+  });
+
+  describe('allow', () => {
+    it('passes tool with HTML description through unchanged', () => {
+      const tool = makeTool('t', 'Creates <b>bold</b> text');
+      const result = sanitizeToolList([tool], logger, 'allow');
+      expect(result.tools).toHaveLength(1);
+      expect(result.tools[0].description).toBe('Creates <b>bold</b> text');
+      expect(result.dropped).toHaveLength(0);
+      expect(logger.warn).not.toHaveBeenCalled();
+    });
+
+    it('passes tool with HTML in inputSchema through unchanged', () => {
+      const schema = { type: 'object', properties: { x: { type: 'string', description: '<b>bad</b>' } } };
+      const tool: Tool = { name: 't', inputSchema: schema };
+      const result = sanitizeToolList([tool], logger, 'allow');
+      expect(result.tools).toHaveLength(1);
+      expect((result.tools[0].inputSchema as any).properties.x.description).toBe('<b>bad</b>');
+    });
+
+    it('still drops tool with invalid name regardless of allow policy', () => {
+      const tool = makeTool('bad name!', 'Has <b>HTML</b>');
+      const result = sanitizeToolList([tool], logger, 'allow');
+      expect(result.tools).toHaveLength(0);
+      expect(result.dropped[0].reason).toBe('invalid characters in tool name');
+    });
+  });
+});
+
 describe('applyProviderToolPolicy', () => {
   const tools = [makeTool('alpha'), makeTool('beta'), makeTool('gamma')];
 
diff --git a/src/upstream/upstream-tool-sanitizer.ts b/src/upstream/upstream-tool-sanitizer.ts
@@ -18,6 +18,8 @@ import { sanitizeLogMessage } from '../core/logger.js';
 import type { Logger } from '../core/logger.js';
 import type { UpstreamMcpToolPolicy } from '../types/profile.js';
 
+export type HtmlDescriptionPolicy = 'allow' | 'strip' | 'drop';
+
 export interface SanitizationResult {
   tools: Tool[];
   dropped: { name: string; reason: string }[];
@@ -26,6 +28,38 @@ export interface SanitizationResult {
 // Data-driven constraints
 const TOOL_NAME_PATTERN = /^[a-zA-Z0-9_-]+$/;
 const DESCRIPTION_FORBIDDEN_CHARS = /[<>`]/;
+const HTML_TAG_PATTERN = /<[^>]*>/g;
+
+const MAX_EXCERPT_CONTEXT = 40;
+
+function firstForbiddenExcerpt(text: string): string {
+  const idx = text.search(DESCRIPTION_FORBIDDEN_CHARS);
+  if (idx === -1) return '';
+  const start = Math.max(0, idx - MAX_EXCERPT_CONTEXT);
+  const end = Math.min(text.length, idx + MAX_EXCERPT_CONTEXT + 1);
+  const prefix = start > 0 ? '…' : '';
+  const suffix = end < text.length ? '…' : '';
+  return prefix + text.slice(start, end) + suffix;
+}
+
+function stripHtmlTags(text: string): string {
+  return text.replace(HTML_TAG_PATTERN, '');
+}
+
+function stripHtmlFromSchema(value: unknown, depth = 0): unknown {
+  if (depth > 10) return value;
+  if (typeof value === 'string') return stripHtmlTags(value);
+  if (Array.isArray(value)) return value.map(v => stripHtmlFromSchema(v, depth + 1));
+  if (typeof value === 'object' && value !== null) {
+    const obj = value as Record<string, unknown>;
+    const result: Record<string, unknown> = {};
+    for (const [k, v] of Object.entries(obj)) {
+      result[k] = stripHtmlFromSchema(v, depth + 1);
+    }
+    return result;
+  }
+  return value;
+}
 
 /**
  * Recursively scan a JSON Schema object for forbidden characters in both keys and string values.
@@ -64,17 +98,18 @@ const truncateName = (name: string): string =>
  *   1. Name length <= 255
  *   2. Name matches [a-zA-Z0-9_-]
  *   3. Description length <= 2048 (if present)
- *   4. Description contains no <, >, or backtick (if present)
- *   5. inputSchema contains no forbidden characters in any key or string value
- *      (recursive scan to depth 10; schemas exceeding the depth limit are dropped)
+ *   4. HTML policy (html_description_policy):
+ *      - drop (default): tools with <, >, or backtick in description/inputSchema are dropped
+ *      - strip: HTML tags stripped from description and inputSchema string values; tool kept
+ *      - allow: HTML checks skipped entirely; tool passes through as-is
  *
  * Offending tools are dropped and logged. Safe tools pass through unchanged.
  */
-export function sanitizeToolList(tools: Tool[], logger?: Logger): SanitizationResult {
+export function sanitizeToolList(tools: Tool[], logger?: Logger, htmlPolicy: HtmlDescriptionPolicy = 'drop'): SanitizationResult {
   const safe: Tool[] = [];
   const dropped: { name: string; reason: string }[] = [];
 
-  for (const tool of tools) {
+  for (let tool of tools) {
     // Guard: upstream may return null or non-object entries (e.g. null items in tools array)
     if (tool === null || typeof tool !== 'object') {
       const safeName = sanitizeLogMessage(truncateName(String(tool)));
@@ -85,6 +120,7 @@ export function sanitizeToolList(tools: Tool[], logger?: Logger): SanitizationRe
     }
 
     let reason: string | undefined;
+    let excerpt: string | undefined;
 
     // Runtime type guards: upstream may return non-string fields despite SDK types
     if (typeof tool.name !== 'string') {
@@ -97,20 +133,38 @@ export function sanitizeToolList(tools: Tool[], logger?: Logger): SanitizationRe
       reason = 'malformed tool definition: description is not a string';
     } else if (tool.description && tool.description.length > MAX_DESCRIPTION_LENGTH) {
       reason = 'tool description too long';
-    } else if (tool.description && DESCRIPTION_FORBIDDEN_CHARS.test(tool.description)) {
-      reason = 'forbidden characters in description';
     } else if (tool.inputSchema !== undefined && (typeof tool.inputSchema !== 'object' || tool.inputSchema === null || Array.isArray(tool.inputSchema))) {
       reason = 'malformed tool definition: inputSchema is not an object';
-    } else if (tool.inputSchema && schemaContainsForbiddenChars(tool.inputSchema)) {
-      reason = 'forbidden characters in input schema';
+    } else if (htmlPolicy === 'drop') {
+      if (tool.description && DESCRIPTION_FORBIDDEN_CHARS.test(tool.description)) {
+        reason = 'forbidden characters in description';
+        excerpt = firstForbiddenExcerpt(tool.description);
+      } else if (tool.inputSchema && schemaContainsForbiddenChars(tool.inputSchema)) {
+        reason = 'forbidden characters in input schema';
+        const schemaStr = JSON.stringify(tool.inputSchema);
+        excerpt = firstForbiddenExcerpt(schemaStr);
+      }
+    } else if (htmlPolicy === 'strip') {
+      if (tool.description) {
+        tool = { ...tool, description: stripHtmlTags(tool.description) };
+      }
+      if (tool.inputSchema) {
+        tool = { ...tool, inputSchema: stripHtmlFromSchema(tool.inputSchema) as Tool['inputSchema'] };
+      }
     }
+    // htmlPolicy === 'allow': skip all HTML checks, pass tool through unchanged
 
     if (reason !== undefined) {
       // Coerce non-string names to string for safe logging
       const nameStr = typeof tool.name === 'string' ? tool.name : String(tool.name);
       const safeName = sanitizeLogMessage(truncateName(nameStr));
+      const safeExcerpt = excerpt ? sanitizeLogMessage(excerpt) : undefined;
       dropped.push({ name: safeName, reason });
-      logger?.warn('Dropped upstream tool due to sanitization failure', { name: safeName, reason });
+      logger?.warn('Dropped upstream tool due to sanitization failure', {
+        name: safeName,
+        reason,
+        ...(safeExcerpt !== undefined && { excerpt: safeExcerpt }),
+      });
     } else {
       safe.push(tool);
     }

Original file line number	Diff line number	Diff line change
`@@ -1906,7 +1906,7 @@ export class MCPServer {`
`1906`	`1906`	`);`
`1907`	`1907`	`}`
`1908`	`1908`	`const rawTools = result.tools;`
`1909`		`- const sanitized = sanitizeToolList(rawTools, this.logger);`
	`1909`	`+ const sanitized = sanitizeToolList(rawTools, this.logger, provider.html_description_policy ?? 'drop');`
`1910`	`1910`	`const policyFiltered = applyProviderToolPolicy(sanitized.tools, provider.tools);`
`1911`	`1911`	`// Cache sanitized+policy-filtered tool names for tools/call gate enforcement.`
`1912`	`1912`	`// Tools dropped here (bad description/inputSchema) must not be callable via tools/call.`