Merge pull request #169 from WolframResearch/bugfix/ensure-valid-regex-in-tool-schemas

rhennigan · web-flow · commit 74a3fe7b9edd · 2026-04-23T13:22:01.000-04:00
Bugfix: Ensure valid regex patterns in tool schemas
diff --git a/.cspell.json b/.cspell.json
@@ -1,10 +1,13 @@
 {
     "words": [
         "Agentic",
+        "alnum",
         "BUILDKIT",
         "buildx",
         "CICD",
+        "cntrl",
         "Collatz",
+        "Dotall",
         "dpkg",
         "Embedder",
         "ENTITLEMENTID",
@@ -28,7 +31,9 @@
         "opencode",
         "pacletreadonly",
         "patt",
+        "PCRE",
         "prebuild",
+        "punct",
         "pwfile",
         "reranking",
         "sandboxed",
@@ -53,6 +58,7 @@
         "WOLFRAMINIT",
         "wolframresearch",
         "worktrees",
+        "xdigit",
         "zoomable"
     ],
     "ignoreRegExpList": [
diff --git a/AGENTS.md b/AGENTS.md
@@ -54,6 +54,7 @@ See [building.md](docs/building.md) for detailed instructions.
   - `StartMCPServer.wl`: Implementation for starting MCP servers
   - `ValidateAgentToolsPacletExtension.wl`: Validation of `"AgentTools"` [paclet extensions](docs/paclet-extensions.md)
   - `UIResources.wl`: [MCP Apps](docs/mcp-apps.md) UI resource registry, client capability detection, and shared cloud notebook deployment helper
+  - `Utilities.wl`: General-purpose helpers — LLMKit subscription checks, Chatbook version verification, and `toJSRegex` for converting ICU/PCRE patterns to ECMA 262 (used when sanitizing tool schema `"pattern"` fields)
   - `YAML.wl`: YAML import/export helpers (`importYAML`, `importYAMLString`, `exportYAML`, `exportYAMLString`) used by YAML-based MCP clients (e.g. Goose)
   - `Tools/`: Contains several files defining predefined MCP tools used by default servers. If tool schemas are modified, we need to rebuild agent skills.
   - `Prompts/`: Contains files defining predefined [MCP prompts](docs/mcp-prompts.md) used by default servers
diff --git a/Kernel/CommonSymbols.wl b/Kernel/CommonSymbols.wl
@@ -57,6 +57,7 @@ BeginPackage[ "Wolfram`AgentTools`Common`" ];
 `throwFailure;
 `throwInternalFailure;
 `throwTop;
+`toJSRegex;
 `validateMCPServerObjectData;
 `writeRawJSONFile;
 `writeWXFFile;
diff --git a/Kernel/StartMCPServer.wl b/Kernel/StartMCPServer.wl
@@ -317,7 +317,7 @@ createMCPToolData[ mcpName_String, tool: HoldPattern[ _LLMTool ] ] := Enclose[
 
         data = ConfirmBy[ tool[ "Data" ], AssociationQ, "Data" ];
         description = safeString @ ConfirmBy[ tool[ "Description" ], StringQ, "Description" ];
-        inputSchema = ConfirmBy[ tool[ "JSONSchema" ], AssociationQ, "InputSchema" ];
+        inputSchema = ConfirmBy[ toolSchema @ tool, AssociationQ, "InputSchema" ];
 
         title = Lookup[ data, "DisplayName", Missing[ ] ];
         If[ StringQ @ title, title = safeString @ title ];
@@ -337,6 +337,40 @@ createMCPToolData[ mcpName_String, tool: HoldPattern[ _LLMTool ] ] := Enclose[
 
 createMCPToolData // endDefinition;
 
+(* ::**************************************************************************************************************:: *)
+(* ::Subsubsection::Closed:: *)
+(*toolSchema*)
+toolSchema // beginDefinition;
+
+toolSchema[ tool: HoldPattern[ _LLMTool ] ] := Enclose[
+    ReplaceAll[
+        ReplaceAll[
+            tool[ "JSONSchema" ],
+            (* Make sure regex patterns are valid in JavaScript *)
+            {
+                (* The vast majority of patterns will just be the one that matches anything,
+                   since it's the pattern produced by the basic "String" Interpreter.
+                   We can safely drop it, since it's redundant. *)
+                as: KeyValuePattern[ "pattern" -> "(?ms).*" ] :>
+                    RuleCondition @ KeyDrop[ as, "pattern" ],
+
+                (* For other patterns produced via `Interpreter[Restricted["String", pattern]]`,
+                   we attempt to convert to JS-compatible format. *)
+                as: KeyValuePattern[ "pattern" -> regex_String ] :>
+                    RuleCondition @ <|
+                        as,
+                        "pattern" -> ConfirmBy[ toJSRegex @ regex, StringQ, "ToJSRegex" ]
+                    |>
+            }
+        ],
+        (* Make sure strings in schemas do not contain private-use characters *)
+        s_String :> RuleCondition @ safeString @ s
+    ],
+    throwInternalFailure
+];
+
+toolSchema // endDefinition;
+
 (* ::**************************************************************************************************************:: *)
 (* ::Subsubsection::Closed:: *)
 (*toolWarmup*)
diff --git a/Kernel/Utilities.wl b/Kernel/Utilities.wl
@@ -113,6 +113,229 @@ chatbookVersionCheck0[ other_ ] := Enclose[
 
 chatbookVersionCheck0 // endDefinition;
 
+(* ::**************************************************************************************************************:: *)
+(* ::Section::Closed:: *)
+(*Regular Expressions*)
+
+(* ::**************************************************************************************************************:: *)
+(* ::Subsection::Closed:: *)
+(*toJSRegex*)
+
+(* Convert an ICU/PCRE-flavored regex (as produced by StringPattern`PatternConvert) into a
+   best-effort ECMA 262 pattern suitable for JSON Schema "pattern" fields consumed by
+   JavaScript-based validators.
+
+   Intentional non-goals and accepted limitations:
+
+   - No attempt to simulate multiline `^`/`$`: if the original had `m` and contained raw `^`/`$`
+     (from `StartOfLine`/`EndOfLine`), the converted pattern treats them as start/end-of-string in JS.
+     Schema patterns rarely use line anchors, and any workaround (`(?:^|(?<=\n))`) bloats output and risks
+     validator compatibility issues.
+
+   - PCRE-only constructs that pass through untouched from user-supplied `RegularExpression[...]` bodies
+     (atomic groups `(?>...)`, possessive quantifiers `*+`, named groups `(?P<x>...)`) are left alone.
+     Those are user escape hatches; if a user puts PCRE-only syntax in a schema pattern, they own the compatibility.
+
+   - `u`-flag is not available to us - we are producing bare pattern strings for JSON Schema consumers who may or may
+     not set it.
+*)
+
+toJSRegex // beginDefinition;
+
+toJSRegex[ regex_String ] := Enclose[
+    Module[ { body, hadDotAll },
+        { body, hadDotAll } = ConfirmMatch[ extractLeadingRegexFlags @ regex, { _String, True|False }, "Extract" ];
+
+        body = ConfirmBy[ stripInnerRegexModifiers @ body, StringQ, "StripInnerRegexModifiers" ];
+        body = ConfirmBy[ convertPOSIXClasses @ body     , StringQ, "ConvertPOSIXClasses"      ];
+        body = ConfirmBy[ convertPCREAnchors @ body      , StringQ, "ConvertPCREAnchors"       ];
+        body = ConfirmBy[ convertUnicodeEscapes @ body   , StringQ, "ConvertUnicodeEscapes"    ];
+
+        If[ hadDotAll,
+            ConfirmBy[ convertDotAllDots @ body, StringQ, "ConvertDotAllDots" ],
+            body
+        ]
+    ],
+    throwInternalFailure
+];
+
+toJSRegex // endDefinition;
+
+(* TODO: When creating an MCP server, we could attempt this conversion and issue a warning message if there are any
+   unhandled patterns. This would only be a warning since many schema validators will still accept the pattern as-is. *)
+
+(* ::**************************************************************************************************************:: *)
+(* ::Subsubsection::Closed:: *)
+(*extractLeadingRegexFlags*)
+(* Strip a leading "(?flags)" group and return { body, hadDotAll }. *)
+extractLeadingRegexFlags // beginDefinition;
+
+extractLeadingRegexFlags[ s_String ] :=
+    Module[ { match },
+        match = StringCases[
+            s,
+            StartOfString ~~ "(?" ~~ flags: (LetterCharacter..) ~~ ")" ~~ rest___ :>
+                { flags, rest },
+            1
+        ];
+        If[ match === { },
+            { s, False },
+            { match[[ 1, 2 ]], StringContainsQ[ match[[ 1, 1 ]], "s" ] }
+        ]
+    ];
+
+extractLeadingRegexFlags // endDefinition;
+
+(* ::**************************************************************************************************************:: *)
+(* ::Subsubsection::Closed:: *)
+(*stripInnerRegexModifiers*)
+(* Strip the scope-less inline modifier prefixes that PatternConvert inserts at the start of a
+   "(?:...)" wrapper around RegularExpression[] contents. Matching only the "(?:(?-...)" wrapper
+   form avoids silently altering mid-pattern modifiers in user-supplied regexes. *)
+stripInnerRegexModifiers // beginDefinition;
+
+stripInnerRegexModifiers[ s_String ] := StringReplace[
+    s,
+    {
+        "(?:(?-m-s)" -> "(?:",
+        "(?:(?-s-m)" -> "(?:",
+        "(?:(?-ms)"  -> "(?:",
+        "(?:(?-sm)"  -> "(?:",
+        "(?:(?-s)"   -> "(?:",
+        "(?:(?-m)"   -> "(?:"
+    }
+];
+
+stripInnerRegexModifiers // endDefinition;
+
+(* ::**************************************************************************************************************:: *)
+(* ::Subsubsection::Closed:: *)
+(*convertPOSIXClasses*)
+(* Map POSIX character class tokens like "[:alpha:]" to JS-compatible bodies. By replacing only
+   the inner token (not the outer brackets), "[[:alpha:]]" becomes "[a-zA-Z]" and nested forms
+   like "[[:alpha:][:digit:]]" become "[a-zA-Z0-9]". *)
+convertPOSIXClasses // beginDefinition;
+
+convertPOSIXClasses[ s_String ] := StringReplace[
+    s,
+    {
+        "[:alpha:]"  -> "a-zA-Z",
+        "[:digit:]"  -> "0-9",
+        "[:alnum:]"  -> "a-zA-Z0-9",
+        "[:upper:]"  -> "A-Z",
+        "[:lower:]"  -> "a-z",
+        "[:xdigit:]" -> "0-9a-fA-F",
+        "[:space:]"  -> "\\s",
+        "[:blank:]"  -> " \\t",
+        "[:cntrl:]"  -> "\\x00-\\x1F\\x7F",
+        "[:print:]"  -> "\\x20-\\x7E",
+        "[:graph:]"  -> "\\x21-\\x7E",
+        "[:punct:]"  -> "!-/:-@[-`{-~"
+    }
+];
+
+convertPOSIXClasses // endDefinition;
+
+(* ::**************************************************************************************************************:: *)
+(* ::Subsubsection::Closed:: *)
+(*convertPCREAnchors*)
+(* "\A" and "\z"/"\Z" are PCRE start/end-of-string anchors with no JS equivalent. JS "^"/"$"
+   mean start/end-of-string when the regex has no "m" flag - which is our target since we
+   strip all flags for JSON Schema output. *)
+convertPCREAnchors // beginDefinition;
+
+convertPCREAnchors[ s_String ] := StringReplace[
+    s,
+    { "\\A" -> "^", "\\z" -> "$", "\\Z" -> "$" }
+];
+
+convertPCREAnchors // endDefinition;
+
+(* ::**************************************************************************************************************:: *)
+(* ::Subsubsection::Closed:: *)
+(*convertUnicodeEscapes*)
+(* Convert "\x{HEX}" to the narrowest JS-valid form. "\xNN" and "\uNNNN" work without the u
+   flag; supplementary code points (> U+FFFF) are emitted as UTF-16 surrogate pairs so the
+   output stays valid without requiring the JS "u" flag. *)
+convertUnicodeEscapes // beginDefinition;
+
+convertUnicodeEscapes[ s_String ] := StringReplace[
+    s,
+    "\\x{" ~~ hex: (HexadecimalCharacter..) ~~ "}" :> convertHexEscape @ hex
+];
+
+convertUnicodeEscapes // endDefinition;
+
+(* ::**************************************************************************************************************:: *)
+(* ::Subsubsection::Closed:: *)
+(*convertHexEscape*)
+convertHexEscape // beginDefinition;
+
+(* Branch on the parsed code point, not the hex payload length. Leading zeros are valid in
+   \x{...} (e.g. "\x{0000A0}" is U+00A0), so classifying by string length would mis-route
+   zero-padded BMP escapes into the surrogate-pair path and fail the supplementary-range
+   assert. *)
+convertHexEscape[ hex_String ] := With[ { cp = FromDigits[ hex, 16 ] },
+    Which[
+        cp <= 16^^FF  , "\\x" <> ToUpperCase @ IntegerString[ cp, 16, 2 ],
+        cp <= 16^^FFFF, "\\u" <> ToUpperCase @ IntegerString[ cp, 16, 4 ],
+        True          , supplementaryToSurrogatePair @ hex
+    ]
+];
+
+convertHexEscape // endDefinition;
+
+(* ::**************************************************************************************************************:: *)
+(* ::Subsubsection::Closed:: *)
+(*supplementaryToSurrogatePair*)
+(* Encode a supplementary-plane code point (U+10000..U+10FFFF) as a UTF-16 surrogate pair of
+   "\uXXXX" escapes. JS regexes match a supplementary character via its surrogate pair even
+   without the "u" flag, so this keeps output valid for JSON Schema validators that do not set
+   it. *)
+supplementaryToSurrogatePair // beginDefinition;
+
+supplementaryToSurrogatePair[ hex_String ] := Enclose[
+    Module[ { cp, offset, hi, lo },
+        cp = FromDigits[ hex, 16 ];
+        ConfirmAssert[ 16^^10000 <= cp <= 16^^10FFFF, "SupplementaryRange" ];
+        offset = cp - 16^^10000;
+        hi = 16^^D800 + BitShiftRight[ offset, 10 ];
+        lo = 16^^DC00 + BitAnd[ offset, 16^^3FF ];
+        "\\u" <> ToUpperCase @ IntegerString[ hi, 16, 4 ] <>
+            "\\u" <> ToUpperCase @ IntegerString[ lo, 16, 4 ]
+    ],
+    throwInternalFailure
+];
+
+supplementaryToSurrogatePair // endDefinition;
+
+(* ::**************************************************************************************************************:: *)
+(* ::Subsubsection::Closed:: *)
+(*convertDotAllDots*)
+(* Replace unescaped "." outside character classes with "[\s\S]" to preserve dotall semantics
+   of the stripped outer "(?s)" flag. Walks the string once tracking escape and class state;
+   leaves "\.", "[.]", and dots inside "[...]" untouched. *)
+convertDotAllDots // beginDefinition;
+
+convertDotAllDots[ s_String ] :=
+    Module[ { inClass = False, escaped = False },
+        StringJoin @ Map[
+            Function[ c,
+                Which[
+                    escaped     , escaped = False; c,
+                    c === "\\"  , escaped = True; c,
+                    inClass     , If[ c === "]", inClass = False ]; c,
+                    c === "["   , inClass = True; c,
+                    c === "."   , "[\\s\\S]",
+                    True        , c
+                ]
+            ],
+            Characters @ s
+        ]
+    ];
+
+convertDotAllDots // endDefinition;
+
 (* ::**************************************************************************************************************:: *)
 (* ::Section::Closed:: *)
 (*Package Footer*)
diff --git a/TODO.md b/TODO.md
@@ -13,6 +13,10 @@ Consolidated list of TODO/FIXME items from the codebase.
   - Sets environment variables with lists of tool names to include/exclude for easy customization
 - [ ] Add `"DisplayName"` property to MCP servers
   - When installing, uses the display name as the config key, but keeps the `MCP_SERVER_NAME` environment variable as the canonical name
+- [ ] Warn on unhandled regex patterns when creating an MCP server
+  - Attempt `toJSRegex` conversion on tool schema regex patterns and issue a warning for any constructs that pass through unchanged (likely JS-incompatible)
+  - Only a warning since many schema validators will still accept the pattern as-is
+  - Source: `Kernel/Utilities.wl`
 
 ## MCP Protocol Support
 
diff --git a/Tests/StartMCPServer.wlt b/Tests/StartMCPServer.wlt
diff --git a/Tests/Utilities.wlt b/Tests/Utilities.wlt
diff --git a/docs/tools.md b/docs/tools.md