Skip to content

Commit 74a3fe7

Browse files
authored
Merge pull request #169 from WolframResearch/bugfix/ensure-valid-regex-in-tool-schemas
Bugfix: Ensure valid regex patterns in tool schemas
2 parents 85a8e6b + 6177fca commit 74a3fe7

9 files changed

Lines changed: 819 additions & 9 deletions

File tree

.cspell.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
{
22
"words": [
33
"Agentic",
4+
"alnum",
45
"BUILDKIT",
56
"buildx",
67
"CICD",
8+
"cntrl",
79
"Collatz",
10+
"Dotall",
811
"dpkg",
912
"Embedder",
1013
"ENTITLEMENTID",
@@ -28,7 +31,9 @@
2831
"opencode",
2932
"pacletreadonly",
3033
"patt",
34+
"PCRE",
3135
"prebuild",
36+
"punct",
3237
"pwfile",
3338
"reranking",
3439
"sandboxed",
@@ -53,6 +58,7 @@
5358
"WOLFRAMINIT",
5459
"wolframresearch",
5560
"worktrees",
61+
"xdigit",
5662
"zoomable"
5763
],
5864
"ignoreRegExpList": [

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ See [building.md](docs/building.md) for detailed instructions.
5454
- `StartMCPServer.wl`: Implementation for starting MCP servers
5555
- `ValidateAgentToolsPacletExtension.wl`: Validation of `"AgentTools"` [paclet extensions](docs/paclet-extensions.md)
5656
- `UIResources.wl`: [MCP Apps](docs/mcp-apps.md) UI resource registry, client capability detection, and shared cloud notebook deployment helper
57+
- `Utilities.wl`: General-purpose helpers — LLMKit subscription checks, Chatbook version verification, and `toJSRegex` for converting ICU/PCRE patterns to ECMA 262 (used when sanitizing tool schema `"pattern"` fields)
5758
- `YAML.wl`: YAML import/export helpers (`importYAML`, `importYAMLString`, `exportYAML`, `exportYAMLString`) used by YAML-based MCP clients (e.g. Goose)
5859
- `Tools/`: Contains several files defining predefined MCP tools used by default servers. If tool schemas are modified, we need to rebuild agent skills.
5960
- `Prompts/`: Contains files defining predefined [MCP prompts](docs/mcp-prompts.md) used by default servers

Kernel/CommonSymbols.wl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ BeginPackage[ "Wolfram`AgentTools`Common`" ];
5757
`throwFailure;
5858
`throwInternalFailure;
5959
`throwTop;
60+
`toJSRegex;
6061
`validateMCPServerObjectData;
6162
`writeRawJSONFile;
6263
`writeWXFFile;

Kernel/StartMCPServer.wl

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ createMCPToolData[ mcpName_String, tool: HoldPattern[ _LLMTool ] ] := Enclose[
317317

318318
data = ConfirmBy[ tool[ "Data" ], AssociationQ, "Data" ];
319319
description = safeString @ ConfirmBy[ tool[ "Description" ], StringQ, "Description" ];
320-
inputSchema = ConfirmBy[ tool[ "JSONSchema" ], AssociationQ, "InputSchema" ];
320+
inputSchema = ConfirmBy[ toolSchema @ tool, AssociationQ, "InputSchema" ];
321321

322322
title = Lookup[ data, "DisplayName", Missing[ ] ];
323323
If[ StringQ @ title, title = safeString @ title ];
@@ -337,6 +337,40 @@ createMCPToolData[ mcpName_String, tool: HoldPattern[ _LLMTool ] ] := Enclose[
337337

338338
createMCPToolData // endDefinition;
339339

340+
(* ::**************************************************************************************************************:: *)
341+
(* ::Subsubsection::Closed:: *)
342+
(*toolSchema*)
343+
toolSchema // beginDefinition;
344+
345+
toolSchema[ tool: HoldPattern[ _LLMTool ] ] := Enclose[
346+
ReplaceAll[
347+
ReplaceAll[
348+
tool[ "JSONSchema" ],
349+
(* Make sure regex patterns are valid in JavaScript *)
350+
{
351+
(* The vast majority of patterns will just be the one that matches anything,
352+
since it's the pattern produced by the basic "String" Interpreter.
353+
We can safely drop it, since it's redundant. *)
354+
as: KeyValuePattern[ "pattern" -> "(?ms).*" ] :>
355+
RuleCondition @ KeyDrop[ as, "pattern" ],
356+
357+
(* For other patterns produced via `Interpreter[Restricted["String", pattern]]`,
358+
we attempt to convert to JS-compatible format. *)
359+
as: KeyValuePattern[ "pattern" -> regex_String ] :>
360+
RuleCondition @ <|
361+
as,
362+
"pattern" -> ConfirmBy[ toJSRegex @ regex, StringQ, "ToJSRegex" ]
363+
|>
364+
}
365+
],
366+
(* Make sure strings in schemas do not contain private-use characters *)
367+
s_String :> RuleCondition @ safeString @ s
368+
],
369+
throwInternalFailure
370+
];
371+
372+
toolSchema // endDefinition;
373+
340374
(* ::**************************************************************************************************************:: *)
341375
(* ::Subsubsection::Closed:: *)
342376
(*toolWarmup*)

Kernel/Utilities.wl

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,229 @@ chatbookVersionCheck0[ other_ ] := Enclose[
113113

114114
chatbookVersionCheck0 // endDefinition;
115115

116+
(* ::**************************************************************************************************************:: *)
117+
(* ::Section::Closed:: *)
118+
(*Regular Expressions*)
119+
120+
(* ::**************************************************************************************************************:: *)
121+
(* ::Subsection::Closed:: *)
122+
(*toJSRegex*)
123+
124+
(* Convert an ICU/PCRE-flavored regex (as produced by StringPattern`PatternConvert) into a
125+
best-effort ECMA 262 pattern suitable for JSON Schema "pattern" fields consumed by
126+
JavaScript-based validators.
127+
128+
Intentional non-goals and accepted limitations:
129+
130+
- No attempt to simulate multiline `^`/`$`: if the original had `m` and contained raw `^`/`$`
131+
(from `StartOfLine`/`EndOfLine`), the converted pattern treats them as start/end-of-string in JS.
132+
Schema patterns rarely use line anchors, and any workaround (`(?:^|(?<=\n))`) bloats output and risks
133+
validator compatibility issues.
134+
135+
- PCRE-only constructs that pass through untouched from user-supplied `RegularExpression[...]` bodies
136+
(atomic groups `(?>...)`, possessive quantifiers `*+`, named groups `(?P<x>...)`) are left alone.
137+
Those are user escape hatches; if a user puts PCRE-only syntax in a schema pattern, they own the compatibility.
138+
139+
- `u`-flag is not available to us - we are producing bare pattern strings for JSON Schema consumers who may or may
140+
not set it.
141+
*)
142+
143+
toJSRegex // beginDefinition;
144+
145+
toJSRegex[ regex_String ] := Enclose[
146+
Module[ { body, hadDotAll },
147+
{ body, hadDotAll } = ConfirmMatch[ extractLeadingRegexFlags @ regex, { _String, True|False }, "Extract" ];
148+
149+
body = ConfirmBy[ stripInnerRegexModifiers @ body, StringQ, "StripInnerRegexModifiers" ];
150+
body = ConfirmBy[ convertPOSIXClasses @ body , StringQ, "ConvertPOSIXClasses" ];
151+
body = ConfirmBy[ convertPCREAnchors @ body , StringQ, "ConvertPCREAnchors" ];
152+
body = ConfirmBy[ convertUnicodeEscapes @ body , StringQ, "ConvertUnicodeEscapes" ];
153+
154+
If[ hadDotAll,
155+
ConfirmBy[ convertDotAllDots @ body, StringQ, "ConvertDotAllDots" ],
156+
body
157+
]
158+
],
159+
throwInternalFailure
160+
];
161+
162+
toJSRegex // endDefinition;
163+
164+
(* TODO: When creating an MCP server, we could attempt this conversion and issue a warning message if there are any
165+
unhandled patterns. This would only be a warning since many schema validators will still accept the pattern as-is. *)
166+
167+
(* ::**************************************************************************************************************:: *)
168+
(* ::Subsubsection::Closed:: *)
169+
(*extractLeadingRegexFlags*)
170+
(* Strip a leading "(?flags)" group and return { body, hadDotAll }. *)
171+
extractLeadingRegexFlags // beginDefinition;
172+
173+
extractLeadingRegexFlags[ s_String ] :=
174+
Module[ { match },
175+
match = StringCases[
176+
s,
177+
StartOfString ~~ "(?" ~~ flags: (LetterCharacter..) ~~ ")" ~~ rest___ :>
178+
{ flags, rest },
179+
1
180+
];
181+
If[ match === { },
182+
{ s, False },
183+
{ match[[ 1, 2 ]], StringContainsQ[ match[[ 1, 1 ]], "s" ] }
184+
]
185+
];
186+
187+
extractLeadingRegexFlags // endDefinition;
188+
189+
(* ::**************************************************************************************************************:: *)
190+
(* ::Subsubsection::Closed:: *)
191+
(*stripInnerRegexModifiers*)
192+
(* Strip the scope-less inline modifier prefixes that PatternConvert inserts at the start of a
193+
"(?:...)" wrapper around RegularExpression[] contents. Matching only the "(?:(?-...)" wrapper
194+
form avoids silently altering mid-pattern modifiers in user-supplied regexes. *)
195+
stripInnerRegexModifiers // beginDefinition;
196+
197+
stripInnerRegexModifiers[ s_String ] := StringReplace[
198+
s,
199+
{
200+
"(?:(?-m-s)" -> "(?:",
201+
"(?:(?-s-m)" -> "(?:",
202+
"(?:(?-ms)" -> "(?:",
203+
"(?:(?-sm)" -> "(?:",
204+
"(?:(?-s)" -> "(?:",
205+
"(?:(?-m)" -> "(?:"
206+
}
207+
];
208+
209+
stripInnerRegexModifiers // endDefinition;
210+
211+
(* ::**************************************************************************************************************:: *)
212+
(* ::Subsubsection::Closed:: *)
213+
(*convertPOSIXClasses*)
214+
(* Map POSIX character class tokens like "[:alpha:]" to JS-compatible bodies. By replacing only
215+
the inner token (not the outer brackets), "[[:alpha:]]" becomes "[a-zA-Z]" and nested forms
216+
like "[[:alpha:][:digit:]]" become "[a-zA-Z0-9]". *)
217+
convertPOSIXClasses // beginDefinition;
218+
219+
convertPOSIXClasses[ s_String ] := StringReplace[
220+
s,
221+
{
222+
"[:alpha:]" -> "a-zA-Z",
223+
"[:digit:]" -> "0-9",
224+
"[:alnum:]" -> "a-zA-Z0-9",
225+
"[:upper:]" -> "A-Z",
226+
"[:lower:]" -> "a-z",
227+
"[:xdigit:]" -> "0-9a-fA-F",
228+
"[:space:]" -> "\\s",
229+
"[:blank:]" -> " \\t",
230+
"[:cntrl:]" -> "\\x00-\\x1F\\x7F",
231+
"[:print:]" -> "\\x20-\\x7E",
232+
"[:graph:]" -> "\\x21-\\x7E",
233+
"[:punct:]" -> "!-/:-@[-`{-~"
234+
}
235+
];
236+
237+
convertPOSIXClasses // endDefinition;
238+
239+
(* ::**************************************************************************************************************:: *)
240+
(* ::Subsubsection::Closed:: *)
241+
(*convertPCREAnchors*)
242+
(* "\A" and "\z"/"\Z" are PCRE start/end-of-string anchors with no JS equivalent. JS "^"/"$"
243+
mean start/end-of-string when the regex has no "m" flag - which is our target since we
244+
strip all flags for JSON Schema output. *)
245+
convertPCREAnchors // beginDefinition;
246+
247+
convertPCREAnchors[ s_String ] := StringReplace[
248+
s,
249+
{ "\\A" -> "^", "\\z" -> "$", "\\Z" -> "$" }
250+
];
251+
252+
convertPCREAnchors // endDefinition;
253+
254+
(* ::**************************************************************************************************************:: *)
255+
(* ::Subsubsection::Closed:: *)
256+
(*convertUnicodeEscapes*)
257+
(* Convert "\x{HEX}" to the narrowest JS-valid form. "\xNN" and "\uNNNN" work without the u
258+
flag; supplementary code points (> U+FFFF) are emitted as UTF-16 surrogate pairs so the
259+
output stays valid without requiring the JS "u" flag. *)
260+
convertUnicodeEscapes // beginDefinition;
261+
262+
convertUnicodeEscapes[ s_String ] := StringReplace[
263+
s,
264+
"\\x{" ~~ hex: (HexadecimalCharacter..) ~~ "}" :> convertHexEscape @ hex
265+
];
266+
267+
convertUnicodeEscapes // endDefinition;
268+
269+
(* ::**************************************************************************************************************:: *)
270+
(* ::Subsubsection::Closed:: *)
271+
(*convertHexEscape*)
272+
convertHexEscape // beginDefinition;
273+
274+
(* Branch on the parsed code point, not the hex payload length. Leading zeros are valid in
275+
\x{...} (e.g. "\x{0000A0}" is U+00A0), so classifying by string length would mis-route
276+
zero-padded BMP escapes into the surrogate-pair path and fail the supplementary-range
277+
assert. *)
278+
convertHexEscape[ hex_String ] := With[ { cp = FromDigits[ hex, 16 ] },
279+
Which[
280+
cp <= 16^^FF , "\\x" <> ToUpperCase @ IntegerString[ cp, 16, 2 ],
281+
cp <= 16^^FFFF, "\\u" <> ToUpperCase @ IntegerString[ cp, 16, 4 ],
282+
True , supplementaryToSurrogatePair @ hex
283+
]
284+
];
285+
286+
convertHexEscape // endDefinition;
287+
288+
(* ::**************************************************************************************************************:: *)
289+
(* ::Subsubsection::Closed:: *)
290+
(*supplementaryToSurrogatePair*)
291+
(* Encode a supplementary-plane code point (U+10000..U+10FFFF) as a UTF-16 surrogate pair of
292+
"\uXXXX" escapes. JS regexes match a supplementary character via its surrogate pair even
293+
without the "u" flag, so this keeps output valid for JSON Schema validators that do not set
294+
it. *)
295+
supplementaryToSurrogatePair // beginDefinition;
296+
297+
supplementaryToSurrogatePair[ hex_String ] := Enclose[
298+
Module[ { cp, offset, hi, lo },
299+
cp = FromDigits[ hex, 16 ];
300+
ConfirmAssert[ 16^^10000 <= cp <= 16^^10FFFF, "SupplementaryRange" ];
301+
offset = cp - 16^^10000;
302+
hi = 16^^D800 + BitShiftRight[ offset, 10 ];
303+
lo = 16^^DC00 + BitAnd[ offset, 16^^3FF ];
304+
"\\u" <> ToUpperCase @ IntegerString[ hi, 16, 4 ] <>
305+
"\\u" <> ToUpperCase @ IntegerString[ lo, 16, 4 ]
306+
],
307+
throwInternalFailure
308+
];
309+
310+
supplementaryToSurrogatePair // endDefinition;
311+
312+
(* ::**************************************************************************************************************:: *)
313+
(* ::Subsubsection::Closed:: *)
314+
(*convertDotAllDots*)
315+
(* Replace unescaped "." outside character classes with "[\s\S]" to preserve dotall semantics
316+
of the stripped outer "(?s)" flag. Walks the string once tracking escape and class state;
317+
leaves "\.", "[.]", and dots inside "[...]" untouched. *)
318+
convertDotAllDots // beginDefinition;
319+
320+
convertDotAllDots[ s_String ] :=
321+
Module[ { inClass = False, escaped = False },
322+
StringJoin @ Map[
323+
Function[ c,
324+
Which[
325+
escaped , escaped = False; c,
326+
c === "\\" , escaped = True; c,
327+
inClass , If[ c === "]", inClass = False ]; c,
328+
c === "[" , inClass = True; c,
329+
c === "." , "[\\s\\S]",
330+
True , c
331+
]
332+
],
333+
Characters @ s
334+
]
335+
];
336+
337+
convertDotAllDots // endDefinition;
338+
116339
(* ::**************************************************************************************************************:: *)
117340
(* ::Section::Closed:: *)
118341
(*Package Footer*)

TODO.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ Consolidated list of TODO/FIXME items from the codebase.
1313
- Sets environment variables with lists of tool names to include/exclude for easy customization
1414
- [ ] Add `"DisplayName"` property to MCP servers
1515
- When installing, uses the display name as the config key, but keeps the `MCP_SERVER_NAME` environment variable as the canonical name
16+
- [ ] Warn on unhandled regex patterns when creating an MCP server
17+
- Attempt `toJSRegex` conversion on tool schema regex patterns and issue a warning for any constructs that pass through unchanged (likely JS-incompatible)
18+
- Only a warning since many schema validators will still accept the pattern as-is
19+
- Source: `Kernel/Utilities.wl`
1620

1721
## MCP Protocol Support
1822

0 commit comments

Comments
 (0)