@@ -113,6 +113,229 @@ chatbookVersionCheck0[ other_ ] := Enclose[
113113
114114chatbookVersionCheck0 // endDefinition ;
115115
116+ (* ::**************************************************************************************************************:: *)
117+ (* ::Section::Closed:: *)
118+ (*Regular Expressions*)
119+
120+ (* ::**************************************************************************************************************:: *)
121+ (* ::Subsection::Closed:: *)
122+ (*toJSRegex*)
123+
124+ (* Convert an ICU/PCRE-flavored regex (as produced by StringPattern`PatternConvert) into a
125+ best-effort ECMA 262 pattern suitable for JSON Schema "pattern" fields consumed by
126+ JavaScript-based validators.
127+
128+ Intentional non-goals and accepted limitations:
129+
130+ - No attempt to simulate multiline `^`/`$`: if the original had `m` and contained raw `^`/`$`
131+ (from `StartOfLine`/`EndOfLine`), the converted pattern treats them as start/end-of-string in JS.
132+ Schema patterns rarely use line anchors, and any workaround (`(?:^|(?<=\n))`) bloats output and risks
133+ validator compatibility issues.
134+
135+ - PCRE-only constructs that pass through untouched from user-supplied `RegularExpression[...]` bodies
136+ (atomic groups `(?>...)`, possessive quantifiers `*+`, named groups `(?P<x>...)`) are left alone.
137+ Those are user escape hatches; if a user puts PCRE-only syntax in a schema pattern, they own the compatibility.
138+
139+ - `u`-flag is not available to us - we are producing bare pattern strings for JSON Schema consumers who may or may
140+ not set it.
141+ *)
142+
143+ toJSRegex // beginDefinition ;
144+
145+ toJSRegex [ regex_ String ] := Enclose [
146+ Module [ { body , hadDotAll },
147+ { body , hadDotAll } = ConfirmMatch [ extractLeadingRegexFlags @ regex , { _ String , True |False }, "Extract" ];
148+
149+ body = ConfirmBy [ stripInnerRegexModifiers @ body , StringQ , "StripInnerRegexModifiers" ];
150+ body = ConfirmBy [ convertPOSIXClasses @ body , StringQ , "ConvertPOSIXClasses" ];
151+ body = ConfirmBy [ convertPCREAnchors @ body , StringQ , "ConvertPCREAnchors" ];
152+ body = ConfirmBy [ convertUnicodeEscapes @ body , StringQ , "ConvertUnicodeEscapes" ];
153+
154+ If [ hadDotAll ,
155+ ConfirmBy [ convertDotAllDots @ body , StringQ , "ConvertDotAllDots" ],
156+ body
157+ ]
158+ ],
159+ throwInternalFailure
160+ ];
161+
162+ toJSRegex // endDefinition ;
163+
164+ (* TODO: When creating an MCP server, we could attempt this conversion and issue a warning message if there are any
165+ unhandled patterns. This would only be a warning since many schema validators will still accept the pattern as-is. *)
166+
167+ (* ::**************************************************************************************************************:: *)
168+ (* ::Subsubsection::Closed:: *)
169+ (*extractLeadingRegexFlags*)
170+ (* Strip a leading "(?flags)" group and return { body, hadDotAll }. *)
171+ extractLeadingRegexFlags // beginDefinition ;
172+
173+ extractLeadingRegexFlags [ s_ String ] :=
174+ Module [ { match },
175+ match = StringCases [
176+ s ,
177+ StartOfString ~~ "(?" ~~ flags : (LetterCharacter .. ) ~~ ")" ~~ rest___ :>
178+ { flags , rest },
179+ 1
180+ ];
181+ If [ match === { },
182+ { s , False },
183+ { match [[ 1 , 2 ]], StringContainsQ [ match [[ 1 , 1 ]], "s" ] }
184+ ]
185+ ];
186+
187+ extractLeadingRegexFlags // endDefinition ;
188+
189+ (* ::**************************************************************************************************************:: *)
190+ (* ::Subsubsection::Closed:: *)
191+ (*stripInnerRegexModifiers*)
192+ (* Strip the scope-less inline modifier prefixes that PatternConvert inserts at the start of a
193+ "(?:...)" wrapper around RegularExpression[] contents. Matching only the "(?:(?-...)" wrapper
194+ form avoids silently altering mid-pattern modifiers in user-supplied regexes. *)
195+ stripInnerRegexModifiers // beginDefinition ;
196+
197+ stripInnerRegexModifiers [ s_ String ] := StringReplace [
198+ s ,
199+ {
200+ "(?:(?-m-s)" -> "(?:" ,
201+ "(?:(?-s-m)" -> "(?:" ,
202+ "(?:(?-ms)" -> "(?:" ,
203+ "(?:(?-sm)" -> "(?:" ,
204+ "(?:(?-s)" -> "(?:" ,
205+ "(?:(?-m)" -> "(?:"
206+ }
207+ ];
208+
209+ stripInnerRegexModifiers // endDefinition ;
210+
211+ (* ::**************************************************************************************************************:: *)
212+ (* ::Subsubsection::Closed:: *)
213+ (*convertPOSIXClasses*)
214+ (* Map POSIX character class tokens like "[:alpha:]" to JS-compatible bodies. By replacing only
215+ the inner token (not the outer brackets), "[[:alpha:]]" becomes "[a-zA-Z]" and nested forms
216+ like "[[:alpha:][:digit:]]" become "[a-zA-Z0-9]". *)
217+ convertPOSIXClasses // beginDefinition ;
218+
219+ convertPOSIXClasses [ s_ String ] := StringReplace [
220+ s ,
221+ {
222+ "[:alpha:]" -> "a-zA-Z" ,
223+ "[:digit:]" -> "0-9" ,
224+ "[:alnum:]" -> "a-zA-Z0-9" ,
225+ "[:upper:]" -> "A-Z" ,
226+ "[:lower:]" -> "a-z" ,
227+ "[:xdigit:]" -> "0-9a-fA-F" ,
228+ "[:space:]" -> "\\ s" ,
229+ "[:blank:]" -> " \\ t" ,
230+ "[:cntrl:]" -> "\\ x00-\\ x1F\\ x7F" ,
231+ "[:print:]" -> "\\ x20-\\ x7E" ,
232+ "[:graph:]" -> "\\ x21-\\ x7E" ,
233+ "[:punct:]" -> "!-/:-@[-`{-~"
234+ }
235+ ];
236+
237+ convertPOSIXClasses // endDefinition ;
238+
239+ (* ::**************************************************************************************************************:: *)
240+ (* ::Subsubsection::Closed:: *)
241+ (*convertPCREAnchors*)
242+ (* "\A" and "\z"/"\Z" are PCRE start/end-of-string anchors with no JS equivalent. JS "^"/"$"
243+ mean start/end-of-string when the regex has no "m" flag - which is our target since we
244+ strip all flags for JSON Schema output. *)
245+ convertPCREAnchors // beginDefinition ;
246+
247+ convertPCREAnchors [ s_ String ] := StringReplace [
248+ s ,
249+ { "\\ A" -> "^" , "\\ z" -> "$" , "\\ Z" -> "$" }
250+ ];
251+
252+ convertPCREAnchors // endDefinition ;
253+
254+ (* ::**************************************************************************************************************:: *)
255+ (* ::Subsubsection::Closed:: *)
256+ (*convertUnicodeEscapes*)
257+ (* Convert "\x{HEX}" to the narrowest JS-valid form. "\xNN" and "\uNNNN" work without the u
258+ flag; supplementary code points (> U+FFFF) are emitted as UTF-16 surrogate pairs so the
259+ output stays valid without requiring the JS "u" flag. *)
260+ convertUnicodeEscapes // beginDefinition ;
261+
262+ convertUnicodeEscapes [ s_ String ] := StringReplace [
263+ s ,
264+ "\\ x{" ~~ hex : (HexadecimalCharacter .. ) ~~ "}" :> convertHexEscape @ hex
265+ ];
266+
267+ convertUnicodeEscapes // endDefinition ;
268+
269+ (* ::**************************************************************************************************************:: *)
270+ (* ::Subsubsection::Closed:: *)
271+ (*convertHexEscape*)
272+ convertHexEscape // beginDefinition ;
273+
274+ (* Branch on the parsed code point, not the hex payload length. Leading zeros are valid in
275+ \x{...} (e.g. "\x{0000A0}" is U+00A0), so classifying by string length would mis-route
276+ zero-padded BMP escapes into the surrogate-pair path and fail the supplementary-range
277+ assert. *)
278+ convertHexEscape [ hex_ String ] := With [ { cp = FromDigits [ hex , 16 ] },
279+ Which [
280+ cp <= 16 ^^ FF , "\\ x" <> ToUpperCase @ IntegerString [ cp , 16 , 2 ],
281+ cp <= 16 ^^ FFFF , "\\ u" <> ToUpperCase @ IntegerString [ cp , 16 , 4 ],
282+ True , supplementaryToSurrogatePair @ hex
283+ ]
284+ ];
285+
286+ convertHexEscape // endDefinition ;
287+
288+ (* ::**************************************************************************************************************:: *)
289+ (* ::Subsubsection::Closed:: *)
290+ (*supplementaryToSurrogatePair*)
291+ (* Encode a supplementary-plane code point (U+10000..U+10FFFF) as a UTF-16 surrogate pair of
292+ "\uXXXX" escapes. JS regexes match a supplementary character via its surrogate pair even
293+ without the "u" flag, so this keeps output valid for JSON Schema validators that do not set
294+ it. *)
295+ supplementaryToSurrogatePair // beginDefinition ;
296+
297+ supplementaryToSurrogatePair [ hex_ String ] := Enclose [
298+ Module [ { cp , offset , hi , lo },
299+ cp = FromDigits [ hex , 16 ];
300+ ConfirmAssert [ 16 ^^ 10000 <= cp <= 16 ^^ 10 FFFF , "SupplementaryRange" ];
301+ offset = cp - 16 ^^ 10000 ;
302+ hi = 16 ^^ D800 + BitShiftRight [ offset , 10 ];
303+ lo = 16 ^^ DC00 + BitAnd [ offset , 16 ^^ 3 FF ];
304+ "\\ u" <> ToUpperCase @ IntegerString [ hi , 16 , 4 ] <>
305+ "\\ u" <> ToUpperCase @ IntegerString [ lo , 16 , 4 ]
306+ ],
307+ throwInternalFailure
308+ ];
309+
310+ supplementaryToSurrogatePair // endDefinition ;
311+
312+ (* ::**************************************************************************************************************:: *)
313+ (* ::Subsubsection::Closed:: *)
314+ (*convertDotAllDots*)
315+ (* Replace unescaped "." outside character classes with "[\s\S]" to preserve dotall semantics
316+ of the stripped outer "(?s)" flag. Walks the string once tracking escape and class state;
317+ leaves "\.", "[.]", and dots inside "[...]" untouched. *)
318+ convertDotAllDots // beginDefinition ;
319+
320+ convertDotAllDots [ s_ String ] :=
321+ Module [ { inClass = False , escaped = False },
322+ StringJoin @ Map [
323+ Function [ c ,
324+ Which [
325+ escaped , escaped = False ; c ,
326+ c === "\\ " , escaped = True ; c ,
327+ inClass , If [ c === "]" , inClass = False ]; c ,
328+ c === "[" , inClass = True ; c ,
329+ c === "." , "[\\ s\\ S]" ,
330+ True , c
331+ ]
332+ ],
333+ Characters @ s
334+ ]
335+ ];
336+
337+ convertDotAllDots // endDefinition ;
338+
116339(* ::**************************************************************************************************************:: *)
117340(* ::Section::Closed:: *)
118341(*Package Footer*)
0 commit comments