@@ -18,7 +18,7 @@ public class ParserSettings
1818{
1919 private readonly List < char > _customSelectorChars = [ ] ;
2020 private readonly List < char > _customOperatorChars = [ ] ;
21- private FilterType _selectorCharFilter = FilterType . Allowlist ;
21+ private SelectorFilterType _selectorCharFilter = SelectorFilterType . Alphanumeric ;
2222
2323 private const string StandardAllowlist = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-" ;
2424
@@ -40,17 +40,21 @@ public class ParserSettings
4040 internal List < char > CustomOperatorChars => _customOperatorChars ;
4141
4242 /// <summary>
43- /// When <see cref="FilterType.Allowlist "/> (default) is set, an allowlist of selector characters is used.
43+ /// When <see cref="SelectorFilterType.Alphanumeric "/> (default) is set, an allowlist of selector characters is used.
4444 /// The allowlist contains alphanumeric characters (upper and lower case), plus '_' and '-'.
4545 /// On top, any custom selector characters added with <see cref="AddCustomSelectorChars"/> are included.
4646 /// <para/>
47- /// When <see cref="FilterType.Blocklist"/>, all Unicode characters are allowed in a selector,
48- /// except control characters (ASCII 0-31 and 127). Excluded control characters can be added back
49- /// using <see cref="AddCustomSelectorChars"/>.
47+ /// When <see cref="SelectorFilterType.VisualUnicodeChars"/> is set, all Unicode characters are allowed in a selector,
48+ /// except 68 non-visual characters: Control Characters (U+0000–U+001F, U+007F), Format Characters (Category: Cf),
49+ /// Directional Formatting (Category: Cf), Invisible Separator, Common Combining Marks (Category: Mn),
50+ /// Whitespace Characters (non-glyph spacing).<br/>
51+ /// Excluded characters can be added back using <see cref="AddCustomSelectorChars"/>.
52+ /// <para/>
53+ /// {}[]()\.? are characters with special functions that are never allowed.
5054 /// <para/>
5155 /// Changing this setting clears any custom operator characters added with <see cref="AddCustomOperatorChars"/>.
5256 /// </summary>
53- public FilterType SelectorCharFilter
57+ public SelectorFilterType SelectorCharFilter
5458 {
5559 get
5660 {
@@ -68,7 +72,7 @@ public FilterType SelectorCharFilter
6872 /// This can be an allowlist, which contains explicitly allowed characters,
6973 /// or a blocklist, when all Unicode characters are allowed, except those from the blocklist.
7074 /// </summary>
71- internal CharSet GetSelectorChars ( ) => SelectorCharFilter == FilterType . Allowlist ? CreateAllowlist ( ) : CreateBlocklist ( ) ;
75+ internal CharSet GetSelectorChars ( ) => SelectorCharFilter == SelectorFilterType . Alphanumeric ? CreateAllowlist ( ) : CreateBlocklist ( ) ;
7276
7377 private CharSet CreateBlocklist ( )
7478 {
@@ -79,8 +83,7 @@ private CharSet CreateBlocklist()
7983 chars . AddRange ( SelectorDelimitingChars . AsSpan ( ) ) ;
8084 chars . AddRange ( OperatorChars . AsSpan ( ) ) ; // no overlaps
8185 chars . AddRange ( _customOperatorChars ) ; // no overlaps
82- // Hard to visualize and debug, disallow by default - can be added back as custom selector chars
83- chars . AddRange ( ControlChars ( ) ) ;
86+ chars . AddRange ( NonVisualUnicodeCharacters . AsSpan ( ) ) ;
8487
8588 // Remove characters used as custom selector chars from the blocklist
8689 foreach ( var c in _customSelectorChars ) chars . Remove ( c ) ;
@@ -209,13 +212,90 @@ private CharSet CreateAllowlist()
209212 ] ;
210213
211214 /// <summary>
212- /// Gets the set of control characters (ASCII 0-31 and 127) .
215+ /// All 68 non-visual Unicode characters that are typically not used in selectors .
213216 /// </summary>
214- internal static IEnumerable < char > ControlChars ( )
215- {
216- for ( var i = 0 ; i <= 31 ; i ++ ) yield return ( char ) i ;
217- yield return ( char ) 127 ; // delete character
218- }
217+ internal static char [ ] NonVisualUnicodeCharacters =
218+ [
219+ // Control Characters (U+0000–U+001F, U+007F)
220+ '\u0000 ' , // NULL – string terminator
221+ '\u0001 ' , // START OF HEADING – protocol control
222+ '\u0002 ' , // START OF TEXT – protocol control
223+ '\u0003 ' , // END OF TEXT – protocol control
224+ '\u0004 ' , // END OF TRANSMISSION – protocol control
225+ '\u0005 ' , // ENQUIRY – request for response
226+ '\u0006 ' , // ACKNOWLEDGE – positive response
227+ '\u0007 ' , // BELL – triggers alert
228+ '\u0008 ' , // BACKSPACE – moves cursor back
229+ '\u0009 ' , // CHARACTER TABULATION – horizontal tab
230+ '\u000A ' , // LINE FEED – line break
231+ '\u000B ' , // LINE TABULATION – vertical tab
232+ '\u000C ' , // FORM FEED – page break
233+ '\u000D ' , // CARRIAGE RETURN – return to line start
234+ '\u000E ' , // SHIFT OUT – alternate character set
235+ '\u000F ' , // SHIFT IN – return to standard set
236+ '\u0010 ' , // DATA LINK ESCAPE – protocol framing
237+ '\u0011 ' , // DEVICE CONTROL 1 – device-specific
238+ '\u0012 ' , // DEVICE CONTROL 2 – device-specific
239+ '\u0013 ' , // DEVICE CONTROL 3 – device-specific
240+ '\u0014 ' , // DEVICE CONTROL 4 – device-specific
241+ '\u0015 ' , // NEGATIVE ACKNOWLEDGE – error signal
242+ '\u0016 ' , // SYNCHRONOUS IDLE – timing control
243+ '\u0017 ' , // END OF TRANSMISSION BLOCK – block end
244+ '\u0018 ' , // CANCEL – cancel transmission
245+ '\u0019 ' , // END OF MEDIUM – physical medium end
246+ '\u001A ' , // SUBSTITUTE – invalid character
247+ '\u001B ' , // ESCAPE – escape sequence initiator
248+ '\u001C ' , // FILE SEPARATOR – data structuring
249+ '\u001D ' , // GROUP SEPARATOR – data structuring
250+ '\u001E ' , // RECORD SEPARATOR – data structuring
251+ '\u001F ' , // UNIT SEPARATOR – data structuring
252+ '\u007F ' , // DELETE – erase character
253+
254+ // Format Characters (Category: Cf)
255+ '\u200B ' , // ZERO WIDTH SPACE – invisible space
256+ '\u200C ' , // ZERO WIDTH NON-JOINER – prevents ligature
257+ '\u200D ' , // ZERO WIDTH JOINER – forces ligature
258+ '\u2060 ' , // WORD JOINER – prevents line break
259+ '\uFEFF ' , // ZERO WIDTH NO-BREAK SPACE – BOM or NBSP
260+
261+ // Directional Formatting (Category: Cf)
262+ '\u202A ' , // LEFT-TO-RIGHT EMBEDDING – sets LTR context
263+ '\u202B ' , // RIGHT-TO-LEFT EMBEDDING – sets RTL context
264+ '\u202C ' , // POP DIRECTIONAL FORMATTING – ends override
265+ '\u202D ' , // LEFT-TO-RIGHT OVERRIDE – forces LTR rendering
266+ '\u202E ' , // RIGHT-TO-LEFT OVERRIDE – forces RTL rendering
267+ '\u2066 ' , // LEFT-TO-RIGHT ISOLATE – isolates LTR segment
268+ '\u2067 ' , // RIGHT-TO-LEFT ISOLATE – isolates RTL segment
269+ '\u2068 ' , // FIRST STRONG ISOLATE – isolates with inferred direction
270+ '\u2069 ' , // POP DIRECTIONAL ISOLATE – ends isolate
271+
272+ // Invisible Separator
273+ '\u2063 ' , // INVISIBLE SEPARATOR – semantic boundary marker
274+
275+ // Common Combining Marks (Category: Mn)
276+ '\u0300 ' , // COMBINING GRAVE ACCENT – diacritic (invisible alone)
277+ '\u0301 ' , // COMBINING ACUTE ACCENT – diacritic (invisible alone)
278+ '\u0302 ' , // COMBINING CIRCUMFLEX ACCENT – diacritic (invisible alone)
279+ '\u0308 ' , // COMBINING DIAERESIS – diacritic (invisible alone)
280+
281+ // Whitespace Characters (non-glyph spacing)
282+ '\u00A0 ' , // NO-BREAK SPACE – non-breaking space
283+ '\u1680 ' , // OGHAM SPACE MARK – special spacing
284+ '\u2000 ' , // EN QUAD – fixed-width space
285+ '\u2001 ' , // EM QUAD – fixed-width space
286+ '\u2002 ' , // EN SPACE – fixed-width space
287+ '\u2003 ' , // EM SPACE – fixed-width space
288+ '\u2004 ' , // THREE-PER-EM SPACE – narrow space
289+ '\u2005 ' , // FOUR-PER-EM SPACE – narrow space
290+ '\u2006 ' , // SIX-PER-EM SPACE – narrow space
291+ '\u2007 ' , // FIGURE SPACE – aligns digits
292+ '\u2008 ' , // PUNCTUATION SPACE – aligns punctuation
293+ '\u2009 ' , // THIN SPACE – narrow space
294+ '\u200A ' , // HAIR SPACE – ultra-thin space
295+ '\u202F ' , // NARROW NO-BREAK SPACE – narrow NBSP
296+ '\u205F ' , // MEDIUM MATHEMATICAL SPACE – math spacing
297+ '\u3000 ' // IDEOGRAPHIC SPACE – full-width CJK space
298+ ] ;
219299
220300 /// <summary>
221301 /// Add a list of allowable selector characters on top of the default selector characters.
@@ -225,26 +305,23 @@ internal static IEnumerable<char> ControlChars()
225305 /// On top, any custom selector characters added with <see cref="AddCustomSelectorChars"/> are included.
226306 /// <para/>
227307 /// When <see cref="SelectorCharFilter"/> is <see langword="false"/>, all Unicode characters are allowed in a selector,
228- /// except control characters (ASCII 0-31 and 127). Excluded control characters can be added back
229- /// using <see cref="AddCustomSelectorChars"/>.
308+ /// except 68 non-visual characters. Excluded characters can be added back using <see cref="AddCustomSelectorChars"/>.
230309 /// <para/>
231310 /// Operator chars and selector chars must be different.
232311 /// </summary>
233312 public void AddCustomSelectorChars ( IList < char > characters )
234313 {
235- var controlChars = ControlChars ( ) . ToList ( ) ;
236-
237314 foreach ( var c in characters )
238315 {
239316 // Explicitly disallow certain characters
240317 if ( SelectorDelimitingChars . Contains ( c ) || c == CharLiteralEscapeChar
241318 || OperatorChars . Contains ( c ) || CustomOperatorChars . Contains ( c ) )
242319 throw new ArgumentException ( $ "Cannot add '{ c } ' as a custom selector character. It is disallowed or in use as an operator character.") ;
243320
244- if ( controlChars . Contains ( c ) )
321+ if ( NonVisualUnicodeCharacters . Contains ( c ) )
245322 _customSelectorChars . Add ( c ) ;
246323
247- if ( SelectorCharFilter == FilterType . Allowlist && ! ( StandardAllowlist . Contains ( c ) || _customSelectorChars . Contains ( c ) ) ) _customSelectorChars . Add ( c ) ;
324+ if ( SelectorCharFilter == SelectorFilterType . Alphanumeric && ! ( StandardAllowlist . Contains ( c ) || _customSelectorChars . Contains ( c ) ) ) _customSelectorChars . Add ( c ) ;
248325 }
249326 }
250327
0 commit comments