1414from itertools import chain
1515from typing import AbstractSet , Any , Optional , Union
1616
17+ from elementpath .helpers import LazyPattern
1718from .codepoints import RegexError
1819from .unicode_subsets import UnicodeSubset , lazy_subset , unicode_subset , unicode_category
1920
@@ -87,6 +88,29 @@ def w_shortcut() -> UnicodeSubset:
8788 '\\ W' : w_shortcut ,
8889}
8990
91+ re_char_set = LazyPattern (r'(\\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})' )
92+
93+
94+ def get_charset_parts (charset : str ) -> list [str ]:
95+ parts : list [str ] = []
96+ prev = ''
97+ for part in re_char_set .split (charset ):
98+ if not part :
99+ continue
100+ elif part .endswith ('-' ) and part != '\\ -' :
101+ prev = part
102+ elif part .startswith ('-' ) and parts :
103+ parts [- 1 ] += prev + part
104+ prev = ''
105+ else :
106+ parts .append (prev + part )
107+ prev = ''
108+ else :
109+ if prev :
110+ parts .append (prev )
111+
112+ return parts
113+
90114
91115class CharacterClass (MutableSet [int ]):
92116 """
@@ -96,7 +120,7 @@ class CharacterClass(MutableSet[int]):
96120 :param xsd_version: the reference XSD version for syntax variants. Defaults to '1.0'.
97121 TODO: implement __ior__, __iand__, __ixor__ operators for a full mutable set class.
98122 """
99- _re_char_set = re .compile (r'(?<!.-)( \\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})' )
123+ _re_char_set = re .compile (r'(\\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})' )
100124 _re_unicode_ref = re .compile (r'\\([pP]){([\w-]+)}' )
101125
102126 __slots__ = 'xsd_version' , 'positive' , 'negative'
@@ -172,7 +196,7 @@ def add(self, charset: Union[int, str]) -> None:
172196 if isinstance (charset , int ):
173197 charset = chr (charset )
174198
175- for part in self . _re_char_set . split (charset ):
199+ for part in get_charset_parts (charset ):
176200 if part in CHARACTER_ESCAPES :
177201 value = CHARACTER_ESCAPES [part ]
178202 if isinstance (value , str ):
@@ -204,7 +228,7 @@ def discard(self, charset: Union[int, str]) -> None:
204228 if isinstance (charset , int ):
205229 charset = chr (charset )
206230
207- for part in self . _re_char_set . split (charset ):
231+ for part in get_charset_parts (charset ):
208232 if part in CHARACTER_ESCAPES :
209233 value = CHARACTER_ESCAPES [part ]
210234 if isinstance (value , str ):
0 commit comments