Fix issue #100

brunato · brunato · commit 08bce2e5a77e · 2026-06-07T22:27:38.000+02:00
- Mix re.split with parts merge when ends or starts with an hyphen
diff --git a/elementpath/helpers.py b/elementpath/helpers.py
@@ -112,6 +112,13 @@ def search(self, string: str) -> re.Match[str] | None:
             self._compiled = re.compile(self._pattern, self._flags)
             return self._compiled.search(string)
 
+    def split(self, string: str) -> list[str]:
+        try:
+            return self._compiled.split(string)
+        except AttributeError:
+            self._compiled = re.compile(self._pattern, self._flags)
+            return self._compiled.split(string)
+
 
 class Patterns:
     """
diff --git a/elementpath/regex/character_classes.py b/elementpath/regex/character_classes.py
@@ -14,6 +14,7 @@
 from itertools import chain
 from typing import AbstractSet, Any, Optional, Union
 
+from elementpath.helpers import LazyPattern
 from .codepoints import RegexError
 from .unicode_subsets import UnicodeSubset, lazy_subset, unicode_subset, unicode_category
 
@@ -87,6 +88,29 @@ def w_shortcut() -> UnicodeSubset:
     '\\W': w_shortcut,
 }
 
+re_char_set = LazyPattern(r'(\\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})')
+
+
+def get_charset_parts(charset: str) -> list[str]:
+    parts: list[str] = []
+    prev = ''
+    for part in re_char_set.split(charset):
+        if not part:
+            continue
+        elif part.endswith('-') and part != '\\-':
+            prev = part
+        elif part.startswith('-') and parts:
+            parts[-1] += prev + part
+            prev = ''
+        else:
+            parts.append(prev + part)
+            prev = ''
+    else:
+        if prev:
+            parts.append(prev)
+
+    return parts
+
 
 class CharacterClass(MutableSet[int]):
     """
@@ -96,7 +120,7 @@ class CharacterClass(MutableSet[int]):
     :param xsd_version: the reference XSD version for syntax variants. Defaults to '1.0'.
     TODO: implement __ior__, __iand__, __ixor__ operators for a full mutable set class.
     """
-    _re_char_set = re.compile(r'(?<!.-)(\\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})')
+    _re_char_set = re.compile(r'(\\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})')
     _re_unicode_ref = re.compile(r'\\([pP]){([\w-]+)}')
 
     __slots__ = 'xsd_version', 'positive', 'negative'
@@ -172,7 +196,7 @@ def add(self, charset: Union[int, str]) -> None:
         if isinstance(charset, int):
             charset = chr(charset)
 
-        for part in self._re_char_set.split(charset):
+        for part in get_charset_parts(charset):
             if part in CHARACTER_ESCAPES:
                 value = CHARACTER_ESCAPES[part]
                 if isinstance(value, str):
@@ -204,7 +228,7 @@ def discard(self, charset: Union[int, str]) -> None:
         if isinstance(charset, int):
             charset = chr(charset)
 
-        for part in self._re_char_set.split(charset):
+        for part in get_charset_parts(charset):
             if part in CHARACTER_ESCAPES:
                 value = CHARACTER_ESCAPES[part]
                 if isinstance(value, str):
diff --git a/tests/test_regex.py b/tests/test_regex.py
@@ -799,6 +799,10 @@ def test_issue_079(self):
         self.assertIsNone(pattern.search('first\tsecond\tthird'))
         self.assertEqual(pattern.search('first second third').group(0), 'first second third')
 
+    def test_issue_100(self):
+        self.assertNotIn('\\s', translate_pattern(r"[\s\-',]{1,255}"))
+        self.assertNotIn('\\s', translate_pattern(r"[\-\s',]{1,255}"))
+
     def test_dot_wildcard(self):
         regex = translate_pattern('.+', anchors=False)
         self.assertEqual(regex, '^([^\\r\\n]+)$(?!\\n\\Z)')