Skip to content

Commit 08bce2e

Browse files
committed
Fix issue #100
- Mix re.split with parts merge when ends or starts with an hyphen
1 parent 8887f26 commit 08bce2e

3 files changed

Lines changed: 38 additions & 3 deletions

File tree

elementpath/helpers.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,13 @@ def search(self, string: str) -> re.Match[str] | None:
112112
self._compiled = re.compile(self._pattern, self._flags)
113113
return self._compiled.search(string)
114114

115+
def split(self, string: str) -> list[str]:
116+
try:
117+
return self._compiled.split(string)
118+
except AttributeError:
119+
self._compiled = re.compile(self._pattern, self._flags)
120+
return self._compiled.split(string)
121+
115122

116123
class Patterns:
117124
"""

elementpath/regex/character_classes.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from itertools import chain
1515
from typing import AbstractSet, Any, Optional, Union
1616

17+
from elementpath.helpers import LazyPattern
1718
from .codepoints import RegexError
1819
from .unicode_subsets import UnicodeSubset, lazy_subset, unicode_subset, unicode_category
1920

@@ -87,6 +88,29 @@ def w_shortcut() -> UnicodeSubset:
8788
'\\W': w_shortcut,
8889
}
8990

91+
re_char_set = LazyPattern(r'(\\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})')
92+
93+
94+
def get_charset_parts(charset: str) -> list[str]:
95+
parts: list[str] = []
96+
prev = ''
97+
for part in re_char_set.split(charset):
98+
if not part:
99+
continue
100+
elif part.endswith('-') and part != '\\-':
101+
prev = part
102+
elif part.startswith('-') and parts:
103+
parts[-1] += prev + part
104+
prev = ''
105+
else:
106+
parts.append(prev + part)
107+
prev = ''
108+
else:
109+
if prev:
110+
parts.append(prev)
111+
112+
return parts
113+
90114

91115
class CharacterClass(MutableSet[int]):
92116
"""
@@ -96,7 +120,7 @@ class CharacterClass(MutableSet[int]):
96120
:param xsd_version: the reference XSD version for syntax variants. Defaults to '1.0'.
97121
TODO: implement __ior__, __iand__, __ixor__ operators for a full mutable set class.
98122
"""
99-
_re_char_set = re.compile(r'(?<!.-)(\\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})')
123+
_re_char_set = re.compile(r'(\\[nrt|.\-^?*+{}()\]sSdDiIcCwW]|\\[pP]{[a-zA-Z\-0-9]+})')
100124
_re_unicode_ref = re.compile(r'\\([pP]){([\w-]+)}')
101125

102126
__slots__ = 'xsd_version', 'positive', 'negative'
@@ -172,7 +196,7 @@ def add(self, charset: Union[int, str]) -> None:
172196
if isinstance(charset, int):
173197
charset = chr(charset)
174198

175-
for part in self._re_char_set.split(charset):
199+
for part in get_charset_parts(charset):
176200
if part in CHARACTER_ESCAPES:
177201
value = CHARACTER_ESCAPES[part]
178202
if isinstance(value, str):
@@ -204,7 +228,7 @@ def discard(self, charset: Union[int, str]) -> None:
204228
if isinstance(charset, int):
205229
charset = chr(charset)
206230

207-
for part in self._re_char_set.split(charset):
231+
for part in get_charset_parts(charset):
208232
if part in CHARACTER_ESCAPES:
209233
value = CHARACTER_ESCAPES[part]
210234
if isinstance(value, str):

tests/test_regex.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,10 @@ def test_issue_079(self):
799799
self.assertIsNone(pattern.search('first\tsecond\tthird'))
800800
self.assertEqual(pattern.search('first second third').group(0), 'first second third')
801801

802+
def test_issue_100(self):
803+
self.assertNotIn('\\s', translate_pattern(r"[\s\-',]{1,255}"))
804+
self.assertNotIn('\\s', translate_pattern(r"[\-\s',]{1,255}"))
805+
802806
def test_dot_wildcard(self):
803807
regex = translate_pattern('.+', anchors=False)
804808
self.assertEqual(regex, '^([^\\r\\n]+)$(?!\\n\\Z)')

0 commit comments

Comments
 (0)