Skip to content

Commit f40f163

Browse files
committed
memoize GlobPathMatcher to remove exponential cliff
The wildcard matcher previously made ~280k recursive calls on a pattern like root[**][**][**][**][**]['x'] against a 20-deep target — the ** backtracking explored overlapping (pi, ti) states repeatedly. Add a per-call memo dict keyed on (pi, ti) to both _match_segments and _could_match_descendant, bounding work to O(len(pattern) * len(target)). Same input now uses ~940 calls. Also collapse match_or_is_descendant from an O(len(target)) loop over every prefix into a single _match_segments pass with allow_extra_target=True — equivalent to 'pattern matches some prefix of target'. Switch sentinel comparisons from == to is (the wildcards are module-level singletons; saves the _WildcardToken.__eq__ call on the hot path). Add two regression tests asserting bounded call counts so this can't silently regress.
1 parent 277c89c commit f40f163

2 files changed

Lines changed: 102 additions & 48 deletions

File tree

deepdiff/path.py

Lines changed: 59 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -389,89 +389,100 @@ def __init__(self, pattern_path):
389389

390390
def match(self, path_string):
391391
"""Return True if *path_string* matches this pattern exactly."""
392-
elements = _path_to_elements(path_string, root_element=('root', GETATTR))
393-
target = elements[1:]
394-
return self._match_segments(self._pattern, target, 0, 0)
392+
target = _path_to_elements(path_string, root_element=('root', GETATTR))[1:]
393+
return self._match_segments(target, 0, 0, {}, allow_extra_target=False)
395394

396395
def match_or_is_ancestor(self, path_string):
397396
"""Return True if *path_string* matches OR is an ancestor of a potential match.
398397
399398
This is needed for ``include_paths``: we must not prune a path that
400399
could lead to a matching descendant.
401400
"""
402-
elements = _path_to_elements(path_string, root_element=('root', GETATTR))
403-
target = elements[1:]
404-
return (self._match_segments(self._pattern, target, 0, 0) or
405-
self._could_match_descendant(self._pattern, target, 0, 0))
401+
target = _path_to_elements(path_string, root_element=('root', GETATTR))[1:]
402+
memo = {}
403+
return (self._match_segments(target, 0, 0, memo, allow_extra_target=False)
404+
or self._could_match_descendant(target, 0, 0, {}))
406405

407406
def match_or_is_descendant(self, path_string):
408407
"""Return True if *path_string* matches OR is a descendant of a matching path.
409408
410-
This checks whether the pattern matches any prefix of *path_string*,
411-
meaning the path is "inside" a matched subtree.
409+
Equivalent to: the pattern matches some prefix of *path_string*.
412410
"""
413-
elements = _path_to_elements(path_string, root_element=('root', GETATTR))
414-
target = elements[1:]
415-
# Check exact match first
416-
if self._match_segments(self._pattern, target, 0, 0):
417-
return True
418-
# Check if any prefix of target matches (making this path a descendant)
419-
for length in range(len(target)):
420-
if self._match_segments(self._pattern, target[:length], 0, 0):
421-
return True
422-
return False
423-
424-
@staticmethod
425-
def _match_segments(pattern, target, pi, ti):
426-
"""Recursive segment matcher with backtracking for ``**``."""
427-
while pi < len(pattern) and ti < len(target):
428-
pat_elem = pattern[pi][0]
411+
target = _path_to_elements(path_string, root_element=('root', GETATTR))[1:]
412+
return self._match_segments(target, 0, 0, {}, allow_extra_target=True)
413+
414+
def _match_segments(self, target, pi, ti, memo, allow_extra_target):
415+
"""Recursive segment matcher with backtracking for ``**``.
429416
430-
if pat_elem == MULTI_WILDCARD:
417+
``memo`` is a per-top-level-call dict keyed by ``(pi, ti)`` so each
418+
state is computed at most once — turns the worst case from
419+
exponential to ``O(len(pattern) * len(target))``.
420+
"""
421+
key = (pi, ti)
422+
if key in memo:
423+
return memo[key]
424+
pattern = self._pattern
425+
target_len = len(target)
426+
pattern_len = len(pattern)
427+
428+
while pi < pattern_len and ti < target_len:
429+
pat_elem = pattern[pi][0]
430+
if pat_elem is MULTI_WILDCARD:
431431
# ** matches zero or more segments — try every suffix
432-
for k in range(ti, len(target) + 1):
433-
if GlobPathMatcher._match_segments(pattern, target, pi + 1, k):
432+
for k in range(ti, target_len + 1):
433+
if self._match_segments(target, pi + 1, k, memo, allow_extra_target):
434+
memo[key] = True
434435
return True
436+
memo[key] = False
435437
return False
436-
elif pat_elem == SINGLE_WILDCARD:
437-
# * matches exactly one segment regardless of value/action
438+
elif pat_elem is SINGLE_WILDCARD:
438439
pi += 1
439440
ti += 1
440441
else:
441-
tgt_elem = target[ti][0]
442-
if pat_elem != tgt_elem:
442+
if pat_elem != target[ti][0]:
443+
memo[key] = False
443444
return False
444445
pi += 1
445446
ti += 1
446447

447448
# Consume any trailing ** (they can match zero segments)
448-
while pi < len(pattern) and pattern[pi][0] == MULTI_WILDCARD:
449+
while pi < pattern_len and pattern[pi][0] is MULTI_WILDCARD:
449450
pi += 1
450451

451-
return pi == len(pattern) and ti == len(target)
452+
if allow_extra_target:
453+
result = pi == pattern_len
454+
else:
455+
result = pi == pattern_len and ti == target_len
456+
memo[key] = result
457+
return result
452458

453-
@staticmethod
454-
def _could_match_descendant(pattern, target, pi, ti):
459+
def _could_match_descendant(self, target, pi, ti, memo):
455460
"""Check if *target* is a prefix that could lead to a match deeper down."""
461+
key = (pi, ti)
462+
if key in memo:
463+
return memo[key]
464+
pattern = self._pattern
456465
if ti == len(target):
457-
# Target exhausted — it's an ancestor if pattern has remaining segments
458-
return pi < len(pattern)
459-
466+
result = pi < len(pattern)
467+
memo[key] = result
468+
return result
460469
if pi >= len(pattern):
470+
memo[key] = False
461471
return False
462472

463473
pat_elem = pattern[pi][0]
464-
465-
if pat_elem == MULTI_WILDCARD:
466-
return (GlobPathMatcher._could_match_descendant(pattern, target, pi + 1, ti) or
467-
GlobPathMatcher._could_match_descendant(pattern, target, pi, ti + 1))
468-
elif pat_elem == SINGLE_WILDCARD:
469-
return GlobPathMatcher._could_match_descendant(pattern, target, pi + 1, ti + 1)
474+
if pat_elem is MULTI_WILDCARD:
475+
result = (self._could_match_descendant(target, pi + 1, ti, memo)
476+
or self._could_match_descendant(target, pi, ti + 1, memo))
477+
elif pat_elem is SINGLE_WILDCARD:
478+
result = self._could_match_descendant(target, pi + 1, ti + 1, memo)
470479
else:
471-
tgt_elem = target[ti][0]
472-
if pat_elem != tgt_elem:
480+
if pat_elem != target[ti][0]:
481+
memo[key] = False
473482
return False
474-
return GlobPathMatcher._could_match_descendant(pattern, target, pi + 1, ti + 1)
483+
result = self._could_match_descendant(target, pi + 1, ti + 1, memo)
484+
memo[key] = result
485+
return result
475486

476487

477488
def compile_glob_paths(paths):

tests/test_glob_paths.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -717,3 +717,46 @@ def test_mixed_exact_include_and_glob_include(self):
717717
assert "root['b']['x']" in changed
718718
# root['b']['y'] is NOT covered by either
719719
assert "root['b']['y']" not in changed
720+
721+
722+
class TestGlobMatcherPerformance:
723+
"""Guards against exponential blowup in the glob matcher.
724+
725+
Memoization caps the recursion at O(len(pattern) * len(target)) states.
726+
Without it, patterns with multiple ``**`` and long targets exploded into
727+
hundreds of thousands of recursive calls.
728+
"""
729+
730+
def _instrument(self, monkeypatch):
731+
counts = {'ms': 0, 'cmd': 0}
732+
orig_ms = GlobPathMatcher._match_segments
733+
orig_cmd = GlobPathMatcher._could_match_descendant
734+
735+
def wrap_ms(self, *a, **k):
736+
counts['ms'] += 1
737+
return orig_ms(self, *a, **k)
738+
739+
def wrap_cmd(self, *a, **k):
740+
counts['cmd'] += 1
741+
return orig_cmd(self, *a, **k)
742+
743+
monkeypatch.setattr(GlobPathMatcher, '_match_segments', wrap_ms)
744+
monkeypatch.setattr(GlobPathMatcher, '_could_match_descendant', wrap_cmd)
745+
return counts
746+
747+
def test_match_or_is_descendant_bounded(self, monkeypatch):
748+
# Pre-memoization this exact case made ~280k recursive calls.
749+
counts = self._instrument(monkeypatch)
750+
m = GlobPathMatcher('root' + '[**]' * 5 + "['x']")
751+
target = 'root' + ''.join(f'[{i}]' for i in range(20))
752+
m.match_or_is_descendant(target)
753+
assert counts['ms'] < 5000, f"_match_segments call count regressed: {counts['ms']}"
754+
755+
def test_match_or_is_ancestor_bounded(self, monkeypatch):
756+
counts = self._instrument(monkeypatch)
757+
m = GlobPathMatcher('root' + '[**]' * 8)
758+
target = 'root' + ''.join(f'[{i}]' for i in range(40)) + "['extra']"
759+
m.match_or_is_ancestor(target)
760+
assert counts['ms'] + counts['cmd'] < 5000, (
761+
f"call count regressed: ms={counts['ms']}, cmd={counts['cmd']}"
762+
)

0 commit comments

Comments
 (0)