Skip to content

Commit 8a607be

Browse files
committed
Implemented the cache replacement.
Changed: - Replaced the homegrown linked-list LFU implementation in deepdiff/lfucache.py with a small DistanceCache wrapper over native cachebox.LRUCache. - Kept LFUCache = DistanceCache and DummyLFU compatibility names so internal imports keep working. - Updated deepdiff/diff.py cache hot paths to avoid contains + get double lookups. - Moved cachebox>=5.2,<6 into core dependencies in pyproject.toml, since DeepDiff now imports it unconditionally. - Updated tests/test_lfucache.py to validate the new bounded distance-cache behavior instead of LFU frequency internals. Benchmark result from the same 1,000,000 operation local microbenchmark: - Old homegrown LFUCache: 1.901302s - Direct cachebox.LFUCache: 5.846142s - Direct cachebox.LRUCache: 0.537102s - New DistanceCache wrapper: 1.153068s So I used cachebox.LRUCache, not cachebox.LFUCache, because cachebox’s LFU policy is slower for this workload.
1 parent b697d65 commit 8a607be

4 files changed

Lines changed: 65 additions & 222 deletions

File tree

deepdiff/diff.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,13 @@
1313
from enum import Enum
1414
from copy import deepcopy
1515
from math import isclose as is_close
16-
from typing import List, Dict, Callable, Union, Any, Pattern, Tuple, Optional, Set, FrozenSet, TYPE_CHECKING, Protocol, Literal
16+
from typing import List, Dict, Callable, Union, Any, Pattern, Tuple, Optional, Set, FrozenSet, TYPE_CHECKING, Protocol, Literal, cast
1717
from collections.abc import Mapping, Iterable, Sequence
1818
from collections import defaultdict
1919
from inspect import getmembers
2020
from itertools import zip_longest
2121
from functools import lru_cache
22-
from deepdiff.helper import (strings, bytes_type, numbers, uuids, ListItemRemovedOrAdded, notpresent,
22+
from deepdiff.helper import (strings, bytes_type, numbers, uuids, ListItemRemovedOrAdded, notpresent, not_found,
2323
IndexedHash, unprocessed, add_to_frozen_set, basic_types,
2424
convert_item_or_items_into_set_else_none, get_type,
2525
convert_item_or_items_into_compiled_regexes_else_none,
@@ -1210,9 +1210,12 @@ def _get_rough_distance_of_hashed_objs(
12101210
_distance = cache_key = None
12111211
if self._stats[DISTANCE_CACHE_ENABLED]:
12121212
cache_key = self._get_distance_cache_key(added_hash, removed_hash)
1213-
if cache_key in self._distance_cache:
1213+
cached_distance = self._distance_cache.get(cache_key)
1214+
if cached_distance is not_found:
1215+
_distance = None
1216+
else:
12141217
self._stats[DISTANCE_CACHE_HIT_COUNT] += 1
1215-
_distance = self._distance_cache.get(cache_key)
1218+
_distance = cast(float, cached_distance)
12161219
if _distance is None:
12171220
# We can only cache the rough distance and not the actual diff result for reuse.
12181221
# The reason is that we have modified the parameters explicitly so they are different and can't
@@ -1254,8 +1257,11 @@ def _get_most_in_common_pairs_in_iterables(
12541257
cache_key = None
12551258
if self._stats[DISTANCE_CACHE_ENABLED]:
12561259
cache_key = combine_hashes_lists(items=[hashes_added, hashes_removed], prefix='pairs_cache')
1257-
if cache_key in self._distance_cache:
1258-
return self._distance_cache.get(cache_key).copy()
1260+
cached_pairs = self._distance_cache.get(cache_key)
1261+
if cached_pairs is not_found:
1262+
cached_pairs = None
1263+
else:
1264+
return cast(dict, cached_pairs).copy()
12591265

12601266
# A dictionary of hashes to distances and each distance to an ordered set of hashes.
12611267
# It tells us about the distance of each object from other objects.
@@ -1296,6 +1302,7 @@ def defaultdict_orderedset():
12961302
if _distance is None:
12971303
_distance = self._get_rough_distance_of_hashed_objs(
12981304
added_hash, removed_hash, added_hash_obj, removed_hash_obj, _original_type)
1305+
_distance = cast(float, _distance)
12991306
# Left for future debugging
13001307
# print(f'{Fore.RED}distance of {added_hash_obj.item} and {removed_hash_obj.item}: {_distance}{Style.RESET_ALL}')
13011308
# Discard potential pairs that are too far.

deepdiff/lfucache.py

Lines changed: 23 additions & 190 deletions
Original file line numberDiff line numberDiff line change
@@ -1,208 +1,39 @@
1-
"""
2-
LFU cache Written by Shane Wang
3-
https://medium.com/@epicshane/a-python-implementation-of-lfu-least-frequently-used-cache-with-o-1-time-complexity-e16b34a3c49b
4-
https://github.com/luxigner/lfu_cache
5-
Modified by Sep Dehpour
6-
"""
71
from collections import defaultdict
8-
from threading import Lock
9-
from statistics import mean
10-
from deepdiff.helper import not_found, dict_, SetOrdered
2+
from cachebox import LRUCache
3+
from deepdiff.helper import SetOrdered, not_found
114

125

13-
class CacheNode:
14-
def __init__(self, key, report_type, value, freq_node, pre, nxt):
15-
self.key = key
16-
if report_type:
17-
self.content = defaultdict(SetOrdered)
18-
self.content[report_type].add(value)
19-
else:
20-
self.content = value
21-
self.freq_node = freq_node
22-
self.pre = pre # previous CacheNode
23-
self.nxt = nxt # next CacheNode
24-
25-
def free_myself(self):
26-
if self.freq_node.cache_head == self.freq_node.cache_tail: # type: ignore
27-
self.freq_node.cache_head = self.freq_node.cache_tail = None # type: ignore
28-
elif self.freq_node.cache_head == self: # type: ignore
29-
self.nxt.pre = None # type: ignore
30-
self.freq_node.cache_head = self.nxt # type: ignore
31-
elif self.freq_node.cache_tail == self: # type: ignore
32-
self.pre.nxt = None # type: ignore
33-
self.freq_node.cache_tail = self.pre # type: ignore
34-
else:
35-
self.pre.nxt = self.nxt # type: ignore
36-
self.nxt.pre = self.pre # type: ignore
37-
38-
self.pre = None
39-
self.nxt = None
40-
self.freq_node = None
41-
42-
43-
class FreqNode:
44-
def __init__(self, freq, pre, nxt):
45-
self.freq = freq
46-
self.pre = pre # previous FreqNode
47-
self.nxt = nxt # next FreqNode
48-
self.cache_head = None # CacheNode head under this FreqNode
49-
self.cache_tail = None # CacheNode tail under this FreqNode
50-
51-
def count_caches(self):
52-
if self.cache_head is None and self.cache_tail is None:
53-
return 0
54-
elif self.cache_head == self.cache_tail:
55-
return 1
56-
else:
57-
return '2+'
58-
59-
def remove(self):
60-
if self.pre is not None:
61-
self.pre.nxt = self.nxt
62-
if self.nxt is not None:
63-
self.nxt.pre = self.pre
64-
65-
pre = self.pre
66-
nxt = self.nxt
67-
self.pre = self.nxt = self.cache_head = self.cache_tail = None
68-
69-
return (pre, nxt)
70-
71-
def pop_head_cache(self):
72-
if self.cache_head is None and self.cache_tail is None:
73-
return None
74-
elif self.cache_head == self.cache_tail:
75-
cache_head = self.cache_head
76-
self.cache_head = self.cache_tail = None
77-
return cache_head
78-
else:
79-
cache_head = self.cache_head
80-
self.cache_head.nxt.pre = None # type: ignore
81-
self.cache_head = self.cache_head.nxt # type: ignore
82-
return cache_head
83-
84-
def append_cache_to_tail(self, cache_node):
85-
cache_node.freq_node = self
86-
87-
if self.cache_head is None and self.cache_tail is None:
88-
self.cache_head = self.cache_tail = cache_node
89-
else:
90-
cache_node.pre = self.cache_tail
91-
cache_node.nxt = None
92-
self.cache_tail.nxt = cache_node # type: ignore
93-
self.cache_tail = cache_node
94-
95-
def insert_after_me(self, freq_node):
96-
freq_node.pre = self
97-
freq_node.nxt = self.nxt
98-
99-
if self.nxt is not None:
100-
self.nxt.pre = freq_node
101-
102-
self.nxt = freq_node
103-
104-
def insert_before_me(self, freq_node):
105-
if self.pre is not None:
106-
self.pre.nxt = freq_node
107-
108-
freq_node.pre = self.pre
109-
freq_node.nxt = self
110-
self.pre = freq_node
6+
class DistanceCache:
7+
"""
8+
Native bounded cache used by DeepDiff's distance calculations.
1119
112-
113-
class LFUCache:
10+
DeepDiff historically used a pure Python LFU cache here. The distance-cache
11+
hot path benefits more from cachebox's native mapping operations than from
12+
preserving LFU eviction semantics.
13+
"""
11414

11515
def __init__(self, capacity):
116-
self.cache = dict_() # {key: cache_node}
11716
if capacity <= 0:
118-
raise ValueError('Capacity of LFUCache needs to be positive.') # pragma: no cover.
119-
self.capacity = capacity
120-
self.freq_link_head = None
121-
self.lock = Lock()
17+
raise ValueError('Capacity of DistanceCache needs to be positive.') # pragma: no cover.
18+
self.cache = LRUCache(capacity)
12219

12320
def get(self, key):
124-
with self.lock:
125-
if key in self.cache:
126-
cache_node = self.cache[key]
127-
freq_node = cache_node.freq_node
128-
content = cache_node.content
129-
130-
self.move_forward(cache_node, freq_node)
131-
132-
return content
133-
else:
134-
return not_found
21+
return self.cache.get(key, not_found)
13522

13623
def set(self, key, report_type=None, value=None):
137-
with self.lock:
138-
if key in self.cache:
139-
cache_node = self.cache[key]
140-
if report_type:
141-
cache_node.content[report_type].add(value)
142-
else:
143-
cache_node.content = value
144-
else:
145-
if len(self.cache) >= self.capacity:
146-
self.dump_cache()
147-
148-
self.create_cache_node(key, report_type, value)
24+
if report_type:
25+
content = self.cache.get(key, None)
26+
if content is None:
27+
content = defaultdict(SetOrdered)
28+
content[report_type].add(value)
29+
value = content
30+
self.cache.insert(key, value)
14931

15032
def __contains__(self, key):
15133
return key in self.cache
15234

153-
def move_forward(self, cache_node, freq_node):
154-
if freq_node.nxt is None or freq_node.nxt.freq != freq_node.freq + 1:
155-
target_freq_node = FreqNode(freq_node.freq + 1, None, None)
156-
target_empty = True
157-
else:
158-
target_freq_node = freq_node.nxt
159-
target_empty = False
160-
161-
cache_node.free_myself()
162-
target_freq_node.append_cache_to_tail(cache_node)
163-
164-
if target_empty:
165-
freq_node.insert_after_me(target_freq_node)
166-
167-
if freq_node.count_caches() == 0:
168-
if self.freq_link_head == freq_node:
169-
self.freq_link_head = target_freq_node
170-
171-
freq_node.remove()
17235

173-
def dump_cache(self):
174-
head_freq_node = self.freq_link_head
175-
self.cache.pop(head_freq_node.cache_head.key) # type: ignore
176-
head_freq_node.pop_head_cache() # type: ignore
177-
178-
if head_freq_node.count_caches() == 0: # type: ignore
179-
self.freq_link_head = head_freq_node.nxt # type: ignore
180-
head_freq_node.remove() # type: ignore
181-
182-
def create_cache_node(self, key, report_type, value):
183-
cache_node = CacheNode(
184-
key=key, report_type=report_type,
185-
value=value, freq_node=None, pre=None, nxt=None)
186-
self.cache[key] = cache_node
187-
188-
if self.freq_link_head is None or self.freq_link_head.freq != 0:
189-
new_freq_node = FreqNode(0, None, None)
190-
new_freq_node.append_cache_to_tail(cache_node)
191-
192-
if self.freq_link_head is not None:
193-
self.freq_link_head.insert_before_me(new_freq_node)
194-
195-
self.freq_link_head = new_freq_node
196-
else:
197-
self.freq_link_head.append_cache_to_tail(cache_node)
198-
199-
def get_sorted_cache_keys(self):
200-
result = [(i, freq.freq_node.freq) for i, freq in self.cache.items()]
201-
result.sort(key=lambda x: -x[1])
202-
return result
203-
204-
def get_average_frequency(self):
205-
return mean(freq.freq_node.freq for freq in self.cache.values())
36+
LFUCache = DistanceCache
20637

20738

20839
class DummyLFU:
@@ -211,7 +42,9 @@ def __init__(self, *args, **kwargs):
21142
pass
21243

21344
set = __init__
214-
get = __init__
45+
46+
def get(self, *args, **kwargs):
47+
return not_found
21548

21649
def __contains__(self, key):
21750
return False

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ build-backend = "flit_core.buildapi"
66
name = "deepdiff"
77
version = "9.0.0"
88
dependencies = [
9+
"cachebox>=5.2,<6",
910
"orderly-set>=5.5.0,<6",
1011
]
1112
requires-python = ">=3.10"

tests/test_lfucache.py

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,35 @@
11
import random
2-
import pytest
32
import concurrent.futures
4-
from deepdiff.lfucache import LFUCache
5-
6-
7-
class TestLFUcache:
8-
9-
@pytest.mark.parametrize("items, size, expected_results, expected_freq", [
10-
(['a', 'a', 'b', 'a', 'c', 'b', 'd'], 3, [('b', 2), ('c', 1), ('d', 1)], '1.333'),
11-
(['a', 'a', 'b', 'a', 'c', 'b', 'd', 'e', 'c', 'b'], 3, [('b', 3), ('d', 1), ('e', 1)], '1.666'),
12-
(['a', 'a', 'b', 'a', 'c', 'b', 'd', 'e', 'c', 'b', 'b', 'c', 'd', 'b'], 3, [('b', 5), ('c', 3), ('d', 2)], '3.333'),
13-
])
14-
def test_lfu(self, items, size, expected_results, expected_freq, benchmark):
15-
benchmark(self._test_lfu, items, size, expected_results, expected_freq)
16-
17-
def _test_lfu(self, items, size, expected_results, expected_freq):
18-
lfucache = LFUCache(size)
19-
for item in items:
20-
lfucache.set(item, value='{}_cached'.format(item))
21-
for item in items:
22-
lfucache.get(item)
23-
results = lfucache.get_sorted_cache_keys()
24-
assert expected_results == results
25-
freq = lfucache.get_average_frequency()
26-
assert expected_freq == str(freq)[:5]
3+
from deepdiff.helper import not_found
4+
from deepdiff.lfucache import DistanceCache
5+
6+
7+
class TestDistanceCache:
8+
9+
def test_lru_cache(self, benchmark):
10+
benchmark(self._test_lru_cache)
11+
12+
def _test_lru_cache(self):
13+
cache = DistanceCache(2)
14+
cache.set('a', value='a_cached')
15+
cache.set('b', value='b_cached')
16+
assert 'a' in cache
17+
assert cache.get('a') == 'a_cached'
18+
cache.set('c', value='c_cached')
19+
assert cache.get('a') == 'a_cached'
20+
assert cache.get('b') is not_found
21+
assert cache.get('c') == 'c_cached'
22+
assert cache.get('missing') is not_found
23+
24+
def test_report_type_values_are_accumulated(self):
25+
cache = DistanceCache(2)
26+
cache.set('a', report_type='values_changed', value='root[0]')
27+
cache.set('a', report_type='values_changed', value='root[1]')
28+
assert cache.get('a') == {'values_changed': {'root[0]', 'root[1]'}}
2729

2830
def test_get_multithreading(self):
2931
keys = 'aaaaaaaaaaaaaaaaaaaaaaaaaaabbc'
30-
lfucache = LFUCache(2)
32+
cache = DistanceCache(2)
3133

3234
def _do_set(cache, key):
3335
cache.set(key, value='{}_cached'.format(key))
@@ -45,6 +47,6 @@ def _random_func(cache, key):
4547
return random.choice([_do_get, _do_get, _do_set])(cache, key)
4648

4749
with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
48-
futures = (executor.submit(_random_func, lfucache, key) for key in _key_gen())
50+
futures = (executor.submit(_random_func, cache, key) for key in _key_gen())
4951
for future in concurrent.futures.as_completed(futures):
5052
future.result()

0 commit comments

Comments
 (0)