Skip to content

Commit 6d750d4

Browse files
committed
Two bugs combined to make ignore_order=True miss int-vs-float type changes:
1. Hash cache collision (deephash.py): The shared self.hashes dict used raw objects as keys. Since Python treats 1 == 1.0 and hash(1) == hash(1.0), looking up 1.0 returned the cached hash for 1, silently hiding the type difference. 2. Distance over-counting (distance.py): _get_item_length counted both old_type and old_value in type_changes entries, double-counting a single operation. This inflated distances, causing the pairing algorithm to reject items that should have been matched. Fixes: - deepdiff/deephash.py: Added _make_hash_key() / _make_hash_key_for_lookup() that wrap numeric objects as (type(obj), obj) tuples when ignore_numeric_type_changes=False. Applied at cache lookup, cache store, and all public accessor methods (__getitem__, __contains__, get, get_key). Added _unwrap_hash_key() for iteration methods so the public API still shows raw objects. - deepdiff/distance.py: Skip old_type and old_value keys in _get_item_length — they're metadata about prior state, not additional operations. Also added ignore_numeric_type_changes to DistanceProtocol and threaded it through get_key calls. - Test updates: Updated expected distance values in test_distance.py to reflect the corrected distance formula.
1 parent 9f0a56c commit 6d750d4

5 files changed

Lines changed: 146 additions & 20 deletions

File tree

deepdiff/deephash.py

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pathlib import Path
1010
from enum import Enum
1111
import re
12-
from deepdiff.helper import (strings, numbers, times, unprocessed, not_hashed, add_to_frozen_set,
12+
from deepdiff.helper import (strings, numbers, only_numbers, times, unprocessed, not_hashed, add_to_frozen_set,
1313
convert_item_or_items_into_set_else_none, get_doc, ipranges,
1414
convert_item_or_items_into_compiled_regexes_else_none,
1515
get_id, type_is_subclass_of_type_group, type_in_type_group,
@@ -276,10 +276,13 @@ def __init__(self,
276276
sha1hex: Callable[[Union[str, bytes]], str] = sha1hex
277277

278278
def __getitem__(self, obj: Any, extract_index: Optional[int] = 0) -> Any:
279-
return self._getitem(self.hashes, obj, extract_index=extract_index, use_enum_value=self.use_enum_value)
279+
return self._getitem(self.hashes, obj, extract_index=extract_index,
280+
use_enum_value=self.use_enum_value,
281+
ignore_numeric_type_changes=self.ignore_numeric_type_changes)
280282

281283
@staticmethod
282-
def _getitem(hashes: Dict[Any, Any], obj: Any, extract_index: Optional[int] = 0, use_enum_value: bool = False) -> Any:
284+
def _getitem(hashes: Dict[Any, Any], obj: Any, extract_index: Optional[int] = 0,
285+
use_enum_value: bool = False, ignore_numeric_type_changes: bool = False) -> Any:
283286
"""
284287
extract_index is zero for hash and 1 for count and None to get them both.
285288
To keep it backward compatible, we only get the hash by default so it is set to zero by default.
@@ -292,6 +295,7 @@ def _getitem(hashes: Dict[Any, Any], obj: Any, extract_index: Optional[int] = 0,
292295
key = BoolObj.FALSE
293296
elif use_enum_value and isinstance(obj, Enum):
294297
key = obj.value
298+
key = DeepHash._make_hash_key_for_lookup(key, ignore_numeric_type_changes=ignore_numeric_type_changes)
295299

296300
result_n_count: Tuple[Any, int] = (None, 0) # type: ignore
297301

@@ -310,9 +314,10 @@ def _getitem(hashes: Dict[Any, Any], obj: Any, extract_index: Optional[int] = 0,
310314
return result_n_count if extract_index is None else result_n_count[extract_index]
311315

312316
def __contains__(self, obj: Any) -> bool:
317+
key = self._make_hash_key(obj)
313318
result = False
314319
try:
315-
result = obj in self.hashes
320+
result = key in self.hashes
316321
except (TypeError, KeyError):
317322
result = False
318323
if not result:
@@ -325,21 +330,32 @@ def get(self, key: Any, default: Any = None, extract_index: Optional[int] = 0) -
325330
It can extract the hash for a given key that is already calculated when extract_index=0
326331
or the count of items that went to building the object whenextract_index=1.
327332
"""
328-
return self.get_key(self.hashes, key, default=default, extract_index=extract_index)
333+
return self.get_key(self.hashes, key, default=default, extract_index=extract_index,
334+
ignore_numeric_type_changes=self.ignore_numeric_type_changes)
329335

330336
@staticmethod
331-
def get_key(hashes: Dict[Any, Any], key: Any, default: Any = None, extract_index: Optional[int] = 0, use_enum_value: bool = False) -> Any:
337+
def get_key(hashes: Dict[Any, Any], key: Any, default: Any = None, extract_index: Optional[int] = 0,
338+
use_enum_value: bool = False, ignore_numeric_type_changes: bool = False) -> Any:
332339
"""
333340
get_key method for the hashes dictionary.
334341
It can extract the hash for a given key that is already calculated when extract_index=0
335342
or the count of items that went to building the object whenextract_index=1.
336343
"""
337344
try:
338-
result = DeepHash._getitem(hashes, key, extract_index=extract_index, use_enum_value=use_enum_value)
345+
result = DeepHash._getitem(hashes, key, extract_index=extract_index,
346+
use_enum_value=use_enum_value,
347+
ignore_numeric_type_changes=ignore_numeric_type_changes)
339348
except KeyError:
340349
result = default
341350
return result
342351

352+
@staticmethod
353+
def _unwrap_hash_key(key: Any) -> Any:
354+
"""Unwrap a (type, value) hash key back to the original value for public API."""
355+
if isinstance(key, tuple) and len(key) == 2 and isinstance(key[0], type) and isinstance(key[1], only_numbers):
356+
return key[1]
357+
return key
358+
343359
def _get_objects_to_hashes_dict(self, extract_index: Optional[int] = 0) -> Dict[Any, Any]:
344360
"""
345361
A dictionary containing only the objects to hashes,
@@ -348,6 +364,7 @@ def _get_objects_to_hashes_dict(self, extract_index: Optional[int] = 0) -> Dict[
348364
"""
349365
result = dict_()
350366
for key, value in self.hashes.items():
367+
key = self._unwrap_hash_key(key)
351368
if key is UNPROCESSED_KEY:
352369
result[key] = value
353370
else:
@@ -377,13 +394,13 @@ def __bool__(self) -> bool:
377394
return bool(self.hashes)
378395

379396
def keys(self) -> Any:
380-
return self.hashes.keys()
397+
return [self._unwrap_hash_key(k) for k in self.hashes.keys()]
381398

382399
def values(self) -> Generator[Any, None, None]:
383400
return (i[0] for i in self.hashes.values()) # Just grab the item and not its count
384401

385402
def items(self) -> Generator[Tuple[Any, Any], None, None]:
386-
return ((i, v[0]) for i, v in self.hashes.items())
403+
return ((self._unwrap_hash_key(i), v[0]) for i, v in self.hashes.items())
387404

388405
def _prep_obj(self, obj: Any, parent: str, parents_ids: frozenset = EMPTY_FROZENSET, is_namedtuple: bool = False, is_pydantic_object: bool = False) -> HashTuple:
389406
"""prepping objects"""
@@ -555,6 +572,26 @@ def _prep_tuple(self, obj: tuple, parent: str, parents_ids: frozenset) -> HashTu
555572
result, counts = self._prep_obj(obj, parent, parents_ids=parents_ids, is_namedtuple=True)
556573
return result, counts
557574

575+
def _make_hash_key(self, obj: Any) -> Any:
576+
"""
577+
Create a key for the hashes dict that distinguishes numeric types.
578+
579+
In Python, 1 == 1.0 and hash(1) == hash(1.0), so int and float values
580+
collide as dict keys. When ignore_numeric_type_changes is False, we wrap
581+
numeric objects as (type, value) tuples so that each type gets its own
582+
cache entry and its own hash.
583+
"""
584+
if not self.ignore_numeric_type_changes and isinstance(obj, only_numbers):
585+
return (type(obj), obj)
586+
return obj
587+
588+
@staticmethod
589+
def _make_hash_key_for_lookup(obj: Any, ignore_numeric_type_changes: bool = False) -> Any:
590+
"""Static version of _make_hash_key for use in static accessor methods."""
591+
if not ignore_numeric_type_changes and isinstance(obj, only_numbers):
592+
return (type(obj), obj)
593+
return obj
594+
558595
def _hash(self, obj: Any, parent: str, parents_ids: frozenset = EMPTY_FROZENSET) -> HashTuple:
559596
"""The main hash method"""
560597
counts = 1
@@ -573,8 +610,9 @@ def _hash(self, obj: Any, parent: str, parents_ids: frozenset = EMPTY_FROZENSET)
573610
obj = obj.value
574611
else:
575612
result = not_hashed
613+
hash_key = self._make_hash_key(obj)
576614
try:
577-
result, counts = self.hashes[obj]
615+
result, counts = self.hashes[hash_key]
578616
except (TypeError, KeyError):
579617
pass
580618
else:
@@ -662,7 +700,7 @@ def gen():
662700
# The hashes will be later used for comparing the objects.
663701
# Object to hash when possible otherwise ObjectID to hash
664702
try:
665-
self.hashes[obj] = (result, counts)
703+
self.hashes[hash_key] = (result, counts)
666704
except TypeError:
667705
obj_id = get_id(obj)
668706
self.hashes[obj_id] = (result, counts)

deepdiff/distance.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
class DistanceProtocol(DeepDiffProtocol, Protocol):
1515
hashes: dict
1616
deephash_parameters: dict
17+
ignore_numeric_type_changes: bool
1718
iterable_compare_func: Optional[Callable]
1819
math_epsilon: Optional[float]
1920
cutoff_distance_for_pairs: float
@@ -86,10 +87,12 @@ def __get_item_rough_length(self: "DistanceProtocol", item, parent='root'):
8687
"""
8788
if not hasattr(self, 'hashes'):
8889
raise RuntimeError(DISTANCE_CALCS_NEEDS_CACHE)
89-
length = DeepHash.get_key(self.hashes, key=item, default=None, extract_index=1)
90+
length = DeepHash.get_key(self.hashes, key=item, default=None, extract_index=1,
91+
ignore_numeric_type_changes=self.ignore_numeric_type_changes)
9092
if length is None:
9193
self.__calculate_item_deephash(item)
92-
length = DeepHash.get_key(self.hashes, key=item, default=None, extract_index=1)
94+
length = DeepHash.get_key(self.hashes, key=item, default=None, extract_index=1,
95+
ignore_numeric_type_changes=self.ignore_numeric_type_changes)
9396
return length
9497

9598
def __calculate_item_deephash(self: "DistanceProtocol", item: Any) -> None:
@@ -184,8 +187,10 @@ def _get_item_length(item, parents_ids=frozenset([])):
184187
new_subitem[path_] = new_indexes_to_items
185188
subitem = new_subitem
186189

187-
# internal keys such as _numpy_paths should not count towards the distance
188-
if isinstance(key, strings) and (key.startswith('_') or key == 'deep_distance' or key == 'new_path'):
190+
# internal keys such as _numpy_paths should not count towards the distance.
191+
# old_type and old_value are metadata about the previous state, not additional operations.
192+
if isinstance(key, strings) and (key.startswith('_') or key == 'deep_distance' or key == 'new_path'
193+
or key == 'old_type' or key == 'old_value'):
189194
continue
190195

191196
item_id = id(subitem)

tests/test_distance.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class TestDeepDistance:
4848
}
4949
}
5050
},
51-
3
51+
2
5252
),
5353
])
5454
def test_item_length(self, diff, expected_length):
@@ -165,7 +165,7 @@ def test_get_distance_works_event_when_ignore_order_and_different_hasher(self):
165165
diff = DeepDiff(t1, t2, ignore_order=True, get_deep_distance=True,
166166
cache_size=100, hasher=sha256hex)
167167
dist = diff['deep_distance']
168-
assert str(dist)[:4] == '0.44'
168+
assert str(dist)[:4] == '0.33'
169169

170170
def test_get_distance_does_not_care_about_the_size_of_string(self):
171171
t1 = ["a", "b"]

tests/test_hash.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -250,14 +250,39 @@ def test_prep_bool_vs_num2(self):
250250
}
251251
assert DeepHashPrep(item1)[item1] == DeepHashPrep(item2)[item2]
252252

253-
def test_prep_str(self):
253+
def test_prep_str1(self):
254254
obj = "a"
255+
obj2 = b"a"
255256
expected_result = {obj: prep_str(obj)}
256257
result = DeepHashPrep(obj, ignore_string_type_changes=True)
257258
assert expected_result == result
258-
expected_result = {obj: prep_str(obj, ignore_string_type_changes=False)}
259+
assert result[obj] == 'a'
260+
expected_result2 = {obj: prep_str(obj, ignore_string_type_changes=False)}
259261
result = DeepHashPrep(obj, ignore_string_type_changes=False)
260-
assert expected_result == result
262+
assert expected_result2 == result
263+
assert result[obj] == 'str:a'
264+
265+
expected_result3 = {obj2: prep_str(obj2, ignore_string_type_changes=True)}
266+
result = DeepHashPrep(obj2, ignore_string_type_changes=True)
267+
assert expected_result3 != result
268+
assert result[obj2] == 'a'
269+
result = DeepHashPrep(obj2, ignore_string_type_changes=False)
270+
assert {b'a': 'bytes:a'} == result
271+
assert result[obj2] == 'bytes:a'
272+
273+
def test_prep_number1(self):
274+
obj_int = 1
275+
obj_float = 1.0
276+
# By default, type is included in the prep
277+
result_int = DeepHashPrep(obj_int)
278+
result_float = DeepHashPrep(obj_float)
279+
assert result_int[obj_int] == 'int:1'
280+
assert result_float[obj_float] == 'float:1.0'
281+
# When ignoring numeric type changes, both get the same "number:" prefix
282+
result_int2 = DeepHashPrep(obj_int, ignore_numeric_type_changes=True)
283+
result_float2 = DeepHashPrep(obj_float, ignore_numeric_type_changes=True)
284+
assert result_int2[obj_int] == result_float2[obj_float]
285+
assert result_int2[obj_int] == 'number:1.000000000000'
261286

262287
def test_dictionary_key_type_change(self):
263288
obj1 = {"b": 10}

tests/test_ignore_order.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1396,3 +1396,61 @@ def test_error_messages_when_ignore_order(self, mock_logger):
13961396
assert {} == result
13971397

13981398
assert not mock_logger.error.called
1399+
1400+
1401+
1402+
class TestIgnoreOrderNumericTypeChange:
1403+
"""Regression tests for GitHub issue #485.
1404+
1405+
When ignore_order=True, numerically-equal values of different numeric types
1406+
(e.g. int 1 vs float 1.0) must still be reported as type_changes.
1407+
1408+
Root cause: Python's hash equality (hash(1) == hash(1.0)) and value equality
1409+
(1 == 1.0) caused both items to land in the same DeepHash bucket, so they
1410+
were treated as identical and silently dropped from the diff result.
1411+
"""
1412+
1413+
def test_int_vs_float_in_list_of_dicts(self):
1414+
"""Core regression: type change inside a dict nested in a list."""
1415+
result = DeepDiff([{"a": 1}], [{"a": 1.0}], ignore_order=True)
1416+
assert "type_changes" in result, (
1417+
"Expected type_changes between int 1 and float 1.0, got: %s" % result
1418+
)
1419+
assert result["type_changes"]["root[0]['a']"]["old_type"] is int
1420+
assert result["type_changes"]["root[0]['a']"]["new_type"] is float
1421+
1422+
def test_ignore_numeric_type_changes_suppresses_report(self):
1423+
"""When ignore_numeric_type_changes=True the type change must be hidden."""
1424+
result = DeepDiff(
1425+
[{"a": 1}], [{"a": 1.0}],
1426+
ignore_order=True,
1427+
ignore_numeric_type_changes=True,
1428+
)
1429+
assert result == {}, (
1430+
"With ignore_numeric_type_changes=True there should be no diff, got: %s" % result
1431+
)
1432+
1433+
def test_value_change_still_detected(self):
1434+
"""Ordinary value differences must still be detected."""
1435+
result = DeepDiff([1], [2], ignore_order=True)
1436+
assert result != {}, "Expected a diff between [1] and [2]"
1437+
1438+
def test_reorder_no_false_positive(self):
1439+
"""A simple reorder of identical values must not trigger type_changes."""
1440+
result = DeepDiff([1, 2, 3], [3, 2, 1], ignore_order=True)
1441+
assert result == {}, "Reordering identical ints must not produce a diff"
1442+
1443+
def test_mixed_list_one_type_change(self):
1444+
"""Only the item with a type change should appear in the diff."""
1445+
result = DeepDiff(
1446+
[{"a": 1}, {"b": 2}],
1447+
[{"b": 2}, {"a": 1.0}],
1448+
ignore_order=True,
1449+
)
1450+
assert "type_changes" in result
1451+
assert "root[0]['a']" in result["type_changes"]
1452+
1453+
def test_ignore_order_false_unchanged(self):
1454+
"""The ignore_order=False path must continue to work as before."""
1455+
result = DeepDiff([{"a": 1}], [{"a": 1.0}], ignore_order=False)
1456+
assert "type_changes" in result

0 commit comments

Comments
 (0)