Skip to content

Commit 3749457

Browse files
authored
[BugFix] fix multimodal hasher hash collision risk when ndarray shape or dtype differs (#7185)
numpy tobytes() only serializes raw element bytes without encoding shape or dtype metadata. This means arrays with identical raw bytes but different shapes (e.g. (6,4) vs (4,6)) or different dtypes (e.g. float32 vs uint8 reinterpretation of same memory) produce the same SHA-256 digest, leading to silent cache collisions in ProcessorCacheManager / EncoderCacheManager / PrefixCacheManager. Prepend a "{shape}|{dtype}|" header to the byte payload before hashing so that shape and dtype participate in the digest. Added test cases for shape and dtype sensitivity.
1 parent fbc3aa9 commit 3749457

2 files changed

Lines changed: 20 additions & 2 deletions

File tree

fastdeploy/multimodal/hasher.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,9 @@ class MultimodalHasher:
2525
@classmethod
2626
def hash_features(cls, obj: object) -> str:
2727
if isinstance(obj, np.ndarray):
28-
return hashlib.sha256((obj.tobytes())).hexdigest()
28+
# Encode shape and dtype into the hash to avoid collisions between
29+
# arrays that share the same raw bytes but differ in layout, e.g.
30+
# a (6,4) vs (4,6) array, or float32 vs uint8 reinterpretation.
31+
header = f"{obj.shape}|{obj.dtype}|".encode()
32+
return hashlib.sha256(header + obj.tobytes()).hexdigest()
2933
return hashlib.sha256((pickle.dumps(obj))).hexdigest()

tests/multimodal/test_hasher.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,23 @@ def test_hash_features_ndarray(self):
2626
"""Test hash features with numpy ndarray"""
2727
arr = np.random.randint(low=0, high=255, size=(28, 28), dtype=np.uint8)
2828
arr_hash = MultimodalHasher.hash_features(arr)
29-
target_hash = hashlib.sha256((arr.tobytes())).hexdigest()
29+
header = f"{arr.shape}|{arr.dtype}|".encode()
30+
target_hash = hashlib.sha256(header + arr.tobytes()).hexdigest()
3031
assert arr_hash == target_hash, f"Ndarray hash mismatch: {arr_hash} != {target_hash}"
3132

33+
def test_hash_features_ndarray_shape_sensitivity(self):
34+
"""Arrays with same bytes but different shapes must produce different hashes"""
35+
base = np.arange(24, dtype=np.float32)
36+
a = base.reshape(6, 4)
37+
b = base.reshape(4, 6)
38+
assert MultimodalHasher.hash_features(a) != MultimodalHasher.hash_features(b)
39+
40+
def test_hash_features_ndarray_dtype_sensitivity(self):
41+
"""Arrays with same shape but different dtypes must produce different hashes"""
42+
a = np.zeros((4, 4), dtype=np.float32)
43+
b = np.zeros((4, 4), dtype=np.float64)
44+
assert MultimodalHasher.hash_features(a) != MultimodalHasher.hash_features(b)
45+
3246
def test_hash_features_object(self):
3347
"""Test hash features with unsupported object type"""
3448
obj = {"key": "value"}

0 commit comments

Comments
 (0)