[https://nvbugs/6226933][fix] canonicalize multimodal cache-key serialization to prevent hash collisions (#14800)

venkywonka · web-flow · commit cd38dfb2e101 · 2026-06-02T11:00:33.000-07:00
Signed-off-by: venkywonka &lt;23023424+venkywonka@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/inputs/multimodal.py b/tensorrt_llm/inputs/multimodal.py
@@ -18,6 +18,10 @@
 default_hasher = blake3
 _INT32_MAX = 2**31 - 1
 
+# Versioned tag prefixed to every content hash so the canonical, self-describing
+# serialization scheme can evolve without silently reusing stale cache keys.
+_HASH_SCHEME_TAG = b"trtllm.mm.hash.v1"
+
 
 def strip_mm_data_for_generation(mm_data: Dict[str, Any]) -> None:
     """Clear `mm_data` in place, retaining only `mrope_config.mrope_position_deltas`.
@@ -666,21 +670,10 @@ class MultimodalServerConfig():
 
 def _update_hash(hasher, item: object) -> None:
     """Hash the content of a multimodal item into the provided hasher."""
+    hasher.update(_HASH_SCHEME_TAG)
     if isinstance(item, BaseModalityData):
         item.update_hash(hasher)
         return
-    if isinstance(item, torch.Tensor):
-        item = item.detach().cpu().contiguous()
-        hasher.update(serialize_item(item))
-        return
-    if isinstance(item, list):
-        for element in item:
-            hasher.update(b"<frame>")
-            if isinstance(element, torch.Tensor):
-                element = element.detach().cpu().contiguous()
-            hasher.update(serialize_item(element))
-        return
-
     hasher.update(serialize_item(item))
 
 
@@ -711,7 +704,6 @@ def apply_mm_hashes(
 
     def _hash_item(item):
         """Hash only the content of a multimodal item (no UUID)."""
-        # TODO: possible hash collision w/ this simplified version (vllm/PR/17378)
         hasher = hash_lib()
         _update_hash(hasher, item)
         return hasher.hexdigest()
diff --git a/tensorrt_llm/inputs/multimodal_data.py b/tensorrt_llm/inputs/multimodal_data.py
@@ -1,13 +1,23 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import struct
 from dataclasses import dataclass
 from typing import Any, Protocol
 
 import numpy as np
 import torch
 from PIL import Image
 
+# Video metadata fields that participate in the cache-key hash. These describe
+# how frames were sampled and therefore change the model-visible content.
+_VIDEO_HASH_METADATA_FIELDS = (
+    "frames_indices",
+    "fps",
+    "duration",
+    "total_num_frames",
+)
+
 
 class ContentHasher(Protocol):
     """Hash object that accepts bytes."""
@@ -16,30 +26,93 @@ def update(self, data: bytes) -> None:
         """Update the hash with raw bytes."""
 
 
+def _u8(value: int) -> bytes:
+    """Encode an unsigned 8-bit integer."""
+    return value.to_bytes(1, "big", signed=False)
+
+
+def _u32(value: int) -> bytes:
+    """Encode an unsigned 32-bit big-endian integer."""
+    return value.to_bytes(4, "big", signed=False)
+
+
+def _u64(value: int) -> bytes:
+    """Encode an unsigned 64-bit big-endian integer."""
+    return value.to_bytes(8, "big", signed=False)
+
+
+def _len_prefixed(payload: bytes) -> bytes:
+    """Encode a byte payload prefixed with its u64 length."""
+    return _u64(len(payload)) + payload
+
+
 def serialize_item(obj: object) -> bytes:
-    """Serialize a supported multimodal hash leaf to bytes."""
+    """Serialize a supported multimodal hash leaf to bytes.
+
+    The encoding is canonical and self-describing: every value is
+    `[1-byte type tag][typed metadata][length-prefixed payload]` with all
+    multi-byte integers big-endian. This prevents cache-key hash collisions
+    between distinct values that happen to share a raw byte payload (for
+    example transposed image dimensions or reshaped arrays).
+    """
     if isinstance(obj, str):
-        return obj.encode("utf-8")
+        return _u8(0x01) + _len_prefixed(obj.encode("utf-8"))
     if isinstance(obj, bytes):
-        return obj
-    if isinstance(obj, (int, float)):
-        return np.array(obj).tobytes()
+        return _u8(0x02) + _len_prefixed(obj)
+    # bool must be checked before int: bool is a subclass of int.
+    if isinstance(obj, bool):
+        return _u8(0x05) + _u8(1 if obj else 0)
+    if isinstance(obj, int):
+        nbytes = (obj.bit_length() + 8) // 8  # +1 sign bit, then ceil-divide.
+        return _u8(0x03) + _u8(nbytes) + obj.to_bytes(nbytes, "big", signed=True)
+    if isinstance(obj, float):
+        return _u8(0x04) + struct.pack(">d", obj)
 
     if isinstance(obj, Image.Image):
-        return np.array(obj.convert("RGBA")).tobytes()
-    if isinstance(obj, torch.Tensor):
-        return obj.numpy().tobytes()
-    if isinstance(obj, np.ndarray):
-        return obj.tobytes()
+        width, height = obj.size
+        payload = np.array(obj.convert("RGBA")).tobytes()
+        return (
+            _u8(0x10)
+            + _len_prefixed(obj.mode.encode("utf-8"))
+            + _u32(width)
+            + _u32(height)
+            + _len_prefixed(payload)
+        )
+    if isinstance(obj, (torch.Tensor, np.ndarray)):
+        # The container (torch.Tensor vs np.ndarray) is not part of the content
+        # identity -- only dtype, shape, and raw bytes are. Normalize both to a
+        # contiguous NumPy array so identical content hashes identically.
+        if isinstance(obj, torch.Tensor):
+            obj = obj.detach().cpu().contiguous().numpy()
+        array = np.ascontiguousarray(obj)
+        parts = [
+            _u8(0x11),
+            _len_prefixed(array.dtype.str.encode("utf-8")),
+            _u8(array.ndim),
+        ]
+        parts.extend(_u64(dim) for dim in array.shape)
+        parts.append(_len_prefixed(array.tobytes()))
+        return b"".join(parts)
     if isinstance(obj, (tuple, list)):
-        container_tag = b"T" if isinstance(obj, tuple) else b"L"
-        parts = [container_tag, len(obj).to_bytes(8, "big", signed=False)]
-        for item in obj:
-            payload = serialize_item(item)
-            parts.append(len(payload).to_bytes(8, "big", signed=False))
-            parts.append(payload)
+        # Ordered sequence; the container (tuple vs list) is not part of the
+        # content identity.
+        parts = [_u8(0x20), _u64(len(obj))]
+        parts.extend(serialize_item(item) for item in obj)
+        return b"".join(parts)
+    if isinstance(obj, dict):
+        parts = [_u8(0x22), _u64(len(obj))]
+        for key in sorted(obj):
+            parts.append(serialize_item(key))
+            parts.append(serialize_item(obj[key]))
         return b"".join(parts)
 
+    if isinstance(obj, np.generic):
+        # numpy scalar (e.g. np.int64 / np.float32 / np.bool_): normalize to the
+        # equivalent Python scalar and recurse, so numpy-typed values hash
+        # identically to their Python counterparts. In numpy 2.x these are not
+        # subclasses of Python int/float/bool, so they bypass the checks above.
+        return serialize_item(obj.item())
+
     raise ValueError(f"Unsupported object type: {type(obj)}")
 
 
@@ -65,11 +138,8 @@ def __post_init__(self) -> None:
             self.sample_rate = int(self.sample_rate)
 
     def update_hash(self, hasher: ContentHasher) -> None:
-        samples = self.samples
-        if isinstance(samples, torch.Tensor):
-            samples = samples.detach().cpu().contiguous()
         hasher.update(b"<audio>")
-        hasher.update(serialize_item((samples, self.sample_rate)))
+        hasher.update(serialize_item((self.samples, self.sample_rate)))
 
 
 @dataclass
@@ -97,12 +167,12 @@ def __post_init__(self) -> None:
             raise TypeError("metadata must be a dictionary")
 
     def update_hash(self, hasher: ContentHasher) -> None:
+        hasher.update(b"<video>")
+        # Sampling metadata is part of the model-visible cache identity.
+        meta = {k: self.metadata[k] for k in _VIDEO_HASH_METADATA_FIELDS if k in self.metadata}
+        hasher.update(serialize_item(meta))
         for frame in self.frames:
             hasher.update(b"<frame>")
-            if isinstance(frame, torch.Tensor):
-                frame = frame.detach().cpu().contiguous()
             hasher.update(serialize_item(frame))
-        # Extend this to include metadata if fields such as sampled frame
-        # indices become part of the model-visible cache identity.
         if self.audio is not None:
             self.audio.update_hash(hasher)
diff --git a/tests/unittest/llmapi/test_llm_kv_cache_events.py b/tests/unittest/llmapi/test_llm_kv_cache_events.py