[KVCache][BugFix] fix storage prefetch nodes inserted at wrong radix tree position

kevincheng2 · kevincheng2 · commit 5043d01af7a3 · 2026-05-11T13:58:41.000+08:00
## Motivation

三级 KV Cache（Device → Host → Storage）预拉取完成后，第二次 match_prefix
仍然只命中 device 层的 block，storage 预拉取的 host block 无法被找到。

根本原因：`prepare_prefetch_metadata` 调用 `radix_tree.insert` 时未传
`start_node`，导致 8 个新 LOADING_FROM_STORAGE 节点被错误地挂在 radix tree
的 root 节点下（以 storage hash h22 作为 root 直接子节点），而非接在已有
22 节点链末尾（node[21] 的子节点）。`find_prefix` 遍历到 node[21] 时，
node[21].children 中不存在 h22，立即停止，始终只返回 22 个节点。

同批次还修复了几个关联问题：
- `_match_storage` 只探测 "key" kind，Mooncake LRU 可能单独驱逐 "value"
  导致虚假命中，改为同时探测 key + value，两者都存在才算命中
- partial write 时部分 key 写成功、部分失败，改为自动 rollback 已写入的
  key，防止 _match_storage 发现半写 block
- `prepare_prefetch_metadata` 中只注册真正是 LOADING_FROM_STORAGE 状态的
  节点进 prefetch_node_map，避免 insert 复用已有 HOST/DEVICE 节点时触发
  spurious "unexpected status" 警告

## Modifications

- `cache_manager.py`
  - `match_prefix`: 传 `start_node=matched_nodes[-1]` 给 `prepare_prefetch_metadata`
  - `prepare_prefetch_metadata`: 新增 `start_node` 参数，透传给 `_radix_tree.insert`
  - `prepare_prefetch_metadata`: 只注册 LOADING_FROM_STORAGE 节点进 prefetch_node_map
  - `_match_storage`: 同时探测 key + value 两个 kind，均存在才视为命中
- `storage/base.py`: 新增 `batch_exists` / `batch_delete` 默认实现
- `storage/mooncake/connector.py`: Mooncake 实现 `batch_exists` / `batch_delete`
- `storage/staging_manager.py`: partial write 自动 rollback
- `transfer_manager.py`: prefetch/backup 失败时输出诊断日志
- `tests/cache_manager/v1/test_cache_manager.py`: 添加回归测试 `TestPreparePrefixtMetadataStartNode`

## Usage or Command

```bash
# 运行回归测试
source .venv/py310/bin/activate
PYTHONPATH=. python -m pytest tests/cache_manager/v1/test_cache_manager.py::TestPreparePrefixtMetadataStartNode -v
```
diff --git a/fastdeploy/cache_manager/v1/cache_manager.py b/fastdeploy/cache_manager/v1/cache_manager.py
@@ -531,7 +531,8 @@ def match_prefix(
                 # Step 2: Match Storage (if enabled and not skipped)
                 if not skip_storage and self._storage_scheduler and remaining_hashes:
                     storage_matches = self._match_storage(remaining_hashes)
-                    result.storage_nodes = self.prepare_prefetch_metadata(storage_matches)
+                    start_node = matched_nodes[-1] if matched_nodes else None
+                    result.storage_nodes = self.prepare_prefetch_metadata(storage_matches, start_node=start_node)
 
                 # Step 3: Increment ref count for matched blocks(only scheduling phase)
                 if skip_storage:
@@ -562,11 +563,13 @@ def _match_storage(self, hash_values: List[str]) -> List[str]:
         consecutive prefix of hashes that are all present (prefix semantics
         are required because a cache miss in the middle breaks prefetch continuity).
 
-        Uses rank=0 key as a probe: if rank 0 has the block, all ranks
-        are assumed to have it (all ranks write storage synchronously).
+        Probes both rank=0 "key" and "value" kinds: a block is considered present
+        only when both exist.  This avoids false positives from partial writes where
+        only one kind was stored, and prevents LRU asymmetry (probing only "key"
+        would keep it hot while "value" gets evicted by Mooncake).
 
         Storage key format (see cache_utils.storage_key_for_block):
-            "{hash_value}_0_key"
+            "{hash_value}_0_key"  /  "{hash_value}_0_value"
 
         Args:
             hash_values: List of block hash values to check, in prefix order.
@@ -584,21 +587,27 @@ def _match_storage(self, hash_values: List[str]) -> List[str]:
                 logger.warning("_match_storage: storage scheduler disconnected, skipping storage match")
                 return []
 
-            # Build probe keys using rank=0 (same format as storage_key_for_block)
-            probe_keys = [storage_key_for_block(h, 0, "key") for h in hash_values]
+            # Probe both key and value kinds for rank=0.
+            # Interleaved: [h0_key, h0_value, h1_key, h1_value, ...]
+            probe_keys = []
+            for h in hash_values:
+                probe_keys.append(storage_key_for_block(h, 0, "key"))
+                probe_keys.append(storage_key_for_block(h, 0, "value"))
 
-            # batch_exists returns a bool list aligned with probe_keys
             exist_flags = self._storage_scheduler.batch_exists(probe_keys)
 
-            # Return only the leading consecutive hit run
+            # A block is present only when both key and value exist.
             matched = []
-            for h, exists in zip(hash_values, exist_flags):
-                if not exists:
+            for i, h in enumerate(hash_values):
+                key_ok = exist_flags[i * 2]
+                val_ok = exist_flags[i * 2 + 1]
+                if not (key_ok and val_ok):
                     break
                 matched.append(h)
 
             logger.debug(
-                f"[CacheManager] _match_storage: probing {len(probe_keys)} keys, matched hashes: {len(matched)}"
+                f"[CacheManager] _match_storage: probing {len(hash_values)} blocks "
+                f"({len(probe_keys)} keys), matched={len(matched)}"
             )
             return matched
         except Exception:
@@ -1001,6 +1010,7 @@ def drain_pending_prefetches(self) -> List[PendingPrefetch]:
     def prepare_prefetch_metadata(
         self,
         storage_hashes: List[str],
+        start_node: Optional["BlockNode"] = None,
     ) -> Optional[List["BlockNode"]]:
         """
         Prepare metadata for storage prefetch operation.
@@ -1010,6 +1020,10 @@ def prepare_prefetch_metadata(
 
         Args:
             storage_hashes: List of storage hash values to prefetch
+            start_node: Node to start insertion from in the radix tree.
+                        Must be the last matched node from find_prefix so that
+                        the new LOADING_FROM_STORAGE nodes are attached as proper
+                        extensions of the existing prefix chain.
 
         Returns:
             List of BlockNode objects if successful, None or empty list otherwise.
@@ -1032,17 +1046,24 @@ def prepare_prefetch_metadata(
 
                 blocks = list(zip(storage_hashes, host_block_ids))
                 prefetch_nodes, wasted_block_ids = self._radix_tree.insert(
-                    blocks=blocks, cache_status=CacheStatus.LOADING_FROM_STORAGE
+                    blocks=blocks, cache_status=CacheStatus.LOADING_FROM_STORAGE, start_node=start_node
                 )
                 # Release any blocks that were wasted due to node reuse
                 if wasted_block_ids:
                     self._host_pool.release(wasted_block_ids)
 
-                # Register nodes in prefetch_node_map for fast status update on done
+                # Register only truly new LOADING_FROM_STORAGE nodes.
+                # insert() reuses existing nodes without updating their status, so nodes
+                # that were already HOST/DEVICE must be excluded — they don't need a
+                # storage transfer and would trigger a spurious "unexpected status" warning
+                # in update_storage_blocks_to_host.
+                actual_prefetch_nodes = []
                 for node in prefetch_nodes:
-                    self._prefetch_node_map[node.block_id] = node
+                    if node.cache_status == CacheStatus.LOADING_FROM_STORAGE:
+                        self._prefetch_node_map[node.block_id] = node
+                        actual_prefetch_nodes.append(node)
 
-                return prefetch_nodes
+                return actual_prefetch_nodes
         except Exception as e:
             logger.error(f"prepare_prefetch_metadata error: {e}, {str(traceback.format_exc())}")
             return []
diff --git a/fastdeploy/cache_manager/v1/storage/base.py b/fastdeploy/cache_manager/v1/storage/base.py
@@ -295,6 +295,20 @@ def is_connected(self) -> bool:
         """Check if connected to storage."""
         return self._connected
 
+    def batch_exists(self, keys: List[str]) -> List[bool]:
+        """
+        Batch check key existence. Backends that support it should override.
+        Default returns False for all keys (conservative: assume missing).
+        """
+        return [False] * len(keys)
+
+    def batch_delete(self, keys: List[str]) -> List[bool]:
+        """
+        Delete multiple keys. Backends can override for efficiency.
+        Default falls back to calling delete() per key.
+        """
+        return [self.delete(k) for k in keys]
+
     def get_stats(self) -> Dict[str, Any]:
         """Get connector statistics."""
         return {
diff --git a/fastdeploy/cache_manager/v1/storage/mooncake/connector.py b/fastdeploy/cache_manager/v1/storage/mooncake/connector.py
@@ -635,6 +635,15 @@ def batch_set(
 
         return final_results
 
+    def batch_exists(self, keys: List[str]) -> List[bool]:
+        """Batch check key existence."""
+        if not self._connected or self._base._store is None:
+            return [False] * len(keys)
+        if not keys:
+            return []
+        results, _ = self._base._batch_exists(keys)
+        return [r == 1 for r in results]
+
     # ------------------------------------------------------------------
     # Delete / clear
     # ------------------------------------------------------------------
@@ -661,6 +670,21 @@ def delete(self, key: str, timeout: int = 5) -> bool:
         self.logger.error(f"delete({key!r}) timed out after {timeout}s")
         return False
 
+    def batch_delete(self, keys: List[str]) -> List[bool]:
+        """
+        Delete multiple keys from the store (single attempt, no retry).
+
+        Used for cleaning up partial writes where some kinds succeeded
+        and others failed. Returns per-key success flags.
+        """
+        if not self._connected or self._base._store is None:
+            return [False] * len(keys)
+        results = []
+        for key in keys:
+            rc = self._base._store.remove(key)
+            results.append(rc == 0)
+        return results
+
     def clear(self) -> int:
         """
         Remove all objects from the store.
diff --git a/fastdeploy/cache_manager/v1/storage/staging_manager.py b/fastdeploy/cache_manager/v1/storage/staging_manager.py
@@ -298,9 +298,22 @@ def batch_set_block(
 
             results = self._connector.batch_set(flat_keys, flat_ptrs, flat_sizes)
 
+            # Track which keys succeeded per block for partial-write cleanup.
+            block_ok_keys: Dict[int, List[str]] = {}
             for flat_idx, ok in enumerate(results):
-                if not ok:
-                    block_success[flat_index[flat_idx]] = False
+                bi = flat_index[flat_idx]
+                if ok:
+                    block_ok_keys.setdefault(bi, []).append(flat_keys[flat_idx])
+                else:
+                    block_success[bi] = False
+
+            # Rollback: if a block failed but some of its keys were written,
+            # delete those keys so the block appears fully absent in storage.
+            # This prevents _match_storage from finding a half-written block.
+            keys_to_rollback = [key for bi, keys in block_ok_keys.items() if not block_success[bi] for key in keys]
+            if keys_to_rollback:
+                logger.warning(f"[StagingManager] partial write on {len(keys_to_rollback)} key(s), rolling back")
+                self._connector.batch_delete(keys_to_rollback)
 
         return block_success
 
diff --git a/fastdeploy/cache_manager/v1/transfer_manager.py b/fastdeploy/cache_manager/v1/transfer_manager.py
@@ -879,7 +879,58 @@ def prefetch_from_storage(
             return [False] * len(hash_list)
 
         keys_per_kind, host_ptrs_per_kind = self._build_storage_io_args(hash_list)
-        return self._staging_manager.batch_get_block(keys_per_kind, host_ptrs_per_kind, cpu_block_list)
+        results = self._staging_manager.batch_get_block(keys_per_kind, host_ptrs_per_kind, cpu_block_list)
+
+        failed_indices = [i for i, ok in enumerate(results) if not ok]
+        if failed_indices and self._storage_connector is not None:
+            # For each failed block, check which storage keys are actually missing.
+            # keys_per_kind maps kind -> [key_for_block_0, key_for_block_1, ...]
+            probe_keys = []
+            probe_labels = []
+            for i in failed_indices:
+                for kind, keys in keys_per_kind.items():
+                    probe_keys.append(keys[i])
+                    probe_labels.append((i, cpu_block_list[i], hash_list[i], kind))
+
+            try:
+                exist_flags = self._storage_connector.batch_exists(probe_keys)
+
+                # Aggregate per-block: collect missing kinds and whether any kind exists
+                # block_idx -> {missing_kinds, existing_kinds}
+                block_diag: Dict[int, Dict] = {}
+                for (bi, cpu_bid, h, kind), ok in zip(probe_labels, exist_flags):
+                    if bi not in block_diag:
+                        block_diag[bi] = {"cpu_bid": cpu_bid, "hash": h, "missing": [], "existing": []}
+                    if ok:
+                        block_diag[bi]["existing"].append(kind)
+                    else:
+                        block_diag[bi]["missing"].append(kind)
+
+                # Blocks with at least one missing kind
+                partial_missing = {bi: v for bi, v in block_diag.items() if v["missing"]}
+                # Blocks where all kinds exist (pure transfer error)
+                pure_transfer_err = {bi: v for bi, v in block_diag.items() if not v["missing"]}
+
+                if partial_missing:
+                    detail = [
+                        f"cpu_block={v['cpu_bid']} hash={v['hash'][:16]}.. "
+                        f"missing_kinds={v['missing']} existing_kinds={v['existing']}"
+                        for v in partial_missing.values()
+                    ]
+                    logger.warning(
+                        f"[TransferManager] prefetch_from_storage: {len(partial_missing)} block(s) have missing keys — "
+                        + "; ".join(detail)
+                    )
+                if pure_transfer_err:
+                    detail = [f"cpu_block={v['cpu_bid']} hash={v['hash'][:16]}.." for v in pure_transfer_err.values()]
+                    logger.warning(
+                        f"[TransferManager] prefetch_from_storage: {len(pure_transfer_err)} block(s) keys exist but transfer failed — "
+                        + ", ".join(detail)
+                    )
+            except Exception as e:
+                logger.warning(f"[TransferManager] prefetch_from_storage: failed to probe missing keys: {e}")
+
+        return results
 
     def backup_to_storage(
         self,
@@ -924,4 +975,13 @@ def backup_to_storage(
             return [False] * len(cpu_block_list)
 
         keys_per_kind, host_ptrs_per_kind = self._build_storage_io_args(hash_list)
-        return self._staging_manager.batch_set_block(keys_per_kind, host_ptrs_per_kind, cpu_block_list)
+        results = self._staging_manager.batch_set_block(keys_per_kind, host_ptrs_per_kind, cpu_block_list)
+
+        failed = [(cpu_block_list[i], hash_list[i]) for i, ok in enumerate(results) if not ok]
+        if failed:
+            logger.warning(
+                f"[TransferManager] backup_to_storage: {len(failed)}/{len(cpu_block_list)} block(s) failed — "
+                + ", ".join(f"cpu_block={cb} hash={h[:16]}.." for cb, h in failed)
+            )
+
+        return results
diff --git a/tests/cache_manager/v1/test_cache_manager.py b/tests/cache_manager/v1/test_cache_manager.py
@@ -918,5 +918,65 @@ def test_issue_returns_none_when_host_cache_disabled(self):
         self.assertEqual(cm.get_pending_backup_count(), 0)
 
 
+class TestPreparePrefixtMetadataStartNode(unittest.TestCase):
+    """Regression test for the start_node bug in prepare_prefetch_metadata.
+
+    Before the fix, prepare_prefetch_metadata called radix_tree.insert without
+    start_node, which inserted LOADING_FROM_STORAGE nodes as children of root
+    (using storage hashes h22..h29 at depth 1) instead of as extensions of the
+    existing device prefix chain (at depth 22..29). As a result, a subsequent
+    find_prefix on the full hash list would traverse root → h0 → ... → h21,
+    then fail to find h22 as a child of node(21), and stop at 22 nodes — never
+    reaching the HOST nodes even after update_storage_blocks_to_host.
+    """
+
+    def test_find_prefix_finds_host_blocks_after_prefetch(self):
+        """After prepare_prefetch_metadata + update_storage_blocks_to_host,
+        find_prefix must return all 30 nodes (22 DEVICE + 8 HOST)."""
+        from fastdeploy.cache_manager.v1.metadata import CacheStatus
+
+        cm = create_cache_manager(total_block_num=50, num_cpu_blocks=20)
+        rt = cm._radix_tree
+
+        # Build 30 hashes: 22 for device, 8 for storage
+        all_hashes = [f"h{i}" for i in range(30)]
+        device_hashes = all_hashes[:22]
+        storage_hashes = all_hashes[22:]
+
+        # Insert 22 device blocks into the radix tree
+        device_block_ids = cm._device_pool.allocate(22)
+        self.assertIsNotNone(device_block_ids)
+        device_nodes, _ = rt.insert(
+            blocks=list(zip(device_hashes, device_block_ids)),
+            cache_status=CacheStatus.DEVICE,
+        )
+        self.assertEqual(len(device_nodes), 22)
+
+        # The last device node is the correct start_node for the storage insertion
+        last_device_node = device_nodes[-1]
+
+        # prepare_prefetch_metadata should attach storage nodes AFTER the last device node
+        storage_nodes = cm.prepare_prefetch_metadata(storage_hashes, start_node=last_device_node)
+        self.assertEqual(len(storage_nodes), 8)
+        for node in storage_nodes:
+            self.assertEqual(node.cache_status, CacheStatus.LOADING_FROM_STORAGE)
+
+        # Simulate prefetch completion: transition LOADING_FROM_STORAGE → HOST
+        storage_block_ids = [n.block_id for n in storage_nodes]
+        for node in storage_nodes:
+            cm._prefetch_node_map[node.block_id] = node
+        cm.update_storage_blocks_to_host(storage_block_ids)
+        for node in storage_nodes:
+            self.assertEqual(node.cache_status, CacheStatus.HOST)
+
+        # Now find_prefix on all 30 hashes must return 30 nodes
+        found = rt.find_prefix(all_hashes)
+        self.assertEqual(len(found), 30, f"Expected 30 nodes, got {len(found)}")
+        device_found = [n for n in found if n.is_on_device()]
+        host_found = [n for n in found if n.is_on_host()]
+        self.assertEqual(len(device_found), 22)
+        self.assertEqual(len(host_found), 8)
+
+
 if __name__ == "__main__":
     unittest.main()