@@ -1063,9 +1063,12 @@ def _allocate_decode_and_extend():
10631063 self .cache_manager .num_cpu_blocks > 0
10641064 or self .config .cache_config .kvcache_storage_backend
10651065 ):
1066- if not self .cache_manager . can_allocate_gpu_blocks (
1066+ can_schedule_block_num_threshold = self ._get_can_schedule_prefill_threshold_block (
10671067 (request .need_prefill_tokens + self .config .cache_config .block_size - 1 )
10681068 // self .config .cache_config .block_size
1069+ )
1070+ if not self .cache_manager .can_allocate_gpu_blocks (
1071+ can_schedule_block_num_threshold
10691072 ): # to prevent block allocation for matching in hierarchical cache and cause dead lock
10701073 break
10711074 success = self .get_prefix_cached_blocks (request )
@@ -1124,6 +1127,7 @@ def _allocate_decode_and_extend():
11241127 self .req_dict [request .request_id ] = allocated_position
11251128 llm_logger .debug (f"req_id:{ request .request_id } allocate pos end" )
11261129 else :
1130+ # Warning: _free_blocks before update_cache_blocks may cause storage blocks leak
11271131 if self .config .cache_config .enable_prefix_caching :
11281132 self ._free_blocks (request )
11291133 break
@@ -1139,9 +1143,12 @@ def _allocate_decode_and_extend():
11391143 self .cache_manager .num_cpu_blocks > 0
11401144 or self .config .cache_config .kvcache_storage_backend
11411145 ):
1142- if not self .cache_manager . can_allocate_gpu_blocks (
1146+ can_schedule_block_num_threshold = self ._get_can_schedule_prefill_threshold_block (
11431147 (request .need_prefill_tokens + self .config .cache_config .block_size - 1 )
11441148 // self .config .cache_config .block_size
1149+ )
1150+ if not self .cache_manager .can_allocate_gpu_blocks (
1151+ can_schedule_block_num_threshold
11451152 ): # to prevent block allocation for matching in hierarchical cache and cause dead lock
11461153 break
11471154 success = self .get_prefix_cached_blocks (request )
@@ -1186,6 +1193,7 @@ def _allocate_decode_and_extend():
11861193 )
11871194 request .status = RequestStatus .RUNNING_PREFILL
11881195 else :
1196+ # Warning: _free_blocks before update_cache_blocks may cause storage blocks leak
11891197 if self .config .cache_config .enable_prefix_caching :
11901198 self ._free_blocks (request )
11911199 break
0 commit comments