@@ -1070,9 +1070,12 @@ def _allocate_decode_and_extend():
10701070 self .cache_manager .num_cpu_blocks > 0
10711071 or self .config .cache_config .kvcache_storage_backend
10721072 ):
1073- if not self .cache_manager . can_allocate_gpu_blocks (
1073+ can_schedule_block_num_threshold = self ._get_can_schedule_prefill_threshold_block (
10741074 (request .need_prefill_tokens + self .config .cache_config .block_size - 1 )
10751075 // self .config .cache_config .block_size
1076+ )
1077+ if not self .cache_manager .can_allocate_gpu_blocks (
1078+ can_schedule_block_num_threshold
10761079 ): # to prevent block allocation for matching in hierarchical cache and cause dead lock
10771080 break
10781081 success = self .get_prefix_cached_blocks (request )
@@ -1134,6 +1137,7 @@ def _allocate_decode_and_extend():
11341137 self .req_dict [request .request_id ] = allocated_position
11351138 llm_logger .debug (f"req_id:{ request .request_id } allocate pos end" )
11361139 else :
1140+ # Warning: _free_blocks before update_cache_blocks may cause storage blocks leak
11371141 if self .config .cache_config .enable_prefix_caching :
11381142 self ._free_blocks (request )
11391143 break
@@ -1150,9 +1154,12 @@ def _allocate_decode_and_extend():
11501154 self .cache_manager .num_cpu_blocks > 0
11511155 or self .config .cache_config .kvcache_storage_backend
11521156 ):
1153- if not self .cache_manager . can_allocate_gpu_blocks (
1157+ can_schedule_block_num_threshold = self ._get_can_schedule_prefill_threshold_block (
11541158 (request .need_prefill_tokens + self .config .cache_config .block_size - 1 )
11551159 // self .config .cache_config .block_size
1160+ )
1161+ if not self .cache_manager .can_allocate_gpu_blocks (
1162+ can_schedule_block_num_threshold
11561163 ): # to prevent block allocation for matching in hierarchical cache and cause dead lock
11571164 break
11581165 success = self .get_prefix_cached_blocks (request )
@@ -1196,6 +1203,7 @@ def _allocate_decode_and_extend():
11961203 )
11971204 request .status = RequestStatus .RUNNING_PREFILL
11981205 else :
1206+ # Warning: _free_blocks before update_cache_blocks may cause storage blocks leak
11991207 if self .config .cache_config .enable_prefix_caching :
12001208 self ._free_blocks (request )
12011209 break
0 commit comments