@@ -1070,9 +1070,12 @@ def _allocate_decode_and_extend():
10701070 self .cache_manager .num_cpu_blocks > 0
10711071 or self .config .cache_config .kvcache_storage_backend
10721072 ):
1073- if not self .cache_manager . can_allocate_gpu_blocks (
1073+ can_schedule_block_num_threshold = self ._get_can_schedule_prefill_threshold_block (
10741074 (request .need_prefill_tokens + self .config .cache_config .block_size - 1 )
10751075 // self .config .cache_config .block_size
1076+ )
1077+ if not self .cache_manager .can_allocate_gpu_blocks (
1078+ can_schedule_block_num_threshold
10761079 ): # to prevent block allocation for matching in hierarchical cache and cause dead lock
10771080 break
10781081 success = self .get_prefix_cached_blocks (request )
@@ -1150,9 +1153,12 @@ def _allocate_decode_and_extend():
11501153 self .cache_manager .num_cpu_blocks > 0
11511154 or self .config .cache_config .kvcache_storage_backend
11521155 ):
1153- if not self .cache_manager . can_allocate_gpu_blocks (
1156+ can_schedule_block_num_threshold = self ._get_can_schedule_prefill_threshold_block (
11541157 (request .need_prefill_tokens + self .config .cache_config .block_size - 1 )
11551158 // self .config .cache_config .block_size
1159+ )
1160+ if not self .cache_manager .can_allocate_gpu_blocks (
1161+ can_schedule_block_num_threshold
11561162 ): # to prevent block allocation for matching in hierarchical cache and cause dead lock
11571163 break
11581164 success = self .get_prefix_cached_blocks (request )
@@ -1196,6 +1202,7 @@ def _allocate_decode_and_extend():
11961202 )
11971203 request .status = RequestStatus .RUNNING_PREFILL
11981204 else :
1205+ # Warnig: _free_blocks before update_cache_blocks may cause storage blocks leak
11991206 if self .config .cache_config .enable_prefix_caching :
12001207 self ._free_blocks (request )
12011208 break
0 commit comments