@@ -117,8 +117,8 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re
117117 preempted_req = self .running .pop ()
118118 preempted_req .status = RequestStatus .PREEMPTED
119119 preempted_req .num_computed_tokens = 0
120- preempted_req .prefill_block_num = 0
121120 self ._free_blocks (preempted_req )
121+ preempted_req .prefill_block_num = None
122122 self .to_be_rescheduled_request_id_set .add (preempted_req .request_id )
123123 preempted_reqs .append (preempted_req )
124124 scheduled_reqs .append (self ._prepare_preempt_task (preempted_req ))
@@ -305,6 +305,7 @@ def schedule(self):
305305 if self .config .cache_config .enable_prefix_caching :
306306 success = self .get_prefix_cached_blocks (request )
307307 if not success :
308+ self ._free_blocks (request )
308309 break
309310
310311 num_new_tokens = self ._get_num_new_tokens (request , token_budget )
@@ -327,23 +328,33 @@ def schedule(self):
327328 self .stop_flags [allocated_position ] = False
328329 self .req_dict [request .request_id ] = allocated_position
329330 else :
331+ if self .config .cache_config .enable_prefix_caching :
332+ self ._free_blocks (request )
330333 break
331334 elif request .status == RequestStatus .PREEMPTED :
332335 request .need_prefill_tokens = (
333336 request .num_total_tokens
334337 ) # Before preempted task rescheduled, preempted task has been sent to engine, no more tokens are output, here num_total_tokens should be static and correct
338+ if self .config .cache_config .enable_prefix_caching :
339+ success = self .get_prefix_cached_blocks (request )
340+ if not success :
341+ self ._free_blocks (request )
342+ break
335343 num_new_tokens = self ._get_num_new_tokens (request , token_budget )
336344 num_new_block = self .get_new_block_nums (request , num_new_tokens )
337345 # Allocate blocks to prefill
338346 if self .cache_manager .can_allocate_gpu_blocks (num_new_block ):
339- request .block_tables .extend (self .cache_manager .allocate_gpu_blocks (num_new_block ))
347+ if not request .get ("skip_allocate" , False ):
348+ request .block_tables .extend (self .cache_manager .allocate_gpu_blocks (num_new_block ))
340349 self .waiting .popleft ()
341350 self .running .append (request )
342351 scheduled_reqs .append (self ._prepare_prefill_task (request , num_new_tokens ))
343352 token_budget -= num_new_tokens
344353 request .num_computed_tokens += num_new_tokens
345354 request .status = RequestStatus .RUNNING
346355 else :
356+ if self .config .cache_config .enable_prefix_caching :
357+ self ._free_blocks (request )
347358 break
348359 else :
349360 llm_logger .error ("Unknown request status type" )
@@ -399,7 +410,7 @@ def get_prefix_cached_blocks(self, request: Request):
399410 main_process_metrics .prefix_cpu_cache_token_num .inc (request .cpu_cache_token_num )
400411
401412 if matched_token_num == request .prompt_token_ids_len :
402- request .num_computed_tokens = matched_token_num - 1
413+ request .num_computed_tokens = matched_token_num - self . config . cache_config . block_size
403414 request .skip_allocate = True
404415 else :
405416 request .num_computed_tokens = matched_token_num
@@ -417,8 +428,15 @@ def add_request(self, request: Request) -> None:
417428 def _free_blocks (self , request : Request ):
418429 if self .config .cache_config .enable_prefix_caching :
419430 # TODO(chengyanfu): support cache ouput blocks for prefix caching
420- self .cache_manager .release_block_ids_async (request )
421- self .cache_manager .recycle_gpu_blocks (request .block_tables [request .prefill_block_num :])
431+ if request .get ("prefill_block_num" , None ) is None :
432+ leaf_node = self .cache_manager .req_leaf_map [request .request_id ]
433+ self .cache_manager .decrease_request_share_count (request .request_id )
434+ self .cache_manager .free_nodes_directly (leaf_node )
435+ self .cache_manager .recycle_gpu_blocks (request .block_tables [request .cache_info [0 ]:])
436+
437+ else :
438+ self .cache_manager .release_block_ids_async (request )
439+ self .cache_manager .recycle_gpu_blocks (request .block_tables [request .prefill_block_num :])
422440 else :
423441 self .cache_manager .recycle_gpu_blocks (request .block_tables )
424442 request .block_tables = []
0 commit comments