@@ -229,6 +229,8 @@ def get_new_block_nums(self, request: Request, num_new_tokens: int):
229229
230230 if self .config .speculative_config .method is not None :
231231 block_num = min (block_num + 1 , self .config .cache_config .max_block_num_per_seq )
232+ else :
233+ block_num = min (block_num , self .config .cache_config .max_block_num_per_seq )
232234 return block_num
233235
234236 def _prepare_prefill_task (self , request , new_token_num ):
@@ -926,7 +928,7 @@ def _allocate_decode_and_extend():
926928 )
927929 # Allocate blocks to prefill
928930 if self .cache_manager .can_allocate_gpu_blocks (can_schedule_block_num_threshold ):
929- if not request . get ( "skip_allocate" , False ) :
931+ if num_new_block > 0 :
930932 extra_gpu_block_ids = self .cache_manager .allocate_gpu_blocks (
931933 num_new_block , request .request_id
932934 )
@@ -985,7 +987,7 @@ def _allocate_decode_and_extend():
985987 )
986988 # Allocate blocks to prefill
987989 if self .cache_manager .can_allocate_gpu_blocks (can_schedule_block_num_threshold ):
988- if not request . get ( "skip_allocate" , False ) :
990+ if num_new_block > 0 :
989991 extra_gpu_block_ids = self .cache_manager .allocate_gpu_blocks (
990992 num_new_block , request .request_id
991993 )
@@ -1166,19 +1168,16 @@ def get_prefix_cached_blocks(self, request: Request):
11661168
11671169 request .cache_info = [matched_block_num , no_cache_block_num ]
11681170 request .block_tables = common_block_ids
1169- request .skip_allocate = False
11701171 request .num_cached_tokens = matched_token_num
11711172 if self .config .cache_config .disable_chunked_mm_input :
11721173 if matched_token_num == request .need_prefill_tokens :
11731174 matched_token_num = matched_token_num - self .config .cache_config .block_size
1174- request .skip_allocate = True
11751175 request .num_computed_tokens = self .revert_chunked_mm_input (
11761176 request .multimodal_inputs , matched_token_num
11771177 )
11781178 else :
11791179 if matched_token_num == request .need_prefill_tokens :
11801180 request .num_computed_tokens = matched_token_num - self .config .cache_config .block_size
1181- request .skip_allocate = True
11821181 else :
11831182 request .num_computed_tokens = matched_token_num
11841183
0 commit comments