Skip to content

Commit f6ccb6a

Browse files
Merge branch 'release/2.2' into release/2.2
2 parents 3f1f0ed + c7c1627 commit f6ccb6a

4 files changed

Lines changed: 27 additions & 2 deletions

File tree

fastdeploy/engine/common_engine.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,8 +527,14 @@ def _fetch_request():
527527
self.cfg.max_prefill_batch,
528528
)
529529

530+
if self.cfg.model_config.enable_mm:
531+
self.resource_manager.check_and_free_block_tables()
532+
available_blocks = self.resource_manager.available_block_num()
533+
else:
534+
available_blocks = self.cfg.cache_config.max_block_num_per_seq
535+
530536
tasks = self.scheduler.get_requests(
531-
available_blocks=self.cfg.cache_config.max_block_num_per_seq,
537+
available_blocks=available_blocks,
532538
block_size=self.cfg.cache_config.block_size,
533539
reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
534540
max_num_batched_tokens=self.cfg.max_model_len,

fastdeploy/engine/sched/resource_manager_v1.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,15 @@ def schedule(self):
348348
if request.status == RequestStatus.WAITING:
349349
# Enable prefix caching
350350
if self.config.cache_config.enable_prefix_caching:
351+
if (
352+
self.config.cache_config.enable_hierarchical_cache
353+
and self.cache_manager.num_cpu_blocks > 0
354+
):
355+
if not self.cache_manager.can_allocate_gpu_blocks(
356+
(request.need_prefill_tokens + self.config.cache_config.block_size - 1)
357+
// self.config.cache_config.block_size
358+
): # to prevent block allocation for matching in hierarchical cache and cause dead lock
359+
break
351360
success = self.get_prefix_cached_blocks(request)
352361
if not success:
353362
self._free_blocks(request)
@@ -387,6 +396,15 @@ def schedule(self):
387396
request.num_total_tokens
388397
) # Before preempted task rescheduled, preempted task has been sent to engine, no more tokens are output, here num_total_tokens should be static and correct
389398
if self.config.cache_config.enable_prefix_caching:
399+
if (
400+
self.config.cache_config.enable_hierarchical_cache
401+
and self.cache_manager.num_cpu_blocks > 0
402+
):
403+
if not self.cache_manager.can_allocate_gpu_blocks(
404+
(request.need_prefill_tokens + self.config.cache_config.block_size - 1)
405+
// self.config.cache_config.block_size
406+
): # to prevent block allocation for matching in hierarchical cache and cause dead lock
407+
break
390408
success = self.get_prefix_cached_blocks(request)
391409
if not success:
392410
self._free_blocks(request)

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ tqdm
1010
pynvml
1111
uvicorn==0.29.0
1212
fastapi
13-
paddleformers==0.2.1
13+
paddleformers>=0.2.3
1414
redis
1515
etcd3
1616
httpx

tests/cov_pytest.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ addopts =
2222
--ignore=tests/operators/test_flash_mask_attn.py
2323
--ignore=tests/operators/test_w4afp8_gemm.py
2424
--ignore=tests/operators/test_tree_mask.py
25+
--ignore=tests/model_loader/test_common_model.py

0 commit comments

Comments
 (0)