Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion tensorrt_llm/runtime/kv_cache_manager_v2/_storage_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,17 @@ def __init__(
gpu_quota = config.cache_tiers[GPU_LEVEL].quota
gpu_granularity = CacheLevelManager.cache_tier_granularity(CacheTier.GPU_MEM, gpu_quota)

# Synthesize a constraint from typical_batch so windowed pool groups
# (window_size < tokens_per_block) don't collapse to min_slots = 1 and
# deadlock the scheduler at concurrency > 1. KVCacheDesc(tokens_per_block,
# tokens_per_block - 1) yields non_stale = 1 per request in every pool
# group, floor-ing min_slots at the concurrency level.
effective_constraints = list(constraints or [])
if typical_batch is not None and typical_batch.kv_caches:
one_decode = KVCacheDesc(capacity=tokens_per_block, history_length=tokens_per_block - 1)
effective_constraints.append(BatchDesc([one_decode] * len(typical_batch.kv_caches)))
self._min_slots = self._compute_min_slots_from_constraints(
constraints or [], tokens_per_block, swa_scratch_reuse
effective_constraints, tokens_per_block, swa_scratch_reuse
)

# Compute init_ratio from typical_batch, constraints, or fallback.
Expand Down
Loading