3636 generate_scheduler_kv_cache_config ,
3737 get_kv_cache_configs ,
3838 get_kv_cache_size_tokens ,
39- get_max_concurrency_for_kv_cache_config ,
39+ get_max_concurrency_for_kv_cache_configs ,
4040 get_request_block_hasher ,
4141 init_none_hash ,
4242)
@@ -103,11 +103,11 @@ def __init__(
103103 self .available_gpu_memory_for_kv_cache = - 1
104104
105105 # Setup KV Caches and update CacheConfig after profiling.
106- num_gpu_blocks , num_cpu_blocks , kv_cache_config = self ._initialize_kv_caches (aphrodite_config )
106+ num_gpu_blocks , num_cpu_blocks , kv_cache_config , max_concurrency = self ._initialize_kv_caches (aphrodite_config )
107107 self .kv_cache_config = kv_cache_config
108108
109109 # Cache KV cache properties for synchronous access
110- self ._max_concurrency = get_max_concurrency_for_kv_cache_config ( aphrodite_config , kv_cache_config )
110+ self ._max_concurrency = max_concurrency
111111 self ._kv_cache_size_tokens = get_kv_cache_size_tokens (aphrodite_config , kv_cache_config )
112112
113113 aphrodite_config .cache_config .num_gpu_blocks = num_gpu_blocks
@@ -206,7 +206,7 @@ def __init__(
206206
207207 self .step_fn = self .step if self .batch_queue is None else self .step_with_batch_queue
208208
209- def _initialize_kv_caches (self , aphrodite_config : AphroditeConfig ) -> tuple [int , int , KVCacheConfig ]:
209+ def _initialize_kv_caches (self , aphrodite_config : AphroditeConfig ) -> tuple [int , int , KVCacheConfig , float ]:
210210 start = time .time ()
211211
212212 # Get all kv cache needed by the model
@@ -231,6 +231,7 @@ def _initialize_kv_caches(self, aphrodite_config: AphroditeConfig) -> tuple[int,
231231 assert len (kv_cache_specs ) == len (available_gpu_memory )
232232
233233 kv_cache_configs = get_kv_cache_configs (aphrodite_config , kv_cache_specs , available_gpu_memory )
234+ max_concurrency = get_max_concurrency_for_kv_cache_configs (aphrodite_config , kv_cache_configs )
234235 scheduler_kv_cache_config = generate_scheduler_kv_cache_config (kv_cache_configs )
235236 num_gpu_blocks = scheduler_kv_cache_config .num_blocks
236237 num_cpu_blocks = 0
@@ -252,7 +253,7 @@ def _initialize_kv_caches(self, aphrodite_config: AphroditeConfig) -> tuple[int,
252253 "metrics, set APHRODITE_REQUEST_LEVEL_METRICS=0." ,
253254 scope = "global" ,
254255 )
255- return num_gpu_blocks , num_cpu_blocks , scheduler_kv_cache_config
256+ return num_gpu_blocks , num_cpu_blocks , scheduler_kv_cache_config , max_concurrency
256257
257258 def get_supported_tasks (self ) -> tuple [SupportedTask , ...]:
258259 return self .modeling .supported_tasks
0 commit comments