fix: compute engine max_concurrency from worker KV cache configs to match runtime reporting (#1622)

lucyknada · web-flow · commit a63d017844d2 · 2026-02-18T01:25:13.000+04:30
diff --git a/aphrodite/v1/core/kv_cache_utils.py b/aphrodite/v1/core/kv_cache_utils.py
@@ -737,6 +737,25 @@ def get_max_concurrency_for_kv_cache_config(aphrodite_config: AphroditeConfig, k
     return max_concurrency
 
 
+def get_max_concurrency_for_kv_cache_configs(
+    aphrodite_config: AphroditeConfig,
+    kv_cache_configs: Sequence[KVCacheConfig],
+) -> float:
+    """
+    Get the maximum concurrency that is simultaneously feasible on all workers.
+
+    Returns 0 for attention-free models where no worker has KV cache groups.
+    """
+    max_concurrencies = [
+        get_max_concurrency_for_kv_cache_config(aphrodite_config, cfg)
+        for cfg in kv_cache_configs
+        if len(cfg.kv_cache_groups) > 0
+    ]
+    if not max_concurrencies:
+        return 0.0
+    return min(max_concurrencies)
+
+
 def get_kv_cache_size_tokens(aphrodite_config: AphroditeConfig, kv_cache_config: KVCacheConfig) -> int:
     """
     Get the total number of tokens that can be stored in the KV cache.
diff --git a/aphrodite/v1/engine/core.py b/aphrodite/v1/engine/core.py
@@ -36,7 +36,7 @@
     generate_scheduler_kv_cache_config,
     get_kv_cache_configs,
     get_kv_cache_size_tokens,
-    get_max_concurrency_for_kv_cache_config,
+    get_max_concurrency_for_kv_cache_configs,
     get_request_block_hasher,
     init_none_hash,
 )
@@ -103,11 +103,11 @@ def __init__(
         self.available_gpu_memory_for_kv_cache = -1
 
         # Setup KV Caches and update CacheConfig after profiling.
-        num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(aphrodite_config)
+        num_gpu_blocks, num_cpu_blocks, kv_cache_config, max_concurrency = self._initialize_kv_caches(aphrodite_config)
         self.kv_cache_config = kv_cache_config
 
         # Cache KV cache properties for synchronous access
-        self._max_concurrency = get_max_concurrency_for_kv_cache_config(aphrodite_config, kv_cache_config)
+        self._max_concurrency = max_concurrency
         self._kv_cache_size_tokens = get_kv_cache_size_tokens(aphrodite_config, kv_cache_config)
 
         aphrodite_config.cache_config.num_gpu_blocks = num_gpu_blocks
@@ -206,7 +206,7 @@ def __init__(
 
         self.step_fn = self.step if self.batch_queue is None else self.step_with_batch_queue
 
-    def _initialize_kv_caches(self, aphrodite_config: AphroditeConfig) -> tuple[int, int, KVCacheConfig]:
+    def _initialize_kv_caches(self, aphrodite_config: AphroditeConfig) -> tuple[int, int, KVCacheConfig, float]:
         start = time.time()
 
         # Get all kv cache needed by the model
@@ -231,6 +231,7 @@ def _initialize_kv_caches(self, aphrodite_config: AphroditeConfig) -> tuple[int,
         assert len(kv_cache_specs) == len(available_gpu_memory)
 
         kv_cache_configs = get_kv_cache_configs(aphrodite_config, kv_cache_specs, available_gpu_memory)
+        max_concurrency = get_max_concurrency_for_kv_cache_configs(aphrodite_config, kv_cache_configs)
         scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
         num_gpu_blocks = scheduler_kv_cache_config.num_blocks
         num_cpu_blocks = 0
@@ -252,7 +253,7 @@ def _initialize_kv_caches(self, aphrodite_config: AphroditeConfig) -> tuple[int,
                 "metrics, set APHRODITE_REQUEST_LEVEL_METRICS=0.",
                 scope="global",
             )
-        return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
+        return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config, max_concurrency
 
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         return self.modeling.supported_tasks
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
@@ -23,6 +23,7 @@
     get_kv_cache_configs,
     get_kv_cache_size_tokens,
     get_max_concurrency_for_kv_cache_config,
+    get_max_concurrency_for_kv_cache_configs,
     get_request_block_hasher,
     hash_block_tokens,
     init_none_hash,
@@ -1156,6 +1157,75 @@ def test_get_max_concurrency_for_kv_cache_config():
     assert max_concurrency_hybrid_model == 3
 
 
+def test_get_max_concurrency_for_kv_cache_configs():
+    model_id = "Qwen/Qwen1.5-7B"
+    max_model_len = 16384
+    model_config = ModelConfig(
+        model_id,
+        runner="generate",
+        dtype="float16",
+        max_model_len=max_model_len,
+    )
+    scheduler_config = SchedulerConfig(max_num_batched_tokens=1024, enable_chunked_prefill=True)
+    aphrodite_config = AphroditeConfig(
+        model_config=model_config,
+        scheduler_config=scheduler_config,
+    )
+
+    full_attention_spec = FullAttentionSpec(
+        block_size=16,
+        num_kv_heads=32,
+        head_size=128,
+        dtype=torch.float16,
+    )
+
+    low_concurrency_cfg = KVCacheConfig(
+        num_blocks=1024,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)], full_attention_spec),
+        ],
+        kv_bytes_per_block=1,
+    )
+    high_concurrency_cfg = KVCacheConfig(
+        num_blocks=1024 * 3,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec([f"layer_{i}" for i in range(32)], full_attention_spec),
+        ],
+        kv_bytes_per_block=1,
+    )
+
+    max_concurrency = get_max_concurrency_for_kv_cache_configs(
+        aphrodite_config,
+        [high_concurrency_cfg, low_concurrency_cfg],
+    )
+    assert max_concurrency == 1
+
+
+def test_get_max_concurrency_for_kv_cache_configs_attention_free_model():
+    model_config = ModelConfig(
+        "Qwen/Qwen1.5-7B",
+        runner="generate",
+        dtype="float16",
+        max_model_len=16384,
+    )
+    scheduler_config = SchedulerConfig(max_num_batched_tokens=1024, enable_chunked_prefill=True)
+    aphrodite_config = AphroditeConfig(
+        model_config=model_config,
+        scheduler_config=scheduler_config,
+    )
+
+    attention_free_cfg = KVCacheConfig(
+        num_blocks=1,
+        kv_cache_tensors=[],
+        kv_cache_groups=[],
+        kv_bytes_per_block=0,
+    )
+
+    assert get_max_concurrency_for_kv_cache_configs(aphrodite_config, [attention_free_cfg]) == 0
+
+
 def test_get_kv_cache_size_tokens():
     """Test get_kv_cache_size_tokens function."""
     model_id = "Qwen/Qwen1.5-7B"