Skip to content

Commit a63d017

Browse files
authored
fix: compute engine max_concurrency from worker KV cache configs to match runtime reporting (#1622)
1 parent f0f652a commit a63d017

3 files changed

Lines changed: 95 additions & 5 deletions

File tree

aphrodite/v1/core/kv_cache_utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,25 @@ def get_max_concurrency_for_kv_cache_config(aphrodite_config: AphroditeConfig, k
737737
return max_concurrency
738738

739739

740+
def get_max_concurrency_for_kv_cache_configs(
741+
aphrodite_config: AphroditeConfig,
742+
kv_cache_configs: Sequence[KVCacheConfig],
743+
) -> float:
744+
"""
745+
Get the maximum concurrency that is simultaneously feasible on all workers.
746+
747+
Returns 0 for attention-free models where no worker has KV cache groups.
748+
"""
749+
max_concurrencies = [
750+
get_max_concurrency_for_kv_cache_config(aphrodite_config, cfg)
751+
for cfg in kv_cache_configs
752+
if len(cfg.kv_cache_groups) > 0
753+
]
754+
if not max_concurrencies:
755+
return 0.0
756+
return min(max_concurrencies)
757+
758+
740759
def get_kv_cache_size_tokens(aphrodite_config: AphroditeConfig, kv_cache_config: KVCacheConfig) -> int:
741760
"""
742761
Get the total number of tokens that can be stored in the KV cache.

aphrodite/v1/engine/core.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
generate_scheduler_kv_cache_config,
3737
get_kv_cache_configs,
3838
get_kv_cache_size_tokens,
39-
get_max_concurrency_for_kv_cache_config,
39+
get_max_concurrency_for_kv_cache_configs,
4040
get_request_block_hasher,
4141
init_none_hash,
4242
)
@@ -103,11 +103,11 @@ def __init__(
103103
self.available_gpu_memory_for_kv_cache = -1
104104

105105
# Setup KV Caches and update CacheConfig after profiling.
106-
num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(aphrodite_config)
106+
num_gpu_blocks, num_cpu_blocks, kv_cache_config, max_concurrency = self._initialize_kv_caches(aphrodite_config)
107107
self.kv_cache_config = kv_cache_config
108108

109109
# Cache KV cache properties for synchronous access
110-
self._max_concurrency = get_max_concurrency_for_kv_cache_config(aphrodite_config, kv_cache_config)
110+
self._max_concurrency = max_concurrency
111111
self._kv_cache_size_tokens = get_kv_cache_size_tokens(aphrodite_config, kv_cache_config)
112112

113113
aphrodite_config.cache_config.num_gpu_blocks = num_gpu_blocks
@@ -206,7 +206,7 @@ def __init__(
206206

207207
self.step_fn = self.step if self.batch_queue is None else self.step_with_batch_queue
208208

209-
def _initialize_kv_caches(self, aphrodite_config: AphroditeConfig) -> tuple[int, int, KVCacheConfig]:
209+
def _initialize_kv_caches(self, aphrodite_config: AphroditeConfig) -> tuple[int, int, KVCacheConfig, float]:
210210
start = time.time()
211211

212212
# Get all kv cache needed by the model
@@ -231,6 +231,7 @@ def _initialize_kv_caches(self, aphrodite_config: AphroditeConfig) -> tuple[int,
231231
assert len(kv_cache_specs) == len(available_gpu_memory)
232232

233233
kv_cache_configs = get_kv_cache_configs(aphrodite_config, kv_cache_specs, available_gpu_memory)
234+
max_concurrency = get_max_concurrency_for_kv_cache_configs(aphrodite_config, kv_cache_configs)
234235
scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
235236
num_gpu_blocks = scheduler_kv_cache_config.num_blocks
236237
num_cpu_blocks = 0
@@ -252,7 +253,7 @@ def _initialize_kv_caches(self, aphrodite_config: AphroditeConfig) -> tuple[int,
252253
"metrics, set APHRODITE_REQUEST_LEVEL_METRICS=0.",
253254
scope="global",
254255
)
255-
return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
256+
return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config, max_concurrency
256257

257258
def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
258259
return self.modeling.supported_tasks

tests/v1/core/test_kv_cache_utils.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
get_kv_cache_configs,
2424
get_kv_cache_size_tokens,
2525
get_max_concurrency_for_kv_cache_config,
26+
get_max_concurrency_for_kv_cache_configs,
2627
get_request_block_hasher,
2728
hash_block_tokens,
2829
init_none_hash,
@@ -1156,6 +1157,75 @@ def test_get_max_concurrency_for_kv_cache_config():
11561157
assert max_concurrency_hybrid_model == 3
11571158

11581159

1160+
def test_get_max_concurrency_for_kv_cache_configs():
1161+
model_id = "Qwen/Qwen1.5-7B"
1162+
max_model_len = 16384
1163+
model_config = ModelConfig(
1164+
model_id,
1165+
runner="generate",
1166+
dtype="float16",
1167+
max_model_len=max_model_len,
1168+
)
1169+
scheduler_config = SchedulerConfig(max_num_batched_tokens=1024, enable_chunked_prefill=True)
1170+
aphrodite_config = AphroditeConfig(
1171+
model_config=model_config,
1172+
scheduler_config=scheduler_config,
1173+
)
1174+
1175+
full_attention_spec = FullAttentionSpec(
1176+
block_size=16,
1177+
num_kv_heads=32,
1178+
head_size=128,
1179+
dtype=torch.float16,
1180+
)
1181+
1182+
low_concurrency_cfg = KVCacheConfig(
1183+
num_blocks=1024,
1184+
kv_cache_tensors=[],
1185+
kv_cache_groups=[
1186+
KVCacheGroupSpec([f"layer_{i}" for i in range(32)], full_attention_spec),
1187+
],
1188+
kv_bytes_per_block=1,
1189+
)
1190+
high_concurrency_cfg = KVCacheConfig(
1191+
num_blocks=1024 * 3,
1192+
kv_cache_tensors=[],
1193+
kv_cache_groups=[
1194+
KVCacheGroupSpec([f"layer_{i}" for i in range(32)], full_attention_spec),
1195+
],
1196+
kv_bytes_per_block=1,
1197+
)
1198+
1199+
max_concurrency = get_max_concurrency_for_kv_cache_configs(
1200+
aphrodite_config,
1201+
[high_concurrency_cfg, low_concurrency_cfg],
1202+
)
1203+
assert max_concurrency == 1
1204+
1205+
1206+
def test_get_max_concurrency_for_kv_cache_configs_attention_free_model():
1207+
model_config = ModelConfig(
1208+
"Qwen/Qwen1.5-7B",
1209+
runner="generate",
1210+
dtype="float16",
1211+
max_model_len=16384,
1212+
)
1213+
scheduler_config = SchedulerConfig(max_num_batched_tokens=1024, enable_chunked_prefill=True)
1214+
aphrodite_config = AphroditeConfig(
1215+
model_config=model_config,
1216+
scheduler_config=scheduler_config,
1217+
)
1218+
1219+
attention_free_cfg = KVCacheConfig(
1220+
num_blocks=1,
1221+
kv_cache_tensors=[],
1222+
kv_cache_groups=[],
1223+
kv_bytes_per_block=0,
1224+
)
1225+
1226+
assert get_max_concurrency_for_kv_cache_configs(aphrodite_config, [attention_free_cfg]) == 0
1227+
1228+
11591229
def test_get_kv_cache_size_tokens():
11601230
"""Test get_kv_cache_size_tokens function."""
11611231
model_id = "Qwen/Qwen1.5-7B"

0 commit comments

Comments
 (0)