Skip to content

Commit 448f0f0

Browse files
authored
[cli] add CLI args for kv cache offloading (#1588)
Signed-off-by: AlpinDale <alpindale@gmail.com>
1 parent 0f9b28a commit 448f0f0

4 files changed

Lines changed: 117 additions & 1 deletion

File tree

aphrodite/config/aphrodite.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,42 @@ def with_hf_config(
274274

275275
return replace(self, model_config=model_config)
276276

277+
def _post_init_kv_transfer_config(self) -> None:
278+
"""Update KVTransferConfig based on top-level configs in VllmConfig.
279+
Right now, this function reads the offloading settings from
280+
CacheConfig and configures the KVTransferConfig accordingly.
281+
"""
282+
if (kv_offloading_backend := self.cache_config.kv_offloading_backend) is None:
283+
return
284+
285+
# If no KVTransferConfig is provided, create a default one.
286+
if self.kv_transfer_config is None:
287+
self.kv_transfer_config = KVTransferConfig()
288+
289+
if (kv_offloading_size := self.cache_config.kv_offloading_size) is None:
290+
raise ValueError("You must set kv_offloading_size when kv_offloading_backend is set.")
291+
num_kv_ranks = self.parallel_config.tensor_parallel_size * self.parallel_config.pipeline_parallel_size
292+
293+
if kv_offloading_backend == "native":
294+
self.kv_transfer_config.kv_connector = "OffloadingConnector"
295+
kv_bytes_per_rank = kv_offloading_size * (1 << 30) / num_kv_ranks
296+
297+
# NOTE: the actual calculation for num_cpu_blocks should be
298+
# done after the model's KV cache is initialized
299+
self.kv_transfer_config.kv_connector_extra_config.update(
300+
{"kv_bytes_per_rank": kv_bytes_per_rank, "num_cpu_blocks": 0}
301+
)
302+
elif kv_offloading_backend == "lmcache":
303+
self.kv_transfer_config.kv_connector = "LMCacheConnectorV1"
304+
kv_gb_per_rank = kv_offloading_size / num_kv_ranks
305+
self.kv_transfer_config.kv_connector_extra_config = {
306+
"lmcache.local_cpu": True,
307+
"lmcache.max_local_cpu_size": kv_gb_per_rank,
308+
}
309+
310+
# This is the same for all backends
311+
self.kv_transfer_config.kv_role = "kv_both"
312+
277313
def __post_init__(self):
278314
"""Verify configs are valid & consistent with each other."""
279315

@@ -613,6 +649,9 @@ def has_blocked_weights():
613649
if "-quant_fp8" not in custom_ops:
614650
custom_ops.append("+quant_fp8")
615651

652+
# Handle the KV connector configs
653+
self._post_init_kv_transfer_config()
654+
616655
def update_sizes_for_sequence_parallelism(self, possible_sizes: list) -> list:
617656
# remove the sizes that not multiple of tp_size when
618657
# enable sequence parallelism

aphrodite/config/cache.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
CacheDType = Literal["auto", "bfloat16", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
2222
MambaDType = Literal["auto", "float32"]
2323
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]
24+
KVOffloadingBackend = Literal["native", "lmcache"]
2425

2526

2627
@config
@@ -125,6 +126,17 @@ class CacheConfig:
125126
gpu_memory_utilization. Note that kv_cache_memory_bytes
126127
(when not-None) ignores gpu_memory_utilization"""
127128

129+
kv_offloading_size: float | None = None
130+
"""Size of the KV cache offloading buffer in GiB. When TP > 1, this is
131+
the total buffer size summed across all TP ranks. By default, this is set
132+
to None, which means no KV offloading is enabled. When set with
133+
kv_offloading_backend, Aphrodite will enable KV cache offloading to CPU"""
134+
135+
kv_offloading_backend: KVOffloadingBackend | None = None
136+
"""The backend to use for KV cache offloading. Supported backends include
137+
'native' (Aphrodite native CPU offloading), 'lmcache' This option must be used
138+
together with kv_offloading_size."""
139+
128140
def compute_hash(self) -> str:
129141
"""
130142
WARNING: Whenever a new field is added to this config,

aphrodite/engine/args_tools.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
StructuredOutputsConfig,
4141
get_attr_docs,
4242
)
43-
from aphrodite.config.cache import BlockSize, CacheDType, MambaDType, PrefixCachingHashAlgo
43+
from aphrodite.config.cache import BlockSize, CacheDType, KVOffloadingBackend, MambaDType, PrefixCachingHashAlgo
4444
from aphrodite.config.device import Device
4545
from aphrodite.config.model import (
4646
ConvertOption,
@@ -484,6 +484,9 @@ class EngineArgs:
484484

485485
kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
486486

487+
kv_offloading_size: float | None = CacheConfig.kv_offloading_size
488+
kv_offloading_backend: KVOffloadingBackend | None = CacheConfig.kv_offloading_backend
489+
487490
single_user_mode: bool = SchedulerConfig.single_user_mode
488491

489492
# Token Throttling
@@ -761,6 +764,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
761764
cache_group.add_argument("--mamba-cache-dtype", **cache_kwargs["mamba_cache_dtype"])
762765
cache_group.add_argument("--mamba-ssm-cache-dtype", **cache_kwargs["mamba_ssm_cache_dtype"])
763766
cache_group.add_argument("--mamba-block-size", **cache_kwargs["mamba_block_size"])
767+
cache_group.add_argument("--kv-offloading-size", **cache_kwargs["kv_offloading_size"])
768+
cache_group.add_argument("--kv-offloading-backend", **cache_kwargs["kv_offloading_backend"])
764769

765770
# Multimodal related configs
766771
multimodal_kwargs = get_kwargs(MultiModalConfig)
@@ -1187,6 +1192,8 @@ def create_engine_config(
11871192
mamba_cache_dtype=self.mamba_cache_dtype,
11881193
mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype,
11891194
mamba_block_size=self.mamba_block_size,
1195+
kv_offloading_size=self.kv_offloading_size,
1196+
kv_offloading_backend=self.kv_offloading_backend,
11901197
)
11911198

11921199
ray_runtime_env = None
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
"""Tests for KV cache offloading configuration."""
2+
3+
import pytest
4+
5+
from aphrodite.config import AphroditeConfig, CacheConfig, KVTransferConfig, ParallelConfig
6+
7+
pytestmark = pytest.mark.cpu_test
8+
9+
10+
@pytest.mark.parametrize(
11+
"kv_offloading_backend,kv_offloading_size,tp,pp,expected_backend,expected_bytes",
12+
[
13+
("native", 4.0, 1, 1, "OffloadingConnector", 4.0 * (1 << 30)),
14+
# bytes per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
15+
("native", 8.0, 2, 2, "OffloadingConnector", 8.0 * (1 << 30) / 4),
16+
("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0),
17+
# size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
18+
("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0),
19+
(None, None, 1, 1, None, None),
20+
],
21+
)
22+
def test_kv_connector(kv_offloading_backend, kv_offloading_size, tp, pp, expected_backend, expected_bytes):
23+
kv_transfer_config = (
24+
KVTransferConfig(kv_connector_extra_config={"existing_key": "existing_value"})
25+
if expected_backend is not None
26+
else None
27+
)
28+
29+
aphrodite_config = AphroditeConfig(
30+
cache_config=CacheConfig(
31+
kv_offloading_backend=kv_offloading_backend,
32+
kv_offloading_size=kv_offloading_size,
33+
),
34+
kv_transfer_config=kv_transfer_config,
35+
parallel_config=ParallelConfig(tensor_parallel_size=tp, pipeline_parallel_size=pp),
36+
)
37+
38+
# No KV transfer config expected
39+
if expected_backend is None:
40+
assert aphrodite_config.kv_transfer_config is expected_backend
41+
return
42+
43+
kv_transfer_config = aphrodite_config.kv_transfer_config
44+
kv_connector_extra_config = kv_transfer_config.kv_connector_extra_config
45+
46+
assert kv_transfer_config.kv_connector == expected_backend
47+
assert kv_transfer_config.kv_role == "kv_both"
48+
49+
if kv_offloading_backend == "native":
50+
assert kv_connector_extra_config["kv_bytes_per_rank"] == expected_bytes
51+
assert kv_connector_extra_config["num_cpu_blocks"] == 0
52+
# Existing config should be preserved
53+
assert kv_connector_extra_config["existing_key"] == "existing_value"
54+
elif kv_offloading_backend == "lmcache":
55+
assert kv_connector_extra_config["lmcache.local_cpu"] is True
56+
assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes
57+
# Existing config should be replaced
58+
assert "existing_key" not in kv_connector_extra_config

0 commit comments

Comments
 (0)