Skip to content

Commit 6b19cf8

Browse files
authored
fix: GPU memory leaks in engine shutdown for rocm (#1642)
* fix: GPU memory leaks in engine shutdown for rocm Signed-off-by: AlpinDale <alpindale@gmail.com> * fix: rocm dockerfile Signed-off-by: AlpinDale <alpindale@gmail.com> --------- Signed-off-by: AlpinDale <alpindale@gmail.com>
1 parent f80985d commit 6b19cf8

6 files changed

Lines changed: 418 additions & 30 deletions

File tree

aphrodite/utils/func_utils.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,16 +45,14 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
4545

4646

4747
@lru_cache
48-
def supports_kw(
48+
def _supports_kw(
4949
callable: Callable[..., object],
5050
kw_name: str,
5151
*,
5252
requires_kw_only: bool = False,
5353
allow_var_kwargs: bool = True,
5454
) -> bool:
55-
"""Check if a keyword is a valid kwarg for a callable; if requires_kw_only
56-
disallows kwargs names that can also be positional arguments.
57-
"""
55+
"""Internal cached implementation of supports_kw."""
5856
params = inspect.signature(callable).parameters
5957
if not params:
6058
return False
@@ -92,6 +90,29 @@ def supports_kw(
9290
return False
9391

9492

93+
def supports_kw(
94+
callable: Callable[..., object],
95+
kw_name: str,
96+
*,
97+
requires_kw_only: bool = False,
98+
allow_var_kwargs: bool = True,
99+
) -> bool:
100+
"""Check if a keyword is a valid kwarg for a callable; if requires_kw_only
101+
disallows kwargs names that can also be positional arguments.
102+
"""
103+
# Unwrap bound methods so that the lru_cache key is the underlying
104+
# function, not the instance. Caching bound methods pins the object
105+
# (and all its GPU tensors) for the lifetime of the cache.
106+
if hasattr(callable, "__func__"):
107+
callable = callable.__func__
108+
return _supports_kw(
109+
callable,
110+
kw_name,
111+
requires_kw_only=requires_kw_only,
112+
allow_var_kwargs=allow_var_kwargs,
113+
)
114+
115+
95116
def get_allowed_kwarg_only_overrides(
96117
callable: Callable[..., object],
97118
overrides: Mapping[str, object] | None,

aphrodite/v1/engine/core.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import gc
34
import os
45
import queue
56
import signal
@@ -524,6 +525,12 @@ def shutdown(self):
524525
if self.scheduler:
525526
self.scheduler.shutdown()
526527

528+
# Undo the gc.freeze() from __init__ so that the objects allocated
529+
# during engine startup (model weights, KV caches, etc.) become
530+
# visible to the garbage collector again. Without this, deleting
531+
# the engine in-process (e.g. unit tests) leaks GPU memory.
532+
gc.unfreeze()
533+
527534
def profile(self, is_start: bool = True, profile_prefix: str | None = None):
528535
self.model_executor.profile(is_start, profile_prefix)
529536

aphrodite/v1/worker/gpu_model_runner.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5468,6 +5468,20 @@ def _freeze_gc():
54685468
gc.unfreeze()
54695469
gc.collect()
54705470

5471+
def shutdown(self) -> None:
5472+
"""Release GPU tensors (model weights, KV caches, workspace) so that
5473+
memory is reclaimable when running in the same process."""
5474+
from aphrodite.model_executor.layers.rotary_embedding import _ROPE_DICT
5475+
from aphrodite.v1.worker.workspace import reset_workspace_manager
5476+
5477+
# Calls torch.accelerator.synchronize()
5478+
self._cleanup_profiling_kv_cache()
5479+
self.compilation_config.static_forward_context.clear()
5480+
self.model = None # type: ignore[assignment]
5481+
_ROPE_DICT.clear()
5482+
5483+
reset_workspace_manager()
5484+
54715485
def _cleanup_profiling_kv_cache(self) -> None:
54725486
torch.accelerator.synchronize()
54735487
if hasattr(self, "kv_caches") and self.kv_caches:

aphrodite/v1/worker/gpu_worker.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -957,6 +957,11 @@ def shutdown(self) -> None:
957957
if weight_transfer_engine := getattr(self, "weight_transfer_engine", None):
958958
weight_transfer_engine.shutdown()
959959

960+
# Release GPU resources held by the model runner so that memory
961+
# can be reclaimed when running in-process
962+
if model_runner := getattr(self, "model_runner", None):
963+
model_runner.shutdown()
964+
960965
def elastic_ep_execute(self, execute_method: str, *args, **kwargs):
961966
return self.elastic_ep_executor.execute(execute_method, *args, **kwargs)
962967

0 commit comments

Comments
 (0)