fix: GPU memory leaks in engine shutdown for rocm (#1642)

AlpinDale · web-flow · commit 6b19cf84b115 · 2026-04-27T11:13:11.000+04:30
* fix: GPU memory leaks in engine shutdown for rocm

Signed-off-by: AlpinDale &lt;alpindale@gmail.com&gt;

* fix: rocm dockerfile

Signed-off-by: AlpinDale &lt;alpindale@gmail.com&gt;

---------

Signed-off-by: AlpinDale &lt;alpindale@gmail.com&gt;
diff --git a/aphrodite/utils/func_utils.py b/aphrodite/utils/func_utils.py
@@ -45,16 +45,14 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
 
 
 @lru_cache
-def supports_kw(
+def _supports_kw(
     callable: Callable[..., object],
     kw_name: str,
     *,
     requires_kw_only: bool = False,
     allow_var_kwargs: bool = True,
 ) -> bool:
-    """Check if a keyword is a valid kwarg for a callable; if requires_kw_only
-    disallows kwargs names that can also be positional arguments.
-    """
+    """Internal cached implementation of supports_kw."""
     params = inspect.signature(callable).parameters
     if not params:
         return False
@@ -92,6 +90,29 @@ def supports_kw(
     return False
 
 
+def supports_kw(
+    callable: Callable[..., object],
+    kw_name: str,
+    *,
+    requires_kw_only: bool = False,
+    allow_var_kwargs: bool = True,
+) -> bool:
+    """Check if a keyword is a valid kwarg for a callable; if requires_kw_only
+    disallows kwargs names that can also be positional arguments.
+    """
+    # Unwrap bound methods so that the lru_cache key is the underlying
+    # function, not the instance. Caching bound methods pins the object
+    # (and all its GPU tensors) for the lifetime of the cache.
+    if hasattr(callable, "__func__"):
+        callable = callable.__func__
+    return _supports_kw(
+        callable,
+        kw_name,
+        requires_kw_only=requires_kw_only,
+        allow_var_kwargs=allow_var_kwargs,
+    )
+
+
 def get_allowed_kwarg_only_overrides(
     callable: Callable[..., object],
     overrides: Mapping[str, object] | None,
diff --git a/aphrodite/v1/engine/core.py b/aphrodite/v1/engine/core.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
 import os
 import queue
 import signal
@@ -524,6 +525,12 @@ def shutdown(self):
         if self.scheduler:
             self.scheduler.shutdown()
 
+        # Undo the gc.freeze() from __init__ so that the objects allocated
+        # during engine startup (model weights, KV caches, etc.) become
+        # visible to the garbage collector again. Without this, deleting
+        # the engine in-process (e.g. unit tests) leaks GPU memory.
+        gc.unfreeze()
+
     def profile(self, is_start: bool = True, profile_prefix: str | None = None):
         self.model_executor.profile(is_start, profile_prefix)
 
diff --git a/aphrodite/v1/worker/gpu_model_runner.py b/aphrodite/v1/worker/gpu_model_runner.py
@@ -5468,6 +5468,20 @@ def _freeze_gc():
                 gc.unfreeze()
                 gc.collect()
 
+    def shutdown(self) -> None:
+        """Release GPU tensors (model weights, KV caches, workspace) so that
+        memory is reclaimable when running in the same process."""
+        from aphrodite.model_executor.layers.rotary_embedding import _ROPE_DICT
+        from aphrodite.v1.worker.workspace import reset_workspace_manager
+
+        # Calls torch.accelerator.synchronize()
+        self._cleanup_profiling_kv_cache()
+        self.compilation_config.static_forward_context.clear()
+        self.model = None  # type: ignore[assignment]
+        _ROPE_DICT.clear()
+
+        reset_workspace_manager()
+
     def _cleanup_profiling_kv_cache(self) -> None:
         torch.accelerator.synchronize()
         if hasattr(self, "kv_caches") and self.kv_caches:
diff --git a/aphrodite/v1/worker/gpu_worker.py b/aphrodite/v1/worker/gpu_worker.py
@@ -957,6 +957,11 @@ def shutdown(self) -> None:
         if weight_transfer_engine := getattr(self, "weight_transfer_engine", None):
             weight_transfer_engine.shutdown()
 
+        # Release GPU resources held by the model runner so that memory
+        # can be reclaimed when running in-process
+        if model_runner := getattr(self, "model_runner", None):
+            model_runner.shutdown()
+
     def elastic_ep_execute(self, execute_method: str, *args, **kwargs):
         return self.elastic_ep_executor.execute(execute_method, *args, **kwargs)
 
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
diff --git a/docker/context/base-wheels/.keep b/docker/context/base-wheels/.keep