Skip to content

Commit c6ca671

Browse files
authored
fix(pt): recognize AOTInductor-wrapped CUDA OOM in AutoBatchSize (#5418)
When running `dp --pt-expt test` (or any path that goes through `deepmd.pt_expt.infer.deep_eval`) against a `.pt2` AOTInductor package, `AutoBatchSize` doubles the batch on every success. For models with a large `sel` the exploration eventually saturates GPU memory, and the CUDA caching allocator raises the usual ``CUDA out of memory`` from inside the AOTInductor runtime. AOTInductor then rewraps that error as a generic RuntimeError: run_func_(...) API call failed at .../aoti_runner/model_container_runner.cpp, line 144 The original "CUDA out of memory" text is printed only to stderr, so the old `is_oom_error` -- which keyed on a short list of substrings in `e.args[0]` -- never matched. `execute()` therefore did not shrink the batch; the exception propagated and the run crashed on a GPU that was otherwise completely idle (as confirmed by monitoring `nvidia-smi --query-compute-apps`, which showed dp itself as the sole consumer holding tens of GiB just before the failure). Widen `is_oom_error` to: * walk the exception chain via `__cause__` / `__context__`, so that a future PyTorch preserving the original OOM text is handled for free; * keep matching the four plain CUDA OOM markers on every message in the chain; * additionally treat the AOTInductor wrapper signature (`run_func_(` plus `model_container_runner`) as an OOM candidate. If the AOTInductor wrapper ever hides a non-OOM failure, the batch shrinker will halve down to 1 and then raise `OutOfMemoryError`, so the fallback is bounded -- non-OOM bugs still surface with a clear terminal error rather than being silently retried forever. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * Improved out-of-memory detection to catch more CUDA memory exhaustion scenarios, including wrapped/instrumented failures; now reliably clears CUDA cached memory when OOM conditions are identified to reduce cascading failures. * **Tests** * Added unit tests validating OOM detection across varied exception shapes and confirming CUDA cache clearing is invoked for detected OOMs. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent d14233e commit c6ca671

2 files changed

Lines changed: 81 additions & 14 deletions

File tree

deepmd/pt/utils/auto_batch_size.py

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -49,20 +49,54 @@ def is_oom_error(self, e: Exception) -> bool:
4949
e : Exception
5050
Exception
5151
"""
52-
# several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error,
53-
# such as https://github.com/JuliaGPU/CUDA.jl/issues/1924
54-
# (the meaningless error message should be considered as a bug in cusolver)
55-
if (
56-
isinstance(e, RuntimeError)
57-
and (
58-
"CUDA out of memory." in e.args[0]
59-
or "CUDA driver error: out of memory" in e.args[0]
60-
or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0]
61-
# https://github.com/deepmodeling/deepmd-kit/issues/4594
62-
or "CUDA error: out of memory" in e.args[0]
63-
)
64-
) or isinstance(e, torch.cuda.OutOfMemoryError):
65-
# Release all unoccupied cached memory
52+
if isinstance(e, torch.cuda.OutOfMemoryError):
6653
torch.cuda.empty_cache()
6754
return True
55+
56+
if not isinstance(e, RuntimeError):
57+
return False
58+
59+
# Gather messages from the exception itself and its chain. AOTInductor
60+
# (.pt2) sometimes strips the underlying OOM message when rewrapping,
61+
# but not always; checking ``__cause__`` / ``__context__`` catches the
62+
# remaining cases when the original error is preserved.
63+
msgs: list[str] = []
64+
cur: BaseException | None = e
65+
seen: set[int] = set()
66+
while cur is not None and id(cur) not in seen:
67+
seen.add(id(cur))
68+
if cur.args:
69+
first = cur.args[0]
70+
if isinstance(first, str):
71+
msgs.append(first)
72+
cur = cur.__cause__ or cur.__context__
73+
74+
# Several sources treat CUSOLVER_STATUS_INTERNAL_ERROR as an OOM, e.g.
75+
# https://github.com/JuliaGPU/CUDA.jl/issues/1924
76+
# https://github.com/deepmodeling/deepmd-kit/issues/4594
77+
plain_oom_markers = (
78+
"CUDA out of memory.",
79+
"CUDA driver error: out of memory",
80+
"CUDA error: out of memory",
81+
"cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR",
82+
)
83+
if any(m in msg for msg in msgs for m in plain_oom_markers):
84+
torch.cuda.empty_cache()
85+
return True
86+
87+
# AOTInductor (.pt2) wraps the underlying CUDA OOM as a generic
88+
# ``run_func_(...) API call failed at .../model_container_runner.cpp``.
89+
# The original "CUDA out of memory" text is printed to stderr only and
90+
# is absent from the Python-level RuntimeError, so we match on the
91+
# wrapper signature. If the root cause turns out to be something
92+
# other than OOM, ``execute()`` will keep shrinking the batch and
93+
# eventually raise ``OutOfMemoryError`` at batch size 1, which is a
94+
# clean failure rather than an uncaught exception.
95+
aoti_wrapped = any(
96+
"run_func_(" in msg and "model_container_runner" in msg for msg in msgs
97+
)
98+
if aoti_wrapped:
99+
torch.cuda.empty_cache()
100+
return True
101+
68102
return False

source/tests/pt/test_auto_batch_size.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# SPDX-License-Identifier: LGPL-3.0-or-later
22
import unittest
3+
from unittest import (
4+
mock,
5+
)
36

47
import numpy as np
58

@@ -9,6 +12,36 @@
912

1013

1114
class TestAutoBatchSize(unittest.TestCase):
15+
@mock.patch("deepmd.pt.utils.auto_batch_size.torch.cuda.empty_cache")
16+
def test_is_oom_error_cuda_message(self, empty_cache) -> None:
17+
auto_batch_size = AutoBatchSize(256, 2.0)
18+
19+
self.assertTrue(
20+
auto_batch_size.is_oom_error(RuntimeError("CUDA out of memory."))
21+
)
22+
empty_cache.assert_called_once()
23+
24+
@mock.patch("deepmd.pt.utils.auto_batch_size.torch.cuda.empty_cache")
25+
def test_is_oom_error_empty_runtime_error_from_cuda_oom(self, empty_cache) -> None:
26+
auto_batch_size = AutoBatchSize(256, 2.0)
27+
cause = RuntimeError("CUDA driver error: out of memory")
28+
error = RuntimeError()
29+
error.__cause__ = cause
30+
31+
self.assertTrue(auto_batch_size.is_oom_error(error))
32+
empty_cache.assert_called_once()
33+
34+
@mock.patch("deepmd.pt.utils.auto_batch_size.torch.cuda.empty_cache")
35+
def test_is_oom_error_aoti_wrapper(self, empty_cache) -> None:
36+
auto_batch_size = AutoBatchSize(256, 2.0)
37+
error = RuntimeError(
38+
"run_func_(...) API call failed at "
39+
"/tmp/torchinductor/model_container_runner.cpp"
40+
)
41+
42+
self.assertTrue(auto_batch_size.is_oom_error(error))
43+
empty_cache.assert_called_once()
44+
1245
def test_execute_all(self) -> None:
1346
dd0 = np.zeros((10000, 2, 1, 3, 4))
1447
dd1 = np.ones((10000, 2, 1, 3, 4))

0 commit comments

Comments
 (0)