Skip to content

Commit 7ccb11d

Browse files
committed
fix(pt): Treat cuBLAS allocation failures as PyTorch OOM during auto batch sizing
PyTorch inference can raise after an oversized batch attempt, especially during . Previously this was treated as a generic RuntimeError, so stopped after the first batch size reduction instead of continuing to shrink the inference batch. Add this cuBLAS allocation failure to the PyTorch auto-batch OOM markers and cover it with a unit test, allowing to continue retrying with smaller batch sizes.
1 parent 57f870f commit 7ccb11d

2 files changed

Lines changed: 15 additions & 0 deletions

File tree

deepmd/pt/utils/auto_batch_size.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def is_oom_error(self, e: Exception) -> bool:
7878
"CUDA out of memory.",
7979
"CUDA driver error: out of memory",
8080
"CUDA error: out of memory",
81+
"CUBLAS_STATUS_ALLOC_FAILED",
8182
"cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR",
8283
)
8384
if any(m in msg for msg in msgs for m in plain_oom_markers):

source/tests/pt/test_auto_batch_size.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,20 @@ def test_is_oom_error_cuda_message(self, empty_cache) -> None:
2121
)
2222
empty_cache.assert_called_once()
2323

24+
@mock.patch("deepmd.pt.utils.auto_batch_size.torch.cuda.empty_cache")
25+
def test_is_oom_error_cublas_alloc_failed(self, empty_cache) -> None:
26+
auto_batch_size = AutoBatchSize(256, 2.0)
27+
28+
self.assertTrue(
29+
auto_batch_size.is_oom_error(
30+
RuntimeError(
31+
"CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling "
32+
"`cublasCreate(handle)`"
33+
)
34+
)
35+
)
36+
empty_cache.assert_called_once()
37+
2438
@mock.patch("deepmd.pt.utils.auto_batch_size.torch.cuda.empty_cache")
2539
def test_is_oom_error_empty_runtime_error_from_cuda_oom(self, empty_cache) -> None:
2640
auto_batch_size = AutoBatchSize(256, 2.0)

0 commit comments

Comments
 (0)