diff --git a/deepmd/pt/utils/auto_batch_size.py b/deepmd/pt/utils/auto_batch_size.py index 306d722fad..b79e7fb142 100644 --- a/deepmd/pt/utils/auto_batch_size.py +++ b/deepmd/pt/utils/auto_batch_size.py @@ -78,6 +78,7 @@ def is_oom_error(self, e: Exception) -> bool: "CUDA out of memory.", "CUDA driver error: out of memory", "CUDA error: out of memory", + "CUBLAS_STATUS_ALLOC_FAILED", "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR", ) if any(m in msg for msg in msgs for m in plain_oom_markers): diff --git a/source/tests/pt/test_auto_batch_size.py b/source/tests/pt/test_auto_batch_size.py index e7bb69b62e..75bd55534c 100644 --- a/source/tests/pt/test_auto_batch_size.py +++ b/source/tests/pt/test_auto_batch_size.py @@ -21,6 +21,20 @@ def test_is_oom_error_cuda_message(self, empty_cache) -> None: ) empty_cache.assert_called_once() + @mock.patch("deepmd.pt.utils.auto_batch_size.torch.cuda.empty_cache") + def test_is_oom_error_cublas_alloc_failed(self, empty_cache) -> None: + auto_batch_size = AutoBatchSize(256, 2.0) + + self.assertTrue( + auto_batch_size.is_oom_error( + RuntimeError( + "CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling " + "`cublasCreate(handle)`" + ) + ) + ) + empty_cache.assert_called_once() + @mock.patch("deepmd.pt.utils.auto_batch_size.torch.cuda.empty_cache") def test_is_oom_error_empty_runtime_error_from_cuda_oom(self, empty_cache) -> None: auto_batch_size = AutoBatchSize(256, 2.0)