From 4c1301884a7b46c272d824cc430539feb1bb84c1 Mon Sep 17 00:00:00 2001 From: caic99 Date: Mon, 15 Sep 2025 13:42:13 +0000 Subject: [PATCH 1/2] perf: fix cuda-aware mpi in v3 --- source/op/pt/comm.cc | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc index 71a2b0e118..94ce0558ec 100644 --- a/source/op/pt/comm.cc +++ b/source/op/pt/comm.cc @@ -86,7 +86,7 @@ class Border : public torch::autograd::Function { #ifdef USE_MPI int mpi_init = 0; MPI_Initialized(&mpi_init); - int cuda_aware = 1; + int cuda_aware = 0; int me = 0; MPI_Comm world; int world_size = 0; @@ -99,17 +99,9 @@ class Border : public torch::autograd::Function { MPI_Request request; #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) if (world_size >= 1) { - int version, subversion; - MPI_Get_version(&version, &subversion); - if (version >= 4) { -#ifdef NO_CUDA_AWARE - cuda_aware = 0; -#else - cuda_aware = MPIX_Query_cuda_support(); +#ifndef NO_CUDA_AWARE + cuda_aware = MPIX_Query_cuda_support(); #endif - } else { - cuda_aware = 0; - } if (cuda_aware == 0) { recv_g1_tensor = torch::empty_like(g1).to(torch::kCPU); recv_g1_tensor.copy_(g1); @@ -193,9 +185,6 @@ class Border : public torch::autograd::Function { static torch::autograd::variable_list backward_t( torch::autograd::AutogradContext* ctx, torch::autograd::variable_list grad_output) { -#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) - gpuDeviceSynchronize(); -#endif torch::autograd::variable_list saved_variables = ctx->get_saved_variables(); torch::Tensor sendlist_tensor = saved_variables[0]; @@ -212,7 +201,7 @@ class Border : public torch::autograd::Function { int mpi_init = 0; MPI_Initialized(&mpi_init); int world_size = 0; - int cuda_aware = 1; + int cuda_aware = 0; int me = 0; MPI_Comm world; if (mpi_init) { @@ -224,17 +213,9 @@ class Border : public torch::autograd::Function { MPI_Request request; #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) if (world_size >= 1) { - int version, subversion; - MPI_Get_version(&version, &subversion); - if (version >= 4) { -#ifdef NO_CUDA_AWARE - cuda_aware = 0; -#else - cuda_aware = MPIX_Query_cuda_support(); +#ifndef NO_CUDA_AWARE + cuda_aware = MPIX_Query_cuda_support(); #endif - } else { - cuda_aware = 0; - } if (cuda_aware == 0) { d_local_g1_tensor = torch::empty_like(grad_output[0]).to(torch::kCPU); d_local_g1_tensor.copy_(grad_output[0]); @@ -329,9 +310,6 @@ class Border : public torch::autograd::Function { recv_g1_tensor.slice(0, 0, nrecv)); } } -#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) - gpuDeviceSynchronize(); -#endif #ifdef USE_MPI #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) if (cuda_aware == 0) { From 8f46f9f24211ddbd42309e3de3c806e8de0a818d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Sep 2025 03:23:52 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- source/op/pt/comm.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc index 94ce0558ec..97466a4833 100644 --- a/source/op/pt/comm.cc +++ b/source/op/pt/comm.cc @@ -185,7 +185,6 @@ class Border : public torch::autograd::Function { static torch::autograd::variable_list backward_t( torch::autograd::AutogradContext* ctx, torch::autograd::variable_list grad_output) { - torch::autograd::variable_list saved_variables = ctx->get_saved_variables(); torch::Tensor sendlist_tensor = saved_variables[0]; torch::Tensor sendproc_tensor = saved_variables[1];