diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc index 71a2b0e118..97466a4833 100644 --- a/source/op/pt/comm.cc +++ b/source/op/pt/comm.cc @@ -86,7 +86,7 @@ class Border : public torch::autograd::Function { #ifdef USE_MPI int mpi_init = 0; MPI_Initialized(&mpi_init); - int cuda_aware = 1; + int cuda_aware = 0; int me = 0; MPI_Comm world; int world_size = 0; @@ -99,17 +99,9 @@ class Border : public torch::autograd::Function { MPI_Request request; #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) if (world_size >= 1) { - int version, subversion; - MPI_Get_version(&version, &subversion); - if (version >= 4) { -#ifdef NO_CUDA_AWARE - cuda_aware = 0; -#else - cuda_aware = MPIX_Query_cuda_support(); +#ifndef NO_CUDA_AWARE + cuda_aware = MPIX_Query_cuda_support(); #endif - } else { - cuda_aware = 0; - } if (cuda_aware == 0) { recv_g1_tensor = torch::empty_like(g1).to(torch::kCPU); recv_g1_tensor.copy_(g1); @@ -193,10 +185,6 @@ class Border : public torch::autograd::Function { static torch::autograd::variable_list backward_t( torch::autograd::AutogradContext* ctx, torch::autograd::variable_list grad_output) { -#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) - gpuDeviceSynchronize(); -#endif - torch::autograd::variable_list saved_variables = ctx->get_saved_variables(); torch::Tensor sendlist_tensor = saved_variables[0]; torch::Tensor sendproc_tensor = saved_variables[1]; @@ -212,7 +200,7 @@ class Border : public torch::autograd::Function { int mpi_init = 0; MPI_Initialized(&mpi_init); int world_size = 0; - int cuda_aware = 1; + int cuda_aware = 0; int me = 0; MPI_Comm world; if (mpi_init) { @@ -224,17 +212,9 @@ class Border : public torch::autograd::Function { MPI_Request request; #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) if (world_size >= 1) { - int version, subversion; - MPI_Get_version(&version, &subversion); - if (version >= 4) { -#ifdef NO_CUDA_AWARE - cuda_aware = 0; -#else - cuda_aware = MPIX_Query_cuda_support(); +#ifndef NO_CUDA_AWARE + cuda_aware = MPIX_Query_cuda_support(); #endif - } else { - cuda_aware = 0; - } if (cuda_aware == 0) { d_local_g1_tensor = torch::empty_like(grad_output[0]).to(torch::kCPU); d_local_g1_tensor.copy_(grad_output[0]); @@ -329,9 +309,6 @@ class Border : public torch::autograd::Function { recv_g1_tensor.slice(0, 0, nrecv)); } } -#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) - gpuDeviceSynchronize(); -#endif #ifdef USE_MPI #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM) if (cuda_aware == 0) {