Added CUDA implementation for the back-propagation

steremma · steremma · commit 7a4044d171d0 · 2018-06-30T01:38:07.000+02:00
diff --git a/tmva/tmva/inc/TMVA/DNN/CNN/MaxPoolLayer.h b/tmva/tmva/inc/TMVA/DNN/CNN/MaxPoolLayer.h
@@ -170,8 +170,13 @@ auto TMaxPoolLayer<Architecture_t>::Backward(std::vector<Matrix_t> &gradients_ba
                                              std::vector<Matrix_t> & /*inp1*/, std::vector<Matrix_t> &
                                              /*inp2*/) -> void
 {
-   Architecture_t::MaxPoolLayerBackward(gradients_backward, this->GetActivationGradients(), indexMatrix,
-                                        this->GetBatchSize(), this->GetDepth(), this->GetNLocalViews());
+   for (size_t i = 0; i < this->GetBatchSize(); i++) {
+      Architecture_t::MaxPoolLayerBackward(gradients_backward[i], this->GetActivationGradients()[i],
+                                           this->GetIndexMatrix()[i],
+                                           this->GetInputHeight(), this->GetInputWidth(),
+                                           this->GetFilterHeight(), this->GetFilterWidth(),
+                                           this->GetStrideRows(), this->GetStrideCols(), this->GetNLocalViews());
+   }
 }
 
 //______________________________________________________________________________
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh b/tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh
@@ -203,6 +203,13 @@ __device__ void ReduceSum(AFloat *result, AFloat * sdata)
    __syncthreads();
 }
 
+template<typename AFloat>
+__device__ AFloat max(AFloat x, AFloat y)
+{
+    if (x < y) return y;
+    return x;
+}
+
 ////////////////////////////////////////////////////////////////////////////////////
 /// \brief Calculate the dimension of an output volume, given the sliding parameters
 ///        and the input shape.
@@ -901,6 +908,64 @@ __global__ void Downsample(AFloat * output, AFloat * indexMatrix, const AFloat *
 
 }
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// \brief Back-propagate the gradients through a max-pooling layer.
+///
+/// \param[out] gradientsBackward The gradients to be written. One gradient for each neuron at the layers's input.
+/// \param[in] gradients The gradients coming from the next layer. One gradient for each receptive field.
+/// \param[in] indexMatrix Winning indices. One index for each receptive field.
+/// \param[in] depth The depth of the input tensor.
+/// \param[in] imgHeight The height of the input tensor.
+/// \param[in] imgWidth The output of the input tensor
+/// \param[in] fltHeight Height of the filter.
+/// \param[in] fltWidth Width of the filter.
+/// \param[in] strideRows stride size in the horizontal dimension.
+/// \param[in] strideCols stride size in the vertical dimension.
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename AFloat>
+__global__ void MaxPoolBackward(AFloat * activationGradientsBackward,
+                                const AFloat * activationGradients,
+                                const AFloat * indexMatrix,
+                                int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth,
+                                int strideRows, int strideCols)
+{
+   int slice = blockDim.y * blockIdx.y + threadIdx.y;   // row of the gradientsBackward matrix.
+   int j = blockDim.x * blockIdx.x + threadIdx.x;       // column of the gradientsBackward matrix.
+
+   if (slice >= depth || j >= imgHeight * imgWidth) return;
+
+   int height = calculateDimension(imgHeight, fltHeight, 0, strideRows);
+   int width = calculateDimension(imgWidth, fltWidth, 0, strideCols);
+
+   // Which gradientsBackward element should this thread write to?
+   int backRow = j % imgHeight;
+   int backCol = j / imgHeight;
+   int backIndex = (backCol + backRow * imgWidth) * depth + slice;
+
+   // Which gradient and indexMatrix elements should this thread read?
+   int nextRowMin = floor((backRow - fltHeight) / (AFloat)strideRows) + 1;
+   int nextColMin = floor((backCol - fltWidth) / (AFloat)strideCols) + 1;
+
+   int outputIndex = 0;
+   AFloat grad = 0;
+
+   // Iterate over all output elements that were the outcome of receptive fields I was part of.
+   for (int row = nextRowMin; row <= nextRowMin + fltHeight - strideRows; row++) {
+      for (int col = nextColMin; col <= nextColMin + fltWidth - strideCols; col++) {
+
+          if (row >= height || col >= width || col < 0 || row < 0) continue;
+
+          outputIndex = (row * width + col) * depth + slice;
+
+         // Was I the winning index within this receptive field?
+         if (indexMatrix[outputIndex] == backCol + backRow * imgWidth) {
+            grad += activationGradients[outputIndex];
+         }
+      }
+   }
+   activationGradientsBackward[(backCol + backRow * imgWidth) * depth + slice] = grad;
+}
+
 } // namespace Cuda
 } // namespace DNN
 } // namespace TMVA
diff --git a/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu b/tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu
@@ -340,14 +340,28 @@ void TCuda<AFloat>::Downsample(TCudaMatrix<AFloat> &A,
 
 //____________________________________________________________________________
 template<typename AFloat>
-void TCuda<AFloat>::MaxPoolLayerBackward(std::vector<TCudaMatrix<AFloat>> & activationGradientsBackward,
-                                         const std::vector<TCudaMatrix<AFloat>> & activationGradients,
-                                         const std::vector<TCudaMatrix<AFloat>> & indexMatrix,
-                                         size_t batchSize,
-                                         size_t depth,
+void TCuda<AFloat>::MaxPoolLayerBackward(TCudaMatrix<AFloat> & activationGradientsBackward,
+                                         const TCudaMatrix<AFloat> & activationGradients,
+                                         const TCudaMatrix<AFloat> & indexMatrix,
+                                         size_t imgHeight,
+                                         size_t imgWidth,
+                                         size_t fltHeight,
+                                         size_t fltWidth,
+                                         size_t strideRows,
+                                         size_t strideCols,
                                          size_t nLocalViews)
 {
+   size_t depth = activationGradientsBackward.GetNrows();
 
+   dim3 blockDims = TDevice::BlockDims2D();
+   dim3 gridDims  = TDevice::GridDims2D(activationGradientsBackward);
+   cudaStream_t s = activationGradientsBackward.GetComputeStream();
+
+   ::TMVA::DNN::Cuda::MaxPoolBackward<<<gridDims, blockDims, 0, s>>>(activationGradientsBackward.GetDataPointer(),
+                                                                     activationGradients.GetDataPointer(),
+                                                                     indexMatrix.GetDataPointer(),
+                                                                     depth, imgHeight, imgWidth, fltHeight, fltWidth,
+                                                                     strideRows, strideCols);
 }
 
 //____________________________________________________________________________
diff --git a/tmva/tmva/test/DNN/CNN/CMakeLists.txt b/tmva/tmva/test/DNN/CNN/CMakeLists.txt
@@ -16,10 +16,6 @@ set(DNN_CUDA_LIBRARIES dnn_cuda ${CUDA_CUBLAS_LIBRARIES})
 CUDA_ADD_EXECUTABLE(testIm2ColCuda TestIm2ColCuda.cxx)
 target_link_libraries(testIm2ColCuda ${Libraries} ${DNN_CUDA_LIBRARIES})
 ROOT_ADD_TEST(TMVA-DNN-CNN-Im2ColCuda COMMAND testIm2ColCuda)
-    
-CUDA_ADD_EXECUTABLE(testDownsampleCuda TestDownsampleCuda.cxx)
-target_link_libraries(testDownsampleCuda ${Libraries} ${DNN_CUDA_LIBRARIES})
-ROOT_ADD_TEST(TMVA-DNN-CNN-DownsampleCuda COMMAND testDownsampleCuda)
 
 CUDA_ADD_EXECUTABLE(testPoolingLayerCuda TestPoolingLayerCuda.cxx)
 target_link_libraries(testPoolingLayerCuda ${Libraries} ${DNN_CUDA_LIBRARIES})