Skip to content

Commit 7a4044d

Browse files
committed
Added CUDA implementation for the back-propagation
1 parent 54af45e commit 7a4044d

4 files changed

Lines changed: 91 additions & 11 deletions

File tree

tmva/tmva/inc/TMVA/DNN/CNN/MaxPoolLayer.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,13 @@ auto TMaxPoolLayer<Architecture_t>::Backward(std::vector<Matrix_t> &gradients_ba
170170
std::vector<Matrix_t> & /*inp1*/, std::vector<Matrix_t> &
171171
/*inp2*/) -> void
172172
{
173-
Architecture_t::MaxPoolLayerBackward(gradients_backward, this->GetActivationGradients(), indexMatrix,
174-
this->GetBatchSize(), this->GetDepth(), this->GetNLocalViews());
173+
for (size_t i = 0; i < this->GetBatchSize(); i++) {
174+
Architecture_t::MaxPoolLayerBackward(gradients_backward[i], this->GetActivationGradients()[i],
175+
this->GetIndexMatrix()[i],
176+
this->GetInputHeight(), this->GetInputWidth(),
177+
this->GetFilterHeight(), this->GetFilterWidth(),
178+
this->GetStrideRows(), this->GetStrideCols(), this->GetNLocalViews());
179+
}
175180
}
176181

177182
//______________________________________________________________________________

tmva/tmva/src/DNN/Architectures/Cuda/Kernels.cuh

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,13 @@ __device__ void ReduceSum(AFloat *result, AFloat * sdata)
203203
__syncthreads();
204204
}
205205

206+
template<typename AFloat>
207+
__device__ AFloat max(AFloat x, AFloat y)
208+
{
209+
if (x < y) return y;
210+
return x;
211+
}
212+
206213
////////////////////////////////////////////////////////////////////////////////////
207214
/// \brief Calculate the dimension of an output volume, given the sliding parameters
208215
/// and the input shape.
@@ -901,6 +908,64 @@ __global__ void Downsample(AFloat * output, AFloat * indexMatrix, const AFloat *
901908

902909
}
903910

911+
/////////////////////////////////////////////////////////////////////////////////////////////////
912+
/// \brief Back-propagate the gradients through a max-pooling layer.
913+
///
914+
/// \param[out] gradientsBackward The gradients to be written. One gradient for each neuron at the layers's input.
915+
/// \param[in] gradients The gradients coming from the next layer. One gradient for each receptive field.
916+
/// \param[in] indexMatrix Winning indices. One index for each receptive field.
917+
/// \param[in] depth The depth of the input tensor.
918+
/// \param[in] imgHeight The height of the input tensor.
919+
/// \param[in] imgWidth The output of the input tensor
920+
/// \param[in] fltHeight Height of the filter.
921+
/// \param[in] fltWidth Width of the filter.
922+
/// \param[in] strideRows stride size in the horizontal dimension.
923+
/// \param[in] strideCols stride size in the vertical dimension.
924+
/////////////////////////////////////////////////////////////////////////////////////////////////
925+
template<typename AFloat>
926+
__global__ void MaxPoolBackward(AFloat * activationGradientsBackward,
927+
const AFloat * activationGradients,
928+
const AFloat * indexMatrix,
929+
int depth, int imgHeight, int imgWidth, int fltHeight, int fltWidth,
930+
int strideRows, int strideCols)
931+
{
932+
int slice = blockDim.y * blockIdx.y + threadIdx.y; // row of the gradientsBackward matrix.
933+
int j = blockDim.x * blockIdx.x + threadIdx.x; // column of the gradientsBackward matrix.
934+
935+
if (slice >= depth || j >= imgHeight * imgWidth) return;
936+
937+
int height = calculateDimension(imgHeight, fltHeight, 0, strideRows);
938+
int width = calculateDimension(imgWidth, fltWidth, 0, strideCols);
939+
940+
// Which gradientsBackward element should this thread write to?
941+
int backRow = j % imgHeight;
942+
int backCol = j / imgHeight;
943+
int backIndex = (backCol + backRow * imgWidth) * depth + slice;
944+
945+
// Which gradient and indexMatrix elements should this thread read?
946+
int nextRowMin = floor((backRow - fltHeight) / (AFloat)strideRows) + 1;
947+
int nextColMin = floor((backCol - fltWidth) / (AFloat)strideCols) + 1;
948+
949+
int outputIndex = 0;
950+
AFloat grad = 0;
951+
952+
// Iterate over all output elements that were the outcome of receptive fields I was part of.
953+
for (int row = nextRowMin; row <= nextRowMin + fltHeight - strideRows; row++) {
954+
for (int col = nextColMin; col <= nextColMin + fltWidth - strideCols; col++) {
955+
956+
if (row >= height || col >= width || col < 0 || row < 0) continue;
957+
958+
outputIndex = (row * width + col) * depth + slice;
959+
960+
// Was I the winning index within this receptive field?
961+
if (indexMatrix[outputIndex] == backCol + backRow * imgWidth) {
962+
grad += activationGradients[outputIndex];
963+
}
964+
}
965+
}
966+
activationGradientsBackward[(backCol + backRow * imgWidth) * depth + slice] = grad;
967+
}
968+
904969
} // namespace Cuda
905970
} // namespace DNN
906971
} // namespace TMVA

tmva/tmva/src/DNN/Architectures/Cuda/Propagation.cu

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -340,14 +340,28 @@ void TCuda<AFloat>::Downsample(TCudaMatrix<AFloat> &A,
340340

341341
//____________________________________________________________________________
342342
template<typename AFloat>
343-
void TCuda<AFloat>::MaxPoolLayerBackward(std::vector<TCudaMatrix<AFloat>> & activationGradientsBackward,
344-
const std::vector<TCudaMatrix<AFloat>> & activationGradients,
345-
const std::vector<TCudaMatrix<AFloat>> & indexMatrix,
346-
size_t batchSize,
347-
size_t depth,
343+
void TCuda<AFloat>::MaxPoolLayerBackward(TCudaMatrix<AFloat> & activationGradientsBackward,
344+
const TCudaMatrix<AFloat> & activationGradients,
345+
const TCudaMatrix<AFloat> & indexMatrix,
346+
size_t imgHeight,
347+
size_t imgWidth,
348+
size_t fltHeight,
349+
size_t fltWidth,
350+
size_t strideRows,
351+
size_t strideCols,
348352
size_t nLocalViews)
349353
{
354+
size_t depth = activationGradientsBackward.GetNrows();
350355

356+
dim3 blockDims = TDevice::BlockDims2D();
357+
dim3 gridDims = TDevice::GridDims2D(activationGradientsBackward);
358+
cudaStream_t s = activationGradientsBackward.GetComputeStream();
359+
360+
::TMVA::DNN::Cuda::MaxPoolBackward<<<gridDims, blockDims, 0, s>>>(activationGradientsBackward.GetDataPointer(),
361+
activationGradients.GetDataPointer(),
362+
indexMatrix.GetDataPointer(),
363+
depth, imgHeight, imgWidth, fltHeight, fltWidth,
364+
strideRows, strideCols);
351365
}
352366

353367
//____________________________________________________________________________

tmva/tmva/test/DNN/CNN/CMakeLists.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,6 @@ set(DNN_CUDA_LIBRARIES dnn_cuda ${CUDA_CUBLAS_LIBRARIES})
1616
CUDA_ADD_EXECUTABLE(testIm2ColCuda TestIm2ColCuda.cxx)
1717
target_link_libraries(testIm2ColCuda ${Libraries} ${DNN_CUDA_LIBRARIES})
1818
ROOT_ADD_TEST(TMVA-DNN-CNN-Im2ColCuda COMMAND testIm2ColCuda)
19-
20-
CUDA_ADD_EXECUTABLE(testDownsampleCuda TestDownsampleCuda.cxx)
21-
target_link_libraries(testDownsampleCuda ${Libraries} ${DNN_CUDA_LIBRARIES})
22-
ROOT_ADD_TEST(TMVA-DNN-CNN-DownsampleCuda COMMAND testDownsampleCuda)
2319

2420
CUDA_ADD_EXECUTABLE(testPoolingLayerCuda TestPoolingLayerCuda.cxx)
2521
target_link_libraries(testPoolingLayerCuda ${Libraries} ${DNN_CUDA_LIBRARIES})

0 commit comments

Comments
 (0)