@@ -78,131 +78,6 @@ struct ConvolutionKernel2DGlobalMemory
7878 }
7979};
8080
81- /* *
82- * @brief ConvolutionKernel2DSharedMemory struct. The kernel for 2D Convolutional Filter, uses
83- tiling method. Tiles of matrix are kept in the shared memory. Block
84- dimensions are equal to tile dimensions.
85- */
86- struct ConvolutionKernel2DSharedMemory
87- {
88- // ! \tparam TAcc Accelerator type
89- // ! \tparam TElem The input-matrix and filter-matrix element type
90- // ! \param acc Accelerator
91- // ! \param input Input matrix
92- // ! \param output Output matrix
93- // ! \param matrixWidth Input matrix width
94- // ! \param matrixHeight Input matrix height
95- // ! \param filter Filter-matrix
96- // ! \param filterWidth Filter-matrix width
97- // ! \param intputWidthAllocated Input-matrix width allocated (possibly larger than normal width due to paddding
98- // ! \param filterWidthAllocated Filter-matrix width allocated (possibly larger than normal width due to paddding
99-
100- template <typename TAcc, typename TElem>
101- ALPAKA_FN_ACC auto operator ()(
102- TAcc const & acc,
103- TElem const * const input,
104- TElem* output,
105- int32_t const matrixWidth,
106- int32_t const matrixHeight,
107- TElem const * const filter,
108- int32_t const filterWidth,
109- int32_t const intputWidthAllocated,
110- int32_t const filterWidthAllocated) const -> void
111- {
112- auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
113- // Get extents(dimensions)
114- auto const gridBlockExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc);
115- auto const blockThreadExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
116- // Get indexes
117- auto const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
118- auto const blockThreadIdx1D = alpaka::mapIdx<1u >(blockThreadIdx, blockThreadExtent)[0u ];
119- // Get elements from 2-element arrays
120- auto const [blockThreadExtentY, blockThreadExtentX] = blockThreadExtent;
121- auto const [blockThreadY, blockThreadX] = blockThreadIdx;
122- auto const [gridBlockExtentY, gridBlockExtentX] = gridBlockExtent;
123- // Allocate shared memory
124- auto * const sharedN = alpaka::getDynSharedMem<TElem>(acc);
125- // Fill shared memory of device so that tile items are accessed from shared memory
126- if (row < matrixHeight && col < matrixWidth && blockThreadIdx1D < blockThreadExtent.prod ())
127- {
128- sharedN[blockThreadIdx1D] = input[row * intputWidthAllocated + col];
129- }
130- else if (blockThreadIdx1D < blockThreadExtent.prod ())
131- {
132- sharedN[blockThreadIdx1D] = 0 .0f ;
133- }
134-
135- // Wait for the block fills the shared memory with the tile of the main matrix
136- alpaka::syncBlockThreads (acc);
137-
138- if (col < matrixWidth && row < matrixHeight)
139- {
140- TElem pValue{0 .0f };
141- for (int32_t fRow = 0 ; fRow < filterWidth; fRow ++)
142- {
143- for (int32_t fCol = 0 ; fCol < filterWidth; fCol ++)
144- {
145- // Position of input matrix element to be multiplied with the corresponding element at the filter.
146- // The position is with respect to tile(block)
147- auto const exactRowBlock = static_cast <int32_t >(blockThreadY) - filterWidth / 2 + fRow ;
148- auto const exactColBlock = static_cast <int32_t >(blockThreadX) - filterWidth / 2 + fCol ;
149- if (exactColBlock >= 0 && exactColBlock < blockThreadExtentX && exactRowBlock >= 0
150- && exactRowBlock < blockThreadExtentY)
151- {
152- // The element is inside the tile. Get the element from the shared memory
153- pValue += filter[fRow * filterWidthAllocated + fCol ]
154- * sharedN[exactRowBlock * blockThreadExtentX + exactColBlock];
155- }
156- else
157- { // The element is not in the tile(block)
158- // Position of input matrix element to be multiplied with the corresponding element at the
159- // filter. The position is with respect to the input matrix
160- auto const exactRow = static_cast <int32_t >(row) - filterWidth / 2 + fRow ;
161- auto const exactCol = static_cast <int32_t >(col) - filterWidth / 2 + fCol ;
162- if (exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth)
163- {
164- // get the item from the global memory, use padded width!
165- pValue += filter[fRow * filterWidthAllocated + fCol ]
166- * input[exactRow * intputWidthAllocated + exactCol];
167- }
168- }
169- }
170- }
171- output[row * matrixWidth + col] = pValue;
172- } // if
173- }
174- };
175-
176- // The specialisation used for calculation of dynamic shared memory size
177- namespace alpaka ::trait
178- {
179- // ! The trait for getting the size of the block shared dynamic memory for a kernel.
180- template <typename TAcc>
181- struct BlockSharedMemDynSizeBytes <ConvolutionKernel2DSharedMemory, TAcc>
182- {
183- // ! \tparam TVec type for extent array
184- // ! \tparam TElem element type of the matrix
185- // ! \return The size of the shared memory allocated for a block.
186- template <typename TVec, typename TElem>
187- ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes (
188- ConvolutionKernel2DSharedMemory const & /* matMulKernel */ ,
189- TVec const & blockThreadExtent, // dimensions of thread per block
190- TVec const & threadElemExtent, // dimensions of elements per thread
191- TElem const * const , // input Matrix
192- TElem*, // output array
193- int32_t const , // matrixWidth
194- int32_t const , // matrixHeight
195- TElem const * const , // filter
196- int32_t const , // filter width
197- int32_t const , // allocated input width
198- int32_t const ) // allocated filter width
199- {
200- // Reserve the buffer, buffers size is the number of elements in a block (tile)
201- return static_cast <std::size_t >(blockThreadExtent.prod () * threadElemExtent.prod ()) * sizeof (TElem);
202- }
203- };
204- } // namespace alpaka::trait
205-
20681auto FuzzyEqual (float a, float b) -> bool
20782{
20883 return std::fabs (a - b) < std::numeric_limits<float >::epsilon () * 1000 .0f ;
@@ -301,9 +176,7 @@ auto example(TAccTag const&) -> int
301176 return static_cast <Idx>(rowPitchFilter / sizeof (DataType));
302177 }();
303178
304- // Construct kernel object, choose on of the kernels provided. ConvolutionKernel2DGlobalMemory and
305- // ConvolutionKernel2DSharedMemory
306- ConvolutionKernel2DSharedMemory convolutionKernel2D;
179+ ConvolutionKernel2DGlobalMemory convolutionKernel2D;
307180
308181 alpaka::KernelCfg<DevAcc> kernelCfg = {extent, Vec::ones ()};
309182
0 commit comments