Skip to content

Commit fb2ecda

Browse files
fix broken example convolution 2D
Remove kernel with shared memory. - the keernel was creating shared memory with the size of the global mesh - shared memory was not fully initilized if more than one thread block was used - The global 2D convolution kernel was not used and therefore dead code Switch to the global convolution kernel.
1 parent b62a91a commit fb2ecda

1 file changed

Lines changed: 1 addition & 128 deletions

File tree

example/convolution2D/src/convolution2D.cpp

Lines changed: 1 addition & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -78,131 +78,6 @@ struct ConvolutionKernel2DGlobalMemory
7878
}
7979
};
8080

81-
/**
82-
* @brief ConvolutionKernel2DSharedMemory struct. The kernel for 2D Convolutional Filter, uses
83-
tiling method. Tiles of matrix are kept in the shared memory. Block
84-
dimensions are equal to tile dimensions.
85-
*/
86-
struct ConvolutionKernel2DSharedMemory
87-
{
88-
//! \tparam TAcc Accelerator type
89-
//! \tparam TElem The input-matrix and filter-matrix element type
90-
//! \param acc Accelerator
91-
//! \param input Input matrix
92-
//! \param output Output matrix
93-
//! \param matrixWidth Input matrix width
94-
//! \param matrixHeight Input matrix height
95-
//! \param filter Filter-matrix
96-
//! \param filterWidth Filter-matrix width
97-
//! \param intputWidthAllocated Input-matrix width allocated (possibly larger than normal width due to paddding
98-
//! \param filterWidthAllocated Filter-matrix width allocated (possibly larger than normal width due to paddding
99-
100-
template<typename TAcc, typename TElem>
101-
ALPAKA_FN_ACC auto operator()(
102-
TAcc const& acc,
103-
TElem const* const input,
104-
TElem* output,
105-
int32_t const matrixWidth,
106-
int32_t const matrixHeight,
107-
TElem const* const filter,
108-
int32_t const filterWidth,
109-
int32_t const intputWidthAllocated,
110-
int32_t const filterWidthAllocated) const -> void
111-
{
112-
auto const [row, col] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
113-
// Get extents(dimensions)
114-
auto const gridBlockExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc);
115-
auto const blockThreadExtent = alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc);
116-
// Get indexes
117-
auto const blockThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);
118-
auto const blockThreadIdx1D = alpaka::mapIdx<1u>(blockThreadIdx, blockThreadExtent)[0u];
119-
// Get elements from 2-element arrays
120-
auto const [blockThreadExtentY, blockThreadExtentX] = blockThreadExtent;
121-
auto const [blockThreadY, blockThreadX] = blockThreadIdx;
122-
auto const [gridBlockExtentY, gridBlockExtentX] = gridBlockExtent;
123-
// Allocate shared memory
124-
auto* const sharedN = alpaka::getDynSharedMem<TElem>(acc);
125-
// Fill shared memory of device so that tile items are accessed from shared memory
126-
if(row < matrixHeight && col < matrixWidth && blockThreadIdx1D < blockThreadExtent.prod())
127-
{
128-
sharedN[blockThreadIdx1D] = input[row * intputWidthAllocated + col];
129-
}
130-
else if(blockThreadIdx1D < blockThreadExtent.prod())
131-
{
132-
sharedN[blockThreadIdx1D] = 0.0f;
133-
}
134-
135-
// Wait for the block fills the shared memory with the tile of the main matrix
136-
alpaka::syncBlockThreads(acc);
137-
138-
if(col < matrixWidth && row < matrixHeight)
139-
{
140-
TElem pValue{0.0f};
141-
for(int32_t fRow = 0; fRow < filterWidth; fRow++)
142-
{
143-
for(int32_t fCol = 0; fCol < filterWidth; fCol++)
144-
{
145-
// Position of input matrix element to be multiplied with the corresponding element at the filter.
146-
// The position is with respect to tile(block)
147-
auto const exactRowBlock = static_cast<int32_t>(blockThreadY) - filterWidth / 2 + fRow;
148-
auto const exactColBlock = static_cast<int32_t>(blockThreadX) - filterWidth / 2 + fCol;
149-
if(exactColBlock >= 0 && exactColBlock < blockThreadExtentX && exactRowBlock >= 0
150-
&& exactRowBlock < blockThreadExtentY)
151-
{
152-
// The element is inside the tile. Get the element from the shared memory
153-
pValue += filter[fRow * filterWidthAllocated + fCol]
154-
* sharedN[exactRowBlock * blockThreadExtentX + exactColBlock];
155-
}
156-
else
157-
{ // The element is not in the tile(block)
158-
// Position of input matrix element to be multiplied with the corresponding element at the
159-
// filter. The position is with respect to the input matrix
160-
auto const exactRow = static_cast<int32_t>(row) - filterWidth / 2 + fRow;
161-
auto const exactCol = static_cast<int32_t>(col) - filterWidth / 2 + fCol;
162-
if(exactRow >= 0 && exactRow < matrixHeight && exactCol >= 0 && exactCol < matrixWidth)
163-
{
164-
// get the item from the global memory, use padded width!
165-
pValue += filter[fRow * filterWidthAllocated + fCol]
166-
* input[exactRow * intputWidthAllocated + exactCol];
167-
}
168-
}
169-
}
170-
}
171-
output[row * matrixWidth + col] = pValue;
172-
} // if
173-
}
174-
};
175-
176-
// The specialisation used for calculation of dynamic shared memory size
177-
namespace alpaka::trait
178-
{
179-
//! The trait for getting the size of the block shared dynamic memory for a kernel.
180-
template<typename TAcc>
181-
struct BlockSharedMemDynSizeBytes<ConvolutionKernel2DSharedMemory, TAcc>
182-
{
183-
//! \tparam TVec type for extent array
184-
//! \tparam TElem element type of the matrix
185-
//! \return The size of the shared memory allocated for a block.
186-
template<typename TVec, typename TElem>
187-
ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
188-
ConvolutionKernel2DSharedMemory const& /* matMulKernel */,
189-
TVec const& blockThreadExtent, // dimensions of thread per block
190-
TVec const& threadElemExtent, // dimensions of elements per thread
191-
TElem const* const, // input Matrix
192-
TElem*, // output array
193-
int32_t const, // matrixWidth
194-
int32_t const, // matrixHeight
195-
TElem const* const, // filter
196-
int32_t const, // filter width
197-
int32_t const, // allocated input width
198-
int32_t const) // allocated filter width
199-
{
200-
// Reserve the buffer, buffers size is the number of elements in a block (tile)
201-
return static_cast<std::size_t>(blockThreadExtent.prod() * threadElemExtent.prod()) * sizeof(TElem);
202-
}
203-
};
204-
} // namespace alpaka::trait
205-
20681
auto FuzzyEqual(float a, float b) -> bool
20782
{
20883
return std::fabs(a - b) < std::numeric_limits<float>::epsilon() * 1000.0f;
@@ -301,9 +176,7 @@ auto example(TAccTag const&) -> int
301176
return static_cast<Idx>(rowPitchFilter / sizeof(DataType));
302177
}();
303178

304-
// Construct kernel object, choose on of the kernels provided. ConvolutionKernel2DGlobalMemory and
305-
// ConvolutionKernel2DSharedMemory
306-
ConvolutionKernel2DSharedMemory convolutionKernel2D;
179+
ConvolutionKernel2DGlobalMemory convolutionKernel2D;
307180

308181
alpaka::KernelCfg<DevAcc> kernelCfg = {extent, Vec::ones()};
309182

0 commit comments

Comments
 (0)