Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit d03c45f

Browse files
authored
GPU implementation improvements (#718)
* Set nwarp to very big number for optimal parallelization and improve a bit grid config of CUDA solve_interleaved2
1 parent 1f01552 commit d03c45f

2 files changed

Lines changed: 13 additions & 8 deletions

File tree

coreneuron/apps/corenrn_parameters.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ struct corenrn_parameters {
4646
unsigned ms_subint = 2; /// Number of multisend interval. 1 or 2
4747
unsigned spkcompress = 0; /// Spike Compression
4848
unsigned cell_interleave_permute = 0; /// Cell interleaving permutation
49-
unsigned nwarp = 1024; /// Number of warps to balance for cell_interleave_permute == 2
50-
unsigned num_gpus = 0; /// Number of gpus to use per node
49+
unsigned nwarp = 65536; /// Number of warps to balance for cell_interleave_permute == 2
50+
unsigned num_gpus = 0; /// Number of gpus to use per node
5151
unsigned report_buff_size = report_buff_size_default; /// Size in MB of the report buffer.
5252
int seed = -1; /// Initialization seed for random number generator (int)
5353

coreneuron/permute/cellorder.cu

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,17 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int
9292
void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) {
9393
auto cuda_stream = static_cast<cudaStream_t>(stream);
9494

95-
// the selection of these parameters has been done after running the channel-benchmark for typical production runs, i.e.
96-
// 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
97-
// The main idea is to have multiple warps per SM and sufficient blocks to fill the GPU.
98-
// In our case, given that multiple threads share the available GPUs, we "guarantee" a sufficient occupancy of the GPUs.
99-
int threadsPerBlock = 128;
100-
int blocksPerGrid = 512;
95+
/// the selection of these parameters has been done after running the channel-benchmark for
96+
/// typical production runs, i.e. 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
97+
/// In the OpenACC/OpenMP implementations threadsPerBlock is set to 32. From profiling the
98+
/// channel-benchmark circuits mentioned above we figured out that the best performance was
99+
/// achieved with this configuration
100+
int threadsPerBlock = warpsize;
101+
/// Max number of blocksPerGrid for NVIDIA GPUs is 65535, so we need to make sure that the
102+
/// blocksPerGrid we launch the CUDA kernel with doesn't exceed this number
103+
const auto maxBlocksPerGrid = 65535;
104+
int provisionalBlocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock;
105+
int blocksPerGrid = provisionalBlocksPerGrid <= maxBlocksPerGrid ? provisionalBlocksPerGrid : maxBlocksPerGrid;
101106

102107
solve_interleaved2_kernel<<<blocksPerGrid, threadsPerBlock, 0, cuda_stream>>>(nt, info, ncore);
103108

0 commit comments

Comments
 (0)