@@ -92,12 +92,17 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int
9292void solve_interleaved2_launcher (NrnThread* nt, InterleaveInfo* info, int ncore, void * stream) {
9393 auto cuda_stream = static_cast <cudaStream_t>(stream);
9494
95- // the selection of these parameters has been done after running the channel-benchmark for typical production runs, i.e.
96- // 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
97- // The main idea is to have multiple warps per SM and sufficient blocks to fill the GPU.
98- // In our case, given that multiple threads share the available GPUs, we "guarantee" a sufficient occupancy of the GPUs.
99- int threadsPerBlock = 128 ;
100- int blocksPerGrid = 512 ;
95+ // / the selection of these parameters has been done after running the channel-benchmark for
96+ // / typical production runs, i.e. 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
97+ // / In the OpenACC/OpenMP implementations threadsPerBlock is set to 32. From profiling the
98+ // / channel-benchmark circuits mentioned above we figured out that the best performance was
99+ // / achieved with this configuration
100+ int threadsPerBlock = warpsize;
101+ // / Max number of blocksPerGrid for NVIDIA GPUs is 65535, so we need to make sure that the
102+ // / blocksPerGrid we launch the CUDA kernel with doesn't exceed this number
103+ const auto maxBlocksPerGrid = 65535 ;
104+ int provisionalBlocksPerGrid = (ncore + threadsPerBlock - 1 ) / threadsPerBlock;
105+ int blocksPerGrid = provisionalBlocksPerGrid <= maxBlocksPerGrid ? provisionalBlocksPerGrid : maxBlocksPerGrid;
101106
102107 solve_interleaved2_kernel<<<blocksPerGrid, threadsPerBlock, 0 , cuda_stream>>> (nt, info, ncore);
103108
0 commit comments