@@ -19,11 +19,6 @@ cudaStream_t outer_stream;
1919
2020// uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
2121
22- static void cuda_stream_cudaback_warmup (cudaStream_t stream, cudaError_t status, void *data)
23- {
24- DT_CUDA_DEBUG ( opal_cuda_output ( 0 , " cuda stream %d warm up is done\n " , (size_t )data); );
25- }
26-
2722
2823static inline ddt_cuda_buffer_t * obj_ddt_cuda_buffer_new ()
2924{
@@ -195,6 +190,10 @@ int32_t opal_ddt_cuda_kernel_init(void)
195190
196191 cuda_free_list = init_cuda_free_list ();
197192
193+ /* init cuda_iov */
194+ cuda_iov_cache_enabled = 1 ;
195+ cuda_iov_count = CUDA_NB_IOV;
196+
198197 /* init device */
199198 cuda_devices = (ddt_cuda_device_t *)malloc (sizeof (ddt_cuda_device_t )*NB_GPUS);
200199 for (i = 0 ; i < NB_GPUS; i++) {
@@ -229,7 +228,6 @@ int32_t opal_ddt_cuda_kernel_init(void)
229228 /* warm up call back */
230229 for (j = 0 ; j < NB_STREAMS; j++) {
231230 cudaStreamSynchronize (cuda_streams->ddt_cuda_stream [j]);
232- cudaStreamAddCallback (cuda_streams->ddt_cuda_stream [j], cuda_stream_cudaback_warmup, (void *)j, 0 );
233231 }
234232 cudaDeviceSynchronize ();
235233
@@ -238,34 +236,46 @@ int32_t opal_ddt_cuda_kernel_init(void)
238236 cudaEventCreate (&(cuda_devices[i].memcpy_event ), cudaEventDisableTiming);
239237
240238 /* init iov pipeline blocks */
241- ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL ;
242- for (j = 0 ; j < NB_PIPELINE_BLOCKS; j++) {
243- cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc (sizeof (ddt_cuda_iov_pipeline_block_t ));
244- cudaMallocHost ((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h )), sizeof (ddt_cuda_iov_dist_cached_t ) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
245- cudaMalloc ((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d )), sizeof (ddt_cuda_iov_dist_cached_t ) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
246- if (j == 0 ) {
247- cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc (sizeof (ddt_cuda_iov_dist_cached_t ) * NUM_CUDA_IOV_PER_DDT);
248- } else {
249- cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL ;
239+ ddt_cuda_iov_pipeline_block_non_cached_t *cuda_iov_pipeline_block_non_cached = NULL ;
240+ for (j = 0 ; j < NB_PIPELINE_NON_CACHED_BLOCKS; j++) {
241+ if (!cuda_iov_cache_enabled) {
242+ cuda_iov_pipeline_block_non_cached = (ddt_cuda_iov_pipeline_block_non_cached_t *)malloc (sizeof (ddt_cuda_iov_pipeline_block_non_cached_t ));
243+ cudaMallocHost ((void **)(&(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h )), sizeof (ddt_cuda_iov_dist_cached_t ) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
244+ cudaMalloc ((void **)(&(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d )), sizeof (ddt_cuda_iov_dist_cached_t ) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
245+ // cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
246+ // cuda_iov_pipeline_block->cuda_stream_id = 0;
247+ cudaEventCreateWithFlags (&(cuda_iov_pipeline_block_non_cached->cuda_event ), cudaEventDisableTiming);
248+ cuda_iov_pipeline_block_non_cached->cuda_stream = NULL ;
250249 }
251- // cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
252- // cuda_iov_pipeline_block->cuda_stream_id = 0;
253- cudaEventCreateWithFlags (&(cuda_iov_pipeline_block->cuda_event ), cudaEventDisableTiming);
254- cuda_devices[i].cuda_iov_pipeline_block [j] = cuda_iov_pipeline_block;
250+ cuda_devices[i].cuda_iov_pipeline_block_non_cached [j] = cuda_iov_pipeline_block_non_cached;
251+ cuda_devices[i].cuda_iov_pipeline_block_non_cached_first_avail = 0 ;
252+ }
253+
254+ /* init iov block for cached */
255+ ddt_cuda_iov_process_block_cached_t *cuda_iov_process_block_cached = NULL ;
256+ for (j = 0 ; j < NB_CACHED_BLOCKS; j++) {
257+ if (cuda_iov_cache_enabled) {
258+ cuda_iov_process_block_cached = (ddt_cuda_iov_process_block_cached_t *)malloc (sizeof (ddt_cuda_iov_process_block_cached_t ));
259+ cuda_iov_process_block_cached->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc (sizeof (ddt_cuda_iov_dist_cached_t ) * NUM_CUDA_IOV_PER_DDT);
260+ cudaEventCreateWithFlags (&(cuda_iov_process_block_cached->cuda_event ), cudaEventDisableTiming);
261+ cuda_iov_process_block_cached->cuda_stream = NULL ;
262+ }
263+ cuda_devices[i].cuda_iov_process_block_cached [j] = cuda_iov_process_block_cached;
264+ cuda_devices[i].cuda_iov_process_block_cached_first_avail = 0 ;
255265 }
256266 }
257267 current_cuda_device = &(cuda_devices[0 ]);
258268 outer_stream = NULL ;
259269
270+ #if defined(OPAL_DATATYPE_CUDA_TIMING)
271+ TIMER_DATA_TYPE start, end, start_total, end_total;
272+ long total_time;
273+ #endif
260274 /* init cuda event list */
261275 for (i = 0 ; i < MAX_CUDA_EVENTS; i++) {
262276 cudaEventCreateWithFlags (&(cuda_event_free_list[i].cuda_event ), cudaEventDisableTiming);
263277 }
264278
265- /* init cuda_iov */
266- cuda_iov_cache_enabled = 1 ;
267- cuda_iov_count = CUDA_NB_IOV;
268-
269279 // /* init size for double, float, char */
270280 // ALIGNMENT_DOUBLE = sizeof(double);
271281 // ALIGNMENT_FLOAT = sizeof(float);
@@ -288,26 +298,37 @@ int32_t opal_ddt_cuda_kernel_fini(void)
288298 }
289299 free (cuda_devices[i].cuda_streams );
290300
291- ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL ;
292- for (j = 0 ; j < NB_PIPELINE_BLOCKS ; j++) {
293- cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block [j];
294- if (cuda_iov_pipeline_block != NULL ) {
295- if (cuda_iov_pipeline_block ->cuda_iov_dist_non_cached_h != NULL ) {
296- cudaFreeHost (cuda_iov_pipeline_block ->cuda_iov_dist_non_cached_h );
297- cuda_iov_pipeline_block ->cuda_iov_dist_non_cached_h = NULL ;
301+ ddt_cuda_iov_pipeline_block_non_cached_t *cuda_iov_pipeline_block_non_cached = NULL ;
302+ for (j = 0 ; j < NB_PIPELINE_NON_CACHED_BLOCKS ; j++) {
303+ cuda_iov_pipeline_block_non_cached = cuda_devices[i].cuda_iov_pipeline_block_non_cached [j];
304+ if (cuda_iov_pipeline_block_non_cached != NULL ) {
305+ if (cuda_iov_pipeline_block_non_cached ->cuda_iov_dist_non_cached_h != NULL ) {
306+ cudaFreeHost (cuda_iov_pipeline_block_non_cached ->cuda_iov_dist_non_cached_h );
307+ cuda_iov_pipeline_block_non_cached ->cuda_iov_dist_non_cached_h = NULL ;
298308 }
299- if (cuda_iov_pipeline_block ->cuda_iov_dist_non_cached_d != NULL ) {
300- cudaFree (cuda_iov_pipeline_block ->cuda_iov_dist_non_cached_d );
301- cuda_iov_pipeline_block ->cuda_iov_dist_non_cached_d = NULL ;
309+ if (cuda_iov_pipeline_block_non_cached ->cuda_iov_dist_non_cached_d != NULL ) {
310+ cudaFree (cuda_iov_pipeline_block_non_cached ->cuda_iov_dist_non_cached_d );
311+ cuda_iov_pipeline_block_non_cached ->cuda_iov_dist_non_cached_d = NULL ;
302312 }
303- if (cuda_iov_pipeline_block->cuda_iov_dist_cached_h != NULL ) {
304- free (cuda_iov_pipeline_block->cuda_iov_dist_cached_h );
305- cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL ;
313+ cudaEventDestroy (cuda_iov_pipeline_block_non_cached->cuda_event );
314+ cuda_iov_pipeline_block_non_cached->cuda_stream = NULL ;
315+ free (cuda_iov_pipeline_block_non_cached);
316+ cuda_iov_pipeline_block_non_cached = NULL ;
317+ }
318+ }
319+
320+ ddt_cuda_iov_process_block_cached_t *cuda_iov_process_block_cached = NULL ;
321+ for (j = 0 ; j < NB_CACHED_BLOCKS; j++) {
322+ cuda_iov_process_block_cached = cuda_devices[i].cuda_iov_process_block_cached [j];
323+ if (cuda_iov_process_block_cached != NULL ) {
324+ if (cuda_iov_process_block_cached->cuda_iov_dist_cached_h != NULL ) {
325+ free (cuda_iov_process_block_cached->cuda_iov_dist_cached_h );
326+ cuda_iov_process_block_cached->cuda_iov_dist_cached_h = NULL ;
306327 }
307- cudaEventDestroy (cuda_iov_pipeline_block ->cuda_event );
308- cuda_iov_pipeline_block ->cuda_stream = NULL ;
309- free (cuda_iov_pipeline_block );
310- cuda_iov_pipeline_block = NULL ;
328+ cudaEventDestroy (cuda_iov_process_block_cached ->cuda_event );
329+ cuda_iov_process_block_cached ->cuda_stream = NULL ;
330+ free (cuda_iov_process_block_cached );
331+ cuda_iov_process_block_cached = NULL ;
311332 }
312333 }
313334 cuda_devices[i].cuda_streams = NULL ;
@@ -385,11 +406,12 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
385406 uint32_t thread_per_block, nb_blocks_used;
386407 size_t length_per_iovec;
387408 uint32_t alignment;
388- ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL ;
409+ ddt_cuda_iov_process_block_cached_t *cuda_iov_process_block_cached = NULL ;
389410 ddt_cuda_iov_total_cached_t * cached_cuda_iov = NULL ;
390411 ddt_cuda_iov_dist_cached_t *cached_cuda_iov_dist_d = NULL ;
391412 ddt_cuda_iov_dist_cached_t *cuda_iov_dist_h = NULL ;
392413 cudaStream_t cuda_stream_iov = NULL ;
414+ cudaError_t cuda_err;
393415 const struct iovec *ddt_iov = NULL ;
394416 uint32_t ddt_iov_count = 0 ;
395417 size_t ncontig_disp_base;
@@ -413,10 +435,21 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
413435 }
414436 cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h ;
415437 nb_blocks_used = 0 ;
416- cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block [0 ];
417- cuda_iov_pipeline_block->cuda_stream = cuda_streams->ddt_cuda_stream [cuda_streams->current_stream_id ];
418- cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h ;
419- cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream ;
438+ cuda_iov_process_block_cached = current_cuda_device->cuda_iov_process_block_cached [current_cuda_device->cuda_iov_process_block_cached_first_avail ];
439+ current_cuda_device->cuda_iov_process_block_cached_first_avail ++;
440+ if (current_cuda_device->cuda_iov_process_block_cached_first_avail >= NB_CACHED_BLOCKS) {
441+ current_cuda_device->cuda_iov_process_block_cached_first_avail = 0 ;
442+ }
443+ cuda_err = cudaEventSynchronize (cuda_iov_process_block_cached->cuda_event );
444+ opal_cuda_check_error (cuda_err);
445+
446+ if (outer_stream == NULL ) {
447+ cuda_iov_process_block_cached->cuda_stream = cuda_streams->ddt_cuda_stream [cuda_streams->current_stream_id ];
448+ } else {
449+ cuda_iov_process_block_cached->cuda_stream = outer_stream;
450+ }
451+ cuda_iov_dist_h = cuda_iov_process_block_cached->cuda_iov_dist_cached_h ;
452+ cuda_stream_iov = cuda_iov_process_block_cached->cuda_stream ;
420453 thread_per_block = CUDA_WARP_SIZE * 64 ;
421454
422455 for (i = 0 ; i < ddt_iov_count; i++) {
@@ -430,11 +463,11 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
430463 residue_desc = length_per_iovec % alignment;
431464 nb_blocks_per_description = (count_desc + thread_per_block - 1 ) / thread_per_block;
432465 DT_CUDA_DEBUG ( opal_cuda_output (10 , " Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n " , i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
433- if (opal_ddt_cached_cuda_iov_isfull (cached_cuda_iov, &(cuda_iov_pipeline_block ->cuda_iov_dist_cached_h ), nb_blocks_used + nb_blocks_per_description + 1 )) {
466+ if (opal_ddt_cached_cuda_iov_isfull (cached_cuda_iov, &(cuda_iov_process_block_cached ->cuda_iov_dist_cached_h ), nb_blocks_used + nb_blocks_per_description + 1 )) {
434467 cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h ;
435468 cuda_iov_dist_h = (ddt_cuda_iov_dist_cached_t *)realloc (cuda_iov_dist_h, sizeof (ddt_cuda_iov_dist_cached_t )*cached_cuda_iov->cuda_iov_count );
436469 assert (cuda_iov_dist_h != NULL );
437- cuda_iov_pipeline_block ->cuda_iov_dist_cached_h = cuda_iov_dist_h;
470+ cuda_iov_process_block_cached ->cuda_iov_dist_cached_h = cuda_iov_dist_h;
438471 }
439472
440473 for (j = 0 ; j < nb_blocks_per_description; j++) {
@@ -481,6 +514,8 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
481514 cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
482515 datatype->cached_cuda_iov = (unsigned char *)cached_cuda_iov;
483516 *cuda_iov_count = nb_blocks_used;
517+ cuda_err = cudaEventRecord (cuda_iov_process_block_cached->cuda_event , cuda_stream_iov);
518+ opal_cuda_check_error (cuda_err);
484519 return OPAL_SUCCESS;
485520}
486521
@@ -594,7 +629,7 @@ uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
594629
595630void opal_ddt_set_cuda_iov_position (struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count)
596631{
597- int i;
632+ uint32_t i;
598633 size_t iov_size = 0 ;
599634 size_t ddt_size;
600635 convertor->current_iov_partial_length = 0 ;
@@ -622,7 +657,7 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
622657
623658void opal_ddt_set_ddt_iov_position (struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov, const uint32_t ddt_iov_count)
624659{
625- int i;
660+ uint32_t i;
626661 size_t iov_size = 0 ;
627662 size_t ddt_size;
628663 convertor->current_iov_partial_length = 0 ;
@@ -814,12 +849,24 @@ void opal_ddt_cuda_set_callback_current_stream(void *callback_func, void *callba
814849
815850void * opal_ddt_cuda_alloc_event (int32_t nb_events, int32_t *loc)
816851{
852+ int i;
817853 *loc = 0 ;
818- return (void *)&(cuda_event_free_list[0 ]);
854+ // return (void*)&(cuda_event_free_list[0]);
855+ ddt_cuda_event_t *event_list = (ddt_cuda_event_t *)malloc (sizeof (ddt_cuda_event_t ) * nb_events);
856+ for (i = 0 ; i < nb_events; i++) {
857+ cudaEventCreateWithFlags (&(event_list[i].cuda_event ), cudaEventDisableTiming);
858+ }
859+ return (void *)event_list;
819860}
820861
821- void opal_ddt_cuda_free_event (int32_t loc )
862+ void opal_ddt_cuda_free_event (void *cuda_event_list, int32_t nb_events )
822863{
864+ ddt_cuda_event_t *event_list = (ddt_cuda_event_t *)cuda_event_list;
865+ int i;
866+ for (i = 0 ; i < nb_events; i++) {
867+ cudaEventDestroy (event_list[i].cuda_event );
868+ }
869+ free (event_list);
823870 return ;
824871}
825872
0 commit comments