@@ -111,6 +111,8 @@ struct cudaFunctionTable {
111111typedef struct cudaFunctionTable cudaFunctionTable_t ;
112112static cudaFunctionTable_t cuFunc ;
113113
114+ #define NB_IPC_STREAM 4
115+
114116static int stage_one_init_ref_count = 0 ;
115117static bool stage_three_init_complete = false;
116118static bool common_cuda_initialized = false;
@@ -121,7 +123,8 @@ bool mca_common_cuda_enabled = false;
121123static bool mca_common_cuda_register_memory = true;
122124static bool mca_common_cuda_warning = false;
123125static opal_list_t common_cuda_memory_registrations ;
124- static CUstream ipcStream = NULL ;
126+ static CUstream ipcStream [NB_IPC_STREAM ];
127+ static int current_ipc_stream_id = 0 ;
125128static CUstream dtohStream = NULL ;
126129static CUstream htodStream = NULL ;
127130static CUstream memcpyStream = NULL ;
@@ -818,12 +821,14 @@ static int mca_common_cuda_stage_three_init(void)
818821 }
819822
820823 /* Create stream for use in ipc asynchronous copies */
821- res = cuFunc .cuStreamCreate (& ipcStream , 0 );
822- if (OPAL_UNLIKELY (res != CUDA_SUCCESS )) {
823- opal_show_help ("help-mpi-common-cuda.txt" , "cuStreamCreate failed" ,
824- true, OPAL_PROC_MY_HOSTNAME , res );
825- rc = OPAL_ERROR ;
826- goto cleanup_and_error ;
824+ for (i = 0 ; i < NB_IPC_STREAM ; i ++ ) {
825+ res = cuFunc .cuStreamCreate (& ipcStream [i ], 0 );
826+ if (OPAL_UNLIKELY (res != CUDA_SUCCESS )) {
827+ opal_show_help ("help-mpi-common-cuda.txt" , "cuStreamCreate failed" ,
828+ true, OPAL_PROC_MY_HOSTNAME , res );
829+ rc = OPAL_ERROR ;
830+ goto cleanup_and_error ;
831+ }
827832 }
828833
829834 /* Create stream for use in dtoh asynchronous copies */
@@ -1005,8 +1010,10 @@ void mca_common_cuda_fini(void)
10051010 if (NULL != cuda_event_unpack_callback_frag_array ) {
10061011 free (cuda_event_unpack_callback_frag_array );
10071012 }
1008- if ((NULL != ipcStream ) && ctx_ok ) {
1009- cuFunc .cuStreamDestroy (ipcStream );
1013+ for (i = 0 ; i < NB_IPC_STREAM ; i ++ ) {
1014+ if ((NULL != ipcStream [i ]) && ctx_ok ) {
1015+ cuFunc .cuStreamDestroy (ipcStream [i ]);
1016+ }
10101017 }
10111018 if ((NULL != dtohStream ) && ctx_ok ) {
10121019 cuFunc .cuStreamDestroy (dtohStream );
@@ -1419,7 +1426,8 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
14191426 /* This is the standard way to run. Running with synchronous copies is available
14201427 * to measure the advantages of asynchronous copies. */
14211428 if (OPAL_LIKELY (mca_common_cuda_async )) {
1422- result = cuFunc .cuMemcpyAsync ((CUdeviceptr )dst , (CUdeviceptr )src , amount , ipcStream );
1429+ // printf("I use async memcpy\n");
1430+ result = cuFunc .cuMemcpyAsync ((CUdeviceptr )dst , (CUdeviceptr )src , amount , ipcStream [current_ipc_stream_id ]);
14231431 if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
14241432 opal_show_help ("help-mpi-common-cuda.txt" , "cuMemcpyAsync failed" ,
14251433 true, dst , src , amount , result );
@@ -1430,7 +1438,11 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
14301438 "CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d" ,
14311439 dst , src , (int )amount );
14321440 }
1433- result = cuFunc .cuEventRecord (cuda_event_ipc_array [cuda_event_ipc_first_avail ], ipcStream );
1441+ result = cuFunc .cuEventRecord (cuda_event_ipc_array [cuda_event_ipc_first_avail ], ipcStream [current_ipc_stream_id ]);
1442+ current_ipc_stream_id ++ ;
1443+ if (current_ipc_stream_id >= NB_IPC_STREAM ) {
1444+ current_ipc_stream_id = 0 ;
1445+ }
14341446 if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
14351447 opal_show_help ("help-mpi-common-cuda.txt" , "cuEventRecord failed" ,
14361448 true, OPAL_PROC_MY_HOSTNAME , result );
@@ -1449,7 +1461,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
14491461 * done = 0 ;
14501462 } else {
14511463 /* Mimic the async function so they use the same memcpy call. */
1452- result = cuFunc .cuMemcpyAsync ((CUdeviceptr )dst , (CUdeviceptr )src , amount , ipcStream );
1464+ result = cuFunc .cuMemcpyAsync ((CUdeviceptr )dst , (CUdeviceptr )src , amount , ipcStream [ 0 ] );
14531465 if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
14541466 opal_show_help ("help-mpi-common-cuda.txt" , "cuMemcpyAsync failed" ,
14551467 true, dst , src , amount , result );
@@ -1462,7 +1474,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
14621474 }
14631475
14641476 /* Record an event, then wait for it to complete with calls to cuEventQuery */
1465- result = cuFunc .cuEventRecord (cuda_event_ipc_array [cuda_event_ipc_first_avail ], ipcStream );
1477+ result = cuFunc .cuEventRecord (cuda_event_ipc_array [cuda_event_ipc_first_avail ], ipcStream [ 0 ] );
14661478 if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
14671479 opal_show_help ("help-mpi-common-cuda.txt" , "cuEventRecord failed" ,
14681480 true, OPAL_PROC_MY_HOSTNAME , result );
0 commit comments