Skip to content

Commit d4aee08

Browse files
committed
use multiple cuda stream for P2P, it allows multiple send/recv working simutaniously
1 parent 1a1e57a commit d4aee08

7 files changed

Lines changed: 65 additions & 19 deletions

File tree

ompi/mca/coll/base/coll_base_bcast.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ ompi_coll_base_bcast_intra_chain( void* buffer,
302302
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:bcast_intra_chain rank %d fo %d ss %5d typelng %lu segcount %d",
303303
ompi_comm_rank(comm), chains, segsize, (unsigned long)typelng, segcount));
304304

305+
printf("&&&&&&&& im using chain\n");
305306
return ompi_coll_base_bcast_intra_generic( buffer, count, datatype, root, comm, module,
306307
segcount, data->cached_chain );
307308
}

opal/datatype/cuda/opal_datatype_cuda.cu

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,15 @@ int32_t opal_ddt_cuda_kernel_init(void)
240240
for (j = 0; j < NB_PIPELINE_NON_CACHED_BLOCKS; j++) {
241241
if (!cuda_iov_cache_enabled) {
242242
cuda_iov_pipeline_block_non_cached = (ddt_cuda_iov_pipeline_block_non_cached_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_non_cached_t));
243+
#if OPAL_DATATYPE_IOV_UNIFIED_MEM
244+
res = cudaMallocManaged((void **)(&(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d)),
245+
sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK, cudaMemAttachHost);
246+
opal_cuda_check_error(res);
247+
cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h = cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d;
248+
#else
243249
cudaMallocHost((void **)(&(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
244250
cudaMalloc((void **)(&(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
251+
#endif
245252
// cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
246253
// cuda_iov_pipeline_block->cuda_stream_id = 0;
247254
cudaEventCreateWithFlags(&(cuda_iov_pipeline_block_non_cached->cuda_event), cudaEventDisableTiming);
@@ -302,6 +309,7 @@ int32_t opal_ddt_cuda_kernel_fini(void)
302309
for (j = 0; j < NB_PIPELINE_NON_CACHED_BLOCKS; j++) {
303310
cuda_iov_pipeline_block_non_cached = cuda_devices[i].cuda_iov_pipeline_block_non_cached[j];
304311
if (cuda_iov_pipeline_block_non_cached != NULL) {
312+
#if !OPAL_DATATYPE_IOV_UNIFIED_MEM
305313
if (cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h != NULL) {
306314
cudaFreeHost(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h);
307315
cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h = NULL;
@@ -310,6 +318,13 @@ int32_t opal_ddt_cuda_kernel_fini(void)
310318
cudaFree(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d);
311319
cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d = NULL;
312320
}
321+
#else
322+
if (cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d != NULL) {
323+
cudaFree(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d);
324+
cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d = NULL;
325+
cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h = NULL;
326+
}
327+
#endif
313328
cudaEventDestroy(cuda_iov_pipeline_block_non_cached->cuda_event);
314329
cuda_iov_pipeline_block_non_cached->cuda_stream = NULL;
315330
free(cuda_iov_pipeline_block_non_cached);

opal/datatype/cuda/opal_datatype_cuda_internal.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
#define OPAL_DATATYPE_VECTOR_USE_PIPELINE 0
2222
#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL 0
2323
#define OPAL_DATATYPE_CUDA_IOV_CACHE 1
24-
24+
#define OPAL_DATATYPE_IOV_UNIFIED_MEM 0
2525

2626

2727
#define NB_GPUS 1

opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
10921092
cuda_stream_iov = cuda_iov_pipeline_block_non_cached->cuda_stream;
10931093
cuda_err = cudaEventSynchronize(cuda_iov_pipeline_block_non_cached->cuda_event);
10941094
opal_cuda_check_error(cuda_err);
1095+
#if OPAL_DATATYPE_IOV_UNIFIED_MEM
1096+
cuda_err = cudaStreamAttachMemAsync(cuda_stream_iov, cuda_iov_dist_h_current, 0, cudaMemAttachHost);
1097+
opal_cuda_check_error(cuda_err);
1098+
cudaStreamSynchronize(cuda_stream_iov);
1099+
#endif
10951100

10961101
#if defined(OPAL_DATATYPE_CUDA_TIMING)
10971102
GET_TIME(start);
@@ -1104,8 +1109,13 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
11041109
total_time = ELAPSED_TIME( start, end );
11051110
DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, destination_base, total_time, cuda_streams->current_stream_id, nb_blocks_used); );
11061111
#endif
1107-
1112+
#if OPAL_DATATYPE_IOV_UNIFIED_MEM
1113+
//cuda_err = cudaStreamAttachMemAsync(cuda_stream_iov, cuda_iov_dist_d_current);
1114+
//opal_cuda_check_error(cuda_err);
1115+
//cudaStreamSynchronize(cuda_stream_iov);
1116+
#else
11081117
cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, cuda_stream_iov);
1118+
#endif
11091119
opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, source_base, destination_base);
11101120
//cudaStreamSynchronize(*cuda_stream_iov);
11111121
cuda_err = cudaEventRecord(cuda_iov_pipeline_block_non_cached->cuda_event, cuda_stream_iov);

opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -779,7 +779,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
779779
cuda_stream_iov = cuda_iov_pipeline_block_non_cached->cuda_stream;
780780
cuda_err = cudaEventSynchronize(cuda_iov_pipeline_block_non_cached->cuda_event);
781781
opal_cuda_check_error(cuda_err);
782-
782+
#if OPAL_DATATYPE_IOV_UNIFIED_MEM
783+
cuda_err = cudaStreamAttachMemAsync(cuda_stream_iov, cuda_iov_dist_h_current, 0, cudaMemAttachHost);
784+
opal_cuda_check_error(cuda_err);
785+
cudaStreamSynchronize(cuda_stream_iov);
786+
#endif
783787

784788
#if defined (OPAL_DATATYPE_CUDA_TIMING)
785789
GET_TIME(start);
@@ -792,8 +796,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
792796
total_time = ELAPSED_TIME( start, end );
793797
DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, destination_base, total_time, cuda_streams->current_stream_id, nb_blocks_used); );
794798
#endif
795-
799+
#if OPAL_DATATYPE_IOV_UNIFIED_MEM
800+
//cuda_err = cudaStreamAttachMemAsync(cuda_stream_iov, cuda_iov_dist_d_current);
801+
//cudaStreamSynchronize(cuda_stream_iov);
802+
#else
796803
cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, cuda_stream_iov);
804+
#endif
797805
opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, destination_base, source_base, 0, 0);
798806
//cudaStreamSynchronize(*cuda_stream_iov);
799807
cuda_err = cudaEventRecord(cuda_iov_pipeline_block_non_cached->cuda_event, cuda_stream_iov);

opal/mca/btl/smcuda/btl_smcuda.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,7 +1074,7 @@ static int mca_btl_smcuda_register_convertor (struct mca_btl_base_module_t* btl,
10741074
struct mca_btl_base_registration_handle_t *handle,
10751075
struct opal_convertor_t *convertor)
10761076
{
1077-
printf("Hello, i register convertor, %p\n", (void*)convertor);
1077+
// printf("Hello, i register convertor, %p\n", (void*)convertor);
10781078
mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t *)((intptr_t) handle - offsetof (mca_mpool_common_cuda_reg_t, data));
10791079

10801080
int32_t local_device = 0;
@@ -1168,7 +1168,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
11681168
offset = (size_t) ((intptr_t) remote_address - (intptr_t) reg_ptr->base.base);
11691169
remote_memory_address = (unsigned char *)reg_ptr->base.alloc_base + offset;
11701170
if (0 != offset) {
1171-
printf("!!!!!!offset %lu, ra %p, base %p, remote %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base, remote_memory_address);
1171+
// printf("!!!!!!offset %lu, ra %p, base %p, remote %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base, remote_memory_address);
11721172
opal_output(-1, "OFFSET=%d", (int)offset);
11731173
}
11741174

opal/mca/common/cuda/common_cuda.c

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ struct cudaFunctionTable {
111111
typedef struct cudaFunctionTable cudaFunctionTable_t;
112112
static cudaFunctionTable_t cuFunc;
113113

114+
#define NB_IPC_STREAM 4
115+
114116
static int stage_one_init_ref_count = 0;
115117
static bool stage_three_init_complete = false;
116118
static bool common_cuda_initialized = false;
@@ -121,7 +123,8 @@ bool mca_common_cuda_enabled = false;
121123
static bool mca_common_cuda_register_memory = true;
122124
static bool mca_common_cuda_warning = false;
123125
static opal_list_t common_cuda_memory_registrations;
124-
static CUstream ipcStream = NULL;
126+
static CUstream ipcStream[NB_IPC_STREAM];
127+
static int current_ipc_stream_id = 0;
125128
static CUstream dtohStream = NULL;
126129
static CUstream htodStream = NULL;
127130
static CUstream memcpyStream = NULL;
@@ -818,12 +821,14 @@ static int mca_common_cuda_stage_three_init(void)
818821
}
819822

820823
/* Create stream for use in ipc asynchronous copies */
821-
res = cuFunc.cuStreamCreate(&ipcStream, 0);
822-
if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
823-
opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
824-
true, OPAL_PROC_MY_HOSTNAME, res);
825-
rc = OPAL_ERROR;
826-
goto cleanup_and_error;
824+
for (i = 0; i < NB_IPC_STREAM; i++) {
825+
res = cuFunc.cuStreamCreate(&ipcStream[i], 0);
826+
if (OPAL_UNLIKELY(res != CUDA_SUCCESS)) {
827+
opal_show_help("help-mpi-common-cuda.txt", "cuStreamCreate failed",
828+
true, OPAL_PROC_MY_HOSTNAME, res);
829+
rc = OPAL_ERROR;
830+
goto cleanup_and_error;
831+
}
827832
}
828833

829834
/* Create stream for use in dtoh asynchronous copies */
@@ -1005,8 +1010,10 @@ void mca_common_cuda_fini(void)
10051010
if (NULL != cuda_event_unpack_callback_frag_array) {
10061011
free(cuda_event_unpack_callback_frag_array);
10071012
}
1008-
if ((NULL != ipcStream) && ctx_ok) {
1009-
cuFunc.cuStreamDestroy(ipcStream);
1013+
for (i = 0; i < NB_IPC_STREAM; i++) {
1014+
if ((NULL != ipcStream[i]) && ctx_ok) {
1015+
cuFunc.cuStreamDestroy(ipcStream[i]);
1016+
}
10101017
}
10111018
if ((NULL != dtohStream) && ctx_ok) {
10121019
cuFunc.cuStreamDestroy(dtohStream);
@@ -1419,7 +1426,8 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
14191426
/* This is the standard way to run. Running with synchronous copies is available
14201427
* to measure the advantages of asynchronous copies. */
14211428
if (OPAL_LIKELY(mca_common_cuda_async)) {
1422-
result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
1429+
// printf("I use async memcpy\n");
1430+
result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream[current_ipc_stream_id]);
14231431
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
14241432
opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
14251433
true, dst, src, amount, result);
@@ -1430,7 +1438,11 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
14301438
"CUDA: cuMemcpyAsync passed: dst=%p, src=%p, size=%d",
14311439
dst, src, (int)amount);
14321440
}
1433-
result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
1441+
result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream[current_ipc_stream_id]);
1442+
current_ipc_stream_id ++;
1443+
if (current_ipc_stream_id >= NB_IPC_STREAM) {
1444+
current_ipc_stream_id = 0;
1445+
}
14341446
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
14351447
opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
14361448
true, OPAL_PROC_MY_HOSTNAME, result);
@@ -1449,7 +1461,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
14491461
*done = 0;
14501462
} else {
14511463
/* Mimic the async function so they use the same memcpy call. */
1452-
result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream);
1464+
result = cuFunc.cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, amount, ipcStream[0]);
14531465
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
14541466
opal_show_help("help-mpi-common-cuda.txt", "cuMemcpyAsync failed",
14551467
true, dst, src, amount, result);
@@ -1462,7 +1474,7 @@ int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg,
14621474
}
14631475

14641476
/* Record an event, then wait for it to complete with calls to cuEventQuery */
1465-
result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream);
1477+
result = cuFunc.cuEventRecord(cuda_event_ipc_array[cuda_event_ipc_first_avail], ipcStream[0]);
14661478
if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
14671479
opal_show_help("help-mpi-common-cuda.txt", "cuEventRecord failed",
14681480
true, OPAL_PROC_MY_HOSTNAME, result);

0 commit comments

Comments
 (0)