Skip to content

Commit 1a1e57a

Browse files
committed
fix a bug: we should also track the completion of unpack operation, but only track D2D.
checkpoint, cached iov now uses seperated block another checkpoint now non-cached shuuport async operations
1 parent 55e0564 commit 1a1e57a

10 files changed

Lines changed: 152 additions & 100 deletions

opal/datatype/cuda/Makefile.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ EXTLIB = -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/
1515
subdir = opal/datatype/cuda
1616

1717
CC = nvcc
18-
CFLAGS = -I$(top_builddir)/opal/include -I$(top_srcdir)/opal/include -I$(top_builddir) -I$(top_srcdir) -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
18+
CFLAGS = -I$(top_builddir)/opal/include -I$(top_srcdir)/opal/include -I$(top_builddir) -I$(top_srcdir) --compiler-options '-fPIC @CFLAGS@'
1919
LDFLAGS = -shared --compiler-options '-fPIC @LDFLAGS@'
2020

2121
SRC := \

opal/datatype/cuda/opal_datatype_cuda.cu

Lines changed: 98 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,6 @@ cudaStream_t outer_stream;
1919

2020
//uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
2121

22-
static void cuda_stream_cudaback_warmup(cudaStream_t stream, cudaError_t status, void *data)
23-
{
24-
DT_CUDA_DEBUG( opal_cuda_output( 0, "cuda stream %d warm up is done\n", (size_t)data); );
25-
}
26-
2722

2823
static inline ddt_cuda_buffer_t* obj_ddt_cuda_buffer_new()
2924
{
@@ -195,6 +190,10 @@ int32_t opal_ddt_cuda_kernel_init(void)
195190

196191
cuda_free_list = init_cuda_free_list();
197192

193+
/* init cuda_iov */
194+
cuda_iov_cache_enabled = 1;
195+
cuda_iov_count = CUDA_NB_IOV;
196+
198197
/* init device */
199198
cuda_devices = (ddt_cuda_device_t *)malloc(sizeof(ddt_cuda_device_t)*NB_GPUS);
200199
for (i = 0; i < NB_GPUS; i++) {
@@ -229,7 +228,6 @@ int32_t opal_ddt_cuda_kernel_init(void)
229228
/* warm up call back */
230229
for (j = 0; j < NB_STREAMS; j++) {
231230
cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[j]);
232-
cudaStreamAddCallback(cuda_streams->ddt_cuda_stream[j], cuda_stream_cudaback_warmup, (void *)j, 0);
233231
}
234232
cudaDeviceSynchronize();
235233

@@ -238,34 +236,46 @@ int32_t opal_ddt_cuda_kernel_init(void)
238236
cudaEventCreate(&(cuda_devices[i].memcpy_event), cudaEventDisableTiming);
239237

240238
/* init iov pipeline blocks */
241-
ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
242-
for (j = 0; j < NB_PIPELINE_BLOCKS; j++) {
243-
cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
244-
cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
245-
cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
246-
if (j == 0) {
247-
cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
248-
} else {
249-
cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
239+
ddt_cuda_iov_pipeline_block_non_cached_t *cuda_iov_pipeline_block_non_cached = NULL;
240+
for (j = 0; j < NB_PIPELINE_NON_CACHED_BLOCKS; j++) {
241+
if (!cuda_iov_cache_enabled) {
242+
cuda_iov_pipeline_block_non_cached = (ddt_cuda_iov_pipeline_block_non_cached_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_non_cached_t));
243+
cudaMallocHost((void **)(&(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
244+
cudaMalloc((void **)(&(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
245+
// cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
246+
// cuda_iov_pipeline_block->cuda_stream_id = 0;
247+
cudaEventCreateWithFlags(&(cuda_iov_pipeline_block_non_cached->cuda_event), cudaEventDisableTiming);
248+
cuda_iov_pipeline_block_non_cached->cuda_stream = NULL;
250249
}
251-
// cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
252-
// cuda_iov_pipeline_block->cuda_stream_id = 0;
253-
cudaEventCreateWithFlags(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
254-
cuda_devices[i].cuda_iov_pipeline_block[j] = cuda_iov_pipeline_block;
250+
cuda_devices[i].cuda_iov_pipeline_block_non_cached[j] = cuda_iov_pipeline_block_non_cached;
251+
cuda_devices[i].cuda_iov_pipeline_block_non_cached_first_avail = 0;
252+
}
253+
254+
/* init iov block for cached */
255+
ddt_cuda_iov_process_block_cached_t *cuda_iov_process_block_cached = NULL;
256+
for (j = 0; j < NB_CACHED_BLOCKS; j++) {
257+
if (cuda_iov_cache_enabled) {
258+
cuda_iov_process_block_cached = (ddt_cuda_iov_process_block_cached_t *)malloc(sizeof(ddt_cuda_iov_process_block_cached_t));
259+
cuda_iov_process_block_cached->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
260+
cudaEventCreateWithFlags(&(cuda_iov_process_block_cached->cuda_event), cudaEventDisableTiming);
261+
cuda_iov_process_block_cached->cuda_stream = NULL;
262+
}
263+
cuda_devices[i].cuda_iov_process_block_cached[j] = cuda_iov_process_block_cached;
264+
cuda_devices[i].cuda_iov_process_block_cached_first_avail = 0;
255265
}
256266
}
257267
current_cuda_device = &(cuda_devices[0]);
258268
outer_stream = NULL;
259269

270+
#if defined(OPAL_DATATYPE_CUDA_TIMING)
271+
TIMER_DATA_TYPE start, end, start_total, end_total;
272+
long total_time;
273+
#endif
260274
/* init cuda event list */
261275
for (i = 0; i < MAX_CUDA_EVENTS; i++) {
262276
cudaEventCreateWithFlags(&(cuda_event_free_list[i].cuda_event), cudaEventDisableTiming);
263277
}
264278

265-
/* init cuda_iov */
266-
cuda_iov_cache_enabled = 1;
267-
cuda_iov_count = CUDA_NB_IOV;
268-
269279
// /* init size for double, float, char */
270280
// ALIGNMENT_DOUBLE = sizeof(double);
271281
// ALIGNMENT_FLOAT = sizeof(float);
@@ -288,26 +298,37 @@ int32_t opal_ddt_cuda_kernel_fini(void)
288298
}
289299
free(cuda_devices[i].cuda_streams);
290300

291-
ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
292-
for (j = 0; j < NB_PIPELINE_BLOCKS; j++) {
293-
cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
294-
if (cuda_iov_pipeline_block != NULL) {
295-
if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h != NULL) {
296-
cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
297-
cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h = NULL;
301+
ddt_cuda_iov_pipeline_block_non_cached_t *cuda_iov_pipeline_block_non_cached = NULL;
302+
for (j = 0; j < NB_PIPELINE_NON_CACHED_BLOCKS; j++) {
303+
cuda_iov_pipeline_block_non_cached = cuda_devices[i].cuda_iov_pipeline_block_non_cached[j];
304+
if (cuda_iov_pipeline_block_non_cached != NULL) {
305+
if (cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h != NULL) {
306+
cudaFreeHost(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h);
307+
cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_h = NULL;
298308
}
299-
if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d != NULL) {
300-
cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
301-
cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d = NULL;
309+
if (cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d != NULL) {
310+
cudaFree(cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d);
311+
cuda_iov_pipeline_block_non_cached->cuda_iov_dist_non_cached_d = NULL;
302312
}
303-
if (cuda_iov_pipeline_block->cuda_iov_dist_cached_h != NULL) {
304-
free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
305-
cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
313+
cudaEventDestroy(cuda_iov_pipeline_block_non_cached->cuda_event);
314+
cuda_iov_pipeline_block_non_cached->cuda_stream = NULL;
315+
free(cuda_iov_pipeline_block_non_cached);
316+
cuda_iov_pipeline_block_non_cached = NULL;
317+
}
318+
}
319+
320+
ddt_cuda_iov_process_block_cached_t *cuda_iov_process_block_cached = NULL;
321+
for (j = 0; j < NB_CACHED_BLOCKS; j++) {
322+
cuda_iov_process_block_cached = cuda_devices[i].cuda_iov_process_block_cached[j];
323+
if (cuda_iov_process_block_cached != NULL) {
324+
if (cuda_iov_process_block_cached->cuda_iov_dist_cached_h != NULL) {
325+
free(cuda_iov_process_block_cached->cuda_iov_dist_cached_h);
326+
cuda_iov_process_block_cached->cuda_iov_dist_cached_h = NULL;
306327
}
307-
cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
308-
cuda_iov_pipeline_block->cuda_stream = NULL;
309-
free(cuda_iov_pipeline_block);
310-
cuda_iov_pipeline_block = NULL;
328+
cudaEventDestroy(cuda_iov_process_block_cached->cuda_event);
329+
cuda_iov_process_block_cached->cuda_stream = NULL;
330+
free(cuda_iov_process_block_cached);
331+
cuda_iov_process_block_cached = NULL;
311332
}
312333
}
313334
cuda_devices[i].cuda_streams = NULL;
@@ -385,11 +406,12 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
385406
uint32_t thread_per_block, nb_blocks_used;
386407
size_t length_per_iovec;
387408
uint32_t alignment;
388-
ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
409+
ddt_cuda_iov_process_block_cached_t *cuda_iov_process_block_cached = NULL;
389410
ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
390411
ddt_cuda_iov_dist_cached_t *cached_cuda_iov_dist_d = NULL;
391412
ddt_cuda_iov_dist_cached_t *cuda_iov_dist_h = NULL;
392413
cudaStream_t cuda_stream_iov = NULL;
414+
cudaError_t cuda_err;
393415
const struct iovec *ddt_iov = NULL;
394416
uint32_t ddt_iov_count = 0;
395417
size_t ncontig_disp_base;
@@ -413,10 +435,21 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
413435
}
414436
cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
415437
nb_blocks_used = 0;
416-
cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
417-
cuda_iov_pipeline_block->cuda_stream = cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id];
418-
cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
419-
cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
438+
cuda_iov_process_block_cached = current_cuda_device->cuda_iov_process_block_cached[current_cuda_device->cuda_iov_process_block_cached_first_avail];
439+
current_cuda_device->cuda_iov_process_block_cached_first_avail ++;
440+
if (current_cuda_device->cuda_iov_process_block_cached_first_avail >= NB_CACHED_BLOCKS) {
441+
current_cuda_device->cuda_iov_process_block_cached_first_avail = 0;
442+
}
443+
cuda_err = cudaEventSynchronize(cuda_iov_process_block_cached->cuda_event);
444+
opal_cuda_check_error(cuda_err);
445+
446+
if (outer_stream == NULL) {
447+
cuda_iov_process_block_cached->cuda_stream = cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id];
448+
} else {
449+
cuda_iov_process_block_cached->cuda_stream = outer_stream;
450+
}
451+
cuda_iov_dist_h = cuda_iov_process_block_cached->cuda_iov_dist_cached_h;
452+
cuda_stream_iov = cuda_iov_process_block_cached->cuda_stream;
420453
thread_per_block = CUDA_WARP_SIZE * 64;
421454

422455
for (i = 0; i < ddt_iov_count; i++) {
@@ -430,11 +463,11 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
430463
residue_desc = length_per_iovec % alignment;
431464
nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
432465
DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
433-
if (opal_ddt_cached_cuda_iov_isfull(cached_cuda_iov, &(cuda_iov_pipeline_block->cuda_iov_dist_cached_h), nb_blocks_used + nb_blocks_per_description + 1)) {
466+
if (opal_ddt_cached_cuda_iov_isfull(cached_cuda_iov, &(cuda_iov_process_block_cached->cuda_iov_dist_cached_h), nb_blocks_used + nb_blocks_per_description + 1)) {
434467
cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
435468
cuda_iov_dist_h = (ddt_cuda_iov_dist_cached_t *)realloc(cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*cached_cuda_iov->cuda_iov_count);
436469
assert(cuda_iov_dist_h != NULL);
437-
cuda_iov_pipeline_block->cuda_iov_dist_cached_h = cuda_iov_dist_h;
470+
cuda_iov_process_block_cached->cuda_iov_dist_cached_h = cuda_iov_dist_h;
438471
}
439472

440473
for (j = 0; j < nb_blocks_per_description; j++) {
@@ -481,6 +514,8 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
481514
cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
482515
datatype->cached_cuda_iov = (unsigned char*)cached_cuda_iov;
483516
*cuda_iov_count = nb_blocks_used;
517+
cuda_err = cudaEventRecord(cuda_iov_process_block_cached->cuda_event, cuda_stream_iov);
518+
opal_cuda_check_error(cuda_err);
484519
return OPAL_SUCCESS;
485520
}
486521

@@ -594,7 +629,7 @@ uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
594629

595630
void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count)
596631
{
597-
int i;
632+
uint32_t i;
598633
size_t iov_size = 0;
599634
size_t ddt_size;
600635
convertor->current_iov_partial_length = 0;
@@ -622,7 +657,7 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
622657

623658
void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov, const uint32_t ddt_iov_count)
624659
{
625-
int i;
660+
uint32_t i;
626661
size_t iov_size = 0;
627662
size_t ddt_size;
628663
convertor->current_iov_partial_length = 0;
@@ -814,12 +849,24 @@ void opal_ddt_cuda_set_callback_current_stream(void *callback_func, void *callba
814849

815850
void* opal_ddt_cuda_alloc_event(int32_t nb_events, int32_t *loc)
816851
{
852+
int i;
817853
*loc = 0;
818-
return (void*)&(cuda_event_free_list[0]);
854+
//return (void*)&(cuda_event_free_list[0]);
855+
ddt_cuda_event_t *event_list = (ddt_cuda_event_t *)malloc(sizeof(ddt_cuda_event_t) * nb_events);
856+
for (i = 0; i < nb_events; i++) {
857+
cudaEventCreateWithFlags(&(event_list[i].cuda_event), cudaEventDisableTiming);
858+
}
859+
return (void*)event_list;
819860
}
820861

821-
void opal_ddt_cuda_free_event(int32_t loc)
862+
void opal_ddt_cuda_free_event(void *cuda_event_list, int32_t nb_events)
822863
{
864+
ddt_cuda_event_t *event_list = (ddt_cuda_event_t *)cuda_event_list;
865+
int i;
866+
for (i = 0; i < nb_events; i++) {
867+
cudaEventDestroy(event_list[i].cuda_event);
868+
}
869+
free (event_list);
823870
return;
824871
}
825872

opal/datatype/cuda/opal_datatype_cuda.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ void opal_ddt_cuda_set_callback_current_stream(void *callback_func, void *callba
141141

142142
void* opal_ddt_cuda_alloc_event(int32_t nb_events, int32_t *loc);
143143

144-
void opal_ddt_cuda_free_event(int32_t loc);
144+
void opal_ddt_cuda_free_event(void *cuda_event_list, int32_t nb_events);
145145

146146
int32_t opal_ddt_cuda_event_query(void *cuda_event_list, int32_t i);
147147

opal/datatype/cuda/opal_datatype_cuda_internal.cuh

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@
3232
#define THREAD_PER_BLOCK 32
3333
#define CUDA_WARP_SIZE 32
3434
#define TASK_PER_THREAD 2
35-
#define NB_STREAMS 8
36-
#define NB_PIPELINE_BLOCKS 4
35+
#define NB_STREAMS 4
36+
#define NB_PIPELINE_NON_CACHED_BLOCKS 4
37+
#define NB_CACHED_BLOCKS 4
3738
#define CUDA_NB_IOV 1024*20
3839
#define CUDA_IOV_LEN 1024*1204
3940
#define CUDA_MAX_NB_BLOCKS 1024
@@ -86,10 +87,15 @@ typedef struct {
8687
typedef struct {
8788
ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_h;
8889
ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_d;
90+
cudaStream_t cuda_stream;
91+
cudaEvent_t cuda_event;
92+
} ddt_cuda_iov_pipeline_block_non_cached_t;
93+
94+
typedef struct {
8995
ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
9096
cudaStream_t cuda_stream;
9197
cudaEvent_t cuda_event;
92-
} ddt_cuda_iov_pipeline_block_t;
98+
} ddt_cuda_iov_process_block_cached_t;
9399

94100
typedef struct ddt_cuda_buffer{
95101
unsigned char* gpu_addr;
@@ -112,7 +118,10 @@ typedef struct {
112118
size_t buffer_free_size;
113119
size_t buffer_used_size;
114120
ddt_cuda_stream_t *cuda_streams;
115-
ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block[NB_PIPELINE_BLOCKS];
121+
ddt_cuda_iov_pipeline_block_non_cached_t *cuda_iov_pipeline_block_non_cached[NB_PIPELINE_NON_CACHED_BLOCKS];
122+
ddt_cuda_iov_process_block_cached_t *cuda_iov_process_block_cached[NB_CACHED_BLOCKS];
123+
uint32_t cuda_iov_process_block_cached_first_avail;
124+
uint32_t cuda_iov_pipeline_block_non_cached_first_avail;
116125
cudaEvent_t memcpy_event;
117126
} ddt_cuda_device_t;
118127

0 commit comments

Comments
 (0)