Skip to content

Commit 8975e0c

Browse files
committed
Transfer data to and from GPU.
This is a multi-part patch that allows the CPU to prepare a data copy mapped onto a device. 1. The first question is how is such a device selected ? The allocation of such a copy happen way before the scheduler is invoked for a task, in fact before the task is even ready. Thus, we need to decide on the location of this copy only based on some static information, such as the task affinity. Therefore, this approach only works for owner-compute type of tasks, where the task will be executed on the device that owns the data used for the task affinity. 2. Pass the correct data copy across the entire system, instead of falling back to data copy of the device 0 (CPU memory) Signed-off-by: George Bosilca <gbosilca@nvidia.com>
1 parent bd35c5d commit 8975e0c

23 files changed

Lines changed: 623 additions & 263 deletions

parsec/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ if( BUILD_PARSEC )
238238
$<$<BOOL:${PARSEC_HAVE_OTF2}>:OTF2::OTF2>
239239
$<$<BOOL:${MPI_C_FOUND}>:MPI::MPI_C>
240240
$<$<BOOL:${PARSEC_HAVE_CUDA}>:CUDA::cudart>
241+
$<$<BOOL:${PARSEC_HAVE_CUDA}>:cuda>
241242
$<$<BOOL:${PARSEC_HAVE_HIP}>:hip::host>
242243
${EXTRA_LIBS}
243244
INTERFACE

parsec/arena.c

Lines changed: 90 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -235,43 +235,109 @@ int parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
235235
return PARSEC_SUCCESS;
236236
}
237237

238-
parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
239-
size_t count, int device,
240-
parsec_datatype_t dtt)
238+
#include "parsec/utils/zone_malloc.h"
239+
#include "mca/device/device_gpu.h"
240+
241+
static inline parsec_data_copy_t *
242+
parsec_arena_internal_copy_new(parsec_arena_t *arena,
243+
parsec_data_t *data,
244+
size_t count, int device,
245+
parsec_datatype_t dtt)
241246
{
242-
parsec_data_t *data;
243-
parsec_data_copy_t *copy;
244-
int rc;
245-
246-
247-
data = parsec_data_new();
247+
parsec_data_copy_t *copy = NULL;
248+
parsec_data_t* ldata = data;
248249
if( NULL == data ) {
250+
ldata = parsec_data_new();
251+
if( NULL == ldata ) {
252+
return NULL;
253+
}
254+
}
255+
if( 0 == device ) {
256+
copy = parsec_data_copy_new(ldata, device, dtt,
257+
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED | PARSEC_DATA_FLAG_ARENA);
258+
if (NULL == copy) {
259+
goto free_and_return;
260+
}
261+
int rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
262+
if (PARSEC_SUCCESS != rc) {
263+
goto free_and_return;
264+
}
265+
return copy;
266+
}
267+
/**
268+
* This part is not really nice, it breaks the separation between devices, and how their memory is
269+
* managed. But, it should give nice perfromance improvements if the communication layer is
270+
* capable of sending or receiving data directly to and from the accelerator memory. The only drawback
271+
* is that once the GPU memory is full, this will fail, so the soeftware will fall back to the
272+
* prior behavior, going through the CPU memory.
273+
*
274+
* The zone deallocation is not symmetric, it will happen in the GPU management, when the data copies
275+
* are released from the different LRU lists.
276+
*/
277+
parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)parsec_mca_device_get(device);
278+
if (NULL == gpu_device) {
249279
return NULL;
250280
}
281+
size_t size = count * arena->elem_size;
282+
void* device_private = zone_malloc(gpu_device->memory, size);
283+
if( NULL == device_private ) {
284+
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed (out of memory)\n",
285+
device, size, (void *)copy->arena_chunk);
286+
goto free_and_return;
287+
}
288+
copy = parsec_data_copy_new(ldata, device, dtt,
289+
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED);
290+
if (NULL == copy) {
291+
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed to allocate copy (out of memory)\n",
292+
device, size, (void *)copy->arena_chunk);
293+
zone_free(gpu_device->memory, device_private);
294+
goto free_and_return;
295+
}
296+
copy->dtt = dtt;
297+
copy->device_private = device_private;
298+
copy->arena_chunk = (parsec_arena_chunk_t*)gpu_device->memory;
299+
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p, "
300+
"data ptr %p",
301+
device, size, (void*)copy->arena_chunk, (void*)copy->device_private);
302+
copy->version = 0;
303+
copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
304+
copy->original->owner_device = device;
305+
copy->original->preferred_device = device;
306+
return copy;
307+
free_and_return:
308+
if( NULL != copy )
309+
PARSEC_OBJ_RELEASE(copy);
310+
if( NULL == data)
311+
PARSEC_OBJ_RELEASE(ldata); /* release the locally allocated data */
312+
return NULL;
313+
}
251314

252-
copy = parsec_data_copy_new( data, device, dtt,
253-
PARSEC_DATA_FLAG_ARENA |
254-
PARSEC_DATA_FLAG_PARSEC_OWNED |
255-
PARSEC_DATA_FLAG_PARSEC_MANAGED);
315+
parsec_data_copy_t *
316+
parsec_arena_get_new_copy(parsec_arena_t *arena,
317+
size_t count, int device,
318+
parsec_datatype_t dtt)
319+
{
320+
parsec_data_copy_t *dev0_copy, *copy;
256321

257-
if(NULL == copy) {
258-
PARSEC_OBJ_RELEASE(data);
322+
dev0_copy = parsec_arena_internal_copy_new(arena, NULL, count, 0 /* first allocate the copy on the device 0 */, dtt);
323+
if( NULL == dev0_copy ) {
259324
return NULL;
260325
}
326+
dev0_copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
327+
dev0_copy->version = 0; /* start from somewhere */
328+
if( 0 == device ) {
329+
return dev0_copy;
330+
}
261331

262-
rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
263-
332+
copy = parsec_arena_internal_copy_new(arena, dev0_copy->original, count, device, dtt);
333+
if( NULL == copy ) {
334+
copy = dev0_copy; /* return the main memory data copy */
335+
}
264336
/* This data is going to be released once all copies are released
265337
* It does not exist without at least a copy, and we don't give the
266338
* pointer to the user, so we must remove our retain from it
267339
*/
268-
PARSEC_OBJ_RELEASE(data);
269-
270-
if( PARSEC_SUCCESS != rc ) {
271-
PARSEC_OBJ_RELEASE(copy);
272-
return NULL;
273-
}
274-
340+
PARSEC_OBJ_RELEASE(dev0_copy->original);
275341
return copy;
276342
}
277343

parsec/arena.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -133,15 +133,15 @@ int parsec_arena_construct_ex(parsec_arena_t* arena,
133133
* enough resource to allocate a new data copy of this type.
134134
*/
135135

136-
parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
137-
size_t count, int device,
138-
parsec_datatype_t dtt);
136+
parsec_data_copy_t *parsec_arena_get_new_copy(parsec_arena_t *arena,
137+
size_t count, int device,
138+
parsec_datatype_t dtt);
139139

140140
/**
141141
* @brief Allocates memory for a given data copy. This is a function used by
142142
* DSLs to set the memory associated with a data copy they have created.
143-
* It is also used by parsec_arena_get_copy.
144-
*
143+
* It is also used by parsec_arena_get_new_copy.
144+
*
145145
* @param copy the (empty) data copy to allocate memory for. NB: the @p original
146146
* field of this data copy must be set. The operation overwrites the device
147147
* dtt and count of this data copy, as well as the device_private pointer.

parsec/data.c

Lines changed: 80 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ static void parsec_data_construct(parsec_data_t* obj )
6565
obj->preferred_device = -1;
6666
obj->key = 0;
6767
obj->nb_elts = 0;
68+
obj->nb_copies = 0;
6869
for( uint32_t i = 0; i < parsec_nb_devices;
6970
obj->device_copies[i] = NULL, i++ );
7071
obj->dc = NULL;
@@ -99,11 +100,12 @@ static void parsec_data_destruct(parsec_data_t* obj )
99100
* GPU copies are normally stored in LRU lists, and must be
100101
* destroyed by the release list to free the memory on the device
101102
*/
102-
PARSEC_OBJ_RELEASE( copy );
103+
PARSEC_DATA_COPY_RELEASE(copy);
103104
}
104105
}
105106
assert(NULL == obj->device_copies[i]);
106107
}
108+
assert(0 == obj->nb_copies);
107109
}
108110

109111
PARSEC_OBJ_CLASS_INSTANCE(parsec_data_t, parsec_object_t,
@@ -161,8 +163,8 @@ void parsec_data_delete(parsec_data_t* data)
161163

162164
inline int
163165
parsec_data_copy_attach(parsec_data_t* data,
164-
parsec_data_copy_t* copy,
165-
uint8_t device)
166+
parsec_data_copy_t* copy,
167+
uint8_t device)
166168
{
167169
assert(NULL == copy->original);
168170
assert(NULL == copy->older);
@@ -175,6 +177,7 @@ parsec_data_copy_attach(parsec_data_t* data,
175177
copy->older = NULL;
176178
return PARSEC_ERROR;
177179
}
180+
parsec_atomic_fetch_add_int32(&data->nb_copies, 1);
178181
PARSEC_OBJ_RETAIN(data);
179182
return PARSEC_SUCCESS;
180183
}
@@ -192,6 +195,7 @@ int parsec_data_copy_detach(parsec_data_t* data,
192195
return PARSEC_ERR_NOT_FOUND;
193196
}
194197
data->device_copies[device] = copy->older;
198+
parsec_atomic_fetch_add_int32(&data->nb_copies, -1);
195199

196200
copy->original = NULL;
197201
copy->older = NULL;
@@ -221,7 +225,7 @@ parsec_data_copy_t* parsec_data_copy_new(parsec_data_t* data, uint8_t device,
221225
}
222226
copy->flags = flags;
223227
if( PARSEC_SUCCESS != parsec_data_copy_attach(data, copy, device) ) {
224-
PARSEC_OBJ_RELEASE(copy);
228+
PARSEC_DATA_COPY_RELEASE(copy);
225229
return NULL;
226230
}
227231
copy->dtt = dtt;
@@ -330,6 +334,12 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
330334
copy = data->device_copies[device];
331335
assert( NULL != copy );
332336

337+
if( valid_copy == device ) {
338+
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
339+
"DEV[%d]: already has ownership of data %p to copy %p in mode %d",
340+
device, data, copy, access_mode);
341+
goto bookkeeping;
342+
}
333343
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
334344
"DEV[%d]: start transfer ownership of data %p to copy %p in mode %d",
335345
device, data, copy, access_mode);
@@ -417,6 +427,7 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
417427
}
418428
}
419429

430+
bookkeeping:
420431
if( PARSEC_FLOW_ACCESS_READ & access_mode ) {
421432
copy->readers++;
422433
}
@@ -435,40 +446,52 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
435446
return valid_copy;
436447
}
437448

438-
static char dump_coherency_codex(parsec_data_coherency_t state)
439-
{
440-
if( PARSEC_DATA_COHERENCY_INVALID == state ) return 'I';
441-
if( PARSEC_DATA_COHERENCY_OWNED == state ) return 'O';
442-
if( PARSEC_DATA_COHERENCY_EXCLUSIVE == state ) return 'E';
443-
if( PARSEC_DATA_COHERENCY_SHARED == state ) return 'S';
444-
return 'X';
445-
}
446-
447-
void parsec_dump_data_copy(parsec_data_copy_t* copy)
449+
void parsec_data_copy_dump(parsec_data_copy_t* copy)
448450
{
449-
parsec_debug_verbose(0, 0, "- [%d]: copy %p state %c readers %d version %u\n",
450-
(int)copy->device_index, copy, dump_coherency_codex(copy->coherency_state), copy->readers, copy->version);
451+
char *tranfer = "---", flags[] = "----", *coherency = "undef";
452+
switch(copy->data_transfer_status) {
453+
case PARSEC_DATA_STATUS_NOT_TRANSFER: tranfer = "no"; break;
454+
case PARSEC_DATA_STATUS_UNDER_TRANSFER: tranfer = "yes"; break;
455+
case PARSEC_DATA_STATUS_COMPLETE_TRANSFER: tranfer = "no"; break;
456+
}
457+
if (copy->flags & PARSEC_DATA_FLAG_ARENA) flags[0] = 'A';
458+
if (copy->flags & PARSEC_DATA_FLAG_TRANSIT) flags[1] = 'T';
459+
if (copy->flags & PARSEC_DATA_FLAG_PARSEC_MANAGED) flags[2] = 'M';
460+
if (copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) flags[3] = 'O';
461+
462+
if( PARSEC_DATA_COHERENCY_INVALID == copy->coherency_state ) coherency = "invalid";
463+
if( PARSEC_DATA_COHERENCY_OWNED == copy->coherency_state ) coherency = "owned";
464+
if( PARSEC_DATA_COHERENCY_EXCLUSIVE == copy->coherency_state ) coherency = "exclusive";
465+
if( PARSEC_DATA_COHERENCY_SHARED == copy->coherency_state ) coherency = "shared";
466+
467+
parsec_debug_verbose(0, 0, "%s [%d]: copy %p [ref %d] coherency %s readers %d version %u transit %s flags %s\n"
468+
" older %p orig %p arena %p dev_priv %p\n",
469+
((NULL != copy->original) && (copy->original->owner_device == copy->device_index)) ? "*" : " ",
470+
(int)copy->device_index, copy, copy->super.super.obj_reference_count, coherency, copy->readers, copy->version, tranfer, flags,
471+
(void *)copy->older, (void *)copy->original, (void *)copy->arena_chunk, copy->device_private);
451472
}
452473

453-
void parsec_dump_data(parsec_data_t* data)
474+
void parsec_data_dump(parsec_data_t* data)
454475
{
455-
parsec_debug_verbose(0, 0, "data %p key %lu owner %d\n", data, data->key, data->owner_device);
476+
parsec_debug_verbose(0, 0, "data %p [ref %d] key %lu owner dev %d pref dev %d copies %d dc %p [# elems %zu]\n",
477+
data, data->super.obj_reference_count, data->key, data->owner_device, data->preferred_device, data->nb_copies,
478+
(void*)data->dc, data->nb_elts);
456479

457480
for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
458481
if( NULL != data->device_copies[i])
459-
parsec_dump_data_copy(data->device_copies[i]);
482+
parsec_data_copy_dump(data->device_copies[i]);
460483
}
461484
}
462485

463486
parsec_data_copy_t*
464487
parsec_data_get_copy(parsec_data_t* data, uint32_t device)
465488
{
466-
return PARSEC_DATA_GET_COPY(data, device);
489+
return PARSEC_DATA_GET_COPY(data, device);
467490
}
468491

469492
void parsec_data_copy_release(parsec_data_copy_t* copy)
470493
{
471-
/* TODO: Move the copy back to the CPU before destroying it */
494+
/* TODO: Move the copy back to the CPU before destroying it */
472495
PARSEC_DATA_COPY_RELEASE(copy);
473496
}
474497

@@ -509,7 +532,7 @@ parsec_data_create( parsec_data_t **holder,
509532

510533
if( !parsec_atomic_cas_ptr(holder, NULL, data) ) {
511534
parsec_data_copy_detach(data, data_copy, 0);
512-
PARSEC_OBJ_RELEASE(data_copy);
535+
PARSEC_DATA_COPY_RELEASE(data_copy);
513536
data = *holder;
514537
}
515538
} else {
@@ -560,3 +583,38 @@ parsec_data_destroy( parsec_data_t *data )
560583
#endif
561584
PARSEC_OBJ_RELEASE(data);
562585
}
586+
587+
#include "parsec/utils/debug.h"
588+
589+
int parsec_data_release_self_contained_data(parsec_data_t *data)
590+
{
591+
if (data->super.obj_reference_count == data->nb_copies) return 0;
592+
parsec_data_copy_t *copy;
593+
PARSEC_DEBUG_VERBOSE(1, parsec_debug_output, "Examine the status of data %p with %d copies and refcounts at %s:%d\n",
594+
data, data->nb_copies, __FILE__, __LINE__);
595+
/* this data is only referenced by it's own copies. If these copies are also only referenced by
596+
* data, then we can release them all.
597+
*/
598+
for( uint32_t i = 0; i < parsec_nb_devices; i++) {
599+
if (NULL == (copy = data->device_copies[i])) continue;
600+
if( copy->super.super.obj_reference_count > 1 )
601+
return 0;
602+
}
603+
PARSEC_DEBUG_VERBOSE(1, parsec_debug_output, "Force the release of data %p at %s:%d", (COPY), __FILE__, __LINE__);
604+
for( uint32_t i = 0; i < parsec_nb_devices; i++) {
605+
if (NULL == (copy = data->device_copies[i])) continue;
606+
assert(1 == copy->super.super.obj_reference_count);
607+
if( 0 == copy->device_index ) {
608+
PARSEC_OBJ_RELEASE(copy);
609+
assert(NULL == copy);
610+
} else {
611+
/* Do not release data copies that do not belong to the CPU or really bad things will happen.
612+
* Only the device manager can release these copies, the best we can do here is to detach them
613+
* from the data and eventually release their memory.
614+
*/
615+
parsec_data_copy_detach(data, copy, copy->device_index);
616+
}
617+
}
618+
return 1;
619+
}
620+

parsec/data.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ typedef uint8_t parsec_data_coherency_t;
3131
#define PARSEC_DATA_COHERENCY_SHARED ((parsec_data_coherency_t)0x4)
3232

3333
typedef uint8_t parsec_data_status_t;
34-
#define PARSEC_DATA_STATUS_NOT_TRANSFER ((parsec_data_coherency_t)0x0)
35-
#define PARSEC_DATA_STATUS_UNDER_TRANSFER ((parsec_data_coherency_t)0x1)
36-
#define PARSEC_DATA_STATUS_COMPLETE_TRANSFER ((parsec_data_coherency_t)0x2)
34+
#define PARSEC_DATA_STATUS_NOT_TRANSFER ((parsec_data_status_t)0x0)
35+
#define PARSEC_DATA_STATUS_UNDER_TRANSFER ((parsec_data_status_t)0x1)
36+
#define PARSEC_DATA_STATUS_COMPLETE_TRANSFER ((parsec_data_status_t)0x2)
3737
/**
3838
* Data copies have three levels of 'ownership':
3939
* - a data copy can be owned and managed by PaRSEC.
@@ -124,8 +124,8 @@ PARSEC_DECLSPEC void
124124
parsec_data_end_transfer_ownership_to_copy(parsec_data_t* data,
125125
uint8_t device,
126126
uint8_t access_mode);
127-
PARSEC_DECLSPEC void parsec_dump_data_copy(parsec_data_copy_t* copy);
128-
PARSEC_DECLSPEC void parsec_dump_data(parsec_data_t* copy);
127+
PARSEC_DECLSPEC void parsec_data_copy_dump(parsec_data_copy_t *copy);
128+
PARSEC_DECLSPEC void parsec_data_dump(parsec_data_t* copy);
129129

130130
PARSEC_DECLSPEC parsec_data_t *
131131
parsec_data_create( parsec_data_t **holder,

parsec/data_dist/matrix/map_operator.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -296,13 +296,13 @@ static int data_lookup(parsec_execution_stream_t *es,
296296
this_task->data[0].data_in = parsec_data_get_copy(src(m,n), 0);
297297
this_task->data[0].source_repo_entry = NULL;
298298
this_task->data[0].data_out = NULL;
299-
PARSEC_OBJ_RETAIN(this_task->data[0].data_in);
299+
PARSEC_DATA_COPY_RETAIN(this_task->data[0].data_in);
300300
}
301301
if( NULL != __tp->dest ) {
302302
this_task->data[1].data_in = parsec_data_get_copy(dest(m,n), 0);
303303
this_task->data[1].source_repo_entry = NULL;
304304
this_task->data[1].data_out = this_task->data[1].data_in;
305-
PARSEC_OBJ_RETAIN(this_task->data[1].data_in);
305+
PARSEC_DATA_COPY_RETAIN(this_task->data[1].data_in);
306306
}
307307
return PARSEC_HOOK_RETURN_DONE;
308308
}

0 commit comments

Comments
 (0)