@@ -235,43 +235,108 @@ int parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
235235 return PARSEC_SUCCESS ;
236236}
237237
238- parsec_data_copy_t * parsec_arena_get_copy (parsec_arena_t * arena ,
239- size_t count , int device ,
240- parsec_datatype_t dtt )
238+ #include "parsec/utils/zone_malloc.h"
239+ #include "mca/device/device_gpu.h"
240+
241+ static inline parsec_data_copy_t *
242+ parsec_arena_internal_copy_new (parsec_arena_t * arena ,
243+ parsec_data_t * data ,
244+ size_t count , int device ,
245+ parsec_datatype_t dtt )
241246{
242- parsec_data_t * data ;
243- parsec_data_copy_t * copy ;
244- int rc ;
245-
246-
247- data = parsec_data_new ();
247+ parsec_data_copy_t * copy = NULL ;
248+ parsec_data_t * ldata = data ;
248249 if ( NULL == data ) {
250+ ldata = parsec_data_new ();
251+ if ( NULL == ldata ) {
252+ return NULL ;
253+ }
254+ }
255+ if ( 0 == device ) {
256+ copy = parsec_data_copy_new (ldata , device , dtt ,
257+ PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED | PARSEC_DATA_FLAG_ARENA );
258+ if (NULL == copy ) {
259+ goto free_and_return ;
260+ }
261+ int rc = parsec_arena_allocate_device_private (copy , arena , count , device , dtt );
262+ if (PARSEC_SUCCESS != rc ) {
263+ goto free_and_return ;
264+ }
265+ return copy ;
266+ }
267+ /**
268+ * This part is not really nice, it breaks the separation between devices, and how their memory is
269+ * managed. But, it should give nice perfromance improvements if the communication layer is
270+ * capable of sending or receiving data directly to and from the accelerator memory. The only drawback
271+ * is that once the GPU memory is full, this will fail, so the soeftware will fall back to the
272+ * prior behavior, going through the CPU memory.
273+ *
274+ * The zone deallocation is not symmetric, it will happen in the GPU management, when the data copies
275+ * are released from the different LRU lists.
276+ */
277+ parsec_device_gpu_module_t * gpu_device = (parsec_device_gpu_module_t * )parsec_mca_device_get (device );
278+ if (NULL == gpu_device ) {
249279 return NULL ;
250280 }
281+ size_t size = count * arena -> elem_size ;
282+ void * device_private = zone_malloc (gpu_device -> memory , size );
283+ if ( NULL == device_private ) {
284+ PARSEC_DEBUG_VERBOSE (10 , parsec_debug_output , "Arena:\tallocate data copy on device %d of size %zu from zone %p failed (out of memory)\n" ,
285+ device , size , (void * )copy -> arena_chunk );
286+ goto free_and_return ;
287+ }
288+ copy = parsec_data_copy_new (ldata , device , dtt ,
289+ PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED );
290+ if (NULL == copy ) {
291+ PARSEC_DEBUG_VERBOSE (10 , parsec_debug_output , "Arena:\tallocate data copy on device %d of size %zu from zone %p failed to allocate copy (out of memory)\n" ,
292+ device , size , (void * )copy -> arena_chunk );
293+ zone_free (gpu_device -> memory , device_private );
294+ goto free_and_return ;
295+ }
296+ copy -> dtt = dtt ;
297+ copy -> device_private = device_private ;
298+ copy -> arena_chunk = (parsec_arena_chunk_t * )gpu_device -> memory ;
299+ PARSEC_DEBUG_VERBOSE (10 , parsec_debug_output , "Arena:\tallocate data copy on device %d of size %zu from zone %p, "
300+ "data ptr %p" ,
301+ device , size , (void * )copy -> arena_chunk , (void * )copy -> device_private );
302+ copy -> version = 0 ;
303+ copy -> coherency_state = PARSEC_DATA_COHERENCY_INVALID ;
304+ copy -> original -> owner_device = device ;
305+ copy -> original -> preferred_device = device ;
306+ return copy ;
307+ free_and_return :
308+ if ( NULL != copy )
309+ PARSEC_OBJ_RELEASE (copy );
310+ if ( NULL == data )
311+ PARSEC_OBJ_RELEASE (ldata ); /* release the locally allocated data */
312+ return NULL ;
313+ }
251314
252- copy = parsec_data_copy_new ( data , device , dtt ,
253- PARSEC_DATA_FLAG_ARENA |
254- PARSEC_DATA_FLAG_PARSEC_OWNED |
255- PARSEC_DATA_FLAG_PARSEC_MANAGED );
315+ parsec_data_copy_t * parsec_arena_get_copy (parsec_arena_t * arena ,
316+ size_t count , int device ,
317+ parsec_datatype_t dtt )
318+ {
319+ parsec_data_copy_t * dev0_copy , * copy ;
256320
257- if ( NULL == copy ) {
258- PARSEC_OBJ_RELEASE ( data );
321+ dev0_copy = parsec_arena_internal_copy_new ( arena , NULL , count , 0 /* first allocate the copy on the device 0 */ , dtt );
322+ if ( NULL == dev0_copy ) {
259323 return NULL ;
260324 }
325+ dev0_copy -> coherency_state = PARSEC_DATA_COHERENCY_INVALID ;
326+ dev0_copy -> version = 0 ; /* start from somewhere */
327+ if ( 0 == device ) {
328+ return dev0_copy ;
329+ }
261330
262- rc = parsec_arena_allocate_device_private (copy , arena , count , device , dtt );
263-
331+ copy = parsec_arena_internal_copy_new (arena , dev0_copy -> original , count , device , dtt );
332+ if ( NULL == copy ) {
333+ copy = dev0_copy ; /* return the main memory data copy */
334+ }
264335 /* This data is going to be released once all copies are released
265336 * It does not exist without at least a copy, and we don't give the
266337 * pointer to the user, so we must remove our retain from it
267338 */
268- PARSEC_OBJ_RELEASE (data );
269-
270- if ( PARSEC_SUCCESS != rc ) {
271- PARSEC_OBJ_RELEASE (copy );
272- return NULL ;
273- }
274-
339+ PARSEC_OBJ_RELEASE (dev0_copy -> original );
275340 return copy ;
276341}
277342
0 commit comments