Skip to content

Commit 1470eba

Browse files
committed
Enable batched chores across device types
Replace the CUDA-specific batch build switch with PARSEC_HAVE_DEV_CAPABILITY_BATCH so batching is a runtime capability shared by all supported device types. Export the new option through parsec_options and PaRSECConfig. Add per-device MCA parameters to disable batching for CPU, recursive, CUDA, HIP, and Level Zero devices. Use shared helpers to sanitize batch chore types in DTD and to gate GPU task-ring batching on the selected device. Teach PTG to accept batch=true for CPU/default bodies as well as typed device bodies, and add CPU batch examples for both PTG and DTD with ctest coverage for the enabled and CPU-disabled DTD paths. Signed-off-by: George Bosilca <gbosilca@nvidia.com>
1 parent 51e368e commit 1470eba

16 files changed

Lines changed: 386 additions & 24 deletions

CMakeLists.txt

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -179,15 +179,17 @@ option(PARSEC_DIST_COLLECTIVES
179179
set (PARSEC_DIST_SHORT_LIMIT 1 CACHE STRING
180180
"Use the short protocol (no flow control) for messages smaller than the limit in KB. Performs better if smaller than the MTU.")
181181

182+
### Batched task parameters
183+
option(PARSEC_HAVE_DEV_CAPABILITY_BATCH
184+
"Enable batched task operations on all devices that support batching" ON)
185+
182186
### GPU engine parameters
183187
option(PARSEC_GPU_ALLOC_PER_TILE
184188
"Tile based allocation engine for GPU memory (instead of internal management
185189
of a complete allocation)" OFF)
186190
mark_as_advanced(PARSEC_GPU_ALLOC_PER_TILE)
187191
option(PARSEC_GPU_WITH_CUDA
188192
"Enable GPU support using CUDA kernels" ON)
189-
option(PARSEC_GPU_WITH_CUDA_BATCH
190-
"Enable the runtime support for batched kernels" ON)
191193
option(PARSEC_GPU_WITH_HIP
192194
"Enable GPU support using HIP kernels" ON)
193195
option(PARSEC_GPU_WITH_LEVEL_ZERO
@@ -736,13 +738,6 @@ int main(int argc, char *argv[]) {
736738
endif (CUDAToolkit_FOUND)
737739
set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files")
738740
endif( PARSEC_GPU_WITH_CUDA )
739-
if( PARSEC_GPU_WITH_CUDA_BATCH )
740-
if( NOT PARSEC_HAVE_CUDA)
741-
message(FATAL_ERROR "PARSEC_GPU_WITH_CUDA_BATCH requires PARSEC_GPU_WITH_CUDA. Enable both or none")
742-
endif( NOT PARSEC_HAVE_CUDA)
743-
set(PARSEC_HAVE_CUDA_BATCH True CACHE BOOL "True if support for batched CUDA has been enabled")
744-
endif( PARSEC_GPU_WITH_CUDA_BATCH )
745-
746741
if( PARSEC_GPU_WITH_HIP )
747742
if( NOT DEFINED ROCM_ROOT_DIR AND IS_DIRECTORY /opt/rocm )
748743
set(ROCM_ROOT_DIR /opt/rocm)

cmake_modules/PaRSECConfig.cmake.in

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ list(APPEND CMAKE_MODULE_PATH "${PARSEC_CMAKE_DIRS}")
1818

1919
find_package(Threads)
2020

21+
if(@PARSEC_HAVE_DEV_CAPABILITY_BATCH@)
22+
set(PARSEC_HAVE_DEV_CAPABILITY_BATCH TRUE)
23+
endif(@PARSEC_HAVE_DEV_CAPABILITY_BATCH@)
24+
2125
if(@PARSEC_HAVE_HWLOC@)
2226
set_and_check(HWLOC_INCLUDE_DIR "@HWLOC_INCLUDE_DIR@")
2327
set(HWLOC_INCLUDE_DIR ${HWLOC_INCLUDE_DIR} CACHE PATH "Imported by PaRSECConfig.cmake" FORCE)
@@ -65,10 +69,6 @@ endif(@PARSEC_DIST_WITH_MPI@)
6569
if(@PARSEC_HAVE_CUDA@)
6670
find_package(CUDAToolkit REQUIRED)
6771
set(PARSEC_HAVE_CUDA TRUE)
68-
69-
if(@PARSEC_HAVE_CUDA_BATCH@)
70-
set(PARSEC_HAVE_CUDA_BATCH TRUE)
71-
endif(@PARSEC_HAVE_CUDA_BATCH@)
7272
endif(@PARSEC_HAVE_CUDA@)
7373

7474
if(@PARSEC_HAVE_HIP@)

parsec/include/parsec/parsec_options.h.in

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,8 @@
129129

130130
#cmakedefine PARSEC_HAVE_DEV_CPU_SUPPORT
131131
#cmakedefine PARSEC_HAVE_DEV_RECURSIVE_SUPPORT
132+
#cmakedefine PARSEC_HAVE_DEV_CAPABILITY_BATCH
132133
#cmakedefine PARSEC_HAVE_DEV_CUDA_SUPPORT
133-
#cmakedefine PARSEC_HAVE_DEV_CUDA_BATCH_SUPPORT
134134
#cmakedefine PARSEC_HAVE_DEV_HIP_SUPPORT
135135
#cmakedefine PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
136136
#cmakedefine PARSEC_HAVE_DEV_OPENCL_SUPPORT
@@ -156,4 +156,3 @@
156156
#include "parsec/parsec_config_bottom.h"
157157

158158
#endif /* PARSEC_CONFIG_H_HAS_BEEN_INCLUDED */
159-

parsec/interfaces/dtd/insert_function.c

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2337,6 +2337,12 @@ static parsec_hook_return_t parsec_dtd_cpu_task_submit(parsec_execution_stream_t
23372337
return dtd_tc->cpu_func_ptr(es, this_task);
23382338
}
23392339

2340+
static inline int
2341+
parsec_dtd_effective_chore_type(int device_type)
2342+
{
2343+
return (int)parsec_mca_device_type_sanitize_batch((uint32_t)device_type);
2344+
}
2345+
23402346
int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
23412347
parsec_task_class_t *tc,
23422348
int device_type,
@@ -2354,11 +2360,13 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
23542360
parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t*)tp;
23552361
parsec_dtd_task_class_t *dtd_tc = (parsec_dtd_task_class_t*)tc;
23562362

2363+
device_type = parsec_dtd_effective_chore_type(device_type);
2364+
23572365
/* We assume that incarnations is big enough, because it has been pre-allocated
23582366
* with PARSEC_DEV_MAX_NB_TYPE+1 chores, as this is a DTD task class */
23592367
incarnations = (__parsec_chore_t*)dtd_tc->super.incarnations;
23602368
for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && (incarnations[i].type & PARSEC_DEV_ANY_TYPE) != PARSEC_DEV_NONE; i++) {
2361-
if( incarnations[i].type & PARSEC_DEV_ANY_TYPE & device_type ) {
2369+
if( (incarnations[i].type & PARSEC_DEV_ANY_TYPE) & (device_type & PARSEC_DEV_ANY_TYPE) ) {
23622370
parsec_warning("A chore for this device type has already been added to task class '%s'\n",
23632371
tc->name);
23642372
return PARSEC_ERROR;
@@ -3128,7 +3136,7 @@ parsec_insert_dtd_task(parsec_task_t *__this_task)
31283136

31293137
static inline parsec_task_t *
31303138
__parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
3131-
void *fpointer, int32_t priority, uint8_t device_type,
3139+
void *fpointer, int32_t priority, int32_t device_type,
31323140
const char *name_of_kernel, parsec_task_class_t *tc, va_list args)
31333141
{
31343142
parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t *)tp;
@@ -3140,6 +3148,8 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
31403148
int nb_params = 0;
31413149
parsec_dtd_param_t params[PARSEC_DTD_MAX_PARAMS];
31423150

3151+
device_type = parsec_dtd_effective_chore_type(device_type);
3152+
31433153
if( dtd_tp == NULL) {
31443154
parsec_fatal("You need to pass a correct parsec taskpool in order to insert task. "
31453155
"Please use \"parsec_dtd_taskpool_new()\" to create new taskpool"

parsec/mca/device/CMakeLists.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@ set(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT 0 CACHE BOOL "PaRSEC has support for Recu
1414
if(PARSEC_HAVE_CUDA)
1515
set(PARSEC_HAVE_DEV_CUDA_SUPPORT 1 CACHE BOOL "PaRSEC support for CUDA")
1616
endif(PARSEC_HAVE_CUDA)
17-
if(PARSEC_HAVE_CUDA_BATCH)
18-
set(PARSEC_HAVE_DEV_CUDA_BATCH_SUPPORT 1 CACHE BOOL "PaRSEC support for batched CUDA")
19-
endif(PARSEC_HAVE_CUDA_BATCH)
2017
if(PARSEC_HAVE_HIP)
2118
set(PARSEC_HAVE_DEV_HIP_SUPPORT 1 CACHE BOOL "PaRSEC support for HIP")
2219
endif(PARSEC_HAVE_HIP)

parsec/mca/device/cuda/device_cuda_component.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ static int device_cuda_component_close(void)
276276
/* Check that no CUDA devices are still registered with PaRSEC */
277277
for(i = 0; i < parsec_mca_device_enabled(); i++) {
278278
if( NULL == (cdev = (parsec_device_cuda_module_t*)parsec_mca_device_get(i)) ) continue;
279-
if(PARSEC_DEV_CUDA & cdev->super.super.type) continue;
279+
if( !(PARSEC_DEV_CUDA & cdev->super.super.type) ) continue;
280280

281281
PARSEC_DEBUG_VERBOSE(0, parsec_gpu_output_stream,
282282
"GPU[%d:%s] CUDA device %d still registered with PaRSEC at the end of CUDA finalize.\n"

parsec/mca/device/device.c

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ static parsec_device_module_t **modules_activated = NULL;
4646

4747
static mca_base_component_t **device_components = NULL;
4848

49+
#if defined(PARSEC_HAVE_DEV_CAPABILITY_BATCH)
50+
static int parsec_device_cpu_enable_batching = 1;
51+
static int parsec_device_recursive_enable_batching = 1;
52+
static int parsec_device_cuda_enable_batching = 1;
53+
static int parsec_device_hip_enable_batching = 1;
54+
static int parsec_device_level_zero_enable_batching = 1;
55+
#endif
56+
4957
/**
5058
* Load balance skew we are willing to accept to favor RO data reuse
5159
* on GPU: a value of 20% means that we will schedule tasks on the preferred
@@ -304,6 +312,52 @@ no_valid_device: {
304312
PARSEC_OBJ_CLASS_INSTANCE(parsec_device_module_t, parsec_object_t,
305313
NULL, NULL);
306314

315+
int
316+
parsec_mca_device_type_supports_batch(uint32_t device_type)
317+
{
318+
#if defined(PARSEC_HAVE_DEV_CAPABILITY_BATCH)
319+
uint32_t type = device_type & PARSEC_DEV_ANY_TYPE;
320+
const uint32_t requested_type = type;
321+
int enabled = 1;
322+
323+
if( type & PARSEC_DEV_CPU ) {
324+
enabled &= parsec_device_cpu_enable_batching;
325+
type &= ~PARSEC_DEV_CPU;
326+
}
327+
if( type & PARSEC_DEV_RECURSIVE ) {
328+
enabled &= parsec_device_recursive_enable_batching;
329+
type &= ~PARSEC_DEV_RECURSIVE;
330+
}
331+
if( type & PARSEC_DEV_CUDA ) {
332+
enabled &= parsec_device_cuda_enable_batching;
333+
type &= ~PARSEC_DEV_CUDA;
334+
}
335+
if( type & PARSEC_DEV_HIP ) {
336+
enabled &= parsec_device_hip_enable_batching;
337+
type &= ~PARSEC_DEV_HIP;
338+
}
339+
if( type & PARSEC_DEV_LEVEL_ZERO ) {
340+
enabled &= parsec_device_level_zero_enable_batching;
341+
type &= ~PARSEC_DEV_LEVEL_ZERO;
342+
}
343+
344+
return (0 != requested_type) && enabled && (0 == type);
345+
#else
346+
(void)device_type;
347+
return 0;
348+
#endif
349+
}
350+
351+
uint32_t
352+
parsec_mca_device_type_sanitize_batch(uint32_t device_type)
353+
{
354+
if( (device_type & PARSEC_DEV_CHORE_ALLOW_BATCH) &&
355+
!parsec_mca_device_type_supports_batch(device_type) ) {
356+
device_type &= ~PARSEC_DEV_CHORE_ALLOW_BATCH;
357+
}
358+
return device_type;
359+
}
360+
307361
int parsec_mca_device_init(void)
308362
{
309363
char** parsec_device_list = NULL;
@@ -328,6 +382,23 @@ int parsec_mca_device_init(void)
328382
(void)parsec_mca_param_reg_int_name("device", "load_balance_allow_cpu",
329383
"Allow load balancing tasks with GPU incarnations to CPU cores",
330384
false, false, parsec_device_load_balance_allow_cpu, NULL);
385+
#if defined(PARSEC_HAVE_DEV_CAPABILITY_BATCH)
386+
(void)parsec_mca_param_reg_int_name("device_cpu", "enable_batching",
387+
"Boolean to allow batched task execution on CPU devices",
388+
false, false, 1, &parsec_device_cpu_enable_batching);
389+
(void)parsec_mca_param_reg_int_name("device_recursive", "enable_batching",
390+
"Boolean to allow batched task execution on recursive CPU devices",
391+
false, false, 1, &parsec_device_recursive_enable_batching);
392+
(void)parsec_mca_param_reg_int_name("device_cuda", "enable_batching",
393+
"Boolean to allow batched task execution on CUDA devices",
394+
false, false, 1, &parsec_device_cuda_enable_batching);
395+
(void)parsec_mca_param_reg_int_name("device_hip", "enable_batching",
396+
"Boolean to allow batched task execution on HIP devices",
397+
false, false, 1, &parsec_device_hip_enable_batching);
398+
(void)parsec_mca_param_reg_int_name("device_level_zero", "enable_batching",
399+
"Boolean to allow batched task execution on Level Zero devices",
400+
false, false, 1, &parsec_device_level_zero_enable_batching);
401+
#endif
331402
if( 0 < (rc = parsec_mca_param_find("device", NULL, "load_balance_skew")) ) {
332403
parsec_mca_param_lookup_int(rc, &parsec_device_load_balance_skew);
333404
}

parsec/mca/device/device.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,18 @@ extern int parsec_device_output;
209209
*/
210210
PARSEC_DECLSPEC extern int parsec_select_best_device( parsec_task_t* this_task);
211211

212+
/**
213+
* Return true if batching is enabled for all concrete device types present
214+
* in the provided non-empty mask.
215+
*/
216+
PARSEC_DECLSPEC int parsec_mca_device_type_supports_batch(uint32_t device_type);
217+
218+
/**
219+
* Drop the batching hint from a chore type if batching is disabled for the
220+
* selected device type.
221+
*/
222+
PARSEC_DECLSPEC uint32_t parsec_mca_device_type_sanitize_batch(uint32_t device_type);
223+
212224
/**
213225
* Initialize the internal structures for managing external devices such as
214226
* accelerators and GPU. Memory nodes can as well be managed using the same

parsec/mca/device/device_gpu.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1944,15 +1944,19 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device,
19441944
return PARSEC_HOOK_RETURN_DONE;
19451945
}
19461946
/* Should we allow the tasks to be batched */
1947+
#if defined(PARSEC_HAVE_DEV_CAPABILITY_BATCH)
19471948
if (PARSEC_GPU_TASK_TYPE_KERNEL == task->task_type ) {
1948-
if( PARSEC_DEV_CHORE_ALLOW_BATCH & task->ec->task_class->incarnations[0].type ) {
1949+
assert(task->ec->selected_chore >= 0);
1950+
if( (PARSEC_DEV_CHORE_ALLOW_BATCH & task->ec->task_class->incarnations[task->ec->selected_chore].type) &&
1951+
parsec_mca_device_type_supports_batch(task->ec->selected_device->type) ) {
19491952
/* Don't singleton the task, allowing the kernel to extract the tasks it wants
19501953
* from the task ring, and singleton it or replace it with the aggregated tasks
19511954
* as necessary.
19521955
*/
19531956
goto move_forward_with_this_task;
19541957
}
19551958
}
1959+
#endif
19561960
PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t *)task);
19571961

19581962
move_forward_with_this_task:

parsec/mca/device/template/device_template_module.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ parsec_template_taskpool_register(parsec_device_module_t* device,
7979
const parsec_task_class_t* tc = tp->task_classes_array[i];
8080
__parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;
8181
for( j = 0; NULL != chores[j].hook; j++ ) {
82-
if( chores[j].type & device->type )
82+
if( !(chores[j].type & device->type) )
8383
continue;
8484
if( NULL != chores[j].dyld_fn ) {
8585
/* the function has been set for another device of the same type */
@@ -193,4 +193,3 @@ parsec_device_template_module_fini(parsec_device_module_t* device)
193193
(void)dev; (void)device;
194194
return PARSEC_SUCCESS;
195195
}
196-

0 commit comments

Comments
 (0)