diff --git a/CMakeLists.txt b/CMakeLists.txt index f4d1af82b..00665f37f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,6 +180,10 @@ option(PARSEC_DIST_COLLECTIVES set (PARSEC_DIST_SHORT_LIMIT 1 CACHE STRING "Use the short protocol (no flow control) for messages smaller than the limit in KB. Performs better if smaller than the MTU.") +### Batched task parameters +option(PARSEC_HAVE_DEV_CAPABILITY_BATCH + "Enable batched task operations on all devices that support batching" ON) + ### GPU engine parameters option(PARSEC_GPU_ALLOC_PER_TILE "Tile based allocation engine for GPU memory (instead of internal management @@ -769,7 +773,6 @@ int main(int argc, char *argv[]) { endif (CUDAToolkit_FOUND) set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files") endif( PARSEC_GPU_WITH_CUDA ) - if( PARSEC_GPU_WITH_HIP ) if( NOT DEFINED ROCM_ROOT_DIR AND IS_DIRECTORY /opt/rocm ) set(ROCM_ROOT_DIR /opt/rocm) diff --git a/cmake_modules/PaRSECConfig.cmake.in b/cmake_modules/PaRSECConfig.cmake.in index 9c2a16919..42390f41a 100644 --- a/cmake_modules/PaRSECConfig.cmake.in +++ b/cmake_modules/PaRSECConfig.cmake.in @@ -18,6 +18,10 @@ list(APPEND CMAKE_MODULE_PATH "${PARSEC_CMAKE_DIRS}") find_package(Threads) +if(@PARSEC_HAVE_DEV_CAPABILITY_BATCH@) + set(PARSEC_HAVE_DEV_CAPABILITY_BATCH TRUE) +endif(@PARSEC_HAVE_DEV_CAPABILITY_BATCH@) + if(@PARSEC_HAVE_HWLOC@) set_and_check(HWLOC_INCLUDE_DIR "@HWLOC_INCLUDE_DIR@") set(HWLOC_INCLUDE_DIR ${HWLOC_INCLUDE_DIR} CACHE PATH "Imported by PaRSECConfig.cmake" FORCE) diff --git a/docs/doxygen/Doxyfile.in b/docs/doxygen/Doxyfile.in index 23cf11de9..60c82f208 100644 --- a/docs/doxygen/Doxyfile.in +++ b/docs/doxygen/Doxyfile.in @@ -659,6 +659,7 @@ WARN_LOGFILE = INPUT = @PROJECT_SOURCE_DIR@/docs/doxygen/mainpage.md INPUT += @PROJECT_SOURCE_DIR@/docs/doxygen/groups.dox +INPUT += @PROJECT_SOURCE_DIR@/docs/doxygen/task-batching.md INPUT += @PROJECT_SOURCE_DIR@/parsec INPUT += @PROJECT_BINARY_DIR@/parsec/include INPUT += @PARSEC_DOX_SRCS@ diff --git a/docs/doxygen/mainpage.md b/docs/doxygen/mainpage.md index ae65f5833..1d7a9501a 100644 --- a/docs/doxygen/mainpage.md +++ b/docs/doxygen/mainpage.md @@ -72,6 +72,9 @@ subdirectory of the source. It is separated in a few modules: - [Virtual Processes](@ref parsec_internal_virtualprocess) allow to isolate groups of threads and avoid work stealing between threads belonging to different virtual processes. + - [Device task batching](@ref task_batching) documents how + accelerator submit hooks can combine compatible ready tasks into one + batched device operation. - [The Internal Runtime Module](@ref parsec_internal_runtime) holds all other functions and data structures that allow to build the PaRSEC runtime system. @@ -117,4 +120,3 @@ following components have specific documentation: - [schedulers](@ref parsec/mca/sched/sched.h) in `parsec/mca/sched` - [PaRSEC INStrumentation](@ref parsec/mca/pins/pins.h) in `parsec/mca/pins` - diff --git a/docs/doxygen/task-batching.md b/docs/doxygen/task-batching.md new file mode 100644 index 000000000..4ff9aea6b --- /dev/null +++ b/docs/doxygen/task-batching.md @@ -0,0 +1,189 @@ +Task Batching {#task_batching} +============== + +Task batching lets a device submit hook combine several compatible ready tasks +into one device operation. The runtime still owns dependency management, data +movement, completion, and release; the submit hook only decides which pending +tasks are compatible with the task it was asked to submit. + +Batching is opt-in at the chore level. A task that does not advertise batching +is always delivered to the submit hook as a singleton task. + +Enabling batching +----------------- + +For PTG-generated tasks, use the `batch = true` body property on a device body: + +```c +BODY [type=CUDA + batch = true + dyld=cublasDgemm dyldtype=cublas_dgemm_t] +{ + /* GPU submit body. */ +} +``` + +For DTD tasks, add `PARSEC_DEV_CHORE_ALLOW_BATCH` to the device type when +registering or selecting the chore: + +```c +parsec_dtd_task_class_add_chore(tp, tc, + PARSEC_DEV_CUDA | PARSEC_DEV_CHORE_ALLOW_BATCH, + kernel_cuda); +``` + +The selected device type must also support batching at runtime. The device layer +uses `parsec_mca_device_type_supports_batch()` to check this and +`parsec_mca_device_type_sanitize_batch()` to drop the batching hint when the +selected device cannot batch. The MCA parameter `device_enable_batching` +defaults to the compile-time batching capability and can be used to disable +batching globally at runtime. +It is read-only when batching support is not compiled in. + +Recommended collection helper +----------------------------- + +The preferred interface for GPU submit hooks is +`parsec_gpu_task_collect_batch()`. The runtime passes the submit hook a +singleton `parsec_gpu_task_t *gpu_task`. The hook calls the collector with a +callback that decides, for each task currently pending on the same stream, +whether that candidate can be added to the batch headed by `gpu_task`. + +The callback has the type `parsec_gpu_task_batch_cb_t` and receives: + +- `candidate`: a pending task from `gpu_stream->fifo_pending`; +- `batch_head`: the task originally passed to the submit hook; +- `callback_data`: user data passed through by the caller. + +The callback return value controls the iterator: + +- negative: stop immediately and return that error code; +- zero: remove `candidate` from the pending FIFO and append it to + `batch_head`'s task ring; +- positive: leave `candidate` pending and continue to the next pending task. + +The callback must not modify `gpu_stream->fifo_pending` directly. + +Example: + +```c +static int +gemm_batch_match(parsec_gpu_task_t *candidate, + parsec_gpu_task_t *batch_head, + void *callback_data) +{ + (void)callback_data; + + if( (batch_head->ec->task_class == candidate->ec->task_class) && + (batch_head->ec->selected_chore == candidate->ec->selected_chore) && + (batch_head->ec->selected_device == candidate->ec->selected_device) ) { + return 0; + } + return 1; +} + +int +gemm_kernel_cuda(parsec_device_gpu_module_t *gpu_device, + parsec_gpu_task_t *gpu_task, + parsec_gpu_exec_stream_t *gpu_stream) +{ + int batch_count; + parsec_gpu_task_t *current; + + (void)gpu_device; + + batch_count = parsec_gpu_task_collect_batch(gpu_stream, gpu_task, + gemm_batch_match, NULL); + if( batch_count < 0 ) { + return batch_count; + } + + current = gpu_task; + do { + parsec_task_t *task = current->ec; + + /* Submit one device operation for task, or use the whole ring to + * issue a real batched operation. + */ + + current = (parsec_gpu_task_t *)current->list_item.list_next; + } while( current != gpu_task ); + + return PARSEC_HOOK_RETURN_DONE; +} +``` + +`parsec_gpu_task_collect_batch()` returns the number of tasks in the ring on +success, including the original `gpu_task`, or the negative callback error. +Tasks accepted before an error remain attached to `gpu_task`; tasks not accepted +remain in `gpu_stream->fifo_pending`. + +The submit hook does not need a completion callback merely to return the ring to +the runtime. If a batched submit hook returns a non-singleton task ring, the GPU +progress engine automatically chains that ring into the next stream's pending +FIFO after the recorded device event completes. The normal data retrieval, +epilog, ownership, pushout, and task completion paths then process the tasks one +at a time. + +Iterating over the returned ring +-------------------------------- + +A batched submit hook should treat `gpu_task` as the head of a circular task +ring. This works for both singleton and batched cases: + +```c +parsec_gpu_task_t *current = gpu_task; + +do { + parsec_task_t *task = current->ec; + + /* Use task. */ + + current = (parsec_gpu_task_t *)current->list_item.list_next; +} while( current != gpu_task ); +``` + +Original direct collection style +-------------------------------- + +The helper above is intentionally conservative: it keeps FIFO ownership inside +the device layer and exposes only a compatibility callback to the submit hook. +In very high load scenarios, the repeated callback call can become visible. A +specialized submit hook can still use the original direct style and manipulate +the pending FIFO and task ring itself. + +This style is more fragile and should be reserved for code that is already +device-runtime aware. The hook must preserve FIFO correctness, keep rejected +tasks pending, and unlock the FIFO on every exit path. + +```c +parsec_list_t *pending = gpu_stream->fifo_pending; +parsec_list_item_t *item; +parsec_list_item_t *next; +int batch_count = 1; + +PARSEC_LIST_ITEM_SINGLETON(&gpu_task->list_item); + +parsec_list_lock(pending); +for(item = (parsec_list_item_t *)pending->ghost_element.list_next; + item != &pending->ghost_element; + item = next) { + parsec_gpu_task_t *candidate; + + next = (parsec_list_item_t *)item->list_next; + candidate = (parsec_gpu_task_t *)item; + + if( compatible_with_batch(candidate, gpu_task) ) { + (void)parsec_list_nolock_remove(pending, item); + (void)parsec_list_item_ring_push(&gpu_task->list_item, item); + batch_count++; + } +} +parsec_list_unlock(pending); +``` + +The direct style avoids the generic iterator and callback dispatch, and it can +fold the compatibility test into a tight kernel-specific loop. The cost is that +the submit hook now depends on internal list and stream details and must be +updated if the GPU stream internals change. + diff --git a/parsec/include/parsec/parsec_options.h.in b/parsec/include/parsec/parsec_options.h.in index f05bbc63c..e0b1ee881 100644 --- a/parsec/include/parsec/parsec_options.h.in +++ b/parsec/include/parsec/parsec_options.h.in @@ -143,6 +143,7 @@ #cmakedefine PARSEC_HAVE_DEV_CPU_SUPPORT #cmakedefine PARSEC_HAVE_DEV_RECURSIVE_SUPPORT +#cmakedefine01 PARSEC_HAVE_DEV_CAPABILITY_BATCH #cmakedefine PARSEC_HAVE_DEV_CUDA_SUPPORT #cmakedefine PARSEC_HAVE_DEV_HIP_SUPPORT #cmakedefine PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c index 98d9c85b7..805d9d76b 100644 --- a/parsec/interfaces/dtd/insert_function.c +++ b/parsec/interfaces/dtd/insert_function.c @@ -1479,7 +1479,7 @@ parsec_dtd_startup(parsec_context_t *context, if( !(tp->devices_index_mask & (1 << device->device_index))) continue; /* not supported */ // If CUDA is enabled, let the CUDA device activated for this // taskpool. - if( PARSEC_DEV_CUDA == device->type ) continue; + if( PARSEC_DEV_CUDA & device->type ) continue; if( NULL != device->taskpool_register ) if( PARSEC_SUCCESS != device->taskpool_register(device, (parsec_taskpool_t *)tp)) { @@ -2345,6 +2345,12 @@ static parsec_hook_return_t parsec_dtd_cpu_task_submit(parsec_execution_stream_t return dtd_tc->cpu_func_ptr(es, this_task); } +static inline int +parsec_dtd_effective_chore_type(int device_type) +{ + return (int)parsec_mca_device_type_sanitize_batch((uint32_t)device_type); +} + int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp, parsec_task_class_t *tc, int device_type, @@ -2362,11 +2368,13 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp, parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t*)tp; parsec_dtd_task_class_t *dtd_tc = (parsec_dtd_task_class_t*)tc; + device_type = parsec_dtd_effective_chore_type(device_type); + /* We assume that incarnations is big enough, because it has been pre-allocated * with PARSEC_DEV_MAX_NB_TYPE+1 chores, as this is a DTD task class */ incarnations = (__parsec_chore_t*)dtd_tc->super.incarnations; - for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && incarnations[i].type != PARSEC_DEV_NONE; i++) { - if( incarnations[i].type == device_type ) { + for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && (incarnations[i].type & PARSEC_DEV_ANY_TYPE) != PARSEC_DEV_NONE; i++) { + if( (incarnations[i].type & PARSEC_DEV_ANY_TYPE) & (device_type & PARSEC_DEV_ANY_TYPE) ) { parsec_warning("A chore for this device type has already been added to task class '%s'\n", tc->name); return PARSEC_ERROR; @@ -2379,7 +2387,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp, } incarnations[i].type = device_type; - if(PARSEC_DEV_CUDA == device_type) { + if(PARSEC_DEV_CUDA & device_type) { incarnations[i].hook = parsec_dtd_gpu_task_submit; dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function; } @@ -3136,7 +3144,7 @@ parsec_insert_dtd_task(parsec_task_t *__this_task) static inline parsec_task_t * __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp, - void *fpointer, int32_t priority, uint8_t device_type, + void *fpointer, int32_t priority, int32_t device_type, const char *name_of_kernel, parsec_task_class_t *tc, va_list args) { parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t *)tp; @@ -3148,6 +3156,8 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp, int nb_params = 0; parsec_dtd_param_t params[PARSEC_DTD_MAX_PARAMS]; + device_type = parsec_dtd_effective_chore_type(device_type); + if( dtd_tp == NULL) { parsec_fatal("You need to pass a correct parsec taskpool in order to insert task. " "Please use \"parsec_dtd_taskpool_new()\" to create new taskpool" @@ -3268,19 +3278,20 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp, dtd_tc = parsec_dtd_create_task_classv(name_of_kernel, nb_params, params); tc = &dtd_tc->super; - __parsec_chore_t **incarnations = (__parsec_chore_t **)&tc->incarnations; - (*incarnations)[0].type = device_type; - if( device_type == PARSEC_DEV_CUDA ) { + __parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations; + incarnations[0].type = device_type; + if( device_type & PARSEC_DEV_CUDA ) { /* Special case for CUDA: we need an intermediate */ - (*incarnations)[0].hook = parsec_dtd_gpu_task_submit; + incarnations[0].hook = parsec_dtd_gpu_task_submit; dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)fpointer; } else { /* Default case: the user-provided function is directly the hook to call */ - (*incarnations)[0].hook = fpointer; // We can directly call the CPU hook + incarnations[0].hook = fpointer; // We can directly call the CPU hook dtd_tc->cpu_func_ptr = fpointer; } - (*incarnations)[1].type = PARSEC_DEV_NONE; + incarnations[1].type = PARSEC_DEV_NONE; + incarnations[1].hook = NULL; /* Bookkeeping of the task class */ parsec_dtd_register_task_class(&dtd_tp->super, fkey, tc); diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c index c72ad011d..5bee838b1 100644 --- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c +++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c @@ -3951,21 +3951,29 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf, jdf_def_list_t* dyld_property; jdf_def_list_t* evaluate_property = NULL; jdf_def_list_t* device_property = NULL; - char* dev_upper; + jdf_def_list_t* batch_property = NULL; (void)jdf; string_arena_add_string(sa, "static const __parsec_chore_t __%s_chores[] ={\n", base_name); do { + char* dev_upper; jdf_find_property(body->properties, "type", &type_property); jdf_find_property(body->properties, "dyld", &dyld_property); jdf_find_property(body->properties, JDF_BODY_PROP_EVALUATE, &evaluate_property); + jdf_find_property(body->properties, "batch", &batch_property); if( NULL == type_property) { dev_upper = strdup_upper("CPU"); } else { dev_upper = strdup_upper(type_property->expr->jdf_var); } string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", dev_upper); - string_arena_add_string(sa, " { .type = PARSEC_DEV_%s,\n", dev_upper); + string_arena_add_string(sa, " { .type = PARSEC_DEV_%s", dev_upper); + if( NULL != batch_property) { +#if PARSEC_HAVE_DEV_CAPABILITY_BATCH + string_arena_add_string(sa, " | PARSEC_DEV_CHORE_ALLOW_BATCH"); +#endif /* PARSEC_HAVE_DEV_CAPABILITY_BATCH */ + } + string_arena_add_string(sa, ",\n"); if( NULL == dyld_property ) { string_arena_add_string(sa, " .dyld = NULL,\n"); } else { @@ -3974,7 +3982,7 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf, if ( NULL == dyld_proptotype_property ) { fprintf(stderr, "Internal Error: function prototype (dyldtype) of dyld function (%s) is not defined in %s body of task %s at line %d\n", - dyld_property->expr->jdf_var, type_property->expr->jdf_var, f->fname, JDF_OBJECT_LINENO( body ) ); + dyld_property->expr->jdf_var, dev_upper, f->fname, JDF_OBJECT_LINENO( body ) ); assert( NULL != dyld_proptotype_property ); } string_arena_add_string(sa, " .dyld = \"%s\",\n", dyld_property->expr->jdf_var); @@ -3993,7 +4001,6 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf, string_arena_add_string(sa, " .hook = (parsec_hook_t*)hook_of_%s_%s },\n", base_name, dev_upper); string_arena_add_string(sa, "#endif /* defined(PARSEC_HAVE_DEV_%s_SUPPORT) */\n", dev_upper); free(dev_upper); - body = body->next; } while (NULL != body); string_arena_add_string(sa, @@ -4500,7 +4507,7 @@ static void jdf_generate_startup_hook( const jdf_t *jdf ) " parsec_task_class_t* tc = (parsec_task_class_t*)__parsec_tp->super.super.task_classes_array[i];\n" " __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;\n" " uint32_t idx = 0, j;\n" - " for( j = 0; PARSEC_DEV_NONE != chores[j].type; j++ ) {\n" + " for( j = 0; PARSEC_DEV_NONE != (chores[j].type & PARSEC_DEV_ANY_TYPE); j++ ) {\n" " if( !(supported_dev & chores[j].type) ) continue;\n" " if( j != idx ) {\n" " chores[idx] = chores[j];\n" @@ -4684,7 +4691,7 @@ static void jdf_generate_constructor( const jdf_t* jdf ) coutput(" for( i = 0; i < __parsec_tp->super.super.nb_task_classes; i++ ) {\n" " __parsec_tp->super.super.task_classes_array[i] = tc = malloc(sizeof(parsec_task_class_t));\n" " memcpy(tc, %s_task_classes[i], sizeof(parsec_task_class_t));\n" - " for( j = 0; PARSEC_DEV_NONE != tc->incarnations[j].type; j++); /* compute the number of incarnations */\n" + " for( j = 0; PARSEC_DEV_NONE != (tc->incarnations[j].type & PARSEC_DEV_ANY_TYPE); j++); /* compute the number of incarnations */\n" " tc->incarnations = (__parsec_chore_t*)malloc((j+1) * sizeof(__parsec_chore_t));\n " " memcpy((__parsec_chore_t*)tc->incarnations, %s_task_classes[i]->incarnations, (j+1) * sizeof(__parsec_chore_t));\n\n" " /* Add a placeholder for initialization and startup task */\n" @@ -6747,8 +6754,8 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf, coutput(" /* Pointer to dynamic gpu function */\n" " {\n" " int chore_idx = 0;\n" - " for ( ; PARSEC_DEV_NONE != this_task->task_class->incarnations[chore_idx].type; ++chore_idx) {\n" - " if (this_task->task_class->incarnations[chore_idx].type == PARSEC_DEV_%s) break;\n" + " for ( ; PARSEC_DEV_NONE != (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_ANY_TYPE); ++chore_idx) {\n" + " if (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_%s) break;\n" " }\n" " /* The void* cast prevents the compiler from complaining about the type change */\n" " parsec_body.dyld_fn = (%s)(void*)this_task->task_class->incarnations[chore_idx].dyld_fn;\n" diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c index 3529917b0..e29f144e9 100644 --- a/parsec/mca/device/cuda/device_cuda_component.c +++ b/parsec/mca/device/cuda/device_cuda_component.c @@ -276,7 +276,7 @@ static int device_cuda_component_close(void) /* Check that no CUDA devices are still registered with PaRSEC */ for(i = 0; i < parsec_mca_device_enabled(); i++) { if( NULL == (cdev = (parsec_device_cuda_module_t*)parsec_mca_device_get(i)) ) continue; - if(PARSEC_DEV_CUDA != cdev->super.super.type) continue; + if( !(PARSEC_DEV_CUDA & cdev->super.super.type) ) continue; PARSEC_DEBUG_VERBOSE(0, parsec_gpu_output_stream, "GPU[%d:%s] CUDA device %d still registered with PaRSEC at the end of CUDA finalize.\n" diff --git a/parsec/mca/device/device.c b/parsec/mca/device/device.c index c73582ff6..ee641df7c 100644 --- a/parsec/mca/device/device.c +++ b/parsec/mca/device/device.c @@ -46,6 +46,8 @@ static parsec_device_module_t **modules_activated = NULL; static mca_base_component_t **device_components = NULL; +static int parsec_device_enable_batching = !!PARSEC_HAVE_DEV_CAPABILITY_BATCH; + /** * Load balance skew we are willing to accept to favor RO data reuse * on GPU: a value of 20% means that we will schedule tasks on the preferred @@ -112,7 +114,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) { /* Run the evaluates for the incarnation types to determine if they can * execute this task */ - for(chore_id = 0; PARSEC_DEV_NONE != tc->incarnations[chore_id].type; chore_id++) { + for(chore_id = 0; PARSEC_DEV_NONE != (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); chore_id++) { if( 0 == (this_task->chore_mask & (1<incarnations[chore_id].hook ) continue; /* dyld hook not found during initialization */ @@ -121,7 +123,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) { if( PARSEC_HOOK_RETURN_DONE != rc ) { if( PARSEC_HOOK_RETURN_NEXT != rc ) { PARSEC_DEBUG_VERBOSE(5, parsec_device_output, "Failed to evaluate %s[%d] chore %d", - tmp, tc->incarnations[chore_id].type, + tmp, tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE, chore_id); } /* Mark this chore as tested */ @@ -129,10 +131,11 @@ int parsec_select_best_device( parsec_task_t* this_task ) { continue; } } - valid_types |= tc->incarnations[chore_id].type; /* the eval accepted the type, but no device specified yet */ - if( NULL != this_task->selected_device ) { /* When Evaluate picked a device, abide by it */ + valid_types |= (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); /* the eval accepted the type, but no device specified yet */ + /* Evaluate may have picked a device, abide by it */ + if( NULL != this_task->selected_device ) { assert( (1<selected_device->device_index) & tp->devices_index_mask /* only valid devices! */ ); - assert( this_task->selected_device->type & valid_types /* only valid device types! */ ); + assert( this_task->selected_device->type & valid_types ); PARSEC_DEBUG_VERBOSE(30, parsec_device_output, "%s: Task %s evaluate set selected_device %d:%s", __func__, tmp, this_task->selected_device->device_index, this_task->selected_device->name); goto device_selected; @@ -145,7 +148,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) { if (PARSEC_DEV_CPU == valid_types) { /* shortcut for CPU only tasks */ this_task->selected_device = dev = parsec_mca_device_get(0); this_task->load = 0; - for(chore_id = 0; tc->incarnations[chore_id].type != PARSEC_DEV_CPU; chore_id++); + for(chore_id = 0; !(tc->incarnations[chore_id].type & PARSEC_DEV_CPU); chore_id++); this_task->selected_chore = chore_id; PARSEC_DEBUG_VERBOSE(80, parsec_device_output, "%s: Task %s cpu-only task set selected_device %d:%s", __func__, tmp, dev->device_index, dev->name); @@ -233,6 +236,8 @@ int parsec_select_best_device( parsec_task_t* this_task ) { dev = parsec_mca_device_get(dev_index); /* Skip the device if no incarnations for its type */ if(!(dev->type & valid_types)) continue; + /* Skip recursive devices: time estimates are computed on the associated CPU device */ + if(dev->type & PARSEC_DEV_RECURSIVE) continue; eta = dev->device_load + time_estimate(this_task, dev); if( best_eta > eta ) { @@ -253,14 +258,14 @@ int parsec_select_best_device( parsec_task_t* this_task ) { goto no_valid_device; this_task->selected_device = parsec_mca_device_get(best_index); - assert( this_task->selected_device->type != PARSEC_DEV_RECURSIVE ); + assert( !(this_task->selected_device->type & PARSEC_DEV_RECURSIVE) ); } device_selected: dev = this_task->selected_device; assert( NULL != dev ); assert( tp->devices_index_mask & (1 << dev->device_index) ); - for(chore_id = 0; tc->incarnations[chore_id].type != dev->type; chore_id++) + for(chore_id = 0; !(tc->incarnations[chore_id].type & dev->type); chore_id++) assert(PARSEC_DEV_NONE != tc->incarnations[chore_id].type /* we have selected this device, so there *must* be an incarnation that matches */); this_task->selected_chore = chore_id; this_task->load = time_estimate(this_task, dev); @@ -301,6 +306,31 @@ no_valid_device: { PARSEC_OBJ_CLASS_INSTANCE(parsec_device_module_t, parsec_object_t, NULL, NULL); +int +parsec_mca_device_type_supports_batch(uint32_t device_type) +{ +#if PARSEC_HAVE_DEV_CAPABILITY_BATCH + const uint32_t batch_capable_types = PARSEC_DEV_CUDA | PARSEC_DEV_HIP | PARSEC_DEV_LEVEL_ZERO; + uint32_t type = device_type & PARSEC_DEV_ANY_TYPE; + + return parsec_device_enable_batching && (0 != type) && + (0 == (type & ~batch_capable_types)); +#else + (void)device_type; + return 0; +#endif /* PARSEC_HAVE_DEV_CAPABILITY_BATCH */ +} + +uint32_t +parsec_mca_device_type_sanitize_batch(uint32_t device_type) +{ + if( (device_type & PARSEC_DEV_CHORE_ALLOW_BATCH) && + !parsec_mca_device_type_supports_batch(device_type) ) { + device_type &= ~PARSEC_DEV_CHORE_ALLOW_BATCH; + } + return device_type; +} + int parsec_mca_device_init(void) { char** parsec_device_list = NULL; @@ -325,6 +355,11 @@ int parsec_mca_device_init(void) (void)parsec_mca_param_reg_int_name("device", "load_balance_allow_cpu", "Allow load balancing tasks with GPU incarnations to CPU cores", false, false, parsec_device_load_balance_allow_cpu, NULL); + (void)parsec_mca_param_reg_int_name("device", "enable_batching", + "Boolean to allow batched task execution on all devices", + false, !parsec_device_enable_batching, + parsec_device_enable_batching, + &parsec_device_enable_batching); if( 0 < (rc = parsec_mca_param_find("device", NULL, "load_balance_skew")) ) { parsec_mca_param_lookup_int(rc, &parsec_device_load_balance_skew); } @@ -765,10 +800,8 @@ int parsec_mca_device_registration_complete(parsec_context_t* context) for( uint32_t i = 0; i < parsec_nb_devices; i++ ) { parsec_device_module_t* device = parsec_devices[i]; if( NULL == device ) continue; - if( PARSEC_DEV_RECURSIVE == device->type ) continue; - if(NULL != device->all_devices_attached) - device->all_devices_attached(device); - if( PARSEC_DEV_CPU == device->type ) { + if( PARSEC_DEV_RECURSIVE & device->type ) continue; + if( PARSEC_DEV_CPU & device->type ) { c = 0; for(int p = 0; p < context->nb_vp; p++) c += context->virtual_processes[p]->nb_cores; @@ -787,7 +820,7 @@ int parsec_mca_device_registration_complete(parsec_context_t* context) for( uint32_t i = 0; i < parsec_nb_devices; i++ ) { parsec_device_module_t* device = parsec_devices[i]; if( NULL == device ) continue; - if( PARSEC_DEV_RECURSIVE == device->type ) continue; + if( PARSEC_DEV_RECURSIVE & device->type ) continue; device->time_estimate_default = total_gflops_fp64/(double)device->gflops_fp64; parsec_debug_verbose(6, parsec_device_output, " Dev[%d] default-time-estimate %-4"PRId64" <- double %-8"PRId64" single %-8"PRId64" tensor %-8"PRId64" half %-8"PRId64" %s", i, device->time_estimate_default, device->gflops_fp64, device->gflops_fp32, device->gflops_tf32, device->gflops_fp16, device->gflops_guess? "GUESSED": ""); @@ -976,7 +1009,7 @@ device_taskpool_register_static(parsec_device_module_t* device, parsec_taskpool_ continue; __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations; for( j = 0; NULL != chores[j].hook; j++ ) { - if( chores[j].type != device->type ) + if( !(chores[j].type & device->type) ) continue; if( NULL != chores[j].dyld_fn ) { continue; /* the function has been set for another device of the same type */ diff --git a/parsec/mca/device/device.h b/parsec/mca/device/device.h index c3950457b..26345afd8 100644 --- a/parsec/mca/device/device.h +++ b/parsec/mca/device/device.h @@ -2,6 +2,7 @@ * Copyright (c) 2013-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. */ /** @addtogroup parsec_device @@ -69,6 +70,8 @@ typedef struct parsec_device_base_component_2_0_0 parsec_device_base_component_t #define PARSEC_DEV_ANY_TYPE ((uint8_t) 0x3f) #define PARSEC_DEV_ALL ((uint8_t) 0x3f) #define PARSEC_DEV_MAX_NB_TYPE (7) +/* The following flags are extensions to the device type */ +#define PARSEC_DEV_CHORE_ALLOW_BATCH ((uint32_t)0x00000100) #define PARSEC_DEV_GPU_MASK (PARSEC_DEV_CUDA|PARSEC_DEV_HIP|PARSEC_DEV_LEVEL_ZERO) #define PARSEC_DEV_IS_GPU(t) (0 != ((t) & PARSEC_DEV_GPU_MASK)) @@ -205,6 +208,18 @@ extern int parsec_device_output; */ PARSEC_DECLSPEC extern int parsec_select_best_device( parsec_task_t* this_task); +/** + * Return true if batching is enabled globally and the provided non-empty + * mask contains only batching-capable device types. + */ +PARSEC_DECLSPEC int parsec_mca_device_type_supports_batch(uint32_t device_type); + +/** + * Drop the batching hint from a chore type if batching is disabled or the + * selected device type cannot batch. + */ +PARSEC_DECLSPEC uint32_t parsec_mca_device_type_sanitize_batch(uint32_t device_type); + /** * Initialize the internal structures for managing external devices such as * accelerators and GPU. Memory nodes can as well be managed using the same diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index 7fd4405ab..b120008b0 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -1,5 +1,4 @@ /* - * * Copyright (c) 2021-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -528,7 +527,7 @@ parsec_device_taskpool_register(parsec_device_module_t* device, const parsec_task_class_t* tc = tp->task_classes_array[i]; __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations; for( j = 0; NULL != chores[j].hook; j++ ) { - if( chores[j].type != device->type ) + if( !(chores[j].type & device->type) ) continue; if( NULL != chores[j].dyld_fn ) { /* the function has been set for another device of the same type */ @@ -1203,7 +1202,7 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index); dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index); - if(src_dev->super.type == dst_dev->super.type) { + if((src_dev->super.type & PARSEC_DEV_ANY_TYPE) == (dst_dev->super.type & PARSEC_DEV_ANY_TYPE)) { assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) ); dir = parsec_device_gpu_transfer_direction_d2d; } else { @@ -1254,7 +1253,7 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask, count = (source->original->nb_elts <= dest->original->nb_elts) ? source->original->nb_elts : dest->original->nb_elts; - if( src_dev->super.type == dst_dev->super.type ) { + if( (src_dev->super.type & PARSEC_DEV_ANY_TYPE) == (dst_dev->super.type & PARSEC_DEV_ANY_TYPE) ) { assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) ); dir = parsec_device_gpu_transfer_direction_d2d; } else { @@ -1381,7 +1380,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream, "GPU[%d:%s]:\tSelecting candidate data copy %p [ref_count %d] on data %p", gpu_device->super.device_index, gpu_device->super.name, task_data->data_in, task_data->data_in->super.super.obj_reference_count, original); - if( gpu_device->super.type == candidate_dev->super.type ) { + if( (gpu_device->super.type & PARSEC_DEV_ANY_TYPE) == (candidate_dev->super.type & PARSEC_DEV_ANY_TYPE) ) { if( gpu_device->peer_access_mask & (1 << candidate_dev->super.device_index) ) { /* We can directly do D2D, so let's skip the selection */ PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream, @@ -1531,7 +1530,8 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, gpu_device->super.device_index, gpu_device->super.name, rc, __func__, __LINE__, candidate->device_private, candidate_dev->super.device_index, candidate_dev->super.name, gpu_elem->device_private, gpu_device->super.device_index, gpu_device->super.name, - nb_elts, (candidate_dev->super.type != gpu_device->super.type)? "H2D": "D2D"); + nb_elts, + (candidate_dev->super.type & gpu_device->super.type & PARSEC_DEV_ANY_TYPE)? "D2D": "H2D"); parsec_atomic_unlock( &original->lock ); assert(0); return PARSEC_HOOK_RETURN_ERROR; @@ -1573,6 +1573,74 @@ static inline parsec_list_item_t* parsec_device_push_task_ordered( parsec_list_t #define PARSEC_PUSH_TASK parsec_list_push_back #endif +static inline int +parsec_gpu_task_is_singleton(parsec_gpu_task_t *task) +{ + parsec_list_item_t *item = &task->list_item; + +#if defined(PARSEC_DEBUG_PARANOID) + if( ((parsec_list_item_t *)(void *)0xdeadbeefL == item->list_next) || + ((parsec_list_item_t *)(void *)0xdeadbeefL == item->list_prev) ) { + return 0; + } +#endif + return (item->list_next == item) && (item->list_prev == item); +} + +static inline void +parsec_gpu_stream_push_pending(parsec_gpu_exec_stream_t *stream, + parsec_gpu_task_t *task) +{ + /* A completed batched kernel returns a proper task ring. Preserve that + * order when feeding the tasks to the next stream. + */ + if( !parsec_gpu_task_is_singleton(task) ) { + parsec_list_chain_back(stream->fifo_pending, &task->list_item); + return; + } + PARSEC_PUSH_TASK(stream->fifo_pending, &task->list_item); +} + +int +parsec_gpu_task_collect_batch(parsec_gpu_exec_stream_t *gpu_stream, + parsec_gpu_task_t *batch_head, + parsec_gpu_task_batch_cb_t callback, + void *callback_data) +{ + parsec_list_t *fifo_pending; + parsec_list_item_t *item, *next; + int nb_tasks = 1; + int rc; + + assert(NULL != gpu_stream); + assert(NULL != batch_head); + assert(NULL != callback); + + fifo_pending = gpu_stream->fifo_pending; + assert(NULL != fifo_pending); + parsec_list_item_singleton(&batch_head->list_item); + + parsec_list_lock(fifo_pending); + for(item = (parsec_list_item_t *)fifo_pending->ghost_element.list_next; + item != &fifo_pending->ghost_element; + item = next) { + next = (parsec_list_item_t *)item->list_next; + rc = callback((parsec_gpu_task_t *)item, batch_head, callback_data); + if( rc < 0 ) { + parsec_list_unlock(fifo_pending); + return rc; + } + if( 0 == rc ) { + (void)parsec_list_nolock_remove(fifo_pending, item); + (void)parsec_list_item_ring_push(&batch_head->list_item, item); + nb_tasks++; + } + } + parsec_list_unlock(fifo_pending); + + return nb_tasks; +} + static parsec_flow_t parsec_device_d2d_complete_flow = { .name = "D2D FLOW", .flow_flags = PARSEC_FLOW_ACCESS_READ, @@ -1867,8 +1935,9 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t *gpu_device, * The progress function is either specified by the caller via the * upstream_progress_fct input argument or by the next task to be progresses * via the submit function associated with the task. In any case, this - * function progresses a single task, which is then returned as the - * out_task parameter. + * function progresses a task. If a batched submit function returns a task + * ring, the ring is returned as the out_task parameter and chained into the + * next stream by the caller's next invocation of this helper. * * Beware: this function does not generate errors by itself, instead * it propagates upward the return code of the progress function. @@ -1891,7 +1960,7 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device, * local list (possibly by reordering the list). Also, as we can return a single * task first try to see if anything completed. */ if( NULL != task ) { - PARSEC_PUSH_TASK(stream->fifo_pending, (parsec_list_item_t*)task); + parsec_gpu_stream_push_pending(stream, task); task = NULL; } *out_task = NULL; @@ -1936,14 +2005,14 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device, } } - grab_a_task: + grab_a_task: if( NULL == stream->tasks[stream->start] ) { /* there is room on the stream */ task = (parsec_gpu_task_t*)parsec_list_pop_front(stream->fifo_pending); /* get the best task */ } if( NULL == task ) { /* No tasks, we're done */ return PARSEC_HOOK_RETURN_DONE; } - PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t*)task); + PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t *)task); assert( NULL == stream->tasks[stream->start] ); @@ -1956,7 +2025,7 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device, * the resubmission of this task as much as possible, but without losing track of it * (aka. returning it to the upper level). */ - parsec_list_push_back(stream->fifo_pending, (parsec_list_item_t*)task); + parsec_gpu_stream_push_pending(stream, task); } else { /* Something else is going on with this task, remove it from the stream queues * and return it to the upper level for final decision on its fate. @@ -2151,6 +2220,12 @@ parsec_device_kernel_exec( parsec_device_gpu_module_t *gpu_device, } #endif /* defined(PARSEC_DEBUG_PARANOID) */ + /* The submit hook may turn gpu_task into a batch ring. Start from a clean + * singleton so stale list links left by release-mode list operations cannot + * be mistaken for a preexisting ring. + */ + PARSEC_LIST_ITEM_SINGLETON(&gpu_task->list_item); + (void)this_task; return progress_fct( gpu_device, gpu_task, gpu_stream ); } @@ -2670,6 +2745,10 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, gpu_task = (parsec_gpu_task_t*)parsec_fifo_try_pop( &(gpu_device->pending) ); if( NULL != gpu_task ) { pop_null = 0; + /* parsec_fifo_try_pop() detaches the task but does not reset list links + * in release builds; normalize before the stream FIFO inspects them. + */ + PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t*)gpu_task); gpu_task->last_data_check_epoch = gpu_device->data_avail_epoch - 1; /* force at least one tour */ PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]:\tGet from shared queue %s", gpu_device->super.device_index, gpu_device->super.name, parsec_device_describe_gpu_task(tmp, MAX_TASK_STRLEN, gpu_task)); diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h index 8037f7fb7..35c367289 100644 --- a/parsec/mca/device/device_gpu.h +++ b/parsec/mca/device/device_gpu.h @@ -50,6 +50,19 @@ typedef int (*parsec_advance_task_function_t)(parsec_device_gpu_module_t *gpu_d parsec_gpu_task_t *gpu_task, parsec_gpu_exec_stream_t *gpu_stream); +/* Callback used by parsec_gpu_task_collect_batch() to decide whether a + * pending task can be appended to the current batched task ring. + * + * Return values: + * < 0: stop iteration and return this error code to the caller. + * 0: extract candidate from the stream pending queue and append it to + * batch_head. + * > 0: leave candidate in the stream pending queue and continue. + */ +typedef int (*parsec_gpu_task_batch_cb_t)(parsec_gpu_task_t *candidate, + parsec_gpu_task_t *batch_head, + void *callback_data); + /* Function type to transfer data to the GPU device. * Transfer transfer the contiguous bytes from * task->data[i].data_in to task->data[i].data_out. @@ -307,6 +320,23 @@ int parsec_device_sort_pending_list(parsec_device_module_t *gpu_device); parsec_gpu_task_t* parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_execution_stream_t *es); int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_gpu_task_t *w2r_task, parsec_execution_stream_t *es); +/** + * Iterate over gpu_stream->fifo_pending and append accepted tasks to + * batch_head. The callback receives each pending candidate, the task passed to + * the submit function, and user data. The callback should return 0 to append + * the candidate to batch_head's ring, a positive value to leave it pending, or + * a negative error code to stop the iteration. + * The callback must not modify gpu_stream->fifo_pending directly. + * + * Returns the number of tasks in batch_head's ring on success, or the negative + * callback error code. If an error is returned, tasks already accepted remain + * attached to batch_head and the remaining candidates stay in fifo_pending. + */ +int parsec_gpu_task_collect_batch(parsec_gpu_exec_stream_t *gpu_stream, + parsec_gpu_task_t *batch_head, + parsec_gpu_task_batch_cb_t callback, + void *callback_data); + void parsec_device_enable_debug(void); #if defined(PARSEC_DEBUG_VERBOSE) diff --git a/parsec/mca/device/level_zero/device_level_zero_component.c b/parsec/mca/device/level_zero/device_level_zero_component.c index f50f2a817..fec2bff88 100644 --- a/parsec/mca/device/level_zero/device_level_zero_component.c +++ b/parsec/mca/device/level_zero/device_level_zero_component.c @@ -2,6 +2,7 @@ * Copyright (c) 2023-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. */ #include "parsec.h" @@ -410,7 +411,7 @@ static int device_level_zero_component_close(void) /* Check that no LEVEL_ZERO devices are still registered with PaRSEC */ for(i = 0; i < parsec_mca_device_enabled(); i++) { if( NULL == (cdev = (parsec_device_level_zero_module_t*)parsec_mca_device_get(i)) ) continue; - if(PARSEC_DEV_LEVEL_ZERO != cdev->super.super.type) continue; + if( !(PARSEC_DEV_LEVEL_ZERO & cdev->super.super.type) ) continue; PARSEC_DEBUG_VERBOSE(0, parsec_gpu_output_stream, "GPU[%d] LEVEL_ZERO device still registered with PaRSEC at the end of LEVEL_ZERO finalize.\n" diff --git a/parsec/mca/device/template/device_template_module.c b/parsec/mca/device/template/device_template_module.c index ec8f5dcc8..453d938d3 100644 --- a/parsec/mca/device/template/device_template_module.c +++ b/parsec/mca/device/template/device_template_module.c @@ -2,6 +2,7 @@ * Copyright (c) 2019-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. */ #include "parsec/parsec_config.h" @@ -78,7 +79,7 @@ parsec_template_taskpool_register(parsec_device_module_t* device, const parsec_task_class_t* tc = tp->task_classes_array[i]; __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations; for( j = 0; NULL != chores[j].hook; j++ ) { - if( chores[j].type != device->type ) + if( !(chores[j].type & device->type) ) continue; if( NULL != chores[j].dyld_fn ) { /* the function has been set for another device of the same type */ diff --git a/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c b/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c index 6d602117a..aff5bc692 100644 --- a/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c +++ b/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c @@ -86,7 +86,7 @@ copy_chores(parsec_taskpool_t *tp, parsec_dtd_taskpool_t *dtd_tp) parsec_hook_t **hook_not_const = (parsec_hook_t **)&(tp->task_classes_array[i]->incarnations[j].hook); /* saving the CPU hook only */ - if (tp->task_classes_array[i]->incarnations[j].type == PARSEC_DEV_CPU) { + if (tp->task_classes_array[i]->incarnations[j].type & PARSEC_DEV_CPU) { dtd_tp->actual_hook[i].hook = tp->task_classes_array[i]->incarnations[j].hook; } /* copying the fake hook in all the hooks (CPU, GPU etc) */ @@ -312,8 +312,8 @@ parsec_dtd_taskpool_insert_task_ptg_to_dtd( parsec_dtd_taskpool_t *dtd_tp, free(params); __parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations; - for(int i = 0; PARSEC_DEV_NONE != incarnations[i].type; i++ ) { - if( PARSEC_DEV_CPU == incarnations[i].type ) { + for(int i = 0; PARSEC_DEV_NONE != (incarnations[i].type & PARSEC_DEV_ANY_TYPE); i++ ) { + if( PARSEC_DEV_CPU & incarnations[i].type ) { incarnations[i] = dtd_chore_for_testing; } } diff --git a/tests/dsl/dtd/CMakeLists.txt b/tests/dsl/dtd/CMakeLists.txt index 209875a05..b5da79051 100644 --- a/tests/dsl/dtd/CMakeLists.txt +++ b/tests/dsl/dtd/CMakeLists.txt @@ -14,6 +14,9 @@ parsec_addtest_executable(C dtd_test_reduce SOURCES dtd_test_reduce.c) parsec_addtest_executable(C dtd_test_allreduce SOURCES dtd_test_allreduce.c) parsec_addtest_executable(C dtd_test_template_counter SOURCES dtd_test_template_counter.c) parsec_addtest_executable(C dtd_test_untie SOURCES dtd_test_untie.c) +if(PARSEC_HAVE_DEV_CAPABILITY_BATCH) + parsec_addtest_executable(C dtd_test_batch_cpu SOURCES dtd_test_batch_cpu.c) +endif(PARSEC_HAVE_DEV_CAPABILITY_BATCH) parsec_addtest_executable(C dtd_test_hierarchy SOURCES dtd_test_hierarchy.c) parsec_addtest_executable(C dtd_test_task_placement SOURCES dtd_test_task_placement.c) parsec_addtest_executable(C dtd_test_insert_task_interface SOURCES dtd_test_insert_task_interface.c) diff --git a/tests/dsl/dtd/Testings.cmake b/tests/dsl/dtd/Testings.cmake index 7305a202c..ff85b63d6 100644 --- a/tests/dsl/dtd/Testings.cmake +++ b/tests/dsl/dtd/Testings.cmake @@ -7,6 +7,10 @@ parsec_addtest_cmd(dsl/dtd/task_generation ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test parsec_addtest_cmd(dsl/dtd/task_inserting_task ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test_task_inserting_task) parsec_addtest_cmd(dsl/dtd/task_insertion ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test_task_insertion) parsec_addtest_cmd(dsl/dtd/war ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test_war) +if(PARSEC_HAVE_DEV_CAPABILITY_BATCH) + parsec_addtest_cmd(dsl/dtd/batch_cpu ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test_batch_cpu) + parsec_addtest_cmd(dsl/dtd/batch_cpu:disabled ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test_batch_cpu --mca device_enable_batching 0) +endif(PARSEC_HAVE_DEV_CAPABILITY_BATCH) parsec_addtest_cmd(dsl/dtd/new_tile:cpu ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test_new_tile --mca device_cuda_enabled 0) if(PARSEC_HAVE_CUDA AND CMAKE_CUDA_COMPILER) parsec_addtest_cmd(dsl/dtd/new_tile:gpu ${SHM_TEST_CMD_LIST} ${CTEST_CUDA_LAUNCHER_OPTIONS} dsl/dtd/dtd_test_new_tile --mca device_cuda_enabled 1 --mca device cuda) diff --git a/tests/dsl/dtd/dtd_test_batch_cpu.c b/tests/dsl/dtd/dtd_test_batch_cpu.c new file mode 100644 index 000000000..c32893f7c --- /dev/null +++ b/tests/dsl/dtd/dtd_test_batch_cpu.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. + */ + +#include "parsec.h" +#include "parsec/interfaces/dtd/insert_function_internal.h" +#include "parsec/mca/device/device.h" + +#if defined(PARSEC_HAVE_MPI) +#include +#endif + +#include +#include + +static int32_t observed_sum = 0; + +static parsec_hook_return_t +batch_cpu_task(parsec_execution_stream_t *es, parsec_task_t *this_task) +{ + int value; + + parsec_dtd_unpack_args(this_task, &value); + (void)parsec_atomic_fetch_add_int32(&observed_sum, value); + + (void)es; + return PARSEC_HOOK_RETURN_DONE; +} + +int +main(int argc, char **argv) +{ + parsec_context_t *parsec; + parsec_taskpool_t *dtd_tp; + parsec_task_class_t *tc; + int rc, rank = 0; + int ntasks = 32; + int expected = 0; + int ret = 0; + +#if defined(PARSEC_HAVE_MPI) + { + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &provided); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + } +#endif + + if( NULL != argv[1] ) { + ntasks = atoi(argv[1]); + } + if( ntasks <= 0 ) { + ntasks = 32; + } + + parsec = parsec_init(-1, &argc, &argv); + assert(NULL != parsec); + + dtd_tp = parsec_dtd_taskpool_new(); + + rc = parsec_context_start(parsec); + PARSEC_CHECK_ERROR(rc, "parsec_context_start"); + + rc = parsec_context_add_taskpool(parsec, dtd_tp); + PARSEC_CHECK_ERROR(rc, "parsec_context_add_taskpool"); + + tc = parsec_dtd_create_task_class(dtd_tp, "BatchCpu", + sizeof(int), PARSEC_VALUE, + PARSEC_DTD_ARG_END); + assert(NULL != tc); + + rc = parsec_dtd_task_class_add_chore(dtd_tp, tc, + PARSEC_DEV_CPU | PARSEC_DEV_CHORE_ALLOW_BATCH, + batch_cpu_task); + PARSEC_CHECK_ERROR(rc, "parsec_dtd_task_class_add_chore"); + assert(tc->incarnations[0].type & PARSEC_DEV_CPU); + assert(!!(tc->incarnations[0].type & PARSEC_DEV_CHORE_ALLOW_BATCH) == + !!parsec_mca_device_type_supports_batch(PARSEC_DEV_CPU)); + + for( int i = 0; i < ntasks; i++ ) { + int value = i + 1; + expected += value; + parsec_dtd_insert_task_with_task_class(dtd_tp, tc, 0, + PARSEC_DEV_CPU | PARSEC_DEV_CHORE_ALLOW_BATCH, + PARSEC_DTD_EMPTY_FLAG, &value, + PARSEC_DTD_ARG_END); + } + + rc = parsec_taskpool_wait(dtd_tp); + PARSEC_CHECK_ERROR(rc, "parsec_taskpool_wait"); + + rc = parsec_context_wait(parsec); + PARSEC_CHECK_ERROR(rc, "parsec_context_wait"); + + if( expected != observed_sum ) { + parsec_warning("Rank %d observed %d, expected %d\n", + rank, observed_sum, expected); + ret = 1; + } + + parsec_dtd_task_class_release(dtd_tp, tc); + parsec_taskpool_free(dtd_tp); + parsec_fini(&parsec); + +#if defined(PARSEC_HAVE_MPI) + MPI_Finalize(); +#endif + + return ret; +} diff --git a/tests/dsl/dtd/dtd_test_cuda_task_insert.c b/tests/dsl/dtd/dtd_test_cuda_task_insert.c index d37d27d32..0b0b78de5 100644 --- a/tests/dsl/dtd/dtd_test_cuda_task_insert.c +++ b/tests/dsl/dtd/dtd_test_cuda_task_insert.c @@ -2,6 +2,7 @@ * Copyright (c) 2021-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved. */ #include "parsec.h" @@ -624,7 +625,7 @@ int get_nb_cuda_devices() for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { nb++; } } @@ -640,7 +641,7 @@ int *get_cuda_device_index() int i = 0; for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { dev_index[i++] = device->device_index; } } diff --git a/tests/dsl/dtd/dtd_test_new_tile.c b/tests/dsl/dtd/dtd_test_new_tile.c index bbffe3ea7..c0ef693f1 100644 --- a/tests/dsl/dtd/dtd_test_new_tile.c +++ b/tests/dsl/dtd/dtd_test_new_tile.c @@ -2,6 +2,7 @@ * Copyright (c) 2022-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved. */ /* parsec things */ @@ -283,7 +284,7 @@ int main(int argc, char **argv) #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(PARSEC_HAVE_CU_COMPILER) for(unsigned int i = 0; i < parsec_nb_devices; i++) { parsec_device_module_t *dev = parsec_mca_device_get(i); - if( dev->type == PARSEC_DEV_CUDA ) + if( dev->type & PARSEC_DEV_CUDA ) nb_gpus++; } if(nb_gpus > 0) { @@ -292,7 +293,7 @@ int main(int argc, char **argv) nb_gpus = 0; for(unsigned int i = 0; i < parsec_nb_devices; i++) { parsec_device_module_t *dev = parsec_mca_device_get(i); - if( dev->type == PARSEC_DEV_CUDA) { + if( dev->type & PARSEC_DEV_CUDA) { cudaError_t status; parsec_device_cuda_module_t *gpu_device = (parsec_device_cuda_module_t *)dev; status = cudaSetDevice( gpu_device->cuda_index ); diff --git a/tests/dsl/dtd/dtd_test_simple_gemm.c b/tests/dsl/dtd/dtd_test_simple_gemm.c index 2939518d9..6ef30179b 100644 --- a/tests/dsl/dtd/dtd_test_simple_gemm.c +++ b/tests/dsl/dtd/dtd_test_simple_gemm.c @@ -2,7 +2,7 @@ * Copyright (c) 2021-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. - * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved. */ #include "parsec.h" @@ -10,6 +10,7 @@ #include "parsec/data_dist/matrix/matrix.h" #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" #include "parsec/interfaces/dtd/insert_function_internal.h" +#include "parsec/mca/device/device.h" // The file is not compiled if CUDA is not present or CUBLAS is not found #include "parsec/mca/device/cuda/device_cuda.h" @@ -57,6 +58,7 @@ static parsec_info_id_t CuHI = -1; static parsec_info_id_t Cu1 = -1; static int verbose = 0; static int device = PARSEC_DEV_CUDA; +static int use_cuda_batch = 0; static int P = -1; static int Q = -1; @@ -182,55 +184,104 @@ int initialize_matrix(parsec_context_t *parsec_context, int rank, parsec_matrix_ return 0; } +static int +gemm_cuda_task_allows_batch(parsec_gpu_task_t *gpu_task) +{ + parsec_task_t *this_task = gpu_task->ec; + int selected_chore = this_task->selected_chore; + + return use_cuda_batch && + (selected_chore >= 0) && + (this_task->task_class->incarnations[selected_chore].type & PARSEC_DEV_CHORE_ALLOW_BATCH) && + parsec_mca_device_type_supports_batch(this_task->selected_device->type); +} + +static int +gemm_cuda_batch_match(parsec_gpu_task_t *candidate, + parsec_gpu_task_t *batch_head, + void *callback_data) +{ + (void)callback_data; + + if( (batch_head->ec->task_class == candidate->ec->task_class) && + (batch_head->ec->selected_chore == candidate->ec->selected_chore) && + (batch_head->ec->selected_device == candidate->ec->selected_device) ) { + return 0; + } + return 1; +} + int gemm_kernel_cuda(parsec_device_gpu_module_t *gpu_device, parsec_gpu_task_t *gpu_task, parsec_gpu_exec_stream_t *gpu_stream) { - double *A, *B, *C; - int m, n, k, mb, nb, kb; - parsec_task_t *this_task = gpu_task->ec; cublasStatus_t status; cublasHandle_t handle; double *one_device = NULL; - struct timeval start, end, diff; - double delta; - double *a_gpu, *b_gpu, *c_gpu; - - (void)gpu_stream; - (void)gpu_device; - - parsec_dtd_unpack_args(this_task, - &A, &B, &C, - &m, &n, &k, - &mb, &nb, &kb); - - a_gpu = parsec_dtd_get_dev_ptr(this_task, 0); - b_gpu = parsec_dtd_get_dev_ptr(this_task, 1); - c_gpu = parsec_dtd_get_dev_ptr(this_task, 2); + parsec_gpu_task_t *current_gpu_task; + int batch_count = 1; + + if( gemm_cuda_task_allows_batch(gpu_task) ) { + batch_count = parsec_gpu_task_collect_batch(gpu_stream, gpu_task, + gemm_cuda_batch_match, NULL); + if( batch_count < 0 ) { + return batch_count; + } + } handle = parsec_info_get(&gpu_stream->infos, CuHI); assert(NULL != handle); one_device = parsec_info_get(&gpu_device->super.infos, Cu1); assert(NULL != one_device); - gettimeofday(&start, NULL); - status = cublasDgemm_v2(handle, - CUBLAS_OP_N, CUBLAS_OP_N, - mb, nb, kb, - one_device, a_gpu, mb, - b_gpu, kb, - one_device, c_gpu, mb); - gettimeofday(&end, NULL); - timersub(&end, &start, &diff); - delta = (double)diff.tv_sec + (double)diff.tv_usec/1e6; - if(verbose) - fprintf(stderr, "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s\n", - m, n, k, mb, kb, kb, nb, mb, kb, - this_task->taskpool->context->my_rank, - gpu_stream->name, delta); + current_gpu_task = gpu_task; + do { + double *A, *B, *C; + int m, n, k, mb, nb, kb; + parsec_task_t *this_task = current_gpu_task->ec; + struct timeval start, end, diff; + double delta; + double *a_gpu, *b_gpu, *c_gpu; + + parsec_dtd_unpack_args(this_task, + &A, &B, &C, + &m, &n, &k, + &mb, &nb, &kb); + (void)A; (void)B; (void)C; + + a_gpu = parsec_dtd_get_dev_ptr(this_task, 0); + b_gpu = parsec_dtd_get_dev_ptr(this_task, 1); + c_gpu = parsec_dtd_get_dev_ptr(this_task, 2); - PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status, - { return PARSEC_HOOK_RETURN_ERROR; }); + gettimeofday(&start, NULL); + + status = cublasDgemm_v2(handle, + CUBLAS_OP_N, CUBLAS_OP_N, + mb, nb, kb, + one_device, a_gpu, mb, + b_gpu, kb, + one_device, c_gpu, mb); + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + delta = (double)diff.tv_sec + (double)diff.tv_usec/1e6; + if(verbose) { + fprintf(stderr, "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s%s\n", + m, n, k, mb, kb, kb, nb, mb, kb, + this_task->taskpool->context->my_rank, + gpu_stream->name, delta, + batch_count > 1 ? " as part of a batch" : ""); + } + + PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status, + { return PARSEC_HOOK_RETURN_ERROR; }); + + current_gpu_task = (parsec_gpu_task_t *)current_gpu_task->list_item.list_next; + } while( current_gpu_task != gpu_task ); + + if( verbose && batch_count > 1 ) { + fprintf(stderr, "Submitted %d batched GEMM tasks on GPU stream %s\n", + batch_count, gpu_stream->name); + } return PARSEC_HOOK_RETURN_DONE; } @@ -297,7 +348,9 @@ int simple_gemm(parsec_context_t *parsec_context, parsec_matrix_block_cyclic_t * sizeof(int), PARSEC_VALUE, /* nb */ sizeof(int), PARSEC_VALUE, /* kb */ PARSEC_DTD_ARG_END); - parsec_dtd_task_class_add_chore(tp, gemm_tc, PARSEC_DEV_CUDA, gemm_kernel_cuda); + parsec_dtd_task_class_add_chore(tp, gemm_tc, + use_cuda_batch ? (PARSEC_DEV_CUDA | PARSEC_DEV_CHORE_ALLOW_BATCH) : PARSEC_DEV_CUDA, + gemm_kernel_cuda); #if defined(HAVE_BLAS) parsec_dtd_task_class_add_chore(tp, gemm_tc, PARSEC_DEV_CPU, gemm_kernel_cpu); #endif @@ -308,7 +361,9 @@ int simple_gemm(parsec_context_t *parsec_context, parsec_matrix_block_cyclic_t * for( int k = 0; k < A->super.nt; k++ ) { keyA = A->super.super.data_key(&A->super.super, i, k); keyB = B->super.super.data_key(&B->super.super, k, j); - parsec_dtd_insert_task_with_task_class(tp, gemm_tc, C->super.mt*C->super.nt*A->super.nt - i*C->super.nt + j, device, + parsec_dtd_insert_task_with_task_class(tp, gemm_tc, C->super.mt*C->super.nt*A->super.nt - i*C->super.nt + j, + use_cuda_batch && (PARSEC_DEV_CUDA == device) ? + (device | PARSEC_DEV_CHORE_ALLOW_BATCH) : device, PARSEC_INPUT, PARSEC_DTD_TILE_OF_KEY(&A->super.super, keyA), PARSEC_INPUT, PARSEC_DTD_TILE_OF_KEY(&B->super.super, keyB), k == A->super.nt - 1 ? (PARSEC_INOUT | PARSEC_PUSHOUT) : PARSEC_INOUT, @@ -347,7 +402,7 @@ int get_nb_gpu_devices() for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { nb++; } } @@ -363,7 +418,7 @@ int *get_gpu_device_index() int i = 0; for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { dev_index[i++] = device->device_index; } } @@ -507,13 +562,14 @@ int main(int argc, char **argv) {"device", required_argument, 0, 'd'}, {"nruns", required_argument, 0, 't'}, {"verbose", no_argument, 0, 'v'}, + {"batch", no_argument, 0, 'b'}, {"Debug", required_argument, 0, 'D'}, {"Alarm", required_argument, 0, 'A'}, {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; - int c = getopt_long(argc, argv, "M:N:K:m:n:k:P:Q:t:d:D:A:vh", + int c = getopt_long(argc, argv, "M:N:K:m:n:k:P:Q:t:d:D:A:vbh", long_options, &option_index); if( c == -1 ) break; @@ -549,6 +605,9 @@ int main(int argc, char **argv) case 'v': verbose = !verbose; break; + case 'b': + use_cuda_batch = 1; + break; case 'd': if(strcmp(optarg, "GPU") == 0) { device=PARSEC_DEV_CUDA; @@ -587,6 +646,7 @@ int main(int argc, char **argv) " --mb|-m / --kb/-k / --nb|-n: set mb, kb and nb (resp.)\n" " --nruns|-t: set the number of runs to do\n" " --device|-d: which device to use (CPU or GPU)\n" + " --batch|-b: enable CUDA batched GEMM chores\n" " --verbose|-v: display which GEMM runs on which GPU\n" " as execution is unfolding\n" " --help|-h|-?: display this help\n" @@ -602,7 +662,9 @@ int main(int argc, char **argv) "\n", argv[0]); } +#if defined(PARSEC_HAVE_MPI) MPI_Finalize(); +#endif exit(0); } } @@ -636,7 +698,9 @@ int main(int argc, char **argv) rc = !(nbgpus >= 1); if( rc != 0 ) { fprintf(stderr, "Rank %d doesn't have CUDA accelerators\n", rank); +#if defined(PARSEC_HAVE_MPI) MPI_Abort(MPI_COMM_WORLD, 0); +#endif return -1; } gpu_device_index = get_gpu_device_index(); diff --git a/tests/dsl/ptg/CMakeLists.txt b/tests/dsl/ptg/CMakeLists.txt index ed03f549f..25ed08aea 100644 --- a/tests/dsl/ptg/CMakeLists.txt +++ b/tests/dsl/ptg/CMakeLists.txt @@ -14,6 +14,11 @@ endif(PARSEC_HAVE_RANDOM) parsec_addtest_executable(C complex_deps) target_ptg_sources(complex_deps PRIVATE "complex_deps.jdf") +if(PARSEC_HAVE_DEV_CAPABILITY_BATCH) + parsec_addtest_executable(C batch_cpu) + target_ptg_sources(batch_cpu PRIVATE "batch_cpu.jdf") +endif(PARSEC_HAVE_DEV_CAPABILITY_BATCH) + add_subdirectory(branching) add_subdirectory(choice) add_subdirectory(controlgather) diff --git a/tests/dsl/ptg/Testings.cmake b/tests/dsl/ptg/Testings.cmake index 697ad425f..4e3057dbe 100644 --- a/tests/dsl/ptg/Testings.cmake +++ b/tests/dsl/ptg/Testings.cmake @@ -7,3 +7,6 @@ parsec_addtest_cmd(dsl/ptg/startup1 ${SHM_TEST_CMD_LIST} dsl/ptg/startup -i=10 - parsec_addtest_cmd(dsl/ptg/startup2 ${SHM_TEST_CMD_LIST} dsl/ptg/startup -i=10 -j=20 -k=30 -v=5) parsec_addtest_cmd(dsl/ptg/startup3 ${SHM_TEST_CMD_LIST} dsl/ptg/startup -i=30 -j=30 -k=30 -v=5) parsec_addtest_cmd(dsl/ptg/strange ${SHM_TEST_CMD_LIST} dsl/ptg/strange) +if(PARSEC_HAVE_DEV_CAPABILITY_BATCH) + parsec_addtest_cmd(dsl/ptg/batch_cpu ${SHM_TEST_CMD_LIST} dsl/ptg/batch_cpu) +endif(PARSEC_HAVE_DEV_CAPABILITY_BATCH) diff --git a/tests/dsl/ptg/batch_cpu.jdf b/tests/dsl/ptg/batch_cpu.jdf new file mode 100644 index 000000000..60e3a16bd --- /dev/null +++ b/tests/dsl/ptg/batch_cpu.jdf @@ -0,0 +1,147 @@ +extern "C" %{ +/* + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2026 NVIDIA Corporation. All rights reserved. + */ + +#include +#include +#include + +#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h" +#include "parsec/mca/device/device.h" +#include "batch_cpu.h" + +#define TYPE PARSEC_MATRIX_INTEGER + +static int +has_cpu_batch_chore(parsec_taskpool_t *tp) +{ + for( unsigned int i = 0; i < tp->nb_task_classes; i++ ) { + const parsec_task_class_t *tc = tp->task_classes_array[i]; + const __parsec_chore_t *chores; + + if( NULL == tc ) { + continue; + } + chores = tc->incarnations; + for( int j = 0; PARSEC_DEV_NONE != (chores[j].type & PARSEC_DEV_ANY_TYPE); j++ ) { + if( (chores[j].type & PARSEC_DEV_CPU) && + (chores[j].type & PARSEC_DEV_CHORE_ALLOW_BATCH) ) { + return 1; + } + } + } + return 0; +} +%} + +DATA [type = "parsec_data_collection_t*"] +N [type = int] + +BATCH_CPU(k) + + k = 0 .. N-1 + + : DATA(k, 0) + + RW TILE <- DATA(k, 0) + -> DATA(k, 0) + +BODY [batch=true] +{ + assert(NULL != TILE); + ((int*)TILE)[0] = k + 1; +} +END + +extern "C" %{ + +int main(int argc, char **argv) +{ + parsec_batch_cpu_taskpool_t *tp; + parsec_matrix_block_cyclic_t descA; + parsec_arena_datatype_t adt; + parsec_datatype_t dt; + parsec_context_t *parsec; + int n = 32, rc; + int rank = 0; + int ret = 0; + +#if defined(PARSEC_HAVE_MPI) + { + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &provided); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + } +#endif + + if( NULL != argv[1] ) { + n = atoi(argv[1]); + } + if( n <= 0 ) { + n = 32; + } + + parsec = parsec_init(-1, &argc, &argv); + assert(NULL != parsec); + + parsec_matrix_block_cyclic_init(&descA, TYPE, PARSEC_MATRIX_TILE, + rank, + 1, 1, n, 1, + 0, 0, n, 1, 1, 1, 1, 1, 0, 0); + descA.mat = parsec_data_allocate(descA.super.nb_local_tiles * + descA.super.bsiz * + parsec_datadist_getsizeoftype(TYPE)); + memset(descA.mat, 0, descA.super.nb_local_tiles * + descA.super.bsiz * + parsec_datadist_getsizeoftype(TYPE)); + + parsec_translate_matrix_type(TYPE, &dt); + parsec_add2arena_rect(&adt, dt, 1, 1, 1); + + tp = parsec_batch_cpu_new((parsec_data_collection_t*)&descA, n); + assert(NULL != tp); + if( !has_cpu_batch_chore(&tp->super) ) { + parsec_warning("The generated PTG task class does not expose a CPU batch chore\n"); + ret = 1; + goto cleanup; + } + tp->arenas_datatypes[PARSEC_batch_cpu_DEFAULT_ADT_IDX] = adt; + PARSEC_OBJ_RETAIN(adt.arena); + + rc = parsec_context_start(parsec); + PARSEC_CHECK_ERROR(rc, "parsec_context_start"); + + rc = parsec_context_add_taskpool(parsec, (parsec_taskpool_t*)tp); + PARSEC_CHECK_ERROR(rc, "parsec_context_add_taskpool"); + + rc = parsec_context_wait(parsec); + PARSEC_CHECK_ERROR(rc, "parsec_context_wait"); + + for( int i = 0; i < n; i++ ) { + if( ((int*)descA.mat)[i] != i + 1 ) { + parsec_warning("Rank %d found value %d at position %d, expected %d\n", + rank, ((int*)descA.mat)[i], i, i + 1); + ret = 1; + break; + } + } + + cleanup: + parsec_taskpool_free(&tp->super); + free(descA.mat); + PARSEC_OBJ_RELEASE(adt.arena); + parsec_del2arena(&adt); + parsec_fini(&parsec); + +#if defined(PARSEC_HAVE_MPI) + MPI_Finalize(); +#endif + + return ret; +} + +%} diff --git a/tests/runtime/cuda/get_best_device_check.jdf b/tests/runtime/cuda/get_best_device_check.jdf index db588b625..3fac93b87 100644 --- a/tests/runtime/cuda/get_best_device_check.jdf +++ b/tests/runtime/cuda/get_best_device_check.jdf @@ -3,6 +3,7 @@ extern "C" %{ * Copyright (c) 2021-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. */ #include "cuda_test_internal.h" @@ -148,7 +149,7 @@ parsec_get_best_device_check_New(parsec_tiled_matrix_t *dcA, int *info) int nb = 0; for(int i = 0; i < (int)parsec_nb_devices; i++) { parsec_device_module_t *device = parsec_mca_device_get(i); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { nb++; } } @@ -160,7 +161,7 @@ parsec_get_best_device_check_New(parsec_tiled_matrix_t *dcA, int *info) nb = 0; for(int i = 0; i < (int)parsec_nb_devices; i++) { parsec_device_module_t *device = parsec_mca_device_get(i); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { dev_index[nb++] = device->device_index; } } diff --git a/tests/runtime/cuda/nvlink_wrapper.c b/tests/runtime/cuda/nvlink_wrapper.c index aa2b840e9..2d97b7749 100644 --- a/tests/runtime/cuda/nvlink_wrapper.c +++ b/tests/runtime/cuda/nvlink_wrapper.c @@ -72,7 +72,7 @@ __parsec_nvlink_destructor( parsec_nvlink_taskpool_t* nvlink_taskpool) userM = nvlink_taskpool->_g_userM; for(g = 0, dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == cuda_device->super.super.type ) { + if( PARSEC_DEV_CUDA & cuda_device->super.super.type ) { parsec_data_t *dta = ((parsec_dc_t*)userM)->data_of((parsec_dc_t*)userM, g, userM->super.super.myrank); parsec_data_copy_t *gpu_copy = parsec_data_get_copy(dta, cuda_device->super.super.device_index); cudaError_t status = cudaSetDevice( cuda_device->cuda_index ); @@ -108,7 +108,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb nb = 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { dev_index[nb++] = device->device_index; } } @@ -169,7 +169,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb * in the JDF, this also pins the task on the GPU that we chose to host the tile */ for(int g = 0, dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == cuda_device->super.super.type ) { + if( PARSEC_DEV_CUDA & cuda_device->super.super.type ) { /* We get the data from the data collection */ parsec_data_t *dta = ((parsec_dc_t*)userM)->data_of((parsec_dc_t*)userM, g, ctx->my_rank); /* The corresponding data copy on CPU RAM */ diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf index 87e30a3f9..9fd975122 100644 --- a/tests/runtime/cuda/stage_custom.jdf +++ b/tests/runtime/cuda/stage_custom.jdf @@ -45,8 +45,8 @@ stage_stride_in(parsec_gpu_task_t *gtask, dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i]; elem_sz = parsec_datadist_getsizeoftype(dc->mtype); in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index); - if(in_elem_dev->super.type != PARSEC_DEV_CUDA ){ - /* copy width bytes height times, skipping pitch - width bytes every time */ + if( !(in_elem_dev->super.type & PARSEC_DEV_CUDA) ) { + /* copy width bytes heigth times, skipping pitch - width bytes every time */ size_t dpitch = dc->mb * elem_sz; size_t spitch = dc->llm * elem_sz; size_t width = dc->mb * elem_sz; @@ -117,6 +117,23 @@ typedef void (*cublas_dgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k, /* Pre-declare function used as a property of some parameterized task */ static int64_t gemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev); +static int +stage_custom_batch_match(parsec_gpu_task_t *candidate, + parsec_gpu_task_t *batch_head, + void *callback_data) +{ + int *how_many = (int *)callback_data; + + if( (*how_many < 5) && + (batch_head->ec->task_class == candidate->ec->task_class) ) { + (*how_many)++; + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, + "Add task %p to the %p batch\n", + candidate, batch_head); + return 0; + } + return 1; +} %} %option no_taskpool_instance = true /* can be anything */ @@ -146,8 +163,21 @@ RW A <- descA(m, k) -> A TASK_CHECK(m,k) BODY [type=CUDA + batch = true dyld=cublasDgemm dyldtype=cublas_dgemm_t] { + int how_many = 1; + how_many = parsec_gpu_task_collect_batch(gpu_stream, gpu_task, + stage_custom_batch_match, + &how_many); + if( how_many < 0 ) { + return how_many; + } + if( how_many > 1 ) { + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, + "submit multiple tasks into one %p on stream %s{%p}\n", + gpu_task, gpu_stream->name, (void*)gpu_stream); + } double lalpha = 1.0; double lbeta = 2.0; int tempmm = descA->mb; @@ -155,14 +185,22 @@ BODY [type=CUDA cublasStatus_t status; cublasSetKernelStream( parsec_body.stream ); - parsec_body.dyld_fn( 'N', 'N', - tempmm, tempmm, tempmm, - lalpha, (double*)A, ldam, - (double*)A, ldam, - lbeta, (double*)A, ldam ); - status = cublasGetError(); - PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status, - {return PARSEC_HOOK_RETURN_ERROR;} ); + + parsec_gpu_task_t* current_gpu_task = gpu_task; + do { + __parsec_stage_custom_TASK_GPU_task_t *task = (__parsec_stage_custom_TASK_GPU_task_t*)current_gpu_task->ec; + _f_A = task->data._f_A.data_out; + A = PARSEC_DATA_COPY_GET_PTR(_f_A); + parsec_body.dyld_fn( 'N', 'N', + tempmm, tempmm, tempmm, + lalpha, (double*)A, ldam, + (double*)A, ldam, + lbeta, (double*)A, ldam ); + status = cublasGetError(); + PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status, + {return PARSEC_HOOK_RETURN_ERROR;} ); + current_gpu_task = (parsec_gpu_task_t*)current_gpu_task->list_item.list_next; + } while( current_gpu_task != gpu_task ); } END diff --git a/tests/runtime/cuda/stress_wrapper.c b/tests/runtime/cuda/stress_wrapper.c index ee1f73d35..542dc744d 100644 --- a/tests/runtime/cuda/stress_wrapper.c +++ b/tests/runtime/cuda/stress_wrapper.c @@ -2,7 +2,9 @@ * Copyright (c) 2019-2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved. */ + #include "parsec.h" #include "parsec/execution_stream.h" #include "parsec/data_distribution.h" @@ -46,7 +48,7 @@ parsec_taskpool_t* testing_stress_New( parsec_context_t *ctx, int depth, int mb nb = 0; for(dev = 0; dev < (int)parsec_nb_devices; dev++) { parsec_device_module_t *device = parsec_mca_device_get(dev); - if( PARSEC_DEV_CUDA == device->type ) { + if( PARSEC_DEV_CUDA & device->type ) { dev_index[nb++] = device->device_index; } } @@ -77,4 +79,3 @@ parsec_taskpool_t* testing_stress_New( parsec_context_t *ctx, int depth, int mb parsec_datatype_double_complex_t, mb ); return &testing_handle->super; } -