Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,10 @@ option(PARSEC_DIST_COLLECTIVES
set (PARSEC_DIST_SHORT_LIMIT 1 CACHE STRING
"Use the short protocol (no flow control) for messages smaller than the limit in KB. Performs better if smaller than the MTU.")

### Batched task parameters
option(PARSEC_HAVE_DEV_CAPABILITY_BATCH
"Enable batched task operations on all devices that support batching" ON)

### GPU engine parameters
option(PARSEC_GPU_ALLOC_PER_TILE
"Tile based allocation engine for GPU memory (instead of internal management
Expand Down Expand Up @@ -769,7 +773,6 @@ int main(int argc, char *argv[]) {
endif (CUDAToolkit_FOUND)
set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files")
endif( PARSEC_GPU_WITH_CUDA )

if( PARSEC_GPU_WITH_HIP )
if( NOT DEFINED ROCM_ROOT_DIR AND IS_DIRECTORY /opt/rocm )
set(ROCM_ROOT_DIR /opt/rocm)
Expand Down
4 changes: 4 additions & 0 deletions cmake_modules/PaRSECConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ list(APPEND CMAKE_MODULE_PATH "${PARSEC_CMAKE_DIRS}")

find_package(Threads)

if(@PARSEC_HAVE_DEV_CAPABILITY_BATCH@)
set(PARSEC_HAVE_DEV_CAPABILITY_BATCH TRUE)
endif(@PARSEC_HAVE_DEV_CAPABILITY_BATCH@)

if(@PARSEC_HAVE_HWLOC@)
set_and_check(HWLOC_INCLUDE_DIR "@HWLOC_INCLUDE_DIR@")
set(HWLOC_INCLUDE_DIR ${HWLOC_INCLUDE_DIR} CACHE PATH "Imported by PaRSECConfig.cmake" FORCE)
Expand Down
1 change: 1 addition & 0 deletions docs/doxygen/Doxyfile.in
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,7 @@ WARN_LOGFILE =

INPUT = @PROJECT_SOURCE_DIR@/docs/doxygen/mainpage.md
INPUT += @PROJECT_SOURCE_DIR@/docs/doxygen/groups.dox
INPUT += @PROJECT_SOURCE_DIR@/docs/doxygen/task-batching.md
INPUT += @PROJECT_SOURCE_DIR@/parsec
INPUT += @PROJECT_BINARY_DIR@/parsec/include
INPUT += @PARSEC_DOX_SRCS@
Expand Down
4 changes: 3 additions & 1 deletion docs/doxygen/mainpage.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ subdirectory of the source. It is separated in a few modules:
- [Virtual Processes](@ref parsec_internal_virtualprocess) allow to
isolate groups of threads and avoid work stealing between threads
belonging to different virtual processes.
- [Device task batching](@ref task_batching) documents how
accelerator submit hooks can combine compatible ready tasks into one
batched device operation.
- [The Internal Runtime Module](@ref parsec_internal_runtime) holds
all other functions and data structures that allow to build the
PaRSEC runtime system.
Expand Down Expand Up @@ -117,4 +120,3 @@ following components have specific documentation:

- [schedulers](@ref parsec/mca/sched/sched.h) in `parsec/mca/sched`
- [PaRSEC INStrumentation](@ref parsec/mca/pins/pins.h) in `parsec/mca/pins`

189 changes: 189 additions & 0 deletions docs/doxygen/task-batching.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
Task Batching {#task_batching}
==============

Task batching lets a device submit hook combine several compatible ready tasks
into one device operation. The runtime still owns dependency management, data
movement, completion, and release; the submit hook only decides which pending
tasks are compatible with the task it was asked to submit.

Batching is opt-in at the chore level. A task that does not advertise batching
is always delivered to the submit hook as a singleton task.

Enabling batching
-----------------

For PTG-generated tasks, use the `batch = true` body property on a device body:

```c
BODY [type=CUDA
batch = true
dyld=cublasDgemm dyldtype=cublas_dgemm_t]
{
/* GPU submit body. */
}
Comment on lines +15 to +23
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not clear how PTG tasks can actually batch kernel invocations. Simply stringing kernels together on the same stream won't save much.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same way as DTD tasks, and the same way we did two years ago for the GB submission.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GB submissions are not part of the docs.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR has been lingering here for a very long time. Let's get it in.

```

For DTD tasks, add `PARSEC_DEV_CHORE_ALLOW_BATCH` to the device type when
registering or selecting the chore:

```c
parsec_dtd_task_class_add_chore(tp, tc,
PARSEC_DEV_CUDA | PARSEC_DEV_CHORE_ALLOW_BATCH,
kernel_cuda);
```

The selected device type must also support batching at runtime. The device layer
uses `parsec_mca_device_type_supports_batch()` to check this and
`parsec_mca_device_type_sanitize_batch()` to drop the batching hint when the
selected device cannot batch. The MCA parameter `device_enable_batching`
defaults to the compile-time batching capability and can be used to disable
batching globally at runtime.
It is read-only when batching support is not compiled in.

Recommended collection helper
-----------------------------

The preferred interface for GPU submit hooks is
`parsec_gpu_task_collect_batch()`. The runtime passes the submit hook a
singleton `parsec_gpu_task_t *gpu_task`. The hook calls the collector with a
callback that decides, for each task currently pending on the same stream,
whether that candidate can be added to the batch headed by `gpu_task`.

The callback has the type `parsec_gpu_task_batch_cb_t` and receives:

- `candidate`: a pending task from `gpu_stream->fifo_pending`;
- `batch_head`: the task originally passed to the submit hook;
- `callback_data`: user data passed through by the caller.

The callback return value controls the iterator:

- negative: stop immediately and return that error code;
- zero: remove `candidate` from the pending FIFO and append it to
`batch_head`'s task ring;
- positive: leave `candidate` pending and continue to the next pending task.

The callback must not modify `gpu_stream->fifo_pending` directly.

Example:

```c
static int
gemm_batch_match(parsec_gpu_task_t *candidate,
parsec_gpu_task_t *batch_head,
void *callback_data)
{
(void)callback_data;

if( (batch_head->ec->task_class == candidate->ec->task_class) &&
(batch_head->ec->selected_chore == candidate->ec->selected_chore) &&
(batch_head->ec->selected_device == candidate->ec->selected_device) ) {
return 0;
}
return 1;
}

int
gemm_kernel_cuda(parsec_device_gpu_module_t *gpu_device,
parsec_gpu_task_t *gpu_task,
parsec_gpu_exec_stream_t *gpu_stream)
{
int batch_count;
parsec_gpu_task_t *current;

(void)gpu_device;

batch_count = parsec_gpu_task_collect_batch(gpu_stream, gpu_task,
gemm_batch_match, NULL);
if( batch_count < 0 ) {
return batch_count;
}

current = gpu_task;
do {
parsec_task_t *task = current->ec;

/* Submit one device operation for task, or use the whole ring to
* issue a real batched operation.
*/

current = (parsec_gpu_task_t *)current->list_item.list_next;
} while( current != gpu_task );

return PARSEC_HOOK_RETURN_DONE;
}
```

`parsec_gpu_task_collect_batch()` returns the number of tasks in the ring on
success, including the original `gpu_task`, or the negative callback error.
Tasks accepted before an error remain attached to `gpu_task`; tasks not accepted
remain in `gpu_stream->fifo_pending`.

The submit hook does not need a completion callback merely to return the ring to
the runtime. If a batched submit hook returns a non-singleton task ring, the GPU
progress engine automatically chains that ring into the next stream's pending
FIFO after the recorded device event completes. The normal data retrieval,
epilog, ownership, pushout, and task completion paths then process the tasks one
at a time.

Iterating over the returned ring
--------------------------------

A batched submit hook should treat `gpu_task` as the head of a circular task
ring. This works for both singleton and batched cases:

```c
parsec_gpu_task_t *current = gpu_task;

do {
parsec_task_t *task = current->ec;

/* Use task. */

current = (parsec_gpu_task_t *)current->list_item.list_next;
} while( current != gpu_task );
```

Original direct collection style
--------------------------------

The helper above is intentionally conservative: it keeps FIFO ownership inside
the device layer and exposes only a compatibility callback to the submit hook.
In very high load scenarios, the repeated callback call can become visible. A
specialized submit hook can still use the original direct style and manipulate
the pending FIFO and task ring itself.

This style is more fragile and should be reserved for code that is already
device-runtime aware. The hook must preserve FIFO correctness, keep rejected
tasks pending, and unlock the FIFO on every exit path.

```c
parsec_list_t *pending = gpu_stream->fifo_pending;
parsec_list_item_t *item;
parsec_list_item_t *next;
int batch_count = 1;

PARSEC_LIST_ITEM_SINGLETON(&gpu_task->list_item);

parsec_list_lock(pending);
for(item = (parsec_list_item_t *)pending->ghost_element.list_next;
item != &pending->ghost_element;
item = next) {
parsec_gpu_task_t *candidate;

next = (parsec_list_item_t *)item->list_next;
candidate = (parsec_gpu_task_t *)item;

if( compatible_with_batch(candidate, gpu_task) ) {
(void)parsec_list_nolock_remove(pending, item);
(void)parsec_list_item_ring_push(&gpu_task->list_item, item);
batch_count++;
}
}
parsec_list_unlock(pending);
```

The direct style avoids the generic iterator and callback dispatch, and it can
fold the compatibility test into a tight kernel-specific loop. The cost is that
the submit hook now depends on internal list and stream details and must be
updated if the GPU stream internals change.

1 change: 1 addition & 0 deletions parsec/include/parsec/parsec_options.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@

#cmakedefine PARSEC_HAVE_DEV_CPU_SUPPORT
#cmakedefine PARSEC_HAVE_DEV_RECURSIVE_SUPPORT
#cmakedefine01 PARSEC_HAVE_DEV_CAPABILITY_BATCH
#cmakedefine PARSEC_HAVE_DEV_CUDA_SUPPORT
#cmakedefine PARSEC_HAVE_DEV_HIP_SUPPORT
#cmakedefine PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
Expand Down
33 changes: 22 additions & 11 deletions parsec/interfaces/dtd/insert_function.c
Original file line number Diff line number Diff line change
Expand Up @@ -1479,7 +1479,7 @@ parsec_dtd_startup(parsec_context_t *context,
if( !(tp->devices_index_mask & (1 << device->device_index))) continue; /* not supported */
// If CUDA is enabled, let the CUDA device activated for this
// taskpool.
if( PARSEC_DEV_CUDA == device->type ) continue;
if( PARSEC_DEV_CUDA & device->type ) continue;
if( NULL != device->taskpool_register )
if( PARSEC_SUCCESS !=
device->taskpool_register(device, (parsec_taskpool_t *)tp)) {
Expand Down Expand Up @@ -2345,6 +2345,12 @@ static parsec_hook_return_t parsec_dtd_cpu_task_submit(parsec_execution_stream_t
return dtd_tc->cpu_func_ptr(es, this_task);
}

static inline int
parsec_dtd_effective_chore_type(int device_type)
{
return (int)parsec_mca_device_type_sanitize_batch((uint32_t)device_type);
}

int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
parsec_task_class_t *tc,
int device_type,
Expand All @@ -2362,11 +2368,13 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t*)tp;
parsec_dtd_task_class_t *dtd_tc = (parsec_dtd_task_class_t*)tc;

device_type = parsec_dtd_effective_chore_type(device_type);

/* We assume that incarnations is big enough, because it has been pre-allocated
* with PARSEC_DEV_MAX_NB_TYPE+1 chores, as this is a DTD task class */
incarnations = (__parsec_chore_t*)dtd_tc->super.incarnations;
for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && incarnations[i].type != PARSEC_DEV_NONE; i++) {
if( incarnations[i].type == device_type ) {
for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && (incarnations[i].type & PARSEC_DEV_ANY_TYPE) != PARSEC_DEV_NONE; i++) {
if( (incarnations[i].type & PARSEC_DEV_ANY_TYPE) & (device_type & PARSEC_DEV_ANY_TYPE) ) {
parsec_warning("A chore for this device type has already been added to task class '%s'\n",
tc->name);
return PARSEC_ERROR;
Expand All @@ -2379,7 +2387,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
}

incarnations[i].type = device_type;
if(PARSEC_DEV_CUDA == device_type) {
if(PARSEC_DEV_CUDA & device_type) {
incarnations[i].hook = parsec_dtd_gpu_task_submit;
dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function;
}
Expand Down Expand Up @@ -3136,7 +3144,7 @@ parsec_insert_dtd_task(parsec_task_t *__this_task)

static inline parsec_task_t *
__parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
void *fpointer, int32_t priority, uint8_t device_type,
void *fpointer, int32_t priority, int32_t device_type,
const char *name_of_kernel, parsec_task_class_t *tc, va_list args)
{
parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t *)tp;
Expand All @@ -3148,6 +3156,8 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
int nb_params = 0;
parsec_dtd_param_t params[PARSEC_DTD_MAX_PARAMS];

device_type = parsec_dtd_effective_chore_type(device_type);

if( dtd_tp == NULL) {
parsec_fatal("You need to pass a correct parsec taskpool in order to insert task. "
"Please use \"parsec_dtd_taskpool_new()\" to create new taskpool"
Expand Down Expand Up @@ -3268,19 +3278,20 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
dtd_tc = parsec_dtd_create_task_classv(name_of_kernel, nb_params, params);
tc = &dtd_tc->super;

__parsec_chore_t **incarnations = (__parsec_chore_t **)&tc->incarnations;
(*incarnations)[0].type = device_type;
if( device_type == PARSEC_DEV_CUDA ) {
__parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations;
incarnations[0].type = device_type;
if( device_type & PARSEC_DEV_CUDA ) {
/* Special case for CUDA: we need an intermediate */
(*incarnations)[0].hook = parsec_dtd_gpu_task_submit;
incarnations[0].hook = parsec_dtd_gpu_task_submit;
dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)fpointer;
}
else {
/* Default case: the user-provided function is directly the hook to call */
(*incarnations)[0].hook = fpointer; // We can directly call the CPU hook
incarnations[0].hook = fpointer; // We can directly call the CPU hook
dtd_tc->cpu_func_ptr = fpointer;
}
(*incarnations)[1].type = PARSEC_DEV_NONE;
incarnations[1].type = PARSEC_DEV_NONE;
incarnations[1].hook = NULL;

/* Bookkeeping of the task class */
parsec_dtd_register_task_class(&dtd_tp->super, fkey, tc);
Expand Down
Loading
Loading