ICLDisco · bosilca · May 11, 2026 · Jul 26, 2024 · May 9, 2026 · May 11, 2026
@@ -180,6 +180,10 @@ option(PARSEC_DIST_COLLECTIVES
 set   (PARSEC_DIST_SHORT_LIMIT 1 CACHE STRING
   "Use the short protocol (no flow control) for messages smaller than the limit in KB. Performs better if smaller than the MTU.")
 
+### Batched task parameters
+option(PARSEC_HAVE_DEV_CAPABILITY_BATCH
+  "Enable batched task operations on all devices that support batching" ON)
+
 ### GPU engine parameters
 option(PARSEC_GPU_ALLOC_PER_TILE
   "Tile based allocation engine for GPU memory (instead of internal management
@@ -769,7 +773,6 @@ int main(int argc, char *argv[]) {
     endif (CUDAToolkit_FOUND)
     set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files")
   endif( PARSEC_GPU_WITH_CUDA )
-
   if( PARSEC_GPU_WITH_HIP )
     if( NOT DEFINED ROCM_ROOT_DIR AND IS_DIRECTORY /opt/rocm )
       set(ROCM_ROOT_DIR /opt/rocm)

@@ -18,6 +18,10 @@ list(APPEND CMAKE_MODULE_PATH "${PARSEC_CMAKE_DIRS}")
 
 find_package(Threads)
 
+if(@PARSEC_HAVE_DEV_CAPABILITY_BATCH@)
+  set(PARSEC_HAVE_DEV_CAPABILITY_BATCH TRUE)
+endif(@PARSEC_HAVE_DEV_CAPABILITY_BATCH@)
+
 if(@PARSEC_HAVE_HWLOC@)
   set_and_check(HWLOC_INCLUDE_DIR "@HWLOC_INCLUDE_DIR@")
   set(HWLOC_INCLUDE_DIR ${HWLOC_INCLUDE_DIR} CACHE PATH "Imported by PaRSECConfig.cmake" FORCE)

@@ -659,6 +659,7 @@ WARN_LOGFILE           =
 
 INPUT                  = @PROJECT_SOURCE_DIR@/docs/doxygen/mainpage.md
 INPUT                 += @PROJECT_SOURCE_DIR@/docs/doxygen/groups.dox
+INPUT                 += @PROJECT_SOURCE_DIR@/docs/doxygen/task-batching.md
 INPUT                 += @PROJECT_SOURCE_DIR@/parsec
 INPUT                 += @PROJECT_BINARY_DIR@/parsec/include
 INPUT                 += @PARSEC_DOX_SRCS@

@@ -72,6 +72,9 @@ subdirectory of the source. It is separated in a few modules:
      - [Virtual Processes](@ref parsec_internal_virtualprocess) allow to
    isolate groups of threads and avoid work stealing between threads
    belonging to different virtual processes.
+     - [Device task batching](@ref task_batching) documents how
+   accelerator submit hooks can combine compatible ready tasks into one
+   batched device operation.
     - [The Internal Runtime Module](@ref parsec_internal_runtime) holds
     all other functions and data structures that allow to build the
     PaRSEC runtime system.
@@ -117,4 +120,3 @@ following components have specific documentation:
 
  - [schedulers](@ref parsec/mca/sched/sched.h) in `parsec/mca/sched`
  - [PaRSEC INStrumentation](@ref parsec/mca/pins/pins.h) in `parsec/mca/pins`
-
@@ -0,0 +1,189 @@
+Task Batching {#task_batching}
+==============
+
+Task batching lets a device submit hook combine several compatible ready tasks
+into one device operation. The runtime still owns dependency management, data
+movement, completion, and release; the submit hook only decides which pending
+tasks are compatible with the task it was asked to submit.
+
+Batching is opt-in at the chore level. A task that does not advertise batching
+is always delivered to the submit hook as a singleton task.
+
+Enabling batching
+-----------------
+
+For PTG-generated tasks, use the `batch = true` body property on a device body:
+
+```c
+BODY [type=CUDA
+      batch = true
+      dyld=cublasDgemm dyldtype=cublas_dgemm_t]
+{
+    /* GPU submit body. */
+}
+```
+
+For DTD tasks, add `PARSEC_DEV_CHORE_ALLOW_BATCH` to the device type when
+registering or selecting the chore:
+
+```c
+parsec_dtd_task_class_add_chore(tp, tc,
+                                PARSEC_DEV_CUDA | PARSEC_DEV_CHORE_ALLOW_BATCH,
+                                kernel_cuda);
+```
+
+The selected device type must also support batching at runtime. The device layer
+uses `parsec_mca_device_type_supports_batch()` to check this and
+`parsec_mca_device_type_sanitize_batch()` to drop the batching hint when the
+selected device cannot batch. The MCA parameter `device_enable_batching`
+defaults to the compile-time batching capability and can be used to disable
+batching globally at runtime.
+It is read-only when batching support is not compiled in.
+
+Recommended collection helper
+-----------------------------
+
+The preferred interface for GPU submit hooks is
+`parsec_gpu_task_collect_batch()`. The runtime passes the submit hook a
+singleton `parsec_gpu_task_t *gpu_task`. The hook calls the collector with a
+callback that decides, for each task currently pending on the same stream,
+whether that candidate can be added to the batch headed by `gpu_task`.
+
+The callback has the type `parsec_gpu_task_batch_cb_t` and receives:
+
+- `candidate`: a pending task from `gpu_stream->fifo_pending`;
+- `batch_head`: the task originally passed to the submit hook;
+- `callback_data`: user data passed through by the caller.
+
+The callback return value controls the iterator:
+
+- negative: stop immediately and return that error code;
+- zero: remove `candidate` from the pending FIFO and append it to
+  `batch_head`'s task ring;
+- positive: leave `candidate` pending and continue to the next pending task.
+
+The callback must not modify `gpu_stream->fifo_pending` directly.
+
+Example:
+
+```c
+static int
+gemm_batch_match(parsec_gpu_task_t *candidate,
+                 parsec_gpu_task_t *batch_head,
+                 void *callback_data)
+{
+    (void)callback_data;
+
+    if( (batch_head->ec->task_class == candidate->ec->task_class) &&
+        (batch_head->ec->selected_chore == candidate->ec->selected_chore) &&
+        (batch_head->ec->selected_device == candidate->ec->selected_device) ) {
+        return 0;
+    }
+    return 1;
+}
+
+int
+gemm_kernel_cuda(parsec_device_gpu_module_t *gpu_device,
+                 parsec_gpu_task_t *gpu_task,
+                 parsec_gpu_exec_stream_t *gpu_stream)
+{
+    int batch_count;
+    parsec_gpu_task_t *current;
+
+    (void)gpu_device;
+
+    batch_count = parsec_gpu_task_collect_batch(gpu_stream, gpu_task,
+                                                gemm_batch_match, NULL);
+    if( batch_count < 0 ) {
+        return batch_count;
+    }
+
+    current = gpu_task;
+    do {
+        parsec_task_t *task = current->ec;
+
+        /* Submit one device operation for task, or use the whole ring to
+         * issue a real batched operation.
+         */
+
+        current = (parsec_gpu_task_t *)current->list_item.list_next;
+    } while( current != gpu_task );
+
+    return PARSEC_HOOK_RETURN_DONE;
+}
+```
+
+`parsec_gpu_task_collect_batch()` returns the number of tasks in the ring on
+success, including the original `gpu_task`, or the negative callback error.
+Tasks accepted before an error remain attached to `gpu_task`; tasks not accepted
+remain in `gpu_stream->fifo_pending`.
+
+The submit hook does not need a completion callback merely to return the ring to
+the runtime. If a batched submit hook returns a non-singleton task ring, the GPU
+progress engine automatically chains that ring into the next stream's pending
+FIFO after the recorded device event completes. The normal data retrieval,
+epilog, ownership, pushout, and task completion paths then process the tasks one
+at a time.
+
+Iterating over the returned ring
+--------------------------------
+
+A batched submit hook should treat `gpu_task` as the head of a circular task
+ring. This works for both singleton and batched cases:
+
+```c
+parsec_gpu_task_t *current = gpu_task;
+
+do {
+    parsec_task_t *task = current->ec;
+
+    /* Use task. */
+
+    current = (parsec_gpu_task_t *)current->list_item.list_next;
+} while( current != gpu_task );
+```
+
+Original direct collection style
+--------------------------------
+
+The helper above is intentionally conservative: it keeps FIFO ownership inside
+the device layer and exposes only a compatibility callback to the submit hook.
+In very high load scenarios, the repeated callback call can become visible. A
+specialized submit hook can still use the original direct style and manipulate
+the pending FIFO and task ring itself.
+
+This style is more fragile and should be reserved for code that is already
+device-runtime aware. The hook must preserve FIFO correctness, keep rejected
+tasks pending, and unlock the FIFO on every exit path.
+
+```c
+parsec_list_t *pending = gpu_stream->fifo_pending;
+parsec_list_item_t *item;
+parsec_list_item_t *next;
+int batch_count = 1;
+
+PARSEC_LIST_ITEM_SINGLETON(&gpu_task->list_item);
+
+parsec_list_lock(pending);
+for(item = (parsec_list_item_t *)pending->ghost_element.list_next;
+    item != &pending->ghost_element;
+    item = next) {
+    parsec_gpu_task_t *candidate;
+
+    next = (parsec_list_item_t *)item->list_next;
+    candidate = (parsec_gpu_task_t *)item;
+
+    if( compatible_with_batch(candidate, gpu_task) ) {
+        (void)parsec_list_nolock_remove(pending, item);
+        (void)parsec_list_item_ring_push(&gpu_task->list_item, item);
+        batch_count++;
+    }
+}
+parsec_list_unlock(pending);
+```
+
+The direct style avoids the generic iterator and callback dispatch, and it can
+fold the compatibility test into a tight kernel-specific loop. The cost is that
+the submit hook now depends on internal list and stream details and must be
+updated if the GPU stream internals change.
+
@@ -143,6 +143,7 @@
 
 #cmakedefine PARSEC_HAVE_DEV_CPU_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_RECURSIVE_SUPPORT
+#cmakedefine01 PARSEC_HAVE_DEV_CAPABILITY_BATCH
 #cmakedefine PARSEC_HAVE_DEV_CUDA_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_HIP_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT

@@ -1479,7 +1479,7 @@ parsec_dtd_startup(parsec_context_t *context,
         if( !(tp->devices_index_mask & (1 << device->device_index))) continue;  /* not supported */
         // If CUDA is enabled, let the CUDA device activated for this
         // taskpool.
-        if( PARSEC_DEV_CUDA == device->type ) continue;
+        if( PARSEC_DEV_CUDA & device->type ) continue;
         if( NULL != device->taskpool_register )
             if( PARSEC_SUCCESS !=
                 device->taskpool_register(device, (parsec_taskpool_t *)tp)) {
@@ -2345,6 +2345,12 @@ static parsec_hook_return_t parsec_dtd_cpu_task_submit(parsec_execution_stream_t
     return dtd_tc->cpu_func_ptr(es, this_task);
 }
 
+static inline int
+parsec_dtd_effective_chore_type(int device_type)
+{
+    return (int)parsec_mca_device_type_sanitize_batch((uint32_t)device_type);
+}
+
 int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
                                     parsec_task_class_t *tc,
                                     int device_type,
@@ -2362,11 +2368,13 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
     parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t*)tp;
     parsec_dtd_task_class_t *dtd_tc = (parsec_dtd_task_class_t*)tc;
 
+    device_type = parsec_dtd_effective_chore_type(device_type);
+
     /* We assume that incarnations is big enough, because it has been pre-allocated
      * with PARSEC_DEV_MAX_NB_TYPE+1 chores, as this is a DTD task class */
     incarnations = (__parsec_chore_t*)dtd_tc->super.incarnations;
-    for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && incarnations[i].type != PARSEC_DEV_NONE; i++) {
-        if( incarnations[i].type == device_type ) {
+    for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && (incarnations[i].type & PARSEC_DEV_ANY_TYPE) != PARSEC_DEV_NONE; i++) {
+        if( (incarnations[i].type & PARSEC_DEV_ANY_TYPE) & (device_type & PARSEC_DEV_ANY_TYPE) ) {
             parsec_warning("A chore for this device type has already been added to task class '%s'\n",
                            tc->name);
             return PARSEC_ERROR;
@@ -2379,7 +2387,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
     }
 
     incarnations[i].type = device_type;
-    if(PARSEC_DEV_CUDA == device_type) {
+    if(PARSEC_DEV_CUDA & device_type) {
         incarnations[i].hook = parsec_dtd_gpu_task_submit;
         dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function;
     }
@@ -3136,7 +3144,7 @@ parsec_insert_dtd_task(parsec_task_t *__this_task)
 
 static inline parsec_task_t *
 __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
-                                  void *fpointer, int32_t priority, uint8_t device_type,
+                                  void *fpointer, int32_t priority, int32_t device_type,
                                   const char *name_of_kernel, parsec_task_class_t *tc, va_list args)
 {
     parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t *)tp;
@@ -3148,6 +3156,8 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
     int nb_params = 0;
     parsec_dtd_param_t params[PARSEC_DTD_MAX_PARAMS];
 
+    device_type = parsec_dtd_effective_chore_type(device_type);
+
     if( dtd_tp == NULL) {
         parsec_fatal("You need to pass a correct parsec taskpool in order to insert task. "
                      "Please use \"parsec_dtd_taskpool_new()\" to create new taskpool"
@@ -3268,19 +3278,20 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
             dtd_tc = parsec_dtd_create_task_classv(name_of_kernel, nb_params, params);
             tc = &dtd_tc->super;
 
-            __parsec_chore_t **incarnations = (__parsec_chore_t **)&tc->incarnations;
-            (*incarnations)[0].type = device_type;
-            if( device_type == PARSEC_DEV_CUDA ) {
+            __parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations;
+            incarnations[0].type = device_type;
+            if( device_type & PARSEC_DEV_CUDA ) {
                 /* Special case for CUDA: we need an intermediate */
-                (*incarnations)[0].hook = parsec_dtd_gpu_task_submit;
+                incarnations[0].hook = parsec_dtd_gpu_task_submit;
                 dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)fpointer;
             }
             else {
                 /* Default case: the user-provided function is directly the hook to call */
-                (*incarnations)[0].hook = fpointer; // We can directly call the CPU hook
+                incarnations[0].hook = fpointer; // We can directly call the CPU hook
                 dtd_tc->cpu_func_ptr = fpointer;
             }
-            (*incarnations)[1].type = PARSEC_DEV_NONE;
+            incarnations[1].type = PARSEC_DEV_NONE;
+            incarnations[1].hook = NULL;
 
             /* Bookkeeping of the task class */
             parsec_dtd_register_task_class(&dtd_tp->super, fkey, tc);