diff --git a/CMakeLists.txt b/CMakeLists.txt
index f4d1af82b..00665f37f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -180,6 +180,10 @@ option(PARSEC_DIST_COLLECTIVES
 set   (PARSEC_DIST_SHORT_LIMIT 1 CACHE STRING
   "Use the short protocol (no flow control) for messages smaller than the limit in KB. Performs better if smaller than the MTU.")
 
+### Batched task parameters
+option(PARSEC_HAVE_DEV_CAPABILITY_BATCH
+  "Enable batched task operations on all devices that support batching" ON)
+
 ### GPU engine parameters
 option(PARSEC_GPU_ALLOC_PER_TILE
   "Tile based allocation engine for GPU memory (instead of internal management
@@ -769,7 +773,6 @@ int main(int argc, char *argv[]) {
     endif (CUDAToolkit_FOUND)
     set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files")
   endif( PARSEC_GPU_WITH_CUDA )
-
   if( PARSEC_GPU_WITH_HIP )
     if( NOT DEFINED ROCM_ROOT_DIR AND IS_DIRECTORY /opt/rocm )
       set(ROCM_ROOT_DIR /opt/rocm)
diff --git a/cmake_modules/PaRSECConfig.cmake.in b/cmake_modules/PaRSECConfig.cmake.in
index 9c2a16919..42390f41a 100644
--- a/cmake_modules/PaRSECConfig.cmake.in
+++ b/cmake_modules/PaRSECConfig.cmake.in
@@ -18,6 +18,10 @@ list(APPEND CMAKE_MODULE_PATH "${PARSEC_CMAKE_DIRS}")
 
 find_package(Threads)
 
+if(@PARSEC_HAVE_DEV_CAPABILITY_BATCH@)
+  set(PARSEC_HAVE_DEV_CAPABILITY_BATCH TRUE)
+endif(@PARSEC_HAVE_DEV_CAPABILITY_BATCH@)
+
 if(@PARSEC_HAVE_HWLOC@)
   set_and_check(HWLOC_INCLUDE_DIR "@HWLOC_INCLUDE_DIR@")
   set(HWLOC_INCLUDE_DIR ${HWLOC_INCLUDE_DIR} CACHE PATH "Imported by PaRSECConfig.cmake" FORCE)
diff --git a/docs/doxygen/Doxyfile.in b/docs/doxygen/Doxyfile.in
index 23cf11de9..60c82f208 100644
--- a/docs/doxygen/Doxyfile.in
+++ b/docs/doxygen/Doxyfile.in
@@ -659,6 +659,7 @@ WARN_LOGFILE           =
 
 INPUT                  = @PROJECT_SOURCE_DIR@/docs/doxygen/mainpage.md
 INPUT                 += @PROJECT_SOURCE_DIR@/docs/doxygen/groups.dox
+INPUT                 += @PROJECT_SOURCE_DIR@/docs/doxygen/task-batching.md
 INPUT                 += @PROJECT_SOURCE_DIR@/parsec
 INPUT                 += @PROJECT_BINARY_DIR@/parsec/include
 INPUT                 += @PARSEC_DOX_SRCS@
diff --git a/docs/doxygen/mainpage.md b/docs/doxygen/mainpage.md
index ae65f5833..1d7a9501a 100644
--- a/docs/doxygen/mainpage.md
+++ b/docs/doxygen/mainpage.md
@@ -72,6 +72,9 @@ subdirectory of the source. It is separated in a few modules:
      - [Virtual Processes](@ref parsec_internal_virtualprocess) allow to
    isolate groups of threads and avoid work stealing between threads
    belonging to different virtual processes.
+     - [Device task batching](@ref task_batching) documents how
+   accelerator submit hooks can combine compatible ready tasks into one
+   batched device operation.
     - [The Internal Runtime Module](@ref parsec_internal_runtime) holds
     all other functions and data structures that allow to build the
     PaRSEC runtime system.
@@ -117,4 +120,3 @@ following components have specific documentation:
 
  - [schedulers](@ref parsec/mca/sched/sched.h) in `parsec/mca/sched`
  - [PaRSEC INStrumentation](@ref parsec/mca/pins/pins.h) in `parsec/mca/pins`
-
diff --git a/docs/doxygen/task-batching.md b/docs/doxygen/task-batching.md
new file mode 100644
index 000000000..4ff9aea6b
--- /dev/null
+++ b/docs/doxygen/task-batching.md
@@ -0,0 +1,189 @@
+Task Batching {#task_batching}
+==============
+
+Task batching lets a device submit hook combine several compatible ready tasks
+into one device operation. The runtime still owns dependency management, data
+movement, completion, and release; the submit hook only decides which pending
+tasks are compatible with the task it was asked to submit.
+
+Batching is opt-in at the chore level. A task that does not advertise batching
+is always delivered to the submit hook as a singleton task.
+
+Enabling batching
+-----------------
+
+For PTG-generated tasks, use the `batch = true` body property on a device body:
+
+```c
+BODY [type=CUDA
+      batch = true
+      dyld=cublasDgemm dyldtype=cublas_dgemm_t]
+{
+    /* GPU submit body. */
+}
+```
+
+For DTD tasks, add `PARSEC_DEV_CHORE_ALLOW_BATCH` to the device type when
+registering or selecting the chore:
+
+```c
+parsec_dtd_task_class_add_chore(tp, tc,
+                                PARSEC_DEV_CUDA | PARSEC_DEV_CHORE_ALLOW_BATCH,
+                                kernel_cuda);
+```
+
+The selected device type must also support batching at runtime. The device layer
+uses `parsec_mca_device_type_supports_batch()` to check this and
+`parsec_mca_device_type_sanitize_batch()` to drop the batching hint when the
+selected device cannot batch. The MCA parameter `device_enable_batching`
+defaults to the compile-time batching capability and can be used to disable
+batching globally at runtime.
+It is read-only when batching support is not compiled in.
+
+Recommended collection helper
+-----------------------------
+
+The preferred interface for GPU submit hooks is
+`parsec_gpu_task_collect_batch()`. The runtime passes the submit hook a
+singleton `parsec_gpu_task_t *gpu_task`. The hook calls the collector with a
+callback that decides, for each task currently pending on the same stream,
+whether that candidate can be added to the batch headed by `gpu_task`.
+
+The callback has the type `parsec_gpu_task_batch_cb_t` and receives:
+
+- `candidate`: a pending task from `gpu_stream->fifo_pending`;
+- `batch_head`: the task originally passed to the submit hook;
+- `callback_data`: user data passed through by the caller.
+
+The callback return value controls the iterator:
+
+- negative: stop immediately and return that error code;
+- zero: remove `candidate` from the pending FIFO and append it to
+  `batch_head`'s task ring;
+- positive: leave `candidate` pending and continue to the next pending task.
+
+The callback must not modify `gpu_stream->fifo_pending` directly.
+
+Example:
+
+```c
+static int
+gemm_batch_match(parsec_gpu_task_t *candidate,
+                 parsec_gpu_task_t *batch_head,
+                 void *callback_data)
+{
+    (void)callback_data;
+
+    if( (batch_head->ec->task_class == candidate->ec->task_class) &&
+        (batch_head->ec->selected_chore == candidate->ec->selected_chore) &&
+        (batch_head->ec->selected_device == candidate->ec->selected_device) ) {
+        return 0;
+    }
+    return 1;
+}
+
+int
+gemm_kernel_cuda(parsec_device_gpu_module_t *gpu_device,
+                 parsec_gpu_task_t *gpu_task,
+                 parsec_gpu_exec_stream_t *gpu_stream)
+{
+    int batch_count;
+    parsec_gpu_task_t *current;
+
+    (void)gpu_device;
+
+    batch_count = parsec_gpu_task_collect_batch(gpu_stream, gpu_task,
+                                                gemm_batch_match, NULL);
+    if( batch_count < 0 ) {
+        return batch_count;
+    }
+
+    current = gpu_task;
+    do {
+        parsec_task_t *task = current->ec;
+
+        /* Submit one device operation for task, or use the whole ring to
+         * issue a real batched operation.
+         */
+
+        current = (parsec_gpu_task_t *)current->list_item.list_next;
+    } while( current != gpu_task );
+
+    return PARSEC_HOOK_RETURN_DONE;
+}
+```
+
+`parsec_gpu_task_collect_batch()` returns the number of tasks in the ring on
+success, including the original `gpu_task`, or the negative callback error.
+Tasks accepted before an error remain attached to `gpu_task`; tasks not accepted
+remain in `gpu_stream->fifo_pending`.
+
+The submit hook does not need a completion callback merely to return the ring to
+the runtime. If a batched submit hook returns a non-singleton task ring, the GPU
+progress engine automatically chains that ring into the next stream's pending
+FIFO after the recorded device event completes. The normal data retrieval,
+epilog, ownership, pushout, and task completion paths then process the tasks one
+at a time.
+
+Iterating over the returned ring
+--------------------------------
+
+A batched submit hook should treat `gpu_task` as the head of a circular task
+ring. This works for both singleton and batched cases:
+
+```c
+parsec_gpu_task_t *current = gpu_task;
+
+do {
+    parsec_task_t *task = current->ec;
+
+    /* Use task. */
+
+    current = (parsec_gpu_task_t *)current->list_item.list_next;
+} while( current != gpu_task );
+```
+
+Original direct collection style
+--------------------------------
+
+The helper above is intentionally conservative: it keeps FIFO ownership inside
+the device layer and exposes only a compatibility callback to the submit hook.
+In very high load scenarios, the repeated callback call can become visible. A
+specialized submit hook can still use the original direct style and manipulate
+the pending FIFO and task ring itself.
+
+This style is more fragile and should be reserved for code that is already
+device-runtime aware. The hook must preserve FIFO correctness, keep rejected
+tasks pending, and unlock the FIFO on every exit path.
+
+```c
+parsec_list_t *pending = gpu_stream->fifo_pending;
+parsec_list_item_t *item;
+parsec_list_item_t *next;
+int batch_count = 1;
+
+PARSEC_LIST_ITEM_SINGLETON(&gpu_task->list_item);
+
+parsec_list_lock(pending);
+for(item = (parsec_list_item_t *)pending->ghost_element.list_next;
+    item != &pending->ghost_element;
+    item = next) {
+    parsec_gpu_task_t *candidate;
+
+    next = (parsec_list_item_t *)item->list_next;
+    candidate = (parsec_gpu_task_t *)item;
+
+    if( compatible_with_batch(candidate, gpu_task) ) {
+        (void)parsec_list_nolock_remove(pending, item);
+        (void)parsec_list_item_ring_push(&gpu_task->list_item, item);
+        batch_count++;
+    }
+}
+parsec_list_unlock(pending);
+```
+
+The direct style avoids the generic iterator and callback dispatch, and it can
+fold the compatibility test into a tight kernel-specific loop. The cost is that
+the submit hook now depends on internal list and stream details and must be
+updated if the GPU stream internals change.
+
diff --git a/parsec/include/parsec/parsec_options.h.in b/parsec/include/parsec/parsec_options.h.in
index f05bbc63c..e0b1ee881 100644
--- a/parsec/include/parsec/parsec_options.h.in
+++ b/parsec/include/parsec/parsec_options.h.in
@@ -143,6 +143,7 @@
 
 #cmakedefine PARSEC_HAVE_DEV_CPU_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_RECURSIVE_SUPPORT
+#cmakedefine01 PARSEC_HAVE_DEV_CAPABILITY_BATCH
 #cmakedefine PARSEC_HAVE_DEV_CUDA_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_HIP_SUPPORT
 #cmakedefine PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c
index 98d9c85b7..805d9d76b 100644
--- a/parsec/interfaces/dtd/insert_function.c
+++ b/parsec/interfaces/dtd/insert_function.c
@@ -1479,7 +1479,7 @@ parsec_dtd_startup(parsec_context_t *context,
         if( !(tp->devices_index_mask & (1 << device->device_index))) continue;  /* not supported */
         // If CUDA is enabled, let the CUDA device activated for this
         // taskpool.
-        if( PARSEC_DEV_CUDA == device->type ) continue;
+        if( PARSEC_DEV_CUDA & device->type ) continue;
         if( NULL != device->taskpool_register )
             if( PARSEC_SUCCESS !=
                 device->taskpool_register(device, (parsec_taskpool_t *)tp)) {
@@ -2345,6 +2345,12 @@ static parsec_hook_return_t parsec_dtd_cpu_task_submit(parsec_execution_stream_t
     return dtd_tc->cpu_func_ptr(es, this_task);
 }
 
+static inline int
+parsec_dtd_effective_chore_type(int device_type)
+{
+    return (int)parsec_mca_device_type_sanitize_batch((uint32_t)device_type);
+}
+
 int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
                                     parsec_task_class_t *tc,
                                     int device_type,
@@ -2362,11 +2368,13 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
     parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t*)tp;
     parsec_dtd_task_class_t *dtd_tc = (parsec_dtd_task_class_t*)tc;
 
+    device_type = parsec_dtd_effective_chore_type(device_type);
+
     /* We assume that incarnations is big enough, because it has been pre-allocated
      * with PARSEC_DEV_MAX_NB_TYPE+1 chores, as this is a DTD task class */
     incarnations = (__parsec_chore_t*)dtd_tc->super.incarnations;
-    for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && incarnations[i].type != PARSEC_DEV_NONE; i++) {
-        if( incarnations[i].type == device_type ) {
+    for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && (incarnations[i].type & PARSEC_DEV_ANY_TYPE) != PARSEC_DEV_NONE; i++) {
+        if( (incarnations[i].type & PARSEC_DEV_ANY_TYPE) & (device_type & PARSEC_DEV_ANY_TYPE) ) {
             parsec_warning("A chore for this device type has already been added to task class '%s'\n",
                            tc->name);
             return PARSEC_ERROR;
@@ -2379,7 +2387,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
     }
 
     incarnations[i].type = device_type;
-    if(PARSEC_DEV_CUDA == device_type) {
+    if(PARSEC_DEV_CUDA & device_type) {
         incarnations[i].hook = parsec_dtd_gpu_task_submit;
         dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function;
     }
@@ -3136,7 +3144,7 @@ parsec_insert_dtd_task(parsec_task_t *__this_task)
 
 static inline parsec_task_t *
 __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
-                                  void *fpointer, int32_t priority, uint8_t device_type,
+                                  void *fpointer, int32_t priority, int32_t device_type,
                                   const char *name_of_kernel, parsec_task_class_t *tc, va_list args)
 {
     parsec_dtd_taskpool_t *dtd_tp = (parsec_dtd_taskpool_t *)tp;
@@ -3148,6 +3156,8 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
     int nb_params = 0;
     parsec_dtd_param_t params[PARSEC_DTD_MAX_PARAMS];
 
+    device_type = parsec_dtd_effective_chore_type(device_type);
+
     if( dtd_tp == NULL) {
         parsec_fatal("You need to pass a correct parsec taskpool in order to insert task. "
                      "Please use \"parsec_dtd_taskpool_new()\" to create new taskpool"
@@ -3268,19 +3278,20 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
             dtd_tc = parsec_dtd_create_task_classv(name_of_kernel, nb_params, params);
             tc = &dtd_tc->super;
 
-            __parsec_chore_t **incarnations = (__parsec_chore_t **)&tc->incarnations;
-            (*incarnations)[0].type = device_type;
-            if( device_type == PARSEC_DEV_CUDA ) {
+            __parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations;
+            incarnations[0].type = device_type;
+            if( device_type & PARSEC_DEV_CUDA ) {
                 /* Special case for CUDA: we need an intermediate */
-                (*incarnations)[0].hook = parsec_dtd_gpu_task_submit;
+                incarnations[0].hook = parsec_dtd_gpu_task_submit;
                 dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)fpointer;
             }
             else {
                 /* Default case: the user-provided function is directly the hook to call */
-                (*incarnations)[0].hook = fpointer; // We can directly call the CPU hook
+                incarnations[0].hook = fpointer; // We can directly call the CPU hook
                 dtd_tc->cpu_func_ptr = fpointer;
             }
-            (*incarnations)[1].type = PARSEC_DEV_NONE;
+            incarnations[1].type = PARSEC_DEV_NONE;
+            incarnations[1].hook = NULL;
 
             /* Bookkeeping of the task class */
             parsec_dtd_register_task_class(&dtd_tp->super, fkey, tc);
diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index c72ad011d..5bee838b1 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -3951,21 +3951,29 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf,
     jdf_def_list_t* dyld_property;
     jdf_def_list_t* evaluate_property = NULL;
     jdf_def_list_t* device_property = NULL;
-     char* dev_upper;
+    jdf_def_list_t* batch_property = NULL;
 
     (void)jdf;
     string_arena_add_string(sa, "static const __parsec_chore_t __%s_chores[] ={\n", base_name);
     do {
+        char* dev_upper;
         jdf_find_property(body->properties, "type", &type_property);
         jdf_find_property(body->properties, "dyld", &dyld_property);
         jdf_find_property(body->properties, JDF_BODY_PROP_EVALUATE, &evaluate_property);
+        jdf_find_property(body->properties, "batch", &batch_property);
         if( NULL == type_property) {
             dev_upper = strdup_upper("CPU");
         } else {
             dev_upper = strdup_upper(type_property->expr->jdf_var);
         }
         string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", dev_upper);
-        string_arena_add_string(sa, "    { .type     = PARSEC_DEV_%s,\n", dev_upper);
+        string_arena_add_string(sa, "    { .type     = PARSEC_DEV_%s", dev_upper);
+        if( NULL != batch_property) {
+#if PARSEC_HAVE_DEV_CAPABILITY_BATCH
+            string_arena_add_string(sa, " | PARSEC_DEV_CHORE_ALLOW_BATCH");
+#endif  /* PARSEC_HAVE_DEV_CAPABILITY_BATCH */
+        }
+        string_arena_add_string(sa, ",\n");
         if( NULL == dyld_property ) {
             string_arena_add_string(sa, "      .dyld     = NULL,\n");
         } else {
@@ -3974,7 +3982,7 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf,
             if ( NULL == dyld_proptotype_property ) {
                 fprintf(stderr,
                         "Internal Error: function prototype (dyldtype) of dyld function (%s) is not defined in %s body of task %s at line %d\n",
-                        dyld_property->expr->jdf_var, type_property->expr->jdf_var, f->fname, JDF_OBJECT_LINENO( body ) );
+                        dyld_property->expr->jdf_var, dev_upper, f->fname, JDF_OBJECT_LINENO( body ) );
                 assert( NULL != dyld_proptotype_property );
             }
             string_arena_add_string(sa, "      .dyld     = \"%s\",\n", dyld_property->expr->jdf_var);
@@ -3993,7 +4001,6 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf,
         string_arena_add_string(sa, "      .hook     = (parsec_hook_t*)hook_of_%s_%s },\n", base_name, dev_upper);
         string_arena_add_string(sa, "#endif  /* defined(PARSEC_HAVE_DEV_%s_SUPPORT) */\n", dev_upper);
         free(dev_upper);
-
         body = body->next;
     } while (NULL != body);
     string_arena_add_string(sa,
@@ -4500,7 +4507,7 @@ static void jdf_generate_startup_hook( const jdf_t *jdf )
             "    parsec_task_class_t* tc = (parsec_task_class_t*)__parsec_tp->super.super.task_classes_array[i];\n"
             "    __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;\n"
             "    uint32_t idx = 0, j;\n"
-            "    for( j = 0; PARSEC_DEV_NONE != chores[j].type; j++ ) {\n"
+            "    for( j = 0; PARSEC_DEV_NONE != (chores[j].type & PARSEC_DEV_ANY_TYPE); j++ ) {\n"
             "      if( !(supported_dev & chores[j].type) ) continue;\n"
             "      if( j != idx ) {\n"
             "        chores[idx] = chores[j];\n"
@@ -4684,7 +4691,7 @@ static void jdf_generate_constructor( const jdf_t* jdf )
     coutput("  for( i = 0; i < __parsec_tp->super.super.nb_task_classes; i++ ) {\n"
             "    __parsec_tp->super.super.task_classes_array[i] = tc = malloc(sizeof(parsec_task_class_t));\n"
             "    memcpy(tc, %s_task_classes[i], sizeof(parsec_task_class_t));\n"
-            "    for( j = 0; PARSEC_DEV_NONE != tc->incarnations[j].type; j++);  /* compute the number of incarnations */\n"
+            "    for( j = 0; PARSEC_DEV_NONE != (tc->incarnations[j].type & PARSEC_DEV_ANY_TYPE); j++);  /* compute the number of incarnations */\n"
             "    tc->incarnations = (__parsec_chore_t*)malloc((j+1) * sizeof(__parsec_chore_t));\n    "
             "    memcpy((__parsec_chore_t*)tc->incarnations, %s_task_classes[i]->incarnations, (j+1) * sizeof(__parsec_chore_t));\n\n"
             "    /* Add a placeholder for initialization and startup task */\n"
@@ -6747,8 +6754,8 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
         coutput("  /* Pointer to dynamic gpu function */\n"
                 "  {\n"
                 "    int chore_idx = 0;\n"
-                "    for ( ; PARSEC_DEV_NONE != this_task->task_class->incarnations[chore_idx].type; ++chore_idx) {\n"
-                "      if (this_task->task_class->incarnations[chore_idx].type == PARSEC_DEV_%s) break;\n"
+                "    for ( ; PARSEC_DEV_NONE != (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_ANY_TYPE); ++chore_idx) {\n"
+                "      if (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_%s) break;\n"
                 "    }\n"
                 "    /* The void* cast prevents the compiler from complaining about the type change */\n"
                 "    parsec_body.dyld_fn = (%s)(void*)this_task->task_class->incarnations[chore_idx].dyld_fn;\n"
diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c
index 3529917b0..e29f144e9 100644
--- a/parsec/mca/device/cuda/device_cuda_component.c
+++ b/parsec/mca/device/cuda/device_cuda_component.c
@@ -276,7 +276,7 @@ static int device_cuda_component_close(void)
     /* Check that no CUDA devices are still registered with PaRSEC */
     for(i = 0; i < parsec_mca_device_enabled(); i++) {
         if( NULL == (cdev = (parsec_device_cuda_module_t*)parsec_mca_device_get(i)) ) continue;
-        if(PARSEC_DEV_CUDA != cdev->super.super.type) continue;
+        if( !(PARSEC_DEV_CUDA & cdev->super.super.type) ) continue;
 
         PARSEC_DEBUG_VERBOSE(0, parsec_gpu_output_stream,
                              "GPU[%d:%s] CUDA device %d still registered with PaRSEC at the end of CUDA finalize.\n"
diff --git a/parsec/mca/device/device.c b/parsec/mca/device/device.c
index c73582ff6..ee641df7c 100644
--- a/parsec/mca/device/device.c
+++ b/parsec/mca/device/device.c
@@ -46,6 +46,8 @@ static parsec_device_module_t **modules_activated = NULL;
 
 static mca_base_component_t **device_components = NULL;
 
+static int parsec_device_enable_batching = !!PARSEC_HAVE_DEV_CAPABILITY_BATCH;
+
 /**
  * Load balance skew we are willing to accept to favor RO data reuse
  * on GPU: a value of 20% means that we will schedule tasks on the preferred
@@ -112,7 +114,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
 
     /* Run the evaluates for the incarnation types to determine if they can
      * execute this task */
-    for(chore_id = 0; PARSEC_DEV_NONE != tc->incarnations[chore_id].type; chore_id++) {
+    for(chore_id = 0; PARSEC_DEV_NONE != (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); chore_id++) {
         if( 0 == (this_task->chore_mask & (1<<chore_id)) ) continue;
         if( NULL == tc->incarnations[chore_id].hook ) continue; /* dyld hook not found during initialization */
 
@@ -121,7 +123,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
             if( PARSEC_HOOK_RETURN_DONE != rc ) {
                 if( PARSEC_HOOK_RETURN_NEXT != rc ) {
                     PARSEC_DEBUG_VERBOSE(5, parsec_device_output, "Failed to evaluate %s[%d] chore %d",
-                                         tmp, tc->incarnations[chore_id].type,
+                                         tmp, tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE,
                                          chore_id);
                 }
                 /* Mark this chore as tested */
@@ -129,10 +131,11 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
                 continue;
             }
         }
-        valid_types |= tc->incarnations[chore_id].type; /* the eval accepted the type, but no device specified yet */
-        if( NULL != this_task->selected_device ) { /* When Evaluate picked a device, abide by it */
+        valid_types |= (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); /* the eval accepted the type, but no device specified yet */
+        /* Evaluate may have picked a device, abide by it */
+        if( NULL != this_task->selected_device ) {
             assert( (1<<this_task->selected_device->device_index) & tp->devices_index_mask /* only valid devices! */ );
-            assert( this_task->selected_device->type & valid_types /* only valid device types! */ );
+            assert( this_task->selected_device->type & valid_types );
             PARSEC_DEBUG_VERBOSE(30, parsec_device_output, "%s: Task %s evaluate set selected_device %d:%s",
                                  __func__, tmp, this_task->selected_device->device_index, this_task->selected_device->name);
             goto device_selected;
@@ -145,7 +148,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
     if (PARSEC_DEV_CPU == valid_types) { /* shortcut for CPU only tasks */
         this_task->selected_device = dev = parsec_mca_device_get(0);
         this_task->load = 0;
-        for(chore_id = 0; tc->incarnations[chore_id].type != PARSEC_DEV_CPU; chore_id++);
+        for(chore_id = 0; !(tc->incarnations[chore_id].type & PARSEC_DEV_CPU); chore_id++);
         this_task->selected_chore = chore_id;
         PARSEC_DEBUG_VERBOSE(80, parsec_device_output, "%s: Task %s cpu-only task set selected_device %d:%s",
                              __func__, tmp, dev->device_index, dev->name);
@@ -233,6 +236,8 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
             dev = parsec_mca_device_get(dev_index);
             /* Skip the device if no incarnations for its type */
             if(!(dev->type & valid_types)) continue;
+            /* Skip recursive devices: time estimates are computed on the associated CPU device */
+            if(dev->type & PARSEC_DEV_RECURSIVE) continue;
 
             eta = dev->device_load + time_estimate(this_task, dev);
             if( best_eta > eta ) {
@@ -253,14 +258,14 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
             goto no_valid_device;
 
         this_task->selected_device = parsec_mca_device_get(best_index);
-        assert( this_task->selected_device->type != PARSEC_DEV_RECURSIVE );
+        assert( !(this_task->selected_device->type & PARSEC_DEV_RECURSIVE) );
     }
 
 device_selected:
     dev = this_task->selected_device;
     assert( NULL != dev );
     assert( tp->devices_index_mask & (1 << dev->device_index) );
-    for(chore_id = 0; tc->incarnations[chore_id].type != dev->type; chore_id++)
+    for(chore_id = 0; !(tc->incarnations[chore_id].type & dev->type); chore_id++)
         assert(PARSEC_DEV_NONE != tc->incarnations[chore_id].type /* we have selected this device, so there *must* be an incarnation that matches */);
     this_task->selected_chore = chore_id;
     this_task->load = time_estimate(this_task, dev);
@@ -301,6 +306,31 @@ no_valid_device: {
 PARSEC_OBJ_CLASS_INSTANCE(parsec_device_module_t, parsec_object_t,
                           NULL, NULL);
 
+int
+parsec_mca_device_type_supports_batch(uint32_t device_type)
+{
+#if PARSEC_HAVE_DEV_CAPABILITY_BATCH
+    const uint32_t batch_capable_types = PARSEC_DEV_CUDA | PARSEC_DEV_HIP | PARSEC_DEV_LEVEL_ZERO;
+    uint32_t type = device_type & PARSEC_DEV_ANY_TYPE;
+
+    return parsec_device_enable_batching && (0 != type) &&
+           (0 == (type & ~batch_capable_types));
+#else
+    (void)device_type;
+    return 0;
+#endif  /* PARSEC_HAVE_DEV_CAPABILITY_BATCH */
+}
+
+uint32_t
+parsec_mca_device_type_sanitize_batch(uint32_t device_type)
+{
+    if( (device_type & PARSEC_DEV_CHORE_ALLOW_BATCH) &&
+        !parsec_mca_device_type_supports_batch(device_type) ) {
+        device_type &= ~PARSEC_DEV_CHORE_ALLOW_BATCH;
+    }
+    return device_type;
+}
+
 int parsec_mca_device_init(void)
 {
     char** parsec_device_list = NULL;
@@ -325,6 +355,11 @@ int parsec_mca_device_init(void)
     (void)parsec_mca_param_reg_int_name("device", "load_balance_allow_cpu",
                                         "Allow load balancing tasks with GPU incarnations to CPU cores",
                                         false, false, parsec_device_load_balance_allow_cpu, NULL);
+    (void)parsec_mca_param_reg_int_name("device", "enable_batching",
+                                        "Boolean to allow batched task execution on all devices",
+                                        false, !parsec_device_enable_batching,
+                                        parsec_device_enable_batching,
+                                        &parsec_device_enable_batching);
     if( 0 < (rc = parsec_mca_param_find("device", NULL, "load_balance_skew")) ) {
         parsec_mca_param_lookup_int(rc, &parsec_device_load_balance_skew);
     }
@@ -765,10 +800,8 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
     for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
         parsec_device_module_t* device = parsec_devices[i];
         if( NULL == device ) continue;
-        if( PARSEC_DEV_RECURSIVE == device->type ) continue;
-        if(NULL != device->all_devices_attached)
-            device->all_devices_attached(device);
-        if( PARSEC_DEV_CPU == device->type ) {
+        if( PARSEC_DEV_RECURSIVE & device->type ) continue;
+        if( PARSEC_DEV_CPU & device->type ) {
             c = 0;
             for(int p = 0; p < context->nb_vp; p++)
                 c += context->virtual_processes[p]->nb_cores;
@@ -787,7 +820,7 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
     for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
         parsec_device_module_t* device = parsec_devices[i];
         if( NULL == device ) continue;
-        if( PARSEC_DEV_RECURSIVE == device->type ) continue;
+        if( PARSEC_DEV_RECURSIVE & device->type ) continue;
         device->time_estimate_default = total_gflops_fp64/(double)device->gflops_fp64;
         parsec_debug_verbose(6, parsec_device_output, "  Dev[%d] default-time-estimate %-4"PRId64" <- double %-8"PRId64" single %-8"PRId64" tensor %-8"PRId64" half %-8"PRId64" %s",
                              i, device->time_estimate_default, device->gflops_fp64, device->gflops_fp32, device->gflops_tf32, device->gflops_fp16, device->gflops_guess? "GUESSED": "");
@@ -976,7 +1009,7 @@ device_taskpool_register_static(parsec_device_module_t* device, parsec_taskpool_
             continue;
         __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;
         for( j = 0; NULL != chores[j].hook; j++ ) {
-            if( chores[j].type != device->type )
+            if( !(chores[j].type & device->type) )
                 continue;
             if(  NULL != chores[j].dyld_fn ) {
                 continue;  /* the function has been set for another device of the same type */
diff --git a/parsec/mca/device/device.h b/parsec/mca/device/device.h
index c3950457b..26345afd8 100644
--- a/parsec/mca/device/device.h
+++ b/parsec/mca/device/device.h
@@ -2,6 +2,7 @@
  * Copyright (c) 2013-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  */
 
 /** @addtogroup parsec_device
@@ -69,6 +70,8 @@ typedef struct parsec_device_base_component_2_0_0 parsec_device_base_component_t
 #define PARSEC_DEV_ANY_TYPE   ((uint8_t)    0x3f)
 #define PARSEC_DEV_ALL        ((uint8_t)    0x3f)
 #define PARSEC_DEV_MAX_NB_TYPE                (7)
+/* The following flags are extensions to the device type */
+#define PARSEC_DEV_CHORE_ALLOW_BATCH  ((uint32_t)0x00000100)
 
 #define PARSEC_DEV_GPU_MASK   (PARSEC_DEV_CUDA|PARSEC_DEV_HIP|PARSEC_DEV_LEVEL_ZERO)
 #define PARSEC_DEV_IS_GPU(t)  (0 != ((t) & PARSEC_DEV_GPU_MASK))
@@ -205,6 +208,18 @@ extern int parsec_device_output;
  */
 PARSEC_DECLSPEC extern int parsec_select_best_device( parsec_task_t* this_task);
 
+/**
+ * Return true if batching is enabled globally and the provided non-empty
+ * mask contains only batching-capable device types.
+ */
+PARSEC_DECLSPEC int parsec_mca_device_type_supports_batch(uint32_t device_type);
+
+/**
+ * Drop the batching hint from a chore type if batching is disabled or the
+ * selected device type cannot batch.
+ */
+PARSEC_DECLSPEC uint32_t parsec_mca_device_type_sanitize_batch(uint32_t device_type);
+
 /**
  * Initialize the internal structures for managing external devices such as
  * accelerators and GPU. Memory nodes can as well be managed using the same
diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index 7fd4405ab..b120008b0 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -1,5 +1,4 @@
 /*
- *
  * Copyright (c) 2021-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
@@ -528,7 +527,7 @@ parsec_device_taskpool_register(parsec_device_module_t* device,
         const parsec_task_class_t* tc = tp->task_classes_array[i];
         __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;
         for( j = 0; NULL != chores[j].hook; j++ ) {
-            if( chores[j].type != device->type )
+            if( !(chores[j].type & device->type) )
                 continue;
             if( NULL != chores[j].dyld_fn ) {
                 /* the function has been set for another device of the same type */
@@ -1203,7 +1202,7 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t        *gtask,
         src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index);
         dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index);
 
-        if(src_dev->super.type == dst_dev->super.type) {
+        if((src_dev->super.type & PARSEC_DEV_ANY_TYPE) == (dst_dev->super.type & PARSEC_DEV_ANY_TYPE)) {
             assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) );
             dir = parsec_device_gpu_transfer_direction_d2d;
         } else {
@@ -1254,7 +1253,7 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t        *gtask,
 
             count = (source->original->nb_elts <= dest->original->nb_elts) ? source->original->nb_elts :
                         dest->original->nb_elts;
-            if( src_dev->super.type == dst_dev->super.type ) {
+            if( (src_dev->super.type & PARSEC_DEV_ANY_TYPE) == (dst_dev->super.type & PARSEC_DEV_ANY_TYPE) ) {
                 assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) );
                 dir = parsec_device_gpu_transfer_direction_d2d;
             } else {
@@ -1381,7 +1380,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream,
                              "GPU[%d:%s]:\tSelecting candidate data copy %p [ref_count %d] on data %p",
                              gpu_device->super.device_index, gpu_device->super.name, task_data->data_in, task_data->data_in->super.super.obj_reference_count, original);
-        if( gpu_device->super.type == candidate_dev->super.type ) {
+        if( (gpu_device->super.type & PARSEC_DEV_ANY_TYPE) == (candidate_dev->super.type & PARSEC_DEV_ANY_TYPE) ) {
             if( gpu_device->peer_access_mask & (1 << candidate_dev->super.device_index) ) {
                 /* We can directly do D2D, so let's skip the selection */
                 PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream,
@@ -1531,7 +1530,8 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
                         gpu_device->super.device_index, gpu_device->super.name, rc, __func__, __LINE__,
                         candidate->device_private, candidate_dev->super.device_index, candidate_dev->super.name,
                         gpu_elem->device_private, gpu_device->super.device_index, gpu_device->super.name,
-                        nb_elts, (candidate_dev->super.type != gpu_device->super.type)? "H2D": "D2D");
+                        nb_elts,
+                        (candidate_dev->super.type & gpu_device->super.type & PARSEC_DEV_ANY_TYPE)? "D2D": "H2D");
         parsec_atomic_unlock( &original->lock );
         assert(0);
         return PARSEC_HOOK_RETURN_ERROR;
@@ -1573,6 +1573,74 @@ static inline parsec_list_item_t* parsec_device_push_task_ordered( parsec_list_t
 #define PARSEC_PUSH_TASK parsec_list_push_back
 #endif
 
+static inline int
+parsec_gpu_task_is_singleton(parsec_gpu_task_t *task)
+{
+    parsec_list_item_t *item = &task->list_item;
+
+#if defined(PARSEC_DEBUG_PARANOID)
+    if( ((parsec_list_item_t *)(void *)0xdeadbeefL == item->list_next) ||
+        ((parsec_list_item_t *)(void *)0xdeadbeefL == item->list_prev) ) {
+        return 0;
+    }
+#endif
+    return (item->list_next == item) && (item->list_prev == item);
+}
+
+static inline void
+parsec_gpu_stream_push_pending(parsec_gpu_exec_stream_t *stream,
+                               parsec_gpu_task_t *task)
+{
+    /* A completed batched kernel returns a proper task ring. Preserve that
+     * order when feeding the tasks to the next stream.
+     */
+    if( !parsec_gpu_task_is_singleton(task) ) {
+        parsec_list_chain_back(stream->fifo_pending, &task->list_item);
+        return;
+    }
+    PARSEC_PUSH_TASK(stream->fifo_pending, &task->list_item);
+}
+
+int
+parsec_gpu_task_collect_batch(parsec_gpu_exec_stream_t *gpu_stream,
+                              parsec_gpu_task_t *batch_head,
+                              parsec_gpu_task_batch_cb_t callback,
+                              void *callback_data)
+{
+    parsec_list_t *fifo_pending;
+    parsec_list_item_t *item, *next;
+    int nb_tasks = 1;
+    int rc;
+
+    assert(NULL != gpu_stream);
+    assert(NULL != batch_head);
+    assert(NULL != callback);
+
+    fifo_pending = gpu_stream->fifo_pending;
+    assert(NULL != fifo_pending);
+    parsec_list_item_singleton(&batch_head->list_item);
+
+    parsec_list_lock(fifo_pending);
+    for(item = (parsec_list_item_t *)fifo_pending->ghost_element.list_next;
+        item != &fifo_pending->ghost_element;
+        item = next) {
+        next = (parsec_list_item_t *)item->list_next;
+        rc = callback((parsec_gpu_task_t *)item, batch_head, callback_data);
+        if( rc < 0 ) {
+            parsec_list_unlock(fifo_pending);
+            return rc;
+        }
+        if( 0 == rc ) {
+            (void)parsec_list_nolock_remove(fifo_pending, item);
+            (void)parsec_list_item_ring_push(&batch_head->list_item, item);
+            nb_tasks++;
+        }
+    }
+    parsec_list_unlock(fifo_pending);
+
+    return nb_tasks;
+}
+
 static parsec_flow_t parsec_device_d2d_complete_flow = {
     .name = "D2D FLOW",
     .flow_flags = PARSEC_FLOW_ACCESS_READ,
@@ -1867,8 +1935,9 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
  * The progress function is either specified by the caller via the
  * upstream_progress_fct input argument or by the next task to be progresses
  * via the submit function associated with the task. In any case, this
- * function progresses a single task, which is then returned as the
- * out_task parameter.
+ * function progresses a task. If a batched submit function returns a task
+ * ring, the ring is returned as the out_task parameter and chained into the
+ * next stream by the caller's next invocation of this helper.
  *
  * Beware: this function does not generate errors by itself, instead
  * it propagates upward the return code of the progress function.
@@ -1891,7 +1960,7 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device,
      * local list (possibly by reordering the list). Also, as we can return a single
      * task first try to see if anything completed. */
     if( NULL != task ) {
-        PARSEC_PUSH_TASK(stream->fifo_pending, (parsec_list_item_t*)task);
+        parsec_gpu_stream_push_pending(stream, task);
         task = NULL;
     }
     *out_task = NULL;
@@ -1936,14 +2005,14 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device,
         }
     }
 
- grab_a_task:
+  grab_a_task:
     if( NULL == stream->tasks[stream->start] ) {  /* there is room on the stream */
         task = (parsec_gpu_task_t*)parsec_list_pop_front(stream->fifo_pending);  /* get the best task */
     }
     if( NULL == task ) {  /* No tasks, we're done */
         return PARSEC_HOOK_RETURN_DONE;
     }
-    PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t*)task);
+    PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t *)task);
 
     assert( NULL == stream->tasks[stream->start] );
 
@@ -1956,7 +2025,7 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device,
                  * the resubmission of this task as much as possible, but without losing track of it
                  * (aka. returning it to the upper level).
                  */
-                parsec_list_push_back(stream->fifo_pending, (parsec_list_item_t*)task);
+                parsec_gpu_stream_push_pending(stream, task);
             } else {
                 /* Something else is going on with this task, remove it from the stream queues
                  * and return it to the upper level for final decision on its fate.
@@ -2151,6 +2220,12 @@ parsec_device_kernel_exec( parsec_device_gpu_module_t      *gpu_device,
     }
 #endif /* defined(PARSEC_DEBUG_PARANOID) */
 
+    /* The submit hook may turn gpu_task into a batch ring. Start from a clean
+     * singleton so stale list links left by release-mode list operations cannot
+     * be mistaken for a preexisting ring.
+     */
+    PARSEC_LIST_ITEM_SINGLETON(&gpu_task->list_item);
+
     (void)this_task;
     return progress_fct( gpu_device, gpu_task, gpu_stream );
 }
@@ -2670,6 +2745,10 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module,
     gpu_task = (parsec_gpu_task_t*)parsec_fifo_try_pop( &(gpu_device->pending) );
     if( NULL != gpu_task ) {
         pop_null = 0;
+        /* parsec_fifo_try_pop() detaches the task but does not reset list links
+         * in release builds; normalize before the stream FIFO inspects them.
+         */
+        PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t*)gpu_task);
         gpu_task->last_data_check_epoch = gpu_device->data_avail_epoch - 1;  /* force at least one tour */
         PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,  "GPU[%d:%s]:\tGet from shared queue %s", gpu_device->super.device_index, gpu_device->super.name,
                              parsec_device_describe_gpu_task(tmp, MAX_TASK_STRLEN, gpu_task));
diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h
index 8037f7fb7..35c367289 100644
--- a/parsec/mca/device/device_gpu.h
+++ b/parsec/mca/device/device_gpu.h
@@ -50,6 +50,19 @@ typedef int (*parsec_advance_task_function_t)(parsec_device_gpu_module_t  *gpu_d
                                               parsec_gpu_task_t           *gpu_task,
                                               parsec_gpu_exec_stream_t    *gpu_stream);
 
+/* Callback used by parsec_gpu_task_collect_batch() to decide whether a
+ * pending task can be appended to the current batched task ring.
+ *
+ * Return values:
+ *   < 0: stop iteration and return this error code to the caller.
+ *     0: extract candidate from the stream pending queue and append it to
+ *        batch_head.
+ *   > 0: leave candidate in the stream pending queue and continue.
+ */
+typedef int (*parsec_gpu_task_batch_cb_t)(parsec_gpu_task_t *candidate,
+                                          parsec_gpu_task_t *batch_head,
+                                          void *callback_data);
+
 /* Function type to transfer data to the GPU device.
  * Transfer transfer the <count> contiguous bytes from
  * task->data[i].data_in to task->data[i].data_out.
@@ -307,6 +320,23 @@ int parsec_device_sort_pending_list(parsec_device_module_t *gpu_device);
 parsec_gpu_task_t* parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_execution_stream_t *es);
 int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_gpu_task_t *w2r_task, parsec_execution_stream_t *es);
 
+/**
+ * Iterate over gpu_stream->fifo_pending and append accepted tasks to
+ * batch_head. The callback receives each pending candidate, the task passed to
+ * the submit function, and user data. The callback should return 0 to append
+ * the candidate to batch_head's ring, a positive value to leave it pending, or
+ * a negative error code to stop the iteration.
+ * The callback must not modify gpu_stream->fifo_pending directly.
+ *
+ * Returns the number of tasks in batch_head's ring on success, or the negative
+ * callback error code. If an error is returned, tasks already accepted remain
+ * attached to batch_head and the remaining candidates stay in fifo_pending.
+ */
+int parsec_gpu_task_collect_batch(parsec_gpu_exec_stream_t *gpu_stream,
+                                  parsec_gpu_task_t *batch_head,
+                                  parsec_gpu_task_batch_cb_t callback,
+                                  void *callback_data);
+
 void parsec_device_enable_debug(void);
 
 #if defined(PARSEC_DEBUG_VERBOSE)
diff --git a/parsec/mca/device/level_zero/device_level_zero_component.c b/parsec/mca/device/level_zero/device_level_zero_component.c
index f50f2a817..fec2bff88 100644
--- a/parsec/mca/device/level_zero/device_level_zero_component.c
+++ b/parsec/mca/device/level_zero/device_level_zero_component.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2023-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  */
 
 #include "parsec.h"
@@ -410,7 +411,7 @@ static int device_level_zero_component_close(void)
     /* Check that no LEVEL_ZERO devices are still registered with PaRSEC */
     for(i = 0; i < parsec_mca_device_enabled(); i++) {
         if( NULL == (cdev = (parsec_device_level_zero_module_t*)parsec_mca_device_get(i)) ) continue;
-        if(PARSEC_DEV_LEVEL_ZERO != cdev->super.super.type) continue;
+        if( !(PARSEC_DEV_LEVEL_ZERO & cdev->super.super.type) ) continue;
 
         PARSEC_DEBUG_VERBOSE(0, parsec_gpu_output_stream,
                              "GPU[%d] LEVEL_ZERO device still registered with PaRSEC at the end of LEVEL_ZERO finalize.\n"
diff --git a/parsec/mca/device/template/device_template_module.c b/parsec/mca/device/template/device_template_module.c
index ec8f5dcc8..453d938d3 100644
--- a/parsec/mca/device/template/device_template_module.c
+++ b/parsec/mca/device/template/device_template_module.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2019-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  */
 
 #include "parsec/parsec_config.h"
@@ -78,7 +79,7 @@ parsec_template_taskpool_register(parsec_device_module_t* device,
         const parsec_task_class_t* tc = tp->task_classes_array[i];
         __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;
         for( j = 0; NULL != chores[j].hook; j++ ) {
-            if( chores[j].type != device->type )
+            if( !(chores[j].type & device->type) )
                 continue;
             if( NULL != chores[j].dyld_fn ) {
                 /* the function has been set for another device of the same type */
diff --git a/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c b/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c
index 6d602117a..aff5bc692 100644
--- a/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c
+++ b/parsec/mca/pins/ptg_to_dtd/pins_ptg_to_dtd_module.c
@@ -86,7 +86,7 @@ copy_chores(parsec_taskpool_t *tp, parsec_dtd_taskpool_t *dtd_tp)
             parsec_hook_t **hook_not_const = (parsec_hook_t **)&(tp->task_classes_array[i]->incarnations[j].hook);
 
             /* saving the CPU hook only */
-            if (tp->task_classes_array[i]->incarnations[j].type == PARSEC_DEV_CPU) {
+            if (tp->task_classes_array[i]->incarnations[j].type & PARSEC_DEV_CPU) {
                 dtd_tp->actual_hook[i].hook = tp->task_classes_array[i]->incarnations[j].hook;
             }
             /* copying the fake hook in all the hooks (CPU, GPU etc) */
@@ -312,8 +312,8 @@ parsec_dtd_taskpool_insert_task_ptg_to_dtd( parsec_dtd_taskpool_t  *dtd_tp,
         free(params);
 
         __parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations;
-        for(int i = 0; PARSEC_DEV_NONE != incarnations[i].type; i++ ) {
-            if( PARSEC_DEV_CPU == incarnations[i].type ) {
+        for(int i = 0; PARSEC_DEV_NONE != (incarnations[i].type & PARSEC_DEV_ANY_TYPE); i++ ) {
+            if( PARSEC_DEV_CPU & incarnations[i].type ) {
                 incarnations[i] = dtd_chore_for_testing;
             }
         }
diff --git a/tests/dsl/dtd/CMakeLists.txt b/tests/dsl/dtd/CMakeLists.txt
index 209875a05..b5da79051 100644
--- a/tests/dsl/dtd/CMakeLists.txt
+++ b/tests/dsl/dtd/CMakeLists.txt
@@ -14,6 +14,9 @@ parsec_addtest_executable(C dtd_test_reduce SOURCES dtd_test_reduce.c)
 parsec_addtest_executable(C dtd_test_allreduce SOURCES dtd_test_allreduce.c)
 parsec_addtest_executable(C dtd_test_template_counter SOURCES dtd_test_template_counter.c)
 parsec_addtest_executable(C dtd_test_untie SOURCES dtd_test_untie.c)
+if(PARSEC_HAVE_DEV_CAPABILITY_BATCH)
+  parsec_addtest_executable(C dtd_test_batch_cpu SOURCES dtd_test_batch_cpu.c)
+endif(PARSEC_HAVE_DEV_CAPABILITY_BATCH)
 parsec_addtest_executable(C dtd_test_hierarchy SOURCES dtd_test_hierarchy.c)
 parsec_addtest_executable(C dtd_test_task_placement SOURCES dtd_test_task_placement.c)
 parsec_addtest_executable(C dtd_test_insert_task_interface SOURCES dtd_test_insert_task_interface.c)
diff --git a/tests/dsl/dtd/Testings.cmake b/tests/dsl/dtd/Testings.cmake
index 7305a202c..ff85b63d6 100644
--- a/tests/dsl/dtd/Testings.cmake
+++ b/tests/dsl/dtd/Testings.cmake
@@ -7,6 +7,10 @@ parsec_addtest_cmd(dsl/dtd/task_generation ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test
 parsec_addtest_cmd(dsl/dtd/task_inserting_task ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test_task_inserting_task)
 parsec_addtest_cmd(dsl/dtd/task_insertion ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test_task_insertion)
 parsec_addtest_cmd(dsl/dtd/war ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test_war)
+if(PARSEC_HAVE_DEV_CAPABILITY_BATCH)
+  parsec_addtest_cmd(dsl/dtd/batch_cpu ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test_batch_cpu)
+  parsec_addtest_cmd(dsl/dtd/batch_cpu:disabled ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test_batch_cpu --mca device_enable_batching 0)
+endif(PARSEC_HAVE_DEV_CAPABILITY_BATCH)
 parsec_addtest_cmd(dsl/dtd/new_tile:cpu ${SHM_TEST_CMD_LIST} dsl/dtd/dtd_test_new_tile --mca device_cuda_enabled 0)
 if(PARSEC_HAVE_CUDA AND CMAKE_CUDA_COMPILER)
   parsec_addtest_cmd(dsl/dtd/new_tile:gpu ${SHM_TEST_CMD_LIST} ${CTEST_CUDA_LAUNCHER_OPTIONS} dsl/dtd/dtd_test_new_tile --mca device_cuda_enabled 1 --mca device cuda)
diff --git a/tests/dsl/dtd/dtd_test_batch_cpu.c b/tests/dsl/dtd/dtd_test_batch_cpu.c
new file mode 100644
index 000000000..c32893f7c
--- /dev/null
+++ b/tests/dsl/dtd/dtd_test_batch_cpu.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2026      NVIDIA Corporation.  All rights reserved.
+ */
+
+#include "parsec.h"
+#include "parsec/interfaces/dtd/insert_function_internal.h"
+#include "parsec/mca/device/device.h"
+
+#if defined(PARSEC_HAVE_MPI)
+#include <mpi.h>
+#endif
+
+#include <assert.h>
+#include <stdlib.h>
+
+static int32_t observed_sum = 0;
+
+static parsec_hook_return_t
+batch_cpu_task(parsec_execution_stream_t *es, parsec_task_t *this_task)
+{
+    int value;
+
+    parsec_dtd_unpack_args(this_task, &value);
+    (void)parsec_atomic_fetch_add_int32(&observed_sum, value);
+
+    (void)es;
+    return PARSEC_HOOK_RETURN_DONE;
+}
+
+int
+main(int argc, char **argv)
+{
+    parsec_context_t *parsec;
+    parsec_taskpool_t *dtd_tp;
+    parsec_task_class_t *tc;
+    int rc, rank = 0;
+    int ntasks = 32;
+    int expected = 0;
+    int ret = 0;
+
+#if defined(PARSEC_HAVE_MPI)
+    {
+        int provided;
+        MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &provided);
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+#endif
+
+    if( NULL != argv[1] ) {
+        ntasks = atoi(argv[1]);
+    }
+    if( ntasks <= 0 ) {
+        ntasks = 32;
+    }
+
+    parsec = parsec_init(-1, &argc, &argv);
+    assert(NULL != parsec);
+
+    dtd_tp = parsec_dtd_taskpool_new();
+
+    rc = parsec_context_start(parsec);
+    PARSEC_CHECK_ERROR(rc, "parsec_context_start");
+
+    rc = parsec_context_add_taskpool(parsec, dtd_tp);
+    PARSEC_CHECK_ERROR(rc, "parsec_context_add_taskpool");
+
+    tc = parsec_dtd_create_task_class(dtd_tp, "BatchCpu",
+                                      sizeof(int), PARSEC_VALUE,
+                                      PARSEC_DTD_ARG_END);
+    assert(NULL != tc);
+
+    rc = parsec_dtd_task_class_add_chore(dtd_tp, tc,
+                                         PARSEC_DEV_CPU | PARSEC_DEV_CHORE_ALLOW_BATCH,
+                                         batch_cpu_task);
+    PARSEC_CHECK_ERROR(rc, "parsec_dtd_task_class_add_chore");
+    assert(tc->incarnations[0].type & PARSEC_DEV_CPU);
+    assert(!!(tc->incarnations[0].type & PARSEC_DEV_CHORE_ALLOW_BATCH) ==
+           !!parsec_mca_device_type_supports_batch(PARSEC_DEV_CPU));
+
+    for( int i = 0; i < ntasks; i++ ) {
+        int value = i + 1;
+        expected += value;
+        parsec_dtd_insert_task_with_task_class(dtd_tp, tc, 0,
+                                               PARSEC_DEV_CPU | PARSEC_DEV_CHORE_ALLOW_BATCH,
+                                               PARSEC_DTD_EMPTY_FLAG, &value,
+                                               PARSEC_DTD_ARG_END);
+    }
+
+    rc = parsec_taskpool_wait(dtd_tp);
+    PARSEC_CHECK_ERROR(rc, "parsec_taskpool_wait");
+
+    rc = parsec_context_wait(parsec);
+    PARSEC_CHECK_ERROR(rc, "parsec_context_wait");
+
+    if( expected != observed_sum ) {
+        parsec_warning("Rank %d observed %d, expected %d\n",
+                       rank, observed_sum, expected);
+        ret = 1;
+    }
+
+    parsec_dtd_task_class_release(dtd_tp, tc);
+    parsec_taskpool_free(dtd_tp);
+    parsec_fini(&parsec);
+
+#if defined(PARSEC_HAVE_MPI)
+    MPI_Finalize();
+#endif
+
+    return ret;
+}
diff --git a/tests/dsl/dtd/dtd_test_cuda_task_insert.c b/tests/dsl/dtd/dtd_test_cuda_task_insert.c
index d37d27d32..0b0b78de5 100644
--- a/tests/dsl/dtd/dtd_test_cuda_task_insert.c
+++ b/tests/dsl/dtd/dtd_test_cuda_task_insert.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2021-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ * Copyright (c) 2024-2026 NVIDIA Corporation.  All rights reserved.
  */
 
 #include "parsec.h"
@@ -624,7 +625,7 @@ int get_nb_cuda_devices()
 
     for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             nb++;
         }
     }
@@ -640,7 +641,7 @@ int *get_cuda_device_index()
     int i = 0;
     for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             dev_index[i++] = device->device_index;
         }
     }
diff --git a/tests/dsl/dtd/dtd_test_new_tile.c b/tests/dsl/dtd/dtd_test_new_tile.c
index bbffe3ea7..c0ef693f1 100644
--- a/tests/dsl/dtd/dtd_test_new_tile.c
+++ b/tests/dsl/dtd/dtd_test_new_tile.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2022-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ * Copyright (c) 2024-2026 NVIDIA Corporation.  All rights reserved.
  */
 
 /* parsec things */
@@ -283,7 +284,7 @@ int main(int argc, char **argv)
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(PARSEC_HAVE_CU_COMPILER)
     for(unsigned int i = 0; i < parsec_nb_devices; i++) {
         parsec_device_module_t *dev = parsec_mca_device_get(i);
-        if( dev->type == PARSEC_DEV_CUDA )
+        if( dev->type & PARSEC_DEV_CUDA )
             nb_gpus++;
     }
     if(nb_gpus > 0) {
@@ -292,7 +293,7 @@ int main(int argc, char **argv)
         nb_gpus = 0;
         for(unsigned int i = 0; i < parsec_nb_devices; i++) {
             parsec_device_module_t *dev = parsec_mca_device_get(i);
-            if( dev->type == PARSEC_DEV_CUDA) {
+            if( dev->type & PARSEC_DEV_CUDA) {
                 cudaError_t status;
                 parsec_device_cuda_module_t *gpu_device = (parsec_device_cuda_module_t *)dev;
                 status = cudaSetDevice( gpu_device->cuda_index );
diff --git a/tests/dsl/dtd/dtd_test_simple_gemm.c b/tests/dsl/dtd/dtd_test_simple_gemm.c
index 2939518d9..6ef30179b 100644
--- a/tests/dsl/dtd/dtd_test_simple_gemm.c
+++ b/tests/dsl/dtd/dtd_test_simple_gemm.c
@@ -2,7 +2,7 @@
  * Copyright (c) 2021-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
- * Copyright (c) 2026      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2024-2026 NVIDIA Corporation.  All rights reserved.
  */
 
 #include "parsec.h"
@@ -10,6 +10,7 @@
 #include "parsec/data_dist/matrix/matrix.h"
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 #include "parsec/interfaces/dtd/insert_function_internal.h"
+#include "parsec/mca/device/device.h"
 
 // The file is not compiled if CUDA is not present or CUBLAS is not found
 #include "parsec/mca/device/cuda/device_cuda.h"
@@ -57,6 +58,7 @@ static parsec_info_id_t CuHI = -1;
 static parsec_info_id_t Cu1 = -1;
 static int verbose = 0;
 static int device = PARSEC_DEV_CUDA;
+static int use_cuda_batch = 0;
 static int P = -1;
 static int Q = -1;
 
@@ -182,55 +184,104 @@ int initialize_matrix(parsec_context_t *parsec_context, int rank, parsec_matrix_
     return 0;
 }
 
+static int
+gemm_cuda_task_allows_batch(parsec_gpu_task_t *gpu_task)
+{
+    parsec_task_t *this_task = gpu_task->ec;
+    int selected_chore = this_task->selected_chore;
+
+    return use_cuda_batch &&
+           (selected_chore >= 0) &&
+           (this_task->task_class->incarnations[selected_chore].type & PARSEC_DEV_CHORE_ALLOW_BATCH) &&
+           parsec_mca_device_type_supports_batch(this_task->selected_device->type);
+}
+
+static int
+gemm_cuda_batch_match(parsec_gpu_task_t *candidate,
+                      parsec_gpu_task_t *batch_head,
+                      void *callback_data)
+{
+    (void)callback_data;
+
+    if( (batch_head->ec->task_class == candidate->ec->task_class) &&
+        (batch_head->ec->selected_chore == candidate->ec->selected_chore) &&
+        (batch_head->ec->selected_device == candidate->ec->selected_device) ) {
+        return 0;
+    }
+    return 1;
+}
+
 int gemm_kernel_cuda(parsec_device_gpu_module_t *gpu_device,
                      parsec_gpu_task_t *gpu_task,
                      parsec_gpu_exec_stream_t *gpu_stream)
 {
-    double *A, *B, *C;
-    int m, n, k, mb, nb, kb;
-    parsec_task_t *this_task = gpu_task->ec;
     cublasStatus_t status;
     cublasHandle_t handle;
     double *one_device = NULL;
-    struct timeval start, end, diff;
-    double delta;
-    double *a_gpu, *b_gpu, *c_gpu;
-
-    (void)gpu_stream;
-    (void)gpu_device;
-
-    parsec_dtd_unpack_args(this_task,
-                           &A, &B, &C,
-                           &m, &n, &k,
-                           &mb, &nb, &kb);
-
-    a_gpu = parsec_dtd_get_dev_ptr(this_task, 0);
-    b_gpu = parsec_dtd_get_dev_ptr(this_task, 1);
-    c_gpu = parsec_dtd_get_dev_ptr(this_task, 2);
+    parsec_gpu_task_t *current_gpu_task;
+    int batch_count = 1;
+
+    if( gemm_cuda_task_allows_batch(gpu_task) ) {
+        batch_count = parsec_gpu_task_collect_batch(gpu_stream, gpu_task,
+                                                    gemm_cuda_batch_match, NULL);
+        if( batch_count < 0 ) {
+            return batch_count;
+        }
+    }
 
     handle = parsec_info_get(&gpu_stream->infos, CuHI);
     assert(NULL != handle);
     one_device = parsec_info_get(&gpu_device->super.infos, Cu1);
     assert(NULL != one_device);
-    gettimeofday(&start, NULL);
 
-    status = cublasDgemm_v2(handle,
-                            CUBLAS_OP_N, CUBLAS_OP_N,
-                            mb, nb, kb,
-                            one_device, a_gpu, mb,
-                            b_gpu, kb,
-                            one_device, c_gpu, mb);
-    gettimeofday(&end, NULL);
-    timersub(&end, &start, &diff);
-    delta = (double)diff.tv_sec + (double)diff.tv_usec/1e6;
-    if(verbose)
-        fprintf(stderr, "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s\n",
-                m, n, k, mb, kb, kb, nb, mb, kb,
-                this_task->taskpool->context->my_rank,
-                gpu_stream->name, delta);
+    current_gpu_task = gpu_task;
+    do {
+        double *A, *B, *C;
+        int m, n, k, mb, nb, kb;
+        parsec_task_t *this_task = current_gpu_task->ec;
+        struct timeval start, end, diff;
+        double delta;
+        double *a_gpu, *b_gpu, *c_gpu;
+
+        parsec_dtd_unpack_args(this_task,
+                               &A, &B, &C,
+                               &m, &n, &k,
+                               &mb, &nb, &kb);
+        (void)A; (void)B; (void)C;
+
+        a_gpu = parsec_dtd_get_dev_ptr(this_task, 0);
+        b_gpu = parsec_dtd_get_dev_ptr(this_task, 1);
+        c_gpu = parsec_dtd_get_dev_ptr(this_task, 2);
 
-    PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status,
-                            { return PARSEC_HOOK_RETURN_ERROR; });
+        gettimeofday(&start, NULL);
+
+        status = cublasDgemm_v2(handle,
+                                CUBLAS_OP_N, CUBLAS_OP_N,
+                                mb, nb, kb,
+                                one_device, a_gpu, mb,
+                                b_gpu, kb,
+                                one_device, c_gpu, mb);
+        gettimeofday(&end, NULL);
+        timersub(&end, &start, &diff);
+        delta = (double)diff.tv_sec + (double)diff.tv_usec/1e6;
+        if(verbose) {
+            fprintf(stderr, "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s%s\n",
+                    m, n, k, mb, kb, kb, nb, mb, kb,
+                    this_task->taskpool->context->my_rank,
+                    gpu_stream->name, delta,
+                    batch_count > 1 ? " as part of a batch" : "");
+        }
+
+        PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status,
+                                { return PARSEC_HOOK_RETURN_ERROR; });
+
+        current_gpu_task = (parsec_gpu_task_t *)current_gpu_task->list_item.list_next;
+    } while( current_gpu_task != gpu_task );
+
+    if( verbose && batch_count > 1 ) {
+        fprintf(stderr, "Submitted %d batched GEMM tasks on GPU stream %s\n",
+                batch_count, gpu_stream->name);
+    }
 
     return PARSEC_HOOK_RETURN_DONE;
 }
@@ -297,7 +348,9 @@ int simple_gemm(parsec_context_t *parsec_context, parsec_matrix_block_cyclic_t *
                                            sizeof(int), PARSEC_VALUE,               /* nb */
                                            sizeof(int), PARSEC_VALUE,               /* kb */
                                            PARSEC_DTD_ARG_END);
-    parsec_dtd_task_class_add_chore(tp, gemm_tc, PARSEC_DEV_CUDA, gemm_kernel_cuda);
+    parsec_dtd_task_class_add_chore(tp, gemm_tc,
+                                    use_cuda_batch ? (PARSEC_DEV_CUDA | PARSEC_DEV_CHORE_ALLOW_BATCH) : PARSEC_DEV_CUDA,
+                                    gemm_kernel_cuda);
 #if defined(HAVE_BLAS)
     parsec_dtd_task_class_add_chore(tp, gemm_tc, PARSEC_DEV_CPU, gemm_kernel_cpu);
 #endif
@@ -308,7 +361,9 @@ int simple_gemm(parsec_context_t *parsec_context, parsec_matrix_block_cyclic_t *
             for( int k = 0; k < A->super.nt; k++ ) {
                 keyA = A->super.super.data_key(&A->super.super, i, k);
                 keyB = B->super.super.data_key(&B->super.super, k, j);
-                parsec_dtd_insert_task_with_task_class(tp, gemm_tc, C->super.mt*C->super.nt*A->super.nt - i*C->super.nt + j, device,
+                parsec_dtd_insert_task_with_task_class(tp, gemm_tc, C->super.mt*C->super.nt*A->super.nt - i*C->super.nt + j,
+                                                       use_cuda_batch && (PARSEC_DEV_CUDA == device) ?
+                                                       (device | PARSEC_DEV_CHORE_ALLOW_BATCH) : device,
                                                        PARSEC_INPUT, PARSEC_DTD_TILE_OF_KEY(&A->super.super, keyA),
                                                        PARSEC_INPUT, PARSEC_DTD_TILE_OF_KEY(&B->super.super, keyB),
                                                        k == A->super.nt - 1 ? (PARSEC_INOUT | PARSEC_PUSHOUT) : PARSEC_INOUT,
@@ -347,7 +402,7 @@ int get_nb_gpu_devices()
 
     for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             nb++;
         }
     }
@@ -363,7 +418,7 @@ int *get_gpu_device_index()
     int i = 0;
     for( int dev = 0; dev < (int)parsec_nb_devices; dev++ ) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             dev_index[i++] = device->device_index;
         }
     }
@@ -507,13 +562,14 @@ int main(int argc, char **argv)
                 {"device",  required_argument, 0, 'd'},
                 {"nruns",   required_argument, 0, 't'},
                 {"verbose", no_argument,       0, 'v'},
+                {"batch",   no_argument,       0, 'b'},
                 {"Debug",   required_argument, 0, 'D'},
                 {"Alarm",   required_argument, 0, 'A'},
                 {"help",    no_argument,       0, 'h'},
                 {0, 0,                         0, 0}
         };
 
-        int c = getopt_long(argc, argv, "M:N:K:m:n:k:P:Q:t:d:D:A:vh",
+        int c = getopt_long(argc, argv, "M:N:K:m:n:k:P:Q:t:d:D:A:vbh",
                             long_options, &option_index);
         if( c == -1 )
             break;
@@ -549,6 +605,9 @@ int main(int argc, char **argv)
             case 'v':
                 verbose = !verbose;
                 break;
+            case 'b':
+                use_cuda_batch = 1;
+                break;
             case 'd':
                 if(strcmp(optarg, "GPU") == 0) {
                     device=PARSEC_DEV_CUDA;
@@ -587,6 +646,7 @@ int main(int argc, char **argv)
                             "   --mb|-m / --kb/-k / --nb|-n:  set mb, kb and nb (resp.)\n"
                             "   --nruns|-t:                   set the number of runs to do\n"
                             "   --device|-d:                  which device to use (CPU or GPU)\n"
+                            "   --batch|-b:                   enable CUDA batched GEMM chores\n"
                             "   --verbose|-v:                 display which GEMM runs on which GPU\n"
                             "                                 as execution is unfolding\n"
                             "   --help|-h|-?:                 display this help\n"
@@ -602,7 +662,9 @@ int main(int argc, char **argv)
                             "\n",
                             argv[0]);
                 }
+#if defined(PARSEC_HAVE_MPI)
                 MPI_Finalize();
+#endif
                 exit(0);
         }
     }
@@ -636,7 +698,9 @@ int main(int argc, char **argv)
         rc = !(nbgpus >= 1);
         if( rc != 0 ) {
             fprintf(stderr, "Rank %d doesn't have CUDA accelerators\n", rank);
+#if defined(PARSEC_HAVE_MPI)
             MPI_Abort(MPI_COMM_WORLD, 0);
+#endif
             return -1;
         }
         gpu_device_index = get_gpu_device_index();
diff --git a/tests/dsl/ptg/CMakeLists.txt b/tests/dsl/ptg/CMakeLists.txt
index ed03f549f..25ed08aea 100644
--- a/tests/dsl/ptg/CMakeLists.txt
+++ b/tests/dsl/ptg/CMakeLists.txt
@@ -14,6 +14,11 @@ endif(PARSEC_HAVE_RANDOM)
 parsec_addtest_executable(C complex_deps)
 target_ptg_sources(complex_deps PRIVATE "complex_deps.jdf")
 
+if(PARSEC_HAVE_DEV_CAPABILITY_BATCH)
+  parsec_addtest_executable(C batch_cpu)
+  target_ptg_sources(batch_cpu PRIVATE "batch_cpu.jdf")
+endif(PARSEC_HAVE_DEV_CAPABILITY_BATCH)
+
 add_subdirectory(branching)
 add_subdirectory(choice)
 add_subdirectory(controlgather)
diff --git a/tests/dsl/ptg/Testings.cmake b/tests/dsl/ptg/Testings.cmake
index 697ad425f..4e3057dbe 100644
--- a/tests/dsl/ptg/Testings.cmake
+++ b/tests/dsl/ptg/Testings.cmake
@@ -7,3 +7,6 @@ parsec_addtest_cmd(dsl/ptg/startup1 ${SHM_TEST_CMD_LIST} dsl/ptg/startup -i=10 -
 parsec_addtest_cmd(dsl/ptg/startup2 ${SHM_TEST_CMD_LIST} dsl/ptg/startup -i=10 -j=20 -k=30 -v=5)
 parsec_addtest_cmd(dsl/ptg/startup3 ${SHM_TEST_CMD_LIST} dsl/ptg/startup -i=30 -j=30 -k=30 -v=5)
 parsec_addtest_cmd(dsl/ptg/strange ${SHM_TEST_CMD_LIST} dsl/ptg/strange)
+if(PARSEC_HAVE_DEV_CAPABILITY_BATCH)
+  parsec_addtest_cmd(dsl/ptg/batch_cpu ${SHM_TEST_CMD_LIST} dsl/ptg/batch_cpu)
+endif(PARSEC_HAVE_DEV_CAPABILITY_BATCH)
diff --git a/tests/dsl/ptg/batch_cpu.jdf b/tests/dsl/ptg/batch_cpu.jdf
new file mode 100644
index 000000000..60e3a16bd
--- /dev/null
+++ b/tests/dsl/ptg/batch_cpu.jdf
@@ -0,0 +1,147 @@
+extern "C" %{
+/*
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2026      NVIDIA Corporation.  All rights reserved.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
+#include "parsec/mca/device/device.h"
+#include "batch_cpu.h"
+
+#define TYPE PARSEC_MATRIX_INTEGER
+
+static int
+has_cpu_batch_chore(parsec_taskpool_t *tp)
+{
+    for( unsigned int i = 0; i < tp->nb_task_classes; i++ ) {
+        const parsec_task_class_t *tc = tp->task_classes_array[i];
+        const __parsec_chore_t *chores;
+
+        if( NULL == tc ) {
+            continue;
+        }
+        chores = tc->incarnations;
+        for( int j = 0; PARSEC_DEV_NONE != (chores[j].type & PARSEC_DEV_ANY_TYPE); j++ ) {
+            if( (chores[j].type & PARSEC_DEV_CPU) &&
+                (chores[j].type & PARSEC_DEV_CHORE_ALLOW_BATCH) ) {
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+%}
+
+DATA [type = "parsec_data_collection_t*"]
+N    [type = int]
+
+BATCH_CPU(k)
+
+  k = 0 .. N-1
+
+  : DATA(k, 0)
+
+  RW TILE <- DATA(k, 0)
+          -> DATA(k, 0)
+
+BODY [batch=true]
+{
+    assert(NULL != TILE);
+    ((int*)TILE)[0] = k + 1;
+}
+END
+
+extern "C" %{
+
+int main(int argc, char **argv)
+{
+    parsec_batch_cpu_taskpool_t *tp;
+    parsec_matrix_block_cyclic_t descA;
+    parsec_arena_datatype_t adt;
+    parsec_datatype_t dt;
+    parsec_context_t *parsec;
+    int n = 32, rc;
+    int rank = 0;
+    int ret = 0;
+
+#if defined(PARSEC_HAVE_MPI)
+    {
+        int provided;
+        MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &provided);
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+#endif
+
+    if( NULL != argv[1] ) {
+        n = atoi(argv[1]);
+    }
+    if( n <= 0 ) {
+        n = 32;
+    }
+
+    parsec = parsec_init(-1, &argc, &argv);
+    assert(NULL != parsec);
+
+    parsec_matrix_block_cyclic_init(&descA, TYPE, PARSEC_MATRIX_TILE,
+                                    rank,
+                                    1, 1, n, 1,
+                                    0, 0, n, 1, 1, 1, 1, 1, 0, 0);
+    descA.mat = parsec_data_allocate(descA.super.nb_local_tiles *
+                                     descA.super.bsiz *
+                                     parsec_datadist_getsizeoftype(TYPE));
+    memset(descA.mat, 0, descA.super.nb_local_tiles *
+                         descA.super.bsiz *
+                         parsec_datadist_getsizeoftype(TYPE));
+
+    parsec_translate_matrix_type(TYPE, &dt);
+    parsec_add2arena_rect(&adt, dt, 1, 1, 1);
+
+    tp = parsec_batch_cpu_new((parsec_data_collection_t*)&descA, n);
+    assert(NULL != tp);
+    if( !has_cpu_batch_chore(&tp->super) ) {
+        parsec_warning("The generated PTG task class does not expose a CPU batch chore\n");
+        ret = 1;
+        goto cleanup;
+    }
+    tp->arenas_datatypes[PARSEC_batch_cpu_DEFAULT_ADT_IDX] = adt;
+    PARSEC_OBJ_RETAIN(adt.arena);
+
+    rc = parsec_context_start(parsec);
+    PARSEC_CHECK_ERROR(rc, "parsec_context_start");
+
+    rc = parsec_context_add_taskpool(parsec, (parsec_taskpool_t*)tp);
+    PARSEC_CHECK_ERROR(rc, "parsec_context_add_taskpool");
+
+    rc = parsec_context_wait(parsec);
+    PARSEC_CHECK_ERROR(rc, "parsec_context_wait");
+
+    for( int i = 0; i < n; i++ ) {
+        if( ((int*)descA.mat)[i] != i + 1 ) {
+            parsec_warning("Rank %d found value %d at position %d, expected %d\n",
+                           rank, ((int*)descA.mat)[i], i, i + 1);
+            ret = 1;
+            break;
+        }
+    }
+
+  cleanup:
+    parsec_taskpool_free(&tp->super);
+    free(descA.mat);
+    PARSEC_OBJ_RELEASE(adt.arena);
+    parsec_del2arena(&adt);
+    parsec_fini(&parsec);
+
+#if defined(PARSEC_HAVE_MPI)
+    MPI_Finalize();
+#endif
+
+    return ret;
+}
+
+%}
diff --git a/tests/runtime/cuda/get_best_device_check.jdf b/tests/runtime/cuda/get_best_device_check.jdf
index db588b625..3fac93b87 100644
--- a/tests/runtime/cuda/get_best_device_check.jdf
+++ b/tests/runtime/cuda/get_best_device_check.jdf
@@ -3,6 +3,7 @@ extern "C" %{
  * Copyright (c) 2021-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation. All rights
  *                         reserved.
+ * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
  */
 #include "cuda_test_internal.h"
 
@@ -148,7 +149,7 @@ parsec_get_best_device_check_New(parsec_tiled_matrix_t *dcA, int *info)
     int nb = 0;
     for(int i = 0; i < (int)parsec_nb_devices; i++) {
         parsec_device_module_t *device = parsec_mca_device_get(i);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             nb++;
         }
     }
@@ -160,7 +161,7 @@ parsec_get_best_device_check_New(parsec_tiled_matrix_t *dcA, int *info)
     nb = 0;
     for(int i = 0; i < (int)parsec_nb_devices; i++) {
         parsec_device_module_t *device = parsec_mca_device_get(i);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             dev_index[nb++] = device->device_index;
         }
     }
diff --git a/tests/runtime/cuda/nvlink_wrapper.c b/tests/runtime/cuda/nvlink_wrapper.c
index aa2b840e9..2d97b7749 100644
--- a/tests/runtime/cuda/nvlink_wrapper.c
+++ b/tests/runtime/cuda/nvlink_wrapper.c
@@ -72,7 +72,7 @@ __parsec_nvlink_destructor( parsec_nvlink_taskpool_t* nvlink_taskpool)
     userM = nvlink_taskpool->_g_userM;
     for(g = 0, dev = 0; dev < (int)parsec_nb_devices; dev++) {
         parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == cuda_device->super.super.type ) {
+        if( PARSEC_DEV_CUDA & cuda_device->super.super.type ) {
             parsec_data_t *dta = ((parsec_dc_t*)userM)->data_of((parsec_dc_t*)userM, g, userM->super.super.myrank);
             parsec_data_copy_t *gpu_copy = parsec_data_get_copy(dta, cuda_device->super.super.device_index);
             cudaError_t status = cudaSetDevice( cuda_device->cuda_index );
@@ -108,7 +108,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
     nb = 0;
     for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
         parsec_device_module_t *device = parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == device->type ) {
+        if( PARSEC_DEV_CUDA & device->type ) {
             dev_index[nb++] = device->device_index;
         }
     }
@@ -169,7 +169,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
      * in the JDF, this also pins the task on the GPU that we chose to host the tile */
     for(int g = 0, dev = 0; dev < (int)parsec_nb_devices; dev++) {
         parsec_device_cuda_module_t *cuda_device = (parsec_device_cuda_module_t*)parsec_mca_device_get(dev);
-        if( PARSEC_DEV_CUDA == cuda_device->super.super.type ) {
+        if( PARSEC_DEV_CUDA & cuda_device->super.super.type ) {
             /* We get the data from the data collection */
             parsec_data_t *dta = ((parsec_dc_t*)userM)->data_of((parsec_dc_t*)userM, g, ctx->my_rank);
             /* The corresponding data copy on CPU RAM */
diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf
index 87e30a3f9..9fd975122 100644
--- a/tests/runtime/cuda/stage_custom.jdf
+++ b/tests/runtime/cuda/stage_custom.jdf
@@ -45,8 +45,8 @@ stage_stride_in(parsec_gpu_task_t *gtask,
             dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i];
             elem_sz = parsec_datadist_getsizeoftype(dc->mtype);
             in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index);
-            if(in_elem_dev->super.type != PARSEC_DEV_CUDA ){
-                /* copy width bytes height times, skipping pitch - width bytes every time */
+            if( !(in_elem_dev->super.type & PARSEC_DEV_CUDA) ) {
+                /* copy width bytes heigth times, skipping pitch - width bytes every time */
                 size_t dpitch = dc->mb * elem_sz;
                 size_t spitch = dc->llm * elem_sz;
                 size_t width  = dc->mb * elem_sz;
@@ -117,6 +117,23 @@ typedef void (*cublas_dgemm_t) ( char TRANSA, char TRANSB, int m, int n, int k,
 /* Pre-declare function used as a property of some parameterized task */
 static int64_t gemm_time_estimate(const parsec_task_t *task, parsec_device_module_t *dev);
 
+static int
+stage_custom_batch_match(parsec_gpu_task_t *candidate,
+                         parsec_gpu_task_t *batch_head,
+                         void *callback_data)
+{
+    int *how_many = (int *)callback_data;
+
+    if( (*how_many < 5) &&
+        (batch_head->ec->task_class == candidate->ec->task_class) ) {
+        (*how_many)++;
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
+                             "Add task %p to the %p batch\n",
+                             candidate, batch_head);
+        return 0;
+    }
+    return 1;
+}
 %}
 
 %option no_taskpool_instance = true  /* can be anything */
@@ -146,8 +163,21 @@ RW A <- descA(m, k)
      -> A TASK_CHECK(m,k)
 
 BODY [type=CUDA
+      batch = true
       dyld=cublasDgemm dyldtype=cublas_dgemm_t]
 {
+    int how_many = 1;
+    how_many = parsec_gpu_task_collect_batch(gpu_stream, gpu_task,
+                                             stage_custom_batch_match,
+                                             &how_many);
+    if( how_many < 0 ) {
+        return how_many;
+    }
+    if( how_many > 1 ) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
+                             "submit multiple tasks into one %p on stream %s{%p}\n",
+                             gpu_task, gpu_stream->name, (void*)gpu_stream);
+    }
     double lalpha = 1.0;
     double lbeta  = 2.0;
     int tempmm = descA->mb;
@@ -155,14 +185,22 @@ BODY [type=CUDA
 
     cublasStatus_t status;
     cublasSetKernelStream( parsec_body.stream );
-    parsec_body.dyld_fn(  'N', 'N',
-                         tempmm, tempmm, tempmm,
-                         lalpha, (double*)A, ldam,
-                                 (double*)A, ldam,
-                         lbeta,  (double*)A, ldam );
-    status = cublasGetError();
-    PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status,
-                            {return PARSEC_HOOK_RETURN_ERROR;} );
+
+    parsec_gpu_task_t* current_gpu_task = gpu_task;
+    do {
+        __parsec_stage_custom_TASK_GPU_task_t *task = (__parsec_stage_custom_TASK_GPU_task_t*)current_gpu_task->ec;
+        _f_A = task->data._f_A.data_out;
+        A = PARSEC_DATA_COPY_GET_PTR(_f_A);
+        parsec_body.dyld_fn(  'N', 'N',
+                            tempmm, tempmm, tempmm,
+                            lalpha, (double*)A, ldam,
+                                    (double*)A, ldam,
+                            lbeta,  (double*)A, ldam );
+        status = cublasGetError();
+        PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status,
+                                {return PARSEC_HOOK_RETURN_ERROR;} );
+        current_gpu_task = (parsec_gpu_task_t*)current_gpu_task->list_item.list_next;
+    } while( current_gpu_task != gpu_task );
 }
 END
 
diff --git a/tests/runtime/cuda/stress_wrapper.c b/tests/runtime/cuda/stress_wrapper.c
index ee1f73d35..542dc744d 100644
--- a/tests/runtime/cuda/stress_wrapper.c
+++ b/tests/runtime/cuda/stress_wrapper.c
@@ -2,7 +2,9 @@
  * Copyright (c) 2019-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
+ * Copyright (c) 2024-2026 NVIDIA Corporation.  All rights reserved.
  */
+
 #include "parsec.h"
 #include "parsec/execution_stream.h"
 #include "parsec/data_distribution.h"
@@ -46,7 +48,7 @@ parsec_taskpool_t* testing_stress_New( parsec_context_t *ctx, int depth, int mb
         nb = 0;
         for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
             parsec_device_module_t *device = parsec_mca_device_get(dev);
-            if( PARSEC_DEV_CUDA == device->type ) {
+            if( PARSEC_DEV_CUDA & device->type ) {
                 dev_index[nb++] = device->device_index;
             }
         }
@@ -77,4 +79,3 @@ parsec_taskpool_t* testing_stress_New( parsec_context_t *ctx, int depth, int mb
                              parsec_datatype_double_complex_t, mb );
     return &testing_handle->super;
 }
-