Enable batched chores across device types

bosilca · bosilca · commit 41fa201cb24d · 2026-05-09T02:25:18.000-04:00
Replace the CUDA-specific batch build switch with PARSEC_HAVE_DEV_CAPABILITY_BATCH so batching is a runtime capability shared by all supported device types. Export the new option through parsec_options and PaRSECConfig.

Add per-device MCA parameters to disable batching for CPU, recursive, CUDA, HIP, and Level Zero devices. Use shared helpers to sanitize batch chore types in DTD and to gate GPU task-ring batching on the selected device.

Teach PTG to accept batch=true for CPU/default bodies as well as typed device bodies, and add CPU batch examples for both PTG and DTD with ctest coverage for the enabled and CPU-disabled DTD paths.

Signed-off-by: George Bosilca &lt;gbosilca@nvidia.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,7 +19,7 @@ include(GNUInstallDirs)
 #   When making a backward compatible addition to the API
 #     PARSEC_VERSION_MAJOR does not change
 #     PARSEC_VERSION_MINOR increases by 1
-#   When making a backward incompabilte change to an API (or exposed structure)
+#   When making a backward incompatible change to an API (or exposed structure)
 #     PARSEC_VERSION_MAJOR increases by 1
 #     PARSEC_VERSION_MINOR resets to 0
 # Unlike strict libtool numbering, PARSEC_VERSION_RELEASE is an monotonous 
diff --git a/tests/dsl/dtd/dtd_test_simple_gemm.c b/tests/dsl/dtd/dtd_test_simple_gemm.c
@@ -10,6 +10,7 @@
 #include "parsec/data_dist/matrix/matrix.h"
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 #include "parsec/interfaces/dtd/insert_function_internal.h"
+#include "parsec/mca/device/device.h"
 
 // The file is not compiled if CUDA is not present or CUBLAS is not found
 #include "parsec/mca/device/cuda/device_cuda.h"
@@ -44,6 +45,7 @@ static parsec_info_id_t CuHI = -1;
 static parsec_info_id_t Cu1 = -1;
 static int verbose = 0;
 static int device = PARSEC_DEV_CUDA;
+static int use_cuda_batch = 0;
 static int P = -1;
 static int Q = -1;
 
@@ -169,55 +171,149 @@ int initialize_matrix(parsec_context_t *parsec_context, int rank, parsec_matrix_
     return 0;
 }
 
+static int
+gemm_cuda_task_allows_batch(parsec_gpu_task_t *gpu_task)
+{
+    parsec_task_t *this_task = gpu_task->ec;
+    int selected_chore = this_task->selected_chore;
+
+    return use_cuda_batch &&
+           (selected_chore >= 0) &&
+           (this_task->task_class->incarnations[selected_chore].type & PARSEC_DEV_CHORE_ALLOW_BATCH) &&
+           parsec_mca_device_type_supports_batch(this_task->selected_device->type);
+}
+
+static int
+gemm_cuda_complete_batch(parsec_device_gpu_module_t *gpu_device,
+                         parsec_gpu_task_t **gpu_task,
+                         parsec_gpu_exec_stream_t *gpu_stream)
+{
+    parsec_list_item_t *output_stream_ghost = &gpu_device->exec_stream[1]->fifo_pending->ghost_element;
+
+    (void)gpu_stream;
+
+    /* The whole ring was submitted on one CUDA stream and completed under one
+     * event. Move every task in the ring to the output stream so the regular
+     * GPU pop/epilog path still handles ownership, pushout, and completion one
+     * task at a time.
+     */
+    parsec_list_item_ring_merge(output_stream_ghost, &(*gpu_task)->list_item);
+    (*gpu_task)->complete_stage = NULL;
+    *gpu_task = NULL;
+
+    return PARSEC_HOOK_RETURN_DONE;
+}
+
+static int
+gemm_cuda_collect_batch(parsec_gpu_task_t *gpu_task,
+                        parsec_gpu_exec_stream_t *gpu_stream)
+{
+    parsec_list_item_t *store_back = NULL;
+    int how_many = 1;
+
+    parsec_list_item_singleton(&gpu_task->list_item);
+    while( !parsec_list_nolock_is_empty(gpu_stream->fifo_pending) ) {
+        parsec_list_item_t *item = parsec_list_pop_front(gpu_stream->fifo_pending);
+        parsec_gpu_task_t *task;
+
+        if( NULL == item ) {
+            break;
+        }
+
+        parsec_list_item_singleton(item);
+        task = (parsec_gpu_task_t *)item;
+
+        if( (gpu_task->ec->task_class == task->ec->task_class) &&
+            (gpu_task->ec->selected_chore == task->ec->selected_chore) &&
+            (gpu_task->ec->selected_device == task->ec->selected_device) ) {
+            (void)parsec_list_item_ring_push(&gpu_task->list_item, item);
+            how_many++;
+        } else {
+            if( NULL == store_back ) {
+                store_back = item;
+            } else {
+                (void)parsec_list_item_ring_push(store_back, item);
+            }
+        }
+    }
+
+    if( NULL != store_back ) {
+        parsec_list_item_ring_merge(&gpu_stream->fifo_pending->ghost_element, store_back);
+    }
+
+    return how_many;
+}
+
 int gemm_kernel_cuda(parsec_device_gpu_module_t *gpu_device,
                      parsec_gpu_task_t *gpu_task,
                      parsec_gpu_exec_stream_t *gpu_stream)
 {
-    double *A, *B, *C;
-    int m, n, k, mb, nb, kb;
-    parsec_task_t *this_task = gpu_task->ec;
     cublasStatus_t status;
     cublasHandle_t handle;
     double *one_device = NULL;
-    struct timeval start, end, diff;
-    double delta;
-    double *a_gpu, *b_gpu, *c_gpu;
+    parsec_gpu_task_t *current_gpu_task;
+    int batch_count = 1;
 
-    (void)gpu_stream;
-    (void)gpu_device;
-
-    parsec_dtd_unpack_args(this_task,
-                           &A, &B, &C,
-                           &m, &n, &k,
-                           &mb, &nb, &kb);
-
-    a_gpu = parsec_dtd_get_dev_ptr(this_task, 0);
-    b_gpu = parsec_dtd_get_dev_ptr(this_task, 1);
-    c_gpu = parsec_dtd_get_dev_ptr(this_task, 2);
+    if( gemm_cuda_task_allows_batch(gpu_task) ) {
+        batch_count = gemm_cuda_collect_batch(gpu_task, gpu_stream);
+        if( batch_count > 1 ) {
+            gpu_task->complete_stage = gemm_cuda_complete_batch;
+        }
+    }
 
     handle = parsec_info_get(&gpu_stream->infos, CuHI);
     assert(NULL != handle);
     one_device = parsec_info_get(&gpu_device->super.infos, Cu1);
     assert(NULL != one_device);
-    gettimeofday(&start, NULL);
 
-    status = cublasDgemm_v2(handle,
-                            CUBLAS_OP_N, CUBLAS_OP_N,
-                            mb, nb, kb,
-                            one_device, a_gpu, mb,
-                            b_gpu, kb,
-                            one_device, c_gpu, mb);
-    gettimeofday(&end, NULL);
-    timersub(&end, &start, &diff);
-    delta = (double)diff.tv_sec + (double)diff.tv_usec/1e6;
-    if(verbose)
-        fprintf(stderr, "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s\n",
-                m, n, k, mb, kb, kb, nb, mb, kb,
-                this_task->taskpool->context->my_rank,
-                gpu_stream->name, delta);
+    current_gpu_task = gpu_task;
+    do {
+        double *A, *B, *C;
+        int m, n, k, mb, nb, kb;
+        parsec_task_t *this_task = current_gpu_task->ec;
+        struct timeval start, end, diff;
+        double delta;
+        double *a_gpu, *b_gpu, *c_gpu;
 
-    PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status,
-                            { return PARSEC_HOOK_RETURN_ERROR; });
+        parsec_dtd_unpack_args(this_task,
+                               &A, &B, &C,
+                               &m, &n, &k,
+                               &mb, &nb, &kb);
+        (void)A; (void)B; (void)C;
+
+        a_gpu = parsec_dtd_get_dev_ptr(this_task, 0);
+        b_gpu = parsec_dtd_get_dev_ptr(this_task, 1);
+        c_gpu = parsec_dtd_get_dev_ptr(this_task, 2);
+
+        gettimeofday(&start, NULL);
+
+        status = cublasDgemm_v2(handle,
+                                CUBLAS_OP_N, CUBLAS_OP_N,
+                                mb, nb, kb,
+                                one_device, a_gpu, mb,
+                                b_gpu, kb,
+                                one_device, c_gpu, mb);
+        gettimeofday(&end, NULL);
+        timersub(&end, &start, &diff);
+        delta = (double)diff.tv_sec + (double)diff.tv_usec/1e6;
+        if(verbose) {
+            fprintf(stderr, "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s%s\n",
+                    m, n, k, mb, kb, kb, nb, mb, kb,
+                    this_task->taskpool->context->my_rank,
+                    gpu_stream->name, delta,
+                    batch_count > 1 ? " as part of a batch" : "");
+        }
+
+        PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status,
+                                { return PARSEC_HOOK_RETURN_ERROR; });
+
+        current_gpu_task = (parsec_gpu_task_t *)current_gpu_task->list_item.list_next;
+    } while( current_gpu_task != gpu_task );
+
+    if( verbose && batch_count > 1 ) {
+        fprintf(stderr, "Submitted %d batched GEMM tasks on GPU stream %s\n",
+                batch_count, gpu_stream->name);
+    }
 
     return PARSEC_HOOK_RETURN_DONE;
 }
@@ -284,7 +380,9 @@ int simple_gemm(parsec_context_t *parsec_context, parsec_matrix_block_cyclic_t *
                                            sizeof(int), PARSEC_VALUE,               /* nb */
                                            sizeof(int), PARSEC_VALUE,               /* kb */
                                            PARSEC_DTD_ARG_END);
-    parsec_dtd_task_class_add_chore(tp, gemm_tc, PARSEC_DEV_CUDA, gemm_kernel_cuda);
+    parsec_dtd_task_class_add_chore(tp, gemm_tc,
+                                    use_cuda_batch ? (PARSEC_DEV_CUDA | PARSEC_DEV_CHORE_ALLOW_BATCH) : PARSEC_DEV_CUDA,
+                                    gemm_kernel_cuda);
 #if defined(HAVE_BLAS)
     parsec_dtd_task_class_add_chore(tp, gemm_tc, PARSEC_DEV_CPU, gemm_kernel_cpu);
 #endif
@@ -295,7 +393,9 @@ int simple_gemm(parsec_context_t *parsec_context, parsec_matrix_block_cyclic_t *
             for( int k = 0; k < A->super.nt; k++ ) {
                 keyA = A->super.super.data_key(&A->super.super, i, k);
                 keyB = B->super.super.data_key(&B->super.super, k, j);
-                parsec_dtd_insert_task_with_task_class(tp, gemm_tc, C->super.mt*C->super.nt*A->super.nt - i*C->super.nt + j, device,
+                parsec_dtd_insert_task_with_task_class(tp, gemm_tc, C->super.mt*C->super.nt*A->super.nt - i*C->super.nt + j,
+                                                       use_cuda_batch && (PARSEC_DEV_CUDA == device) ?
+                                                       (device | PARSEC_DEV_CHORE_ALLOW_BATCH) : device,
                                                        PARSEC_INPUT, PARSEC_DTD_TILE_OF_KEY(&A->super.super, keyA),
                                                        PARSEC_INPUT, PARSEC_DTD_TILE_OF_KEY(&B->super.super, keyB),
                                                        k == A->super.nt - 1 ? (PARSEC_INOUT | PARSEC_PUSHOUT) : PARSEC_INOUT,
@@ -494,13 +594,14 @@ int main(int argc, char **argv)
                 {"device",  required_argument, 0, 'd'},
                 {"nruns",   required_argument, 0, 't'},
                 {"verbose", no_argument,       0, 'v'},
+                {"batch",   no_argument,       0, 'b'},
                 {"Debug",   required_argument, 0, 'D'},
                 {"Alarm",   required_argument, 0, 'A'},
                 {"help",    no_argument,       0, 'h'},
                 {0, 0,                         0, 0}
         };
 
-        int c = getopt_long(argc, argv, "M:N:K:m:n:k:P:Q:t:d:D:A:vh",
+        int c = getopt_long(argc, argv, "M:N:K:m:n:k:P:Q:t:d:D:A:vbh",
                             long_options, &option_index);
         if( c == -1 )
             break;
@@ -536,6 +637,9 @@ int main(int argc, char **argv)
             case 'v':
                 verbose = !verbose;
                 break;
+            case 'b':
+                use_cuda_batch = 1;
+                break;
             case 'd':
                 if(strcmp(optarg, "GPU") == 0) {
                     device=PARSEC_DEV_CUDA;
@@ -574,6 +678,7 @@ int main(int argc, char **argv)
                             "   --mb|-m / --kb/-k / --nb|-n:  set mb, kb and nb (resp.)\n"
                             "   --nruns|-t:                   set the number of runs to do\n"
                             "   --device|-d:                  which device to use (CPU or GPU)\n"
+                            "   --batch|-b:                   enable CUDA batched GEMM chores\n"
                             "   --verbose|-v:                 display which GEMM runs on which GPU\n"
                             "                                 as execution is unfolding\n"
                             "   --help|-h|-?:                 display this help\n"
@@ -589,7 +694,9 @@ int main(int argc, char **argv)
                             "\n",
                             argv[0]);
                 }
+#if defined(PARSEC_HAVE_MPI)
                 MPI_Finalize();
+#endif
                 exit(0);
         }
     }
@@ -623,7 +730,9 @@ int main(int argc, char **argv)
         rc = !(nbgpus >= 1);
         if( rc != 0 ) {
             fprintf(stderr, "Rank %d doesn't have CUDA accelerators\n", rank);
+#if defined(PARSEC_HAVE_MPI)
             MPI_Abort(MPI_COMM_WORLD, 0);
+#endif
             return -1;
         }
         gpu_device_index = get_gpu_device_index();