Skip to content

Commit 41fa201

Browse files
committed
Enable batched chores across device types
Replace the CUDA-specific batch build switch with PARSEC_HAVE_DEV_CAPABILITY_BATCH so batching is a runtime capability shared by all supported device types. Export the new option through parsec_options and PaRSECConfig. Add per-device MCA parameters to disable batching for CPU, recursive, CUDA, HIP, and Level Zero devices. Use shared helpers to sanitize batch chore types in DTD and to gate GPU task-ring batching on the selected device. Teach PTG to accept batch=true for CPU/default bodies as well as typed device bodies, and add CPU batch examples for both PTG and DTD with ctest coverage for the enabled and CPU-disabled DTD paths. Signed-off-by: George Bosilca <gbosilca@nvidia.com>
1 parent 1470eba commit 41fa201

2 files changed

Lines changed: 147 additions & 38 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ include(GNUInstallDirs)
1919
# When making a backward compatible addition to the API
2020
# PARSEC_VERSION_MAJOR does not change
2121
# PARSEC_VERSION_MINOR increases by 1
22-
# When making a backward incompabilte change to an API (or exposed structure)
22+
# When making a backward incompatible change to an API (or exposed structure)
2323
# PARSEC_VERSION_MAJOR increases by 1
2424
# PARSEC_VERSION_MINOR resets to 0
2525
# Unlike strict libtool numbering, PARSEC_VERSION_RELEASE is an monotonous

tests/dsl/dtd/dtd_test_simple_gemm.c

Lines changed: 146 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "parsec/data_dist/matrix/matrix.h"
1111
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
1212
#include "parsec/interfaces/dtd/insert_function_internal.h"
13+
#include "parsec/mca/device/device.h"
1314

1415
// The file is not compiled if CUDA is not present or CUBLAS is not found
1516
#include "parsec/mca/device/cuda/device_cuda.h"
@@ -44,6 +45,7 @@ static parsec_info_id_t CuHI = -1;
4445
static parsec_info_id_t Cu1 = -1;
4546
static int verbose = 0;
4647
static int device = PARSEC_DEV_CUDA;
48+
static int use_cuda_batch = 0;
4749
static int P = -1;
4850
static int Q = -1;
4951

@@ -169,55 +171,149 @@ int initialize_matrix(parsec_context_t *parsec_context, int rank, parsec_matrix_
169171
return 0;
170172
}
171173

174+
static int
175+
gemm_cuda_task_allows_batch(parsec_gpu_task_t *gpu_task)
176+
{
177+
parsec_task_t *this_task = gpu_task->ec;
178+
int selected_chore = this_task->selected_chore;
179+
180+
return use_cuda_batch &&
181+
(selected_chore >= 0) &&
182+
(this_task->task_class->incarnations[selected_chore].type & PARSEC_DEV_CHORE_ALLOW_BATCH) &&
183+
parsec_mca_device_type_supports_batch(this_task->selected_device->type);
184+
}
185+
186+
static int
187+
gemm_cuda_complete_batch(parsec_device_gpu_module_t *gpu_device,
188+
parsec_gpu_task_t **gpu_task,
189+
parsec_gpu_exec_stream_t *gpu_stream)
190+
{
191+
parsec_list_item_t *output_stream_ghost = &gpu_device->exec_stream[1]->fifo_pending->ghost_element;
192+
193+
(void)gpu_stream;
194+
195+
/* The whole ring was submitted on one CUDA stream and completed under one
196+
* event. Move every task in the ring to the output stream so the regular
197+
* GPU pop/epilog path still handles ownership, pushout, and completion one
198+
* task at a time.
199+
*/
200+
parsec_list_item_ring_merge(output_stream_ghost, &(*gpu_task)->list_item);
201+
(*gpu_task)->complete_stage = NULL;
202+
*gpu_task = NULL;
203+
204+
return PARSEC_HOOK_RETURN_DONE;
205+
}
206+
207+
static int
208+
gemm_cuda_collect_batch(parsec_gpu_task_t *gpu_task,
209+
parsec_gpu_exec_stream_t *gpu_stream)
210+
{
211+
parsec_list_item_t *store_back = NULL;
212+
int how_many = 1;
213+
214+
parsec_list_item_singleton(&gpu_task->list_item);
215+
while( !parsec_list_nolock_is_empty(gpu_stream->fifo_pending) ) {
216+
parsec_list_item_t *item = parsec_list_pop_front(gpu_stream->fifo_pending);
217+
parsec_gpu_task_t *task;
218+
219+
if( NULL == item ) {
220+
break;
221+
}
222+
223+
parsec_list_item_singleton(item);
224+
task = (parsec_gpu_task_t *)item;
225+
226+
if( (gpu_task->ec->task_class == task->ec->task_class) &&
227+
(gpu_task->ec->selected_chore == task->ec->selected_chore) &&
228+
(gpu_task->ec->selected_device == task->ec->selected_device) ) {
229+
(void)parsec_list_item_ring_push(&gpu_task->list_item, item);
230+
how_many++;
231+
} else {
232+
if( NULL == store_back ) {
233+
store_back = item;
234+
} else {
235+
(void)parsec_list_item_ring_push(store_back, item);
236+
}
237+
}
238+
}
239+
240+
if( NULL != store_back ) {
241+
parsec_list_item_ring_merge(&gpu_stream->fifo_pending->ghost_element, store_back);
242+
}
243+
244+
return how_many;
245+
}
246+
172247
int gemm_kernel_cuda(parsec_device_gpu_module_t *gpu_device,
173248
parsec_gpu_task_t *gpu_task,
174249
parsec_gpu_exec_stream_t *gpu_stream)
175250
{
176-
double *A, *B, *C;
177-
int m, n, k, mb, nb, kb;
178-
parsec_task_t *this_task = gpu_task->ec;
179251
cublasStatus_t status;
180252
cublasHandle_t handle;
181253
double *one_device = NULL;
182-
struct timeval start, end, diff;
183-
double delta;
184-
double *a_gpu, *b_gpu, *c_gpu;
254+
parsec_gpu_task_t *current_gpu_task;
255+
int batch_count = 1;
185256

186-
(void)gpu_stream;
187-
(void)gpu_device;
188-
189-
parsec_dtd_unpack_args(this_task,
190-
&A, &B, &C,
191-
&m, &n, &k,
192-
&mb, &nb, &kb);
193-
194-
a_gpu = parsec_dtd_get_dev_ptr(this_task, 0);
195-
b_gpu = parsec_dtd_get_dev_ptr(this_task, 1);
196-
c_gpu = parsec_dtd_get_dev_ptr(this_task, 2);
257+
if( gemm_cuda_task_allows_batch(gpu_task) ) {
258+
batch_count = gemm_cuda_collect_batch(gpu_task, gpu_stream);
259+
if( batch_count > 1 ) {
260+
gpu_task->complete_stage = gemm_cuda_complete_batch;
261+
}
262+
}
197263

198264
handle = parsec_info_get(&gpu_stream->infos, CuHI);
199265
assert(NULL != handle);
200266
one_device = parsec_info_get(&gpu_device->super.infos, Cu1);
201267
assert(NULL != one_device);
202-
gettimeofday(&start, NULL);
203268

204-
status = cublasDgemm_v2(handle,
205-
CUBLAS_OP_N, CUBLAS_OP_N,
206-
mb, nb, kb,
207-
one_device, a_gpu, mb,
208-
b_gpu, kb,
209-
one_device, c_gpu, mb);
210-
gettimeofday(&end, NULL);
211-
timersub(&end, &start, &diff);
212-
delta = (double)diff.tv_sec + (double)diff.tv_usec/1e6;
213-
if(verbose)
214-
fprintf(stderr, "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s\n",
215-
m, n, k, mb, kb, kb, nb, mb, kb,
216-
this_task->taskpool->context->my_rank,
217-
gpu_stream->name, delta);
269+
current_gpu_task = gpu_task;
270+
do {
271+
double *A, *B, *C;
272+
int m, n, k, mb, nb, kb;
273+
parsec_task_t *this_task = current_gpu_task->ec;
274+
struct timeval start, end, diff;
275+
double delta;
276+
double *a_gpu, *b_gpu, *c_gpu;
218277

219-
PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status,
220-
{ return PARSEC_HOOK_RETURN_ERROR; });
278+
parsec_dtd_unpack_args(this_task,
279+
&A, &B, &C,
280+
&m, &n, &k,
281+
&mb, &nb, &kb);
282+
(void)A; (void)B; (void)C;
283+
284+
a_gpu = parsec_dtd_get_dev_ptr(this_task, 0);
285+
b_gpu = parsec_dtd_get_dev_ptr(this_task, 1);
286+
c_gpu = parsec_dtd_get_dev_ptr(this_task, 2);
287+
288+
gettimeofday(&start, NULL);
289+
290+
status = cublasDgemm_v2(handle,
291+
CUBLAS_OP_N, CUBLAS_OP_N,
292+
mb, nb, kb,
293+
one_device, a_gpu, mb,
294+
b_gpu, kb,
295+
one_device, c_gpu, mb);
296+
gettimeofday(&end, NULL);
297+
timersub(&end, &start, &diff);
298+
delta = (double)diff.tv_sec + (double)diff.tv_usec/1e6;
299+
if(verbose) {
300+
fprintf(stderr, "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s%s\n",
301+
m, n, k, mb, kb, kb, nb, mb, kb,
302+
this_task->taskpool->context->my_rank,
303+
gpu_stream->name, delta,
304+
batch_count > 1 ? " as part of a batch" : "");
305+
}
306+
307+
PARSEC_CUDA_CHECK_ERROR("cublasDgemm_v2", status,
308+
{ return PARSEC_HOOK_RETURN_ERROR; });
309+
310+
current_gpu_task = (parsec_gpu_task_t *)current_gpu_task->list_item.list_next;
311+
} while( current_gpu_task != gpu_task );
312+
313+
if( verbose && batch_count > 1 ) {
314+
fprintf(stderr, "Submitted %d batched GEMM tasks on GPU stream %s\n",
315+
batch_count, gpu_stream->name);
316+
}
221317

222318
return PARSEC_HOOK_RETURN_DONE;
223319
}
@@ -284,7 +380,9 @@ int simple_gemm(parsec_context_t *parsec_context, parsec_matrix_block_cyclic_t *
284380
sizeof(int), PARSEC_VALUE, /* nb */
285381
sizeof(int), PARSEC_VALUE, /* kb */
286382
PARSEC_DTD_ARG_END);
287-
parsec_dtd_task_class_add_chore(tp, gemm_tc, PARSEC_DEV_CUDA, gemm_kernel_cuda);
383+
parsec_dtd_task_class_add_chore(tp, gemm_tc,
384+
use_cuda_batch ? (PARSEC_DEV_CUDA | PARSEC_DEV_CHORE_ALLOW_BATCH) : PARSEC_DEV_CUDA,
385+
gemm_kernel_cuda);
288386
#if defined(HAVE_BLAS)
289387
parsec_dtd_task_class_add_chore(tp, gemm_tc, PARSEC_DEV_CPU, gemm_kernel_cpu);
290388
#endif
@@ -295,7 +393,9 @@ int simple_gemm(parsec_context_t *parsec_context, parsec_matrix_block_cyclic_t *
295393
for( int k = 0; k < A->super.nt; k++ ) {
296394
keyA = A->super.super.data_key(&A->super.super, i, k);
297395
keyB = B->super.super.data_key(&B->super.super, k, j);
298-
parsec_dtd_insert_task_with_task_class(tp, gemm_tc, C->super.mt*C->super.nt*A->super.nt - i*C->super.nt + j, device,
396+
parsec_dtd_insert_task_with_task_class(tp, gemm_tc, C->super.mt*C->super.nt*A->super.nt - i*C->super.nt + j,
397+
use_cuda_batch && (PARSEC_DEV_CUDA == device) ?
398+
(device | PARSEC_DEV_CHORE_ALLOW_BATCH) : device,
299399
PARSEC_INPUT, PARSEC_DTD_TILE_OF_KEY(&A->super.super, keyA),
300400
PARSEC_INPUT, PARSEC_DTD_TILE_OF_KEY(&B->super.super, keyB),
301401
k == A->super.nt - 1 ? (PARSEC_INOUT | PARSEC_PUSHOUT) : PARSEC_INOUT,
@@ -494,13 +594,14 @@ int main(int argc, char **argv)
494594
{"device", required_argument, 0, 'd'},
495595
{"nruns", required_argument, 0, 't'},
496596
{"verbose", no_argument, 0, 'v'},
597+
{"batch", no_argument, 0, 'b'},
497598
{"Debug", required_argument, 0, 'D'},
498599
{"Alarm", required_argument, 0, 'A'},
499600
{"help", no_argument, 0, 'h'},
500601
{0, 0, 0, 0}
501602
};
502603

503-
int c = getopt_long(argc, argv, "M:N:K:m:n:k:P:Q:t:d:D:A:vh",
604+
int c = getopt_long(argc, argv, "M:N:K:m:n:k:P:Q:t:d:D:A:vbh",
504605
long_options, &option_index);
505606
if( c == -1 )
506607
break;
@@ -536,6 +637,9 @@ int main(int argc, char **argv)
536637
case 'v':
537638
verbose = !verbose;
538639
break;
640+
case 'b':
641+
use_cuda_batch = 1;
642+
break;
539643
case 'd':
540644
if(strcmp(optarg, "GPU") == 0) {
541645
device=PARSEC_DEV_CUDA;
@@ -574,6 +678,7 @@ int main(int argc, char **argv)
574678
" --mb|-m / --kb/-k / --nb|-n: set mb, kb and nb (resp.)\n"
575679
" --nruns|-t: set the number of runs to do\n"
576680
" --device|-d: which device to use (CPU or GPU)\n"
681+
" --batch|-b: enable CUDA batched GEMM chores\n"
577682
" --verbose|-v: display which GEMM runs on which GPU\n"
578683
" as execution is unfolding\n"
579684
" --help|-h|-?: display this help\n"
@@ -589,7 +694,9 @@ int main(int argc, char **argv)
589694
"\n",
590695
argv[0]);
591696
}
697+
#if defined(PARSEC_HAVE_MPI)
592698
MPI_Finalize();
699+
#endif
593700
exit(0);
594701
}
595702
}
@@ -623,7 +730,9 @@ int main(int argc, char **argv)
623730
rc = !(nbgpus >= 1);
624731
if( rc != 0 ) {
625732
fprintf(stderr, "Rank %d doesn't have CUDA accelerators\n", rank);
733+
#if defined(PARSEC_HAVE_MPI)
626734
MPI_Abort(MPI_COMM_WORLD, 0);
735+
#endif
627736
return -1;
628737
}
629738
gpu_device_index = get_gpu_device_index();

0 commit comments

Comments
 (0)