1010#include "parsec/data_dist/matrix/matrix.h"
1111#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
1212#include "parsec/interfaces/dtd/insert_function_internal.h"
13+ #include "parsec/mca/device/device.h"
1314
1415// The file is not compiled if CUDA is not present or CUBLAS is not found
1516#include "parsec/mca/device/cuda/device_cuda.h"
@@ -44,6 +45,7 @@ static parsec_info_id_t CuHI = -1;
4445static parsec_info_id_t Cu1 = -1 ;
4546static int verbose = 0 ;
4647static int device = PARSEC_DEV_CUDA ;
48+ static int use_cuda_batch = 0 ;
4749static int P = -1 ;
4850static int Q = -1 ;
4951
@@ -169,55 +171,149 @@ int initialize_matrix(parsec_context_t *parsec_context, int rank, parsec_matrix_
169171 return 0 ;
170172}
171173
174+ static int
175+ gemm_cuda_task_allows_batch (parsec_gpu_task_t * gpu_task )
176+ {
177+ parsec_task_t * this_task = gpu_task -> ec ;
178+ int selected_chore = this_task -> selected_chore ;
179+
180+ return use_cuda_batch &&
181+ (selected_chore >= 0 ) &&
182+ (this_task -> task_class -> incarnations [selected_chore ].type & PARSEC_DEV_CHORE_ALLOW_BATCH ) &&
183+ parsec_mca_device_type_supports_batch (this_task -> selected_device -> type );
184+ }
185+
186+ static int
187+ gemm_cuda_complete_batch (parsec_device_gpu_module_t * gpu_device ,
188+ parsec_gpu_task_t * * gpu_task ,
189+ parsec_gpu_exec_stream_t * gpu_stream )
190+ {
191+ parsec_list_item_t * output_stream_ghost = & gpu_device -> exec_stream [1 ]-> fifo_pending -> ghost_element ;
192+
193+ (void )gpu_stream ;
194+
195+ /* The whole ring was submitted on one CUDA stream and completed under one
196+ * event. Move every task in the ring to the output stream so the regular
197+ * GPU pop/epilog path still handles ownership, pushout, and completion one
198+ * task at a time.
199+ */
200+ parsec_list_item_ring_merge (output_stream_ghost , & (* gpu_task )-> list_item );
201+ (* gpu_task )-> complete_stage = NULL ;
202+ * gpu_task = NULL ;
203+
204+ return PARSEC_HOOK_RETURN_DONE ;
205+ }
206+
207+ static int
208+ gemm_cuda_collect_batch (parsec_gpu_task_t * gpu_task ,
209+ parsec_gpu_exec_stream_t * gpu_stream )
210+ {
211+ parsec_list_item_t * store_back = NULL ;
212+ int how_many = 1 ;
213+
214+ parsec_list_item_singleton (& gpu_task -> list_item );
215+ while ( !parsec_list_nolock_is_empty (gpu_stream -> fifo_pending ) ) {
216+ parsec_list_item_t * item = parsec_list_pop_front (gpu_stream -> fifo_pending );
217+ parsec_gpu_task_t * task ;
218+
219+ if ( NULL == item ) {
220+ break ;
221+ }
222+
223+ parsec_list_item_singleton (item );
224+ task = (parsec_gpu_task_t * )item ;
225+
226+ if ( (gpu_task -> ec -> task_class == task -> ec -> task_class ) &&
227+ (gpu_task -> ec -> selected_chore == task -> ec -> selected_chore ) &&
228+ (gpu_task -> ec -> selected_device == task -> ec -> selected_device ) ) {
229+ (void )parsec_list_item_ring_push (& gpu_task -> list_item , item );
230+ how_many ++ ;
231+ } else {
232+ if ( NULL == store_back ) {
233+ store_back = item ;
234+ } else {
235+ (void )parsec_list_item_ring_push (store_back , item );
236+ }
237+ }
238+ }
239+
240+ if ( NULL != store_back ) {
241+ parsec_list_item_ring_merge (& gpu_stream -> fifo_pending -> ghost_element , store_back );
242+ }
243+
244+ return how_many ;
245+ }
246+
172247int gemm_kernel_cuda (parsec_device_gpu_module_t * gpu_device ,
173248 parsec_gpu_task_t * gpu_task ,
174249 parsec_gpu_exec_stream_t * gpu_stream )
175250{
176- double * A , * B , * C ;
177- int m , n , k , mb , nb , kb ;
178- parsec_task_t * this_task = gpu_task -> ec ;
179251 cublasStatus_t status ;
180252 cublasHandle_t handle ;
181253 double * one_device = NULL ;
182- struct timeval start , end , diff ;
183- double delta ;
184- double * a_gpu , * b_gpu , * c_gpu ;
254+ parsec_gpu_task_t * current_gpu_task ;
255+ int batch_count = 1 ;
185256
186- (void )gpu_stream ;
187- (void )gpu_device ;
188-
189- parsec_dtd_unpack_args (this_task ,
190- & A , & B , & C ,
191- & m , & n , & k ,
192- & mb , & nb , & kb );
193-
194- a_gpu = parsec_dtd_get_dev_ptr (this_task , 0 );
195- b_gpu = parsec_dtd_get_dev_ptr (this_task , 1 );
196- c_gpu = parsec_dtd_get_dev_ptr (this_task , 2 );
257+ if ( gemm_cuda_task_allows_batch (gpu_task ) ) {
258+ batch_count = gemm_cuda_collect_batch (gpu_task , gpu_stream );
259+ if ( batch_count > 1 ) {
260+ gpu_task -> complete_stage = gemm_cuda_complete_batch ;
261+ }
262+ }
197263
198264 handle = parsec_info_get (& gpu_stream -> infos , CuHI );
199265 assert (NULL != handle );
200266 one_device = parsec_info_get (& gpu_device -> super .infos , Cu1 );
201267 assert (NULL != one_device );
202- gettimeofday (& start , NULL );
203268
204- status = cublasDgemm_v2 (handle ,
205- CUBLAS_OP_N , CUBLAS_OP_N ,
206- mb , nb , kb ,
207- one_device , a_gpu , mb ,
208- b_gpu , kb ,
209- one_device , c_gpu , mb );
210- gettimeofday (& end , NULL );
211- timersub (& end , & start , & diff );
212- delta = (double )diff .tv_sec + (double )diff .tv_usec /1e6 ;
213- if (verbose )
214- fprintf (stderr , "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s\n" ,
215- m , n , k , mb , kb , kb , nb , mb , kb ,
216- this_task -> taskpool -> context -> my_rank ,
217- gpu_stream -> name , delta );
269+ current_gpu_task = gpu_task ;
270+ do {
271+ double * A , * B , * C ;
272+ int m , n , k , mb , nb , kb ;
273+ parsec_task_t * this_task = current_gpu_task -> ec ;
274+ struct timeval start , end , diff ;
275+ double delta ;
276+ double * a_gpu , * b_gpu , * c_gpu ;
218277
219- PARSEC_CUDA_CHECK_ERROR ("cublasDgemm_v2" , status ,
220- { return PARSEC_HOOK_RETURN_ERROR ; });
278+ parsec_dtd_unpack_args (this_task ,
279+ & A , & B , & C ,
280+ & m , & n , & k ,
281+ & mb , & nb , & kb );
282+ (void )A ; (void )B ; (void )C ;
283+
284+ a_gpu = parsec_dtd_get_dev_ptr (this_task , 0 );
285+ b_gpu = parsec_dtd_get_dev_ptr (this_task , 1 );
286+ c_gpu = parsec_dtd_get_dev_ptr (this_task , 2 );
287+
288+ gettimeofday (& start , NULL );
289+
290+ status = cublasDgemm_v2 (handle ,
291+ CUBLAS_OP_N , CUBLAS_OP_N ,
292+ mb , nb , kb ,
293+ one_device , a_gpu , mb ,
294+ b_gpu , kb ,
295+ one_device , c_gpu , mb );
296+ gettimeofday (& end , NULL );
297+ timersub (& end , & start , & diff );
298+ delta = (double )diff .tv_sec + (double )diff .tv_usec /1e6 ;
299+ if (verbose ) {
300+ fprintf (stderr , "GEMM(%d, %d, %d) with tiles of %dx%d, %dx%d, %dx%d on node %d, GPU %s submitted in %g s%s\n" ,
301+ m , n , k , mb , kb , kb , nb , mb , kb ,
302+ this_task -> taskpool -> context -> my_rank ,
303+ gpu_stream -> name , delta ,
304+ batch_count > 1 ? " as part of a batch" : "" );
305+ }
306+
307+ PARSEC_CUDA_CHECK_ERROR ("cublasDgemm_v2" , status ,
308+ { return PARSEC_HOOK_RETURN_ERROR ; });
309+
310+ current_gpu_task = (parsec_gpu_task_t * )current_gpu_task -> list_item .list_next ;
311+ } while ( current_gpu_task != gpu_task );
312+
313+ if ( verbose && batch_count > 1 ) {
314+ fprintf (stderr , "Submitted %d batched GEMM tasks on GPU stream %s\n" ,
315+ batch_count , gpu_stream -> name );
316+ }
221317
222318 return PARSEC_HOOK_RETURN_DONE ;
223319}
@@ -284,7 +380,9 @@ int simple_gemm(parsec_context_t *parsec_context, parsec_matrix_block_cyclic_t *
284380 sizeof (int ), PARSEC_VALUE , /* nb */
285381 sizeof (int ), PARSEC_VALUE , /* kb */
286382 PARSEC_DTD_ARG_END );
287- parsec_dtd_task_class_add_chore (tp , gemm_tc , PARSEC_DEV_CUDA , gemm_kernel_cuda );
383+ parsec_dtd_task_class_add_chore (tp , gemm_tc ,
384+ use_cuda_batch ? (PARSEC_DEV_CUDA | PARSEC_DEV_CHORE_ALLOW_BATCH ) : PARSEC_DEV_CUDA ,
385+ gemm_kernel_cuda );
288386#if defined(HAVE_BLAS )
289387 parsec_dtd_task_class_add_chore (tp , gemm_tc , PARSEC_DEV_CPU , gemm_kernel_cpu );
290388#endif
@@ -295,7 +393,9 @@ int simple_gemm(parsec_context_t *parsec_context, parsec_matrix_block_cyclic_t *
295393 for ( int k = 0 ; k < A -> super .nt ; k ++ ) {
296394 keyA = A -> super .super .data_key (& A -> super .super , i , k );
297395 keyB = B -> super .super .data_key (& B -> super .super , k , j );
298- parsec_dtd_insert_task_with_task_class (tp , gemm_tc , C -> super .mt * C -> super .nt * A -> super .nt - i * C -> super .nt + j , device ,
396+ parsec_dtd_insert_task_with_task_class (tp , gemm_tc , C -> super .mt * C -> super .nt * A -> super .nt - i * C -> super .nt + j ,
397+ use_cuda_batch && (PARSEC_DEV_CUDA == device ) ?
398+ (device | PARSEC_DEV_CHORE_ALLOW_BATCH ) : device ,
299399 PARSEC_INPUT , PARSEC_DTD_TILE_OF_KEY (& A -> super .super , keyA ),
300400 PARSEC_INPUT , PARSEC_DTD_TILE_OF_KEY (& B -> super .super , keyB ),
301401 k == A -> super .nt - 1 ? (PARSEC_INOUT | PARSEC_PUSHOUT ) : PARSEC_INOUT ,
@@ -494,13 +594,14 @@ int main(int argc, char **argv)
494594 {"device" , required_argument , 0 , 'd' },
495595 {"nruns" , required_argument , 0 , 't' },
496596 {"verbose" , no_argument , 0 , 'v' },
597+ {"batch" , no_argument , 0 , 'b' },
497598 {"Debug" , required_argument , 0 , 'D' },
498599 {"Alarm" , required_argument , 0 , 'A' },
499600 {"help" , no_argument , 0 , 'h' },
500601 {0 , 0 , 0 , 0 }
501602 };
502603
503- int c = getopt_long (argc , argv , "M:N:K:m:n:k:P:Q:t:d:D:A:vh " ,
604+ int c = getopt_long (argc , argv , "M:N:K:m:n:k:P:Q:t:d:D:A:vbh " ,
504605 long_options , & option_index );
505606 if ( c == -1 )
506607 break ;
@@ -536,6 +637,9 @@ int main(int argc, char **argv)
536637 case 'v' :
537638 verbose = !verbose ;
538639 break ;
640+ case 'b' :
641+ use_cuda_batch = 1 ;
642+ break ;
539643 case 'd' :
540644 if (strcmp (optarg , "GPU" ) == 0 ) {
541645 device = PARSEC_DEV_CUDA ;
@@ -574,6 +678,7 @@ int main(int argc, char **argv)
574678 " --mb|-m / --kb/-k / --nb|-n: set mb, kb and nb (resp.)\n"
575679 " --nruns|-t: set the number of runs to do\n"
576680 " --device|-d: which device to use (CPU or GPU)\n"
681+ " --batch|-b: enable CUDA batched GEMM chores\n"
577682 " --verbose|-v: display which GEMM runs on which GPU\n"
578683 " as execution is unfolding\n"
579684 " --help|-h|-?: display this help\n"
@@ -589,7 +694,9 @@ int main(int argc, char **argv)
589694 "\n" ,
590695 argv [0 ]);
591696 }
697+ #if defined(PARSEC_HAVE_MPI )
592698 MPI_Finalize ();
699+ #endif
593700 exit (0 );
594701 }
595702 }
@@ -623,7 +730,9 @@ int main(int argc, char **argv)
623730 rc = !(nbgpus >= 1 );
624731 if ( rc != 0 ) {
625732 fprintf (stderr , "Rank %d doesn't have CUDA accelerators\n" , rank );
733+ #if defined(PARSEC_HAVE_MPI )
626734 MPI_Abort (MPI_COMM_WORLD , 0 );
735+ #endif
627736 return -1 ;
628737 }
629738 gpu_device_index = get_gpu_device_index ();
0 commit comments