@@ -1073,9 +1073,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op,
10731073 CeedInt strides [3 ] = {1 , num_elem * Q , Q }; /* *NOPAD* */
10741074
10751075 // Create output restriction
1076- CeedCallBackend (CeedElemRestrictionCreateStrided (ceed_parent , num_elem , Q , num_active_in * num_active_out ,
1077- (CeedSize )num_active_in * (CeedSize )num_active_out * (CeedSize )num_elem * (CeedSize )Q , strides ,
1078- rstr ));
1076+ CeedCallBackend (CeedElemRestrictionCreateStrided (ceed_parent , num_elem , Q , num_active_in * num_active_out , l_size , strides , rstr ));
10791077 // Create assembled vector
10801078 CeedCallBackend (CeedVectorCreate (ceed_parent , l_size , assembled ));
10811079 }
@@ -1534,12 +1532,12 @@ static int CeedOperatorAssembleSingleBlockSetup_Cuda(CeedOperator op, CeedInt ac
15341532 CeedInt num_input_fields , num_output_fields , num_eval_modes_in = 0 , num_eval_modes_out = 0 ;
15351533 CeedInt elem_size_in , num_qpts_in = 0 , num_comp_in , elem_size_out , num_qpts_out , num_comp_out ;
15361534 CeedSize num_output_components ;
1537- CeedSize eval_mode_offset_in = 0 , eval_mode_offset_out = 0 ;
15381535 const CeedScalar * h_B_in , * h_B_out ;
15391536 CeedElemRestriction rstr_in = NULL , rstr_out = NULL ;
15401537 CeedBasis basis_in = NULL , basis_out = NULL ;
15411538 CeedOperatorField * input_fields , * output_fields ;
15421539 CeedOperator_Cuda * impl ;
1540+ char * eval_mode_offsets_in_str , * eval_mode_offsets_out_str ;
15431541
15441542 CeedCallBackend (CeedOperatorGetCeed (op , & ceed ));
15451543 CeedCallBackend (CeedOperatorGetData (op , & impl ));
@@ -1573,14 +1571,16 @@ static int CeedOperatorAssembleSingleBlockSetup_Cuda(CeedOperator op, CeedInt ac
15731571 impl -> num_blocks_out = num_active_bases_out ;
15741572 }
15751573
1576- rstr_in = active_rstrs_in [active_input ];
1577- basis_in = active_bases_in [active_input ];
1578- eval_mode_offset_in = eval_modes_offsets_in [active_input ][0 ];
1579- h_B_in = B_mats_in [active_input ];
1580- rstr_out = active_rstrs_out [active_output ];
1581- basis_out = active_bases_out [active_output ];
1582- eval_mode_offset_out = eval_modes_offsets_out [active_output ][0 ];
1583- h_B_out = B_mats_out [active_output ];
1574+ rstr_in = active_rstrs_in [active_input ];
1575+ basis_in = active_bases_in [active_input ];
1576+ CeedCallBackend (CeedBuildArrayConstantSize_Cuda (ceed , "EVAL_MODE_OFFSETS_IN" , num_eval_modes_in , eval_modes_offsets_in [active_input ],
1577+ & eval_mode_offsets_in_str ));
1578+ h_B_in = B_mats_in [active_input ];
1579+ rstr_out = active_rstrs_out [active_output ];
1580+ basis_out = active_bases_out [active_output ];
1581+ CeedCallBackend (CeedBuildArrayConstantSize_Cuda (ceed , "EVAL_MODE_OFFSETS_OUT" , num_eval_modes_out , eval_modes_offsets_out [active_output ],
1582+ & eval_mode_offsets_out_str ));
1583+ h_B_out = B_mats_out [active_output ];
15841584 }
15851585
15861586 CeedCallBackend (CeedElemRestrictionGetElementSize (rstr_in , & elem_size_in ));
@@ -1611,12 +1611,12 @@ static int CeedOperatorAssembleSingleBlockSetup_Cuda(CeedOperator op, CeedInt ac
16111611 const char assembly_kernel_source [] = "// Full assembly source\n#include <ceed/jit-source/cuda/cuda-ref-operator-assemble-block.h>\n" ;
16121612 CeedCallBackend (CeedElemRestrictionGetNumComponents (rstr_in , & num_comp_in ));
16131613 CeedCallBackend (CeedElemRestrictionGetNumComponents (rstr_out , & num_comp_out ));
1614- CeedCallBackend (CeedCompile_Cuda (ceed , assembly_kernel_source , & asmb -> module , 13 , "NUM_EVAL_MODES_IN" , num_eval_modes_in , "NUM_EVAL_MODES_OUT" ,
1615- num_eval_modes_out , "EVAL_MODE_OFFSET_IN " , eval_mode_offset_in , "EVAL_MODE_OFFSET_OUT " , eval_mode_offset_out ,
1616- "NUM_COMP_IN " , num_comp_in , "NUM_COMP_OUT " , num_comp_out , "TOTAL_NUM_COMP_OUT " , num_output_components ,
1617- "NUM_NODES_IN" , elem_size_in , "NUM_NODES_OUT" , elem_size_out , "NUM_QPTS" , num_qpts_in , "BLOCK_SIZE" ,
1618- asmb -> block_size_x * asmb -> block_size_y * asmb -> elems_per_block , "BLOCK_SIZE_Y" , asmb -> block_size_y ,
1619- "USE_CEEDSIZE" , use_ceedsize_idx ));
1614+ CeedCallBackend (CeedCompileExtra_Cuda (ceed , assembly_kernel_source , & asmb -> module , 11 , 2 , "NUM_EVAL_MODES_IN" , num_eval_modes_in ,
1615+ "NUM_EVAL_MODES_OUT" , num_eval_modes_out , "NUM_COMP_IN " , num_comp_in , "NUM_COMP_OUT " , num_comp_out ,
1616+ "TOTAL_NUM_COMP_OUT " , num_output_components , "NUM_NODES_IN " , elem_size_in , "NUM_NODES_OUT " , elem_size_out ,
1617+ "NUM_QPTS" , num_qpts_in , "BLOCK_SIZE" , asmb -> block_size_x * asmb -> block_size_y * asmb -> elems_per_block ,
1618+ "BLOCK_SIZE_Y" , asmb -> block_size_y , "USE_CEEDSIZE" , use_ceedsize_idx , eval_mode_offsets_in_str ,
1619+ eval_mode_offsets_out_str ));
16201620 CeedCallBackend (CeedGetKernel_Cuda (ceed , asmb -> module , "LinearAssembleBlock" , & asmb -> LinearAssemble ));
16211621
16221622 // Load into B_in, in order that they will be used in eval_modes_in
@@ -1634,6 +1634,8 @@ static int CeedOperatorAssembleSingleBlockSetup_Cuda(CeedOperator op, CeedInt ac
16341634 CeedCallCuda (ceed , cudaMalloc ((void * * )& asmb -> d_B_out , out_bytes ));
16351635 CeedCallCuda (ceed , cudaMemcpy (asmb -> d_B_out , h_B_out , out_bytes , cudaMemcpyHostToDevice ));
16361636 }
1637+ CeedCallBackend (CeedFree (& eval_mode_offsets_in_str ));
1638+ CeedCallBackend (CeedFree (& eval_mode_offsets_out_str ));
16371639 CeedCallBackend (CeedDestroy (& ceed ));
16381640 return CEED_ERROR_SUCCESS ;
16391641}
0 commit comments