@@ -500,13 +500,17 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
500500 CeedInt comp_stride;
501501
502502 CeedCallBackend (CeedElemRestrictionGetLVectorSize (elem_rstr, &l_size));
503+ code << tab << " if (e < num_elem) {\n " ;
504+ tab.push ();
503505 code << tab << " const CeedInt l_size" << var_suffix << " = " << l_size << " ;\n " ;
504506 CeedCallBackend (CeedElemRestrictionGetCompStride (elem_rstr, &comp_stride));
505- code << tab << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << " ;\n " ;
507+ code << tab << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << " ;\n\n " ;
506508 data->indices .outputs [i] = (CeedInt *)rstr_data->d_offsets ;
507509 code << tab << " WriteLVecStandard" << (is_all_tensor ? max_dim : 1 ) << " d<num_comp" << var_suffix << " , comp_stride" << var_suffix << " , "
508510 << P_name << " >(data, l_size" << var_suffix << " , elem, indices.outputs[" << i << " ], r_e" << var_suffix << " , d" << var_suffix
509511 << " );\n " ;
512+ tab.pop ();
513+ code << tab << " }\n " ;
510514 break ;
511515 }
512516 case CEED_RESTRICTION_STRIDED: {
@@ -520,11 +524,15 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
520524 if (!has_backend_strides) {
521525 CeedCallBackend (CeedElemRestrictionGetStrides (elem_rstr, strides));
522526 }
527+ code << tab << " if (e < num_elem) {\n " ;
528+ tab.push ();
523529 code << tab << " const CeedInt strides" << var_suffix << " _0 = " << strides[0 ] << " , strides" << var_suffix << " _1 = " << strides[1 ]
524- << " , strides" << var_suffix << " _2 = " << strides[2 ] << " ;\n " ;
530+ << " , strides" << var_suffix << " _2 = " << strides[2 ] << " ;\n\n " ;
525531 code << tab << " WriteLVecStrided" << (is_all_tensor ? max_dim : 1 ) << " d<num_comp" << var_suffix << " , " << P_name << " , strides"
526532 << var_suffix << " _0, strides" << var_suffix << " _1, strides" << var_suffix << " _2>(data, elem, r_e" << var_suffix << " , d" << var_suffix
527533 << " );\n " ;
534+ tab.pop ();
535+ code << tab << " }\n " ;
528536 break ;
529537 }
530538 case CEED_RESTRICTION_POINTS:
@@ -1060,10 +1068,14 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
10601068 CeedCallBackend (CeedOperatorFieldGetElemRestriction (op_output_fields[i], &elem_rstr));
10611069 CeedCallBackend (CeedElemRestrictionGetCompStride (elem_rstr, &comp_stride));
10621070 CeedCallBackend (CeedElemRestrictionDestroy (&elem_rstr));
1063- code << tab << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << " ;\n " ;
1071+ code << tab << " if (e < num_elem) {\n " ;
1072+ tab.push ();
1073+ code << tab << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << " ;\n\n " ;
10641074 code << tab << " WritePoint<num_comp" << var_suffix << " , comp_stride" << var_suffix
10651075 << " , max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << " ]"
10661076 << " , r_s" << var_suffix << " , d" << var_suffix << " );\n " ;
1077+ tab.pop ();
1078+ code << tab << " }\n " ;
10671079 break ;
10681080 }
10691081 case CEED_EVAL_INTERP:
@@ -1495,8 +1507,15 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
14951507 // Loop over all elements
14961508 code << " \n " << tab << " // Element loop\n " ;
14971509 code << tab << " __syncthreads();\n " ;
1498- code << tab << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n " ;
1510+ code << tab << " #if CEED_HIP_USE_CHIPSTAR\n " ;
1511+ code << tab << " // Pad out elements so all threads hit syncthreads()\n " ;
1512+ code << tab << " const CeedInt elem_loop_bound = (gridDim.x*blockDim.z) * ceil(1.0*num_elem/(gridDim.x*blockDim.z));\n\n " ;
1513+ code << tab << " #else\n " ;
1514+ code << tab << " const CeedInt elem_loop_bound = num_elem;\n\n " ;
1515+ code << tab << " #endif\n " ;
1516+ code << tab << " for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x*blockDim.z) {\n " ;
14991517 tab.push ();
1518+ code << tab << " const CeedInt elem = e % num_elem;\n\n " ;
15001519
15011520 // -- Compute minimum buffer space needed
15021521 CeedInt max_rstr_buffer_size = 1 ;
@@ -1853,8 +1872,15 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
18531872 // Loop over all elements
18541873 code << " \n " << tab << " // Element loop\n " ;
18551874 code << tab << " __syncthreads();\n " ;
1856- code << tab << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n " ;
1875+ code << tab << " #if CEED_HIP_USE_CHIPSTAR\n " ;
1876+ code << tab << " // Pad out elements so all threads hit syncthreads()\n " ;
1877+ code << tab << " const CeedInt elem_loop_bound = (gridDim.x*blockDim.z) * ceil(1.0*num_elem/(gridDim.x*blockDim.z));\n\n " ;
1878+ code << tab << " #else\n " ;
1879+ code << tab << " const CeedInt elem_loop_bound = num_elem;\n\n " ;
1880+ code << tab << " #endif\n " ;
1881+ code << tab << " for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x*blockDim.z) {\n " ;
18571882 tab.push ();
1883+ code << tab << " const CeedInt elem = e % num_elem;\n\n " ;
18581884
18591885 // -- Compute minimum buffer space needed
18601886 CeedInt max_rstr_buffer_size = 1 ;
@@ -2047,11 +2073,15 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
20472073
20482074 CeedCallBackend (CeedOperatorFieldGetElemRestriction (op_output_fields[i], &elem_rstr));
20492075 CeedCallBackend (CeedElemRestrictionGetLVectorSize (elem_rstr, &l_size));
2076+ code << tab << " if (e < num_elem) {\n " ;
2077+ tab.push ();
20502078 code << tab << " const CeedInt l_size" << var_suffix << " = " << l_size << " ;\n " ;
20512079 CeedCallBackend (CeedElemRestrictionGetCompStride (elem_rstr, &comp_stride));
2052- code << tab << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << " ;\n " ;
2080+ code << tab << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << " ;\n\n " ;
20532081 code << tab << " WriteLVecStandard" << max_dim << " d_Assembly<num_comp" << var_suffix << " , comp_stride" << var_suffix << " , P_1d" + var_suffix
20542082 << " >(data, l_size" << var_suffix << " , elem, n, r_e" << var_suffix << " , values_array);\n " ;
2083+ tab.pop ();
2084+ code << tab << " }\n " ;
20552085 CeedCallBackend (CeedElemRestrictionDestroy (&elem_rstr));
20562086 } else {
20572087 std::string var_suffix = " _out_" + std::to_string (i);
@@ -2061,11 +2091,15 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
20612091
20622092 CeedCallBackend (CeedOperatorFieldGetElemRestriction (op_output_fields[i], &elem_rstr));
20632093 CeedCallBackend (CeedElemRestrictionGetLVectorSize (elem_rstr, &l_size));
2094+ code << tab << " if (e < num_elem) {\n " ;
2095+ tab.push ();
20642096 code << tab << " const CeedInt l_size" << var_suffix << " = " << l_size << " ;\n " ;
20652097 CeedCallBackend (CeedElemRestrictionGetCompStride (elem_rstr, &comp_stride));
2066- code << tab << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << " ;\n " ;
2098+ code << tab << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << " ;\n\n " ;
20672099 code << tab << " WriteLVecStandard" << max_dim << " d_Single<num_comp" << var_suffix << " , comp_stride" << var_suffix << " , P_1d" + var_suffix
20682100 << " >(data, l_size" << var_suffix << " , elem, n, indices.outputs[" << i << " ], r_e" << var_suffix << " , values_array);\n " ;
2101+ tab.pop ();
2102+ code << tab << " }\n " ;
20692103 CeedCallBackend (CeedElemRestrictionDestroy (&elem_rstr));
20702104 }
20712105 }
@@ -2638,8 +2672,12 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
26382672 // ---- Restriction
26392673 CeedInt field_size;
26402674
2675+ code << tab << " if (e < num_elem) {\n " ;
2676+ tab.push ();
26412677 code << tab << " WriteLVecStandard" << (is_all_tensor ? max_dim : 1 ) << " d_QFAssembly<total_size_out, field_size_out_" << i << " , "
26422678 << (is_all_tensor ? " Q_1d" : " Q" ) << " >(data, num_elem, elem, input_offset + s, " << offset << " , r_q_out_" << i << " , values_array);\n " ;
2679+ tab.pop ();
2680+ code << tab << " }\n " ;
26432681 CeedCallBackend (CeedQFunctionFieldGetSize (qf_output_fields[i], &field_size));
26442682 offset += field_size;
26452683 }
0 commit comments