Skip to content

Commit 59b5803

Browse files
gpu - pad out elem loop for shared/gen (#1950)
* gpu - pad out elem loop for shared/gen * typo - fix bad copypasta Co-authored-by: Zach Atkins <zach.atkins@colorado.edu> * cuda - don't padd threads on CUDA * hip - fix element loop bound * hip - set Chipstar modifications off by default * hip - comment on logic * hip - move chipstar jit macro definition --------- Co-authored-by: Zach Atkins <zach.atkins@colorado.edu>
1 parent 2cbb475 commit 59b5803

6 files changed

Lines changed: 542 additions & 118 deletions

File tree

backends/cuda-gen/ceed-cuda-gen-operator-build.cpp

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -473,13 +473,17 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
473473
CeedInt comp_stride;
474474

475475
CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
476+
code << tab << "{\n";
477+
tab.push();
476478
code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
477479
CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
478-
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
480+
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n\n";
479481
data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
480482
code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
481483
<< P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
482484
<< ");\n";
485+
tab.pop();
486+
code << tab << "}\n";
483487
break;
484488
}
485489
case CEED_RESTRICTION_STRIDED: {
@@ -493,11 +497,15 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
493497
if (!has_backend_strides) {
494498
CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
495499
}
500+
code << tab << "{\n";
501+
tab.push();
496502
code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
497-
<< ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
503+
<< ", strides" << var_suffix << "_2 = " << strides[2] << ";\n\n";
498504
code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
499505
<< var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
500506
<< ");\n";
507+
tab.pop();
508+
code << tab << "}\n";
501509
break;
502510
}
503511
case CEED_RESTRICTION_POINTS:
@@ -1033,10 +1041,14 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
10331041
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
10341042
CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
10351043
CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1036-
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
1044+
code << tab << "{\n";
1045+
tab.push();
1046+
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n\n";
10371047
code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
10381048
<< ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
10391049
<< ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
1050+
tab.pop();
1051+
code << tab << "}\n";
10401052
break;
10411053
}
10421054
case CEED_EVAL_INTERP:
@@ -1848,7 +1860,7 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
18481860
// Loop over all elements
18491861
code << "\n" << tab << "// Element loop\n";
18501862
code << tab << "__syncthreads();\n";
1851-
code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1863+
code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; e < num_elem; elem += gridDim.x*blockDim.z) {\n";
18521864
tab.push();
18531865

18541866
// -- Compute minimum buffer space needed
@@ -2042,11 +2054,15 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
20422054

20432055
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
20442056
CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2057+
code << tab << "{\n";
2058+
tab.push();
20452059
code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
20462060
CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2047-
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2061+
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n\n";
20482062
code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
20492063
<< ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
2064+
tab.pop();
2065+
code << tab << "}\n";
20502066
CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
20512067
} else {
20522068
std::string var_suffix = "_out_" + std::to_string(i);
@@ -2056,11 +2072,15 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
20562072

20572073
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
20582074
CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2075+
code << tab << "{\n";
2076+
tab.push();
20592077
code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
20602078
CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2061-
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2079+
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n\n";
20622080
code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
20632081
<< ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
2082+
tab.pop();
2083+
code << tab << "}\n";
20642084
CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
20652085
}
20662086
}
@@ -2642,8 +2662,12 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
26422662
// ---- Restriction
26432663
CeedInt field_size;
26442664

2665+
code << tab << "{\n";
2666+
tab.push();
26452667
code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
26462668
<< (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
2669+
tab.pop();
2670+
code << tab << "}\n";
26472671
CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
26482672
offset += field_size;
26492673
}

backends/hip-gen/ceed-hip-gen-operator-build.cpp

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -500,13 +500,17 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
500500
CeedInt comp_stride;
501501

502502
CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
503+
code << tab << "if (e < num_elem) {\n";
504+
tab.push();
503505
code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
504506
CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
505-
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
507+
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n\n";
506508
data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
507509
code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
508510
<< P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
509511
<< ");\n";
512+
tab.pop();
513+
code << tab << "}\n";
510514
break;
511515
}
512516
case CEED_RESTRICTION_STRIDED: {
@@ -520,11 +524,15 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
520524
if (!has_backend_strides) {
521525
CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
522526
}
527+
code << tab << "if (e < num_elem) {\n";
528+
tab.push();
523529
code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
524-
<< ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
530+
<< ", strides" << var_suffix << "_2 = " << strides[2] << ";\n\n";
525531
code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
526532
<< var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
527533
<< ");\n";
534+
tab.pop();
535+
code << tab << "}\n";
528536
break;
529537
}
530538
case CEED_RESTRICTION_POINTS:
@@ -1060,10 +1068,14 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
10601068
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
10611069
CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
10621070
CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1063-
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
1071+
code << tab << "if (e < num_elem) {\n";
1072+
tab.push();
1073+
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n\n";
10641074
code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
10651075
<< ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
10661076
<< ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
1077+
tab.pop();
1078+
code << tab << "}\n";
10671079
break;
10681080
}
10691081
case CEED_EVAL_INTERP:
@@ -1495,8 +1507,15 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
14951507
// Loop over all elements
14961508
code << "\n" << tab << "// Element loop\n";
14971509
code << tab << "__syncthreads();\n";
1498-
code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1510+
code << tab << "#if CEED_HIP_USE_CHIPSTAR\n";
1511+
code << tab << "// Pad out elements so all threads hit syncthreads()\n";
1512+
code << tab << "const CeedInt elem_loop_bound = (gridDim.x*blockDim.z) * ceil(1.0*num_elem/(gridDim.x*blockDim.z));\n\n";
1513+
code << tab << "#else\n";
1514+
code << tab << "const CeedInt elem_loop_bound = num_elem;\n\n";
1515+
code << tab << "#endif\n";
1516+
code << tab << "for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x*blockDim.z) {\n";
14991517
tab.push();
1518+
code << tab << "const CeedInt elem = e % num_elem;\n\n";
15001519

15011520
// -- Compute minimum buffer space needed
15021521
CeedInt max_rstr_buffer_size = 1;
@@ -1853,8 +1872,15 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
18531872
// Loop over all elements
18541873
code << "\n" << tab << "// Element loop\n";
18551874
code << tab << "__syncthreads();\n";
1856-
code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1875+
code << tab << "#if CEED_HIP_USE_CHIPSTAR\n";
1876+
code << tab << "// Pad out elements so all threads hit syncthreads()\n";
1877+
code << tab << "const CeedInt elem_loop_bound = (gridDim.x*blockDim.z) * ceil(1.0*num_elem/(gridDim.x*blockDim.z));\n\n";
1878+
code << tab << "#else\n";
1879+
code << tab << "const CeedInt elem_loop_bound = num_elem;\n\n";
1880+
code << tab << "#endif\n";
1881+
code << tab << "for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x*blockDim.z) {\n";
18571882
tab.push();
1883+
code << tab << "const CeedInt elem = e % num_elem;\n\n";
18581884

18591885
// -- Compute minimum buffer space needed
18601886
CeedInt max_rstr_buffer_size = 1;
@@ -2047,11 +2073,15 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
20472073

20482074
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
20492075
CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2076+
code << tab << "if (e < num_elem) {\n";
2077+
tab.push();
20502078
code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
20512079
CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2052-
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2080+
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n\n";
20532081
code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
20542082
<< ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
2083+
tab.pop();
2084+
code << tab << "}\n";
20552085
CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
20562086
} else {
20572087
std::string var_suffix = "_out_" + std::to_string(i);
@@ -2061,11 +2091,15 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
20612091

20622092
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
20632093
CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2094+
code << tab << "if (e < num_elem) {\n";
2095+
tab.push();
20642096
code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
20652097
CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2066-
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2098+
code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n\n";
20672099
code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
20682100
<< ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
2101+
tab.pop();
2102+
code << tab << "}\n";
20692103
CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
20702104
}
20712105
}
@@ -2638,8 +2672,12 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
26382672
// ---- Restriction
26392673
CeedInt field_size;
26402674

2675+
code << tab << "if (e < num_elem) {\n";
2676+
tab.push();
26412677
code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
26422678
<< (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
2679+
tab.pop();
2680+
code << tab << "}\n";
26432681
CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
26442682
offset += field_size;
26452683
}

include/ceed/jit-source/hip/hip-jit.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,12 @@
1313
#define CeedPragmaSIMD
1414
#define CEED_Q_VLA 1
1515

16+
// If we are using Chipstar, then we have to ensure all threads have the same workloads
17+
// and hit __syncthreads() at the same places/number of times
18+
#ifdef __HIP_PLATFORM_SPIRV__
19+
#define CEED_HIP_USE_CHIPSTAR true
20+
#else
21+
#define CEED_HIP_USE_CHIPSTAR false
22+
#endif
23+
1624
#include "hip-types.h"

0 commit comments

Comments
 (0)