Skip to content

Commit f70b67d

Browse files
committed
hip - set Chipstar modifications off by default
1 parent 446df38 commit f70b67d

5 files changed

Lines changed: 123 additions & 0 deletions

File tree

backends/hip-gen/ceed-hip-gen-operator-build.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1507,7 +1507,11 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
15071507
// Loop over all elements
15081508
code << "\n" << tab << "// Element loop\n";
15091509
code << tab << "__syncthreads();\n";
1510+
code << tab << "#if CEED_HIP_USE_CHIPSTAR\n";
15101511
code << tab << "const CeedInt elem_loop_bound = (gridDim.x*blockDim.z) * ceil(1.0*num_elem/(gridDim.x*blockDim.z));\n\n";
1512+
code << tab << "#else\n";
1513+
code << tab << "const CeedInt elem_loop_bound = num_elem;\n\n";
1514+
code << tab << "#endif\n";
15111515
code << tab << "for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x*blockDim.z) {\n";
15121516
tab.push();
15131517
code << tab << "const CeedInt elem = e % num_elem;\n\n";
@@ -1867,7 +1871,11 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
18671871
// Loop over all elements
18681872
code << "\n" << tab << "// Element loop\n";
18691873
code << tab << "__syncthreads();\n";
1874+
code << tab << "#if CEED_HIP_USE_CHIPSTAR\n";
18701875
code << tab << "const CeedInt elem_loop_bound = (gridDim.x*blockDim.z) * ceil(1.0*num_elem/(gridDim.x*blockDim.z));\n\n";
1876+
code << tab << "#else\n";
1877+
code << tab << "const CeedInt elem_loop_bound = num_elem;\n\n";
1878+
code << tab << "#endif\n";
18711879
code << tab << "for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x*blockDim.z) {\n";
18721880
tab.push();
18731881
code << tab << "const CeedInt elem = e % num_elem;\n\n";

backends/hip/ceed-hip-common.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,17 @@ int CeedInit_Hip(Ceed ceed, const char *resource) {
3838
CeedDebug(ceed, "Using unified memory addressing");
3939
}
4040
data->opt_block_size = 256;
41+
42+
// Set CHIPSTAR modifications off by default
43+
{
44+
#ifdef __HIP_PLATFORM_SPIRV__
45+
const char *define = "CEED_HIP_USE_CHIPSTAR=true";
46+
#else
47+
const char *define = "CEED_HIP_USE_CHIPSTAR=false";
48+
#endif
49+
50+
CeedCallBackend(CeedAddJitDefine(ceed, define));
51+
}
4152
return CEED_ERROR_SUCCESS;
4253
}
4354

include/ceed/jit-source/hip/hip-shared-basis-nontensor.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
3535
__syncthreads();
3636

3737
// Apply basis element by element
38+
#if CEED_HIP_USE_CHIPSTAR
3839
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
40+
#else
41+
const CeedInt elem_loop_bound = num_elem;
42+
#endif
3943

4044
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
4145
const CeedInt elem = e % num_elem;
@@ -68,7 +72,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
6872
__syncthreads();
6973

7074
// Apply basis element by element
75+
#if CEED_HIP_USE_CHIPSTAR
7176
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
77+
#else
78+
const CeedInt elem_loop_bound = num_elem;
79+
#endif
7280

7381
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
7482
const CeedInt elem = e % num_elem;
@@ -101,7 +109,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
101109
__syncthreads();
102110

103111
// Apply basis element by element
112+
#if CEED_HIP_USE_CHIPSTAR
104113
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
114+
#else
115+
const CeedInt elem_loop_bound = num_elem;
116+
#endif
105117

106118
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
107119
const CeedInt elem = e % num_elem;
@@ -137,7 +149,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
137149
__syncthreads();
138150

139151
// Apply basis element by element
152+
#if CEED_HIP_USE_CHIPSTAR
140153
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
154+
#else
155+
const CeedInt elem_loop_bound = num_elem;
156+
#endif
141157

142158
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
143159
const CeedInt elem = e % num_elem;
@@ -170,7 +186,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
170186
__syncthreads();
171187

172188
// Apply basis element by element
189+
#if CEED_HIP_USE_CHIPSTAR
173190
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
191+
#else
192+
const CeedInt elem_loop_bound = num_elem;
193+
#endif
174194

175195
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
176196
const CeedInt elem = e % num_elem;
@@ -203,7 +223,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
203223
__syncthreads();
204224

205225
// Apply basis element by element
226+
#if CEED_HIP_USE_CHIPSTAR
206227
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
228+
#else
229+
const CeedInt elem_loop_bound = num_elem;
230+
#endif
207231

208232
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
209233
const CeedInt elem = e % num_elem;
@@ -232,7 +256,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
232256

233257
CeedScalar r_W[1];
234258

259+
#if CEED_HIP_USE_CHIPSTAR
235260
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
261+
#else
262+
const CeedInt elem_loop_bound = num_elem;
263+
#endif
236264

237265
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
238266
const CeedInt elem = e % num_elem;

include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
4343
__syncthreads();
4444

4545
// Apply basis element by element
46+
#if CEED_HIP_USE_CHIPSTAR
4647
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
48+
#else
49+
const CeedInt elem_loop_bound = num_elem;
50+
#endif
4751

4852
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
4953
const CeedInt elem = e % num_elem;
@@ -105,7 +109,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
105109
__syncthreads();
106110

107111
// Apply basis element by element
112+
#if CEED_HIP_USE_CHIPSTAR
108113
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
114+
#else
115+
const CeedInt elem_loop_bound = num_elem;
116+
#endif
109117

110118
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
111119
const CeedInt elem = e % num_elem;
@@ -185,7 +193,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
185193
__syncthreads();
186194

187195
// Apply basis element by element
196+
#if CEED_HIP_USE_CHIPSTAR
188197
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
198+
#else
199+
const CeedInt elem_loop_bound = num_elem;
200+
#endif
189201

190202
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
191203
const CeedInt elem = e % num_elem;
@@ -257,7 +269,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
257269
__syncthreads();
258270

259271
// Apply basis element by element
272+
#if CEED_HIP_USE_CHIPSTAR
260273
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
274+
#else
275+
const CeedInt elem_loop_bound = num_elem;
276+
#endif
261277

262278
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
263279
const CeedInt elem = e % num_elem;
@@ -319,7 +335,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
319335
__syncthreads();
320336

321337
// Apply basis element by element
338+
#if CEED_HIP_USE_CHIPSTAR
322339
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
340+
#else
341+
const CeedInt elem_loop_bound = num_elem;
342+
#endif
323343

324344
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
325345
const CeedInt elem = e % num_elem;
@@ -400,7 +420,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
400420
__syncthreads();
401421

402422
// Apply basis element by element
423+
#if CEED_HIP_USE_CHIPSTAR
403424
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
425+
#else
426+
const CeedInt elem_loop_bound = num_elem;
427+
#endif
404428

405429
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
406430
const CeedInt elem = e % num_elem;

include/ceed/jit-source/hip/hip-shared-basis-tensor.h

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
3535
__syncthreads();
3636

3737
// Apply basis element by element
38+
#if CEED_HIP_USE_CHIPSTAR
3839
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
40+
#else
41+
const CeedInt elem_loop_bound = num_elem;
42+
#endif
3943

4044
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
4145
const CeedInt elem = e % num_elem;
@@ -78,7 +82,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
7882
CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
7983

8084
// Apply basis element by element
85+
#if CEED_HIP_USE_CHIPSTAR
8186
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
87+
#else
88+
const CeedInt elem_loop_bound = num_elem;
89+
#endif
8290

8391
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
8492
const CeedInt elem = e % num_elem;
@@ -124,7 +132,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
124132
__syncthreads();
125133

126134
// Apply basis element by element
135+
#if CEED_HIP_USE_CHIPSTAR
127136
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
137+
#else
138+
const CeedInt elem_loop_bound = num_elem;
139+
#endif
128140

129141
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
130142
const CeedInt elem = e % num_elem;
@@ -167,7 +179,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
167179
CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
168180

169181
// Apply basis element by element
182+
#if CEED_HIP_USE_CHIPSTAR
170183
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
184+
#else
185+
const CeedInt elem_loop_bound = num_elem;
186+
#endif
171187

172188
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
173189
const CeedInt elem = e % num_elem;
@@ -213,7 +229,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
213229
__syncthreads();
214230

215231
// Apply basis element by element
232+
#if CEED_HIP_USE_CHIPSTAR
216233
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
234+
#else
235+
const CeedInt elem_loop_bound = num_elem;
236+
#endif
217237

218238
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
219239
const CeedInt elem = e % num_elem;
@@ -257,7 +277,11 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
257277
CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
258278

259279
// Apply basis element by element
280+
#if CEED_HIP_USE_CHIPSTAR
260281
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
282+
#else
283+
const CeedInt elem_loop_bound = num_elem;
284+
#endif
261285

262286
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
263287
const CeedInt elem = e % num_elem;
@@ -308,7 +332,11 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const C
308332
__syncthreads();
309333

310334
// Apply basis element by element
335+
#if CEED_HIP_USE_CHIPSTAR
311336
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
337+
#else
338+
const CeedInt elem_loop_bound = num_elem;
339+
#endif
312340

313341
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
314342
const CeedInt elem = e % num_elem;
@@ -360,7 +388,11 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
360388
__syncthreads();
361389

362390
// Apply basis element by element
391+
#if CEED_HIP_USE_CHIPSTAR
363392
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
393+
#else
394+
const CeedInt elem_loop_bound = num_elem;
395+
#endif
364396

365397
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
366398
const CeedInt elem = e % num_elem;
@@ -413,7 +445,11 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
413445
__syncthreads();
414446

415447
// Apply basis element by element
448+
#if CEED_HIP_USE_CHIPSTAR
416449
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
450+
#else
451+
const CeedInt elem_loop_bound = num_elem;
452+
#endif
417453

418454
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
419455
const CeedInt elem = e % num_elem;
@@ -465,7 +501,11 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
465501
__syncthreads();
466502

467503
// Apply basis element by element
504+
#if CEED_HIP_USE_CHIPSTAR
468505
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
506+
#else
507+
const CeedInt elem_loop_bound = num_elem;
508+
#endif
469509

470510
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
471511
const CeedInt elem = e % num_elem;
@@ -518,7 +558,11 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
518558
__syncthreads();
519559

520560
// Apply basis element by element
561+
#if CEED_HIP_USE_CHIPSTAR
521562
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
563+
#else
564+
const CeedInt elem_loop_bound = num_elem;
565+
#endif
522566

523567
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
524568
const CeedInt elem = e % num_elem;
@@ -570,7 +614,11 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
570614
__syncthreads();
571615

572616
// Apply basis element by element
617+
#if CEED_HIP_USE_CHIPSTAR
573618
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
619+
#else
620+
const CeedInt elem_loop_bound = num_elem;
621+
#endif
574622

575623
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
576624
const CeedInt elem = e % num_elem;
@@ -616,7 +664,11 @@ extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
616664

617665
CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1];
618666

667+
#if CEED_HIP_USE_CHIPSTAR
619668
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
669+
#else
670+
const CeedInt elem_loop_bound = num_elem;
671+
#endif
620672

621673
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
622674
const CeedInt elem = e % num_elem;

0 commit comments

Comments
 (0)