Skip to content

Commit 446df38

Browse files
committed
hip - fix element loop bound
1 parent bd3602b commit 446df38

4 files changed

Lines changed: 28 additions & 28 deletions

File tree

backends/hip-gen/ceed-hip-gen-operator-build.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1507,7 +1507,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
15071507
// Loop over all elements
15081508
code << "\n" << tab << "// Element loop\n";
15091509
code << tab << "__syncthreads();\n";
1510-
code << tab << "const CeedInt elem_loop_bound = num_elem * ceil(1.0*num_elem/(gridDim.x*blockDim.z));\n\n";
1510+
code << tab << "const CeedInt elem_loop_bound = (gridDim.x*blockDim.z) * ceil(1.0*num_elem/(gridDim.x*blockDim.z));\n\n";
15111511
code << tab << "for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x*blockDim.z) {\n";
15121512
tab.push();
15131513
code << tab << "const CeedInt elem = e % num_elem;\n\n";
@@ -1867,7 +1867,7 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
18671867
// Loop over all elements
18681868
code << "\n" << tab << "// Element loop\n";
18691869
code << tab << "__syncthreads();\n";
1870-
code << tab << "const CeedInt elem_loop_bound = num_elem * ceil(1.0*num_elem/(gridDim.x*blockDim.z));\n\n";
1870+
code << tab << "const CeedInt elem_loop_bound = (gridDim.x*blockDim.z) * ceil(1.0*num_elem/(gridDim.x*blockDim.z));\n\n";
18711871
code << tab << "for (CeedInt e = blockIdx.x*blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x*blockDim.z) {\n";
18721872
tab.push();
18731873
code << tab << "const CeedInt elem = e % num_elem;\n\n";

include/ceed/jit-source/hip/hip-shared-basis-nontensor.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
3535
__syncthreads();
3636

3737
// Apply basis element by element
38-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
38+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
3939

4040
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
4141
const CeedInt elem = e % num_elem;
@@ -68,7 +68,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
6868
__syncthreads();
6969

7070
// Apply basis element by element
71-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
71+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
7272

7373
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
7474
const CeedInt elem = e % num_elem;
@@ -101,7 +101,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
101101
__syncthreads();
102102

103103
// Apply basis element by element
104-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
104+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
105105

106106
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
107107
const CeedInt elem = e % num_elem;
@@ -137,7 +137,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
137137
__syncthreads();
138138

139139
// Apply basis element by element
140-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
140+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
141141

142142
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
143143
const CeedInt elem = e % num_elem;
@@ -170,7 +170,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
170170
__syncthreads();
171171

172172
// Apply basis element by element
173-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
173+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
174174

175175
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
176176
const CeedInt elem = e % num_elem;
@@ -203,7 +203,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
203203
__syncthreads();
204204

205205
// Apply basis element by element
206-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
206+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
207207

208208
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
209209
const CeedInt elem = e % num_elem;
@@ -232,7 +232,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
232232

233233
CeedScalar r_W[1];
234234

235-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
235+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
236236

237237
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
238238
const CeedInt elem = e % num_elem;

include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
4343
__syncthreads();
4444

4545
// Apply basis element by element
46-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
46+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
4747

4848
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
4949
const CeedInt elem = e % num_elem;
@@ -105,7 +105,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
105105
__syncthreads();
106106

107107
// Apply basis element by element
108-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
108+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
109109

110110
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
111111
const CeedInt elem = e % num_elem;
@@ -185,7 +185,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
185185
__syncthreads();
186186

187187
// Apply basis element by element
188-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
188+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
189189

190190
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
191191
const CeedInt elem = e % num_elem;
@@ -257,7 +257,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
257257
__syncthreads();
258258

259259
// Apply basis element by element
260-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
260+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
261261

262262
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
263263
const CeedInt elem = e % num_elem;
@@ -319,7 +319,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
319319
__syncthreads();
320320

321321
// Apply basis element by element
322-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
322+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
323323

324324
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
325325
const CeedInt elem = e % num_elem;
@@ -400,7 +400,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
400400
__syncthreads();
401401

402402
// Apply basis element by element
403-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
403+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
404404

405405
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
406406
const CeedInt elem = e % num_elem;

include/ceed/jit-source/hip/hip-shared-basis-tensor.h

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
3535
__syncthreads();
3636

3737
// Apply basis element by element
38-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
38+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
3939

4040
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
4141
const CeedInt elem = e % num_elem;
@@ -78,7 +78,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
7878
CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
7979

8080
// Apply basis element by element
81-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
81+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
8282

8383
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
8484
const CeedInt elem = e % num_elem;
@@ -124,7 +124,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
124124
__syncthreads();
125125

126126
// Apply basis element by element
127-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
127+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
128128

129129
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
130130
const CeedInt elem = e % num_elem;
@@ -167,7 +167,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
167167
CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
168168

169169
// Apply basis element by element
170-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
170+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
171171

172172
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
173173
const CeedInt elem = e % num_elem;
@@ -213,7 +213,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
213213
__syncthreads();
214214

215215
// Apply basis element by element
216-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
216+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
217217

218218
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
219219
const CeedInt elem = e % num_elem;
@@ -257,7 +257,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
257257
CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
258258

259259
// Apply basis element by element
260-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
260+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
261261

262262
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
263263
const CeedInt elem = e % num_elem;
@@ -308,7 +308,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const C
308308
__syncthreads();
309309

310310
// Apply basis element by element
311-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
311+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
312312

313313
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
314314
const CeedInt elem = e % num_elem;
@@ -360,7 +360,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
360360
__syncthreads();
361361

362362
// Apply basis element by element
363-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
363+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
364364

365365
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
366366
const CeedInt elem = e % num_elem;
@@ -413,7 +413,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
413413
__syncthreads();
414414

415415
// Apply basis element by element
416-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
416+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
417417

418418
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
419419
const CeedInt elem = e % num_elem;
@@ -465,7 +465,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
465465
__syncthreads();
466466

467467
// Apply basis element by element
468-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
468+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
469469

470470
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
471471
const CeedInt elem = e % num_elem;
@@ -518,7 +518,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
518518
__syncthreads();
519519

520520
// Apply basis element by element
521-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
521+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
522522

523523
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
524524
const CeedInt elem = e % num_elem;
@@ -570,7 +570,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
570570
__syncthreads();
571571

572572
// Apply basis element by element
573-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
573+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
574574

575575
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
576576
const CeedInt elem = e % num_elem;
@@ -616,7 +616,7 @@ extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
616616

617617
CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1];
618618

619-
const CeedInt elem_loop_bound = num_elem * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
619+
const CeedInt elem_loop_bound = (gridDim.x * blockDim.z) * ceil(1.0 * num_elem / (gridDim.x * blockDim.z));
620620

621621
for (CeedInt e = blockIdx.x * blockDim.z + threadIdx.z; e < elem_loop_bound; e += gridDim.x * blockDim.z) {
622622
const CeedInt elem = e % num_elem;

0 commit comments

Comments
 (0)