@@ -35,7 +35,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
3535 __syncthreads ();
3636
3737 // Apply basis element by element
38- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
38+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
3939
4040 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
4141 const CeedInt elem = e % num_elem ;
@@ -78,7 +78,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
7878 CeedScalar r_U [BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1 )];
7979
8080 // Apply basis element by element
81- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
81+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
8282
8383 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
8484 const CeedInt elem = e % num_elem ;
@@ -124,7 +124,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
124124 __syncthreads ();
125125
126126 // Apply basis element by element
127- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
127+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
128128
129129 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
130130 const CeedInt elem = e % num_elem ;
@@ -167,7 +167,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
167167 CeedScalar r_U [BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1 )];
168168
169169 // Apply basis element by element
170- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
170+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
171171
172172 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
173173 const CeedInt elem = e % num_elem ;
@@ -213,7 +213,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
213213 __syncthreads ();
214214
215215 // Apply basis element by element
216- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
216+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
217217
218218 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
219219 const CeedInt elem = e % num_elem ;
@@ -257,7 +257,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
257257 CeedScalar r_U [BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1 )];
258258
259259 // Apply basis element by element
260- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
260+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
261261
262262 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
263263 const CeedInt elem = e % num_elem ;
@@ -308,7 +308,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ void Grad(const C
308308 __syncthreads ();
309309
310310 // Apply basis element by element
311- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
311+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
312312
313313 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
314314 const CeedInt elem = e % num_elem ;
@@ -360,7 +360,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
360360 __syncthreads ();
361361
362362 // Apply basis element by element
363- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
363+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
364364
365365 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
366366 const CeedInt elem = e % num_elem ;
@@ -413,7 +413,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
413413 __syncthreads ();
414414
415415 // Apply basis element by element
416- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
416+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
417417
418418 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
419419 const CeedInt elem = e % num_elem ;
@@ -465,7 +465,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
465465 __syncthreads ();
466466
467467 // Apply basis element by element
468- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
468+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
469469
470470 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
471471 const CeedInt elem = e % num_elem ;
@@ -518,7 +518,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
518518 __syncthreads ();
519519
520520 // Apply basis element by element
521- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
521+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
522522
523523 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
524524 const CeedInt elem = e % num_elem ;
@@ -570,7 +570,7 @@ extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
570570 __syncthreads ();
571571
572572 // Apply basis element by element
573- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
573+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
574574
575575 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
576576 const CeedInt elem = e % num_elem ;
@@ -616,7 +616,7 @@ extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
616616
617617 CeedScalar r_W [BASIS_DIM > 2 ? BASIS_Q_1D : 1 ];
618618
619- const CeedInt elem_loop_bound = num_elem * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
619+ const CeedInt elem_loop_bound = ( gridDim . x * blockDim . z ) * ceil (1.0 * num_elem / (gridDim .x * blockDim .z ));
620620
621621 for (CeedInt e = blockIdx .x * blockDim .z + threadIdx .z ; e < elem_loop_bound ; e += gridDim .x * blockDim .z ) {
622622 const CeedInt elem = e % num_elem ;
0 commit comments