Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit 9e90429

Browse files
committed
Fixed openacc async clauses
1 parent 8c9e6a7 commit 9e90429

10 files changed

Lines changed: 42 additions & 42 deletions

File tree

coreneuron/gpu/nrn_acc_manager.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
108108
/* Here is the example of using OpenACC data enter/exit
109109
* Remember that we are not allowed to use nt->_data but we have to use:
110110
* double *dtmp = nt->_data; // now use dtmp!
111-
#pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->stream_id)
112-
#pragma acc wait(nt->stream_id)
111+
#pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->streams[nt->stream_id])
112+
#pragma acc wait(nt->streams[nt->stream_id])
113113
*/
114114

115115
/*update d_nt._data to point to device copy */
@@ -610,7 +610,7 @@ void update_net_receive_buffer(NrnThread* nt) {
610610
nrb->_nrb_flag[:nrb->_cnt],
611611
nrb->_displ[:nrb->_displ_cnt + 1],
612612
nrb->_nrb_index[:nrb->_cnt])
613-
async(nt->stream_id))
613+
async(nt->streams[nt->stream_id]))
614614
nrn_pragma_omp(target update to(nrb->_cnt,
615615
nrb->_displ_cnt,
616616
nrb->_pnt_index[:nrb->_cnt],
@@ -623,7 +623,7 @@ void update_net_receive_buffer(NrnThread* nt) {
623623
}
624624
}
625625
}
626-
nrn_pragma_acc(wait(nt->stream_id))
626+
nrn_pragma_acc(wait(nt->streams[nt->stream_id]))
627627
nrn_pragma_omp(taskwait)
628628
}
629629

coreneuron/mechanism/capac.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
7070
nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
7171
ni [0:_cntml_actual],
7272
_vec_d [0:_nt->end]) if (_nt->compute_gpu)
73-
async(_nt->stream_id))
73+
async(_nt->streams[_nt->stream_id])
7474
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
7575
for (_iml = 0; _iml < _cntml_actual; _iml++) {
7676
_vec_d[ni[_iml]] += cfac * cm;
@@ -116,7 +116,7 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
116116
nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
117117
ni [0:_cntml_actual],
118118
_vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
119-
async(_nt->stream_id))
119+
async(_nt->streams[_nt->stream_id])
120120
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
121121
for (int _iml = 0; _iml < _cntml_actual; _iml++) {
122122
i_cap = cfac * cm * _vec_rhs[ni[_iml]];

coreneuron/mechanism/eion.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) {
267267
pd [0:_cntml_padded * 5],
268268
nrn_ion_global_map
269269
[0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu)
270-
async(nt->stream_id))
270+
async(nt->streams[nt->stream_id]))
271271
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
272272
for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
273273
dcurdv = 0.;
@@ -338,7 +338,7 @@ void second_order_cur(NrnThread* _nt, int secondorder) {
338338
nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5],
339339
ni [0:_cntml_actual],
340340
_vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
341-
async(_nt->stream_id))
341+
async(_nt->streams[_nt->stream_id])
342342
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
343343
for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
344344
cur += dcurdv * (_vec_rhs[ni[_iml]]);

coreneuron/network/netcvode.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method
536536

537537
nrn_pragma_acc(parallel loop present(
538538
nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
539-
copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
539+
copy(net_send_buf_count) if (nt->compute_gpu) async(nt->streams[nt->stream_id]))
540540
nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu))
541541
for (int i = 0; i < nt->ncell; ++i) {
542542
PreSyn* ps = presyns + i;
@@ -564,15 +564,15 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method
564564
nt->_net_send_buffer[idx] = i;
565565
}
566566
}
567-
nrn_pragma_acc(wait(nt->stream_id))
567+
nrn_pragma_acc(wait(nt->streams[nt->stream_id]))
568568
nt->_net_send_buffer_cnt = net_send_buf_count;
569569

570570
if (nt->compute_gpu && nt->_net_send_buffer_cnt) {
571571
#ifdef CORENEURON_ENABLE_GPU
572572
int* nsbuffer = nt->_net_send_buffer;
573573
#endif
574-
nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->stream_id))
575-
nrn_pragma_acc(wait(nt->stream_id))
574+
nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->streams[nt->stream_id]))
575+
nrn_pragma_acc(wait(nt->streams[nt->stream_id]))
576576
nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]))
577577
}
578578

coreneuron/network/partrans.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,13 @@ void nrnmpi_v_transfer() {
5656
nrn_pragma_acc(parallel loop present(src_indices [0:n_src_gather],
5757
src_data [0:nt->_ndata],
5858
src_gather [0:n_src_gather]) if (nt->compute_gpu)
59-
async(nt->stream_id))
59+
async(nt->streams[nt->stream_id]))
6060
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
6161
for (int i = 0; i < n_src_gather; ++i) {
6262
src_gather[i] = src_data[src_indices[i]];
6363
}
6464
nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu)
65-
async(nt->stream_id))
65+
async(nt->streams[nt->stream_id]))
6666
nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu))
6767
}
6868

@@ -71,7 +71,7 @@ void nrnmpi_v_transfer() {
7171
for (int tid = 0; tid < nrn_nthread; ++tid) {
7272
if (nrn_threads[tid].compute_gpu) {
7373
compute_gpu = true;
74-
nrn_pragma_acc(wait(nrn_threads[tid].stream_id))
74+
nrn_pragma_acc(wait(nrn_threads[tid].streams[nrn_threads[tid].stream_id]))
7575
nrn_pragma_omp(taskwait)
7676
}
7777
TransferThreadData& ttd = transfer_thread_data_[tid];
@@ -122,7 +122,7 @@ void nrnthread_v_transfer(NrnThread* _nt) {
122122
nrn_pragma_acc(parallel loop present(insrc_indices [0:ntar],
123123
tar_data [0:ndata],
124124
insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu)
125-
async(_nt->stream_id))
125+
async(_nt->streams[_nt->stream_id])
126126
nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu))
127127
for (size_t i = 0; i < ntar; ++i) {
128128
tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]];

coreneuron/permute/cellorder.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -605,7 +605,7 @@ void solve_interleaved2(int ith) {
605605
ncycles [0:nwarp],
606606
stridedispl [0:nwarp + 1],
607607
rootbegin [0:nwarp + 1],
608-
nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
608+
nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->streams[nt->stream_id]))
609609
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
610610
for (int icore = 0; icore < ncore; ++icore) {
611611
int iwarp = icore / warpsize; // figure out the >> value
@@ -625,7 +625,7 @@ void solve_interleaved2(int ith) {
625625
} // serial test mode
626626
#endif
627627
}
628-
nrn_pragma_acc(wait(nt->stream_id))
628+
nrn_pragma_acc(wait(nt->streams[nt->stream_id]))
629629
#ifdef _OPENACC
630630
}
631631
#endif
@@ -659,14 +659,14 @@ void solve_interleaved1(int ith) {
659659
firstnode [0:ncell],
660660
lastnode [0:ncell],
661661
cellsize [0:ncell]) if (nt->compute_gpu)
662-
async(nt->stream_id))
662+
async(nt->streams[nt->stream_id]))
663663
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
664664
for (int icell = 0; icell < ncell; ++icell) {
665665
int icellsize = cellsize[icell];
666666
triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);
667667
bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode);
668668
}
669-
nrn_pragma_acc(wait(nt->stream_id))
669+
nrn_pragma_acc(wait(nt->streams[nt->stream_id]))
670670
}
671671

672672
void solve_interleaved(int ith) {

coreneuron/sim/fadvance_core.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */
7979
nt->cj = 1.0 / dt;
8080
}
8181
nrn_pragma_acc(update device(nt->_t, nt->_dt, nt->cj)
82-
async(nt->stream_id) if (nt->compute_gpu))
82+
async(nt->streams[nt->stream_id]) if (nt->compute_gpu))
8383
// clang-format off
8484
nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj)
8585
if(nt->compute_gpu))
@@ -206,14 +206,14 @@ void update(NrnThread* _nt) {
206206
/* do not need to worry about linmod or extracellular*/
207207
if (secondorder) {
208208
nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
209-
async(_nt->stream_id))
209+
async(_nt->streams[_nt->stream_id])
210210
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
211211
for (int i = 0; i < i2; ++i) {
212212
vec_v[i] += 2. * vec_rhs[i];
213213
}
214214
} else {
215215
nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
216-
async(_nt->stream_id))
216+
async(_nt->streams[_nt->stream_id])
217217
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
218218
for (int i = 0; i < i2; ++i) {
219219
vec_v[i] += vec_rhs[i];
@@ -295,7 +295,7 @@ void nrncore2nrn_send_values(NrnThread* nth) {
295295
assert(vs < tr->bsize);
296296

297297
nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu)
298-
async(nth->stream_id))
298+
async(nth->streams[nth->stream_id]))
299299
nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu))
300300
for (int i = 0; i < tr->n_trajec; ++i) {
301301
tr->varrays[i][vs] = *tr->gather[i];
@@ -316,10 +316,10 @@ void nrncore2nrn_send_values(NrnThread* nth) {
316316
for (int i = 0; i < tr->n_trajec; ++i) {
317317
double* gather_i = tr->gather[i];
318318
nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu)
319-
async(nth->stream_id))
319+
async(nth->streams[nth->stream_id]))
320320
nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu))
321321
}
322-
nrn_pragma_acc(wait(nth->stream_id))
322+
nrn_pragma_acc(wait(nth->streams[nth->stream_id)))
323323
nrn_pragma_omp(taskwait)
324324
for (int i = 0; i < tr->n_trajec; ++i) {
325325
*(tr->scatter[i]) = *(tr->gather[i]);
@@ -342,8 +342,8 @@ static void* nrn_fixed_step_thread(NrnThread* nth) {
342342
if (nth->ncell) {
343343
/*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can
344344
launch kernel) */
345-
nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
346-
nrn_pragma_acc(wait(nth->stream_id))
345+
nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id]))
346+
nrn_pragma_acc(wait(nth->streams[nth->stream_id)))
347347
nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
348348
fixed_play_continuous(nth);
349349

@@ -378,8 +378,8 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) {
378378

379379
if (nth->ncell) {
380380
/*@todo: do we need to update nth->_t on GPU */
381-
nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
382-
nrn_pragma_acc(wait(nth->stream_id))
381+
nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->streams[nth->stream_id]))
382+
nrn_pragma_acc(wait(nth->streams[nth->stream_id)))
383383
nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
384384
fixed_play_continuous(nth);
385385
nonvint(nth);

coreneuron/sim/fast_imem.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ void nrn_calc_fast_imem(NrnThread* nt) {
5252
double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
5353
nrn_pragma_acc(
5454
parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu)
55-
async(nt->stream_id))
55+
async(nt->streams[nt->stream_id]))
5656
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
5757
for (int i = i1; i < i3; ++i) {
5858
fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
@@ -69,7 +69,7 @@ void nrn_calc_fast_imem_init(NrnThread* nt) {
6969

7070
double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
7171
nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu)
72-
async(nt->stream_id))
72+
async(nt->streams[nt->stream_id]))
7373
nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
7474
for (int i = i1; i < i3; ++i) {
7575
fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;

coreneuron/sim/solve_core.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ static void triang(NrnThread* _nt) {
4141

4242
nrn_pragma_acc(parallel loop seq present(
4343
vec_a [0:i3], vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3])
44-
async(_nt->stream_id) if (_nt->compute_gpu))
44+
async(_nt->streams[_nt->stream_id]) if (_nt->compute_gpu))
4545
for (int i = i3 - 1; i >= i2; --i) {
4646
double p = vec_a[i] / vec_d[i];
4747
vec_d[parent_index[i]] -= p * vec_b[i];
@@ -61,21 +61,21 @@ static void bksub(NrnThread* _nt) {
6161
int* parent_index = _nt->_v_parent_index;
6262

6363
nrn_pragma_acc(parallel loop seq present(vec_d [0:i2], vec_rhs [0:i2])
64-
async(_nt->stream_id) if (_nt->compute_gpu))
64+
async(_nt->streams[_nt->stream_id]) if (_nt->compute_gpu))
6565
for (int i = i1; i < i2; ++i) {
6666
vec_rhs[i] /= vec_d[i];
6767
}
6868

6969
nrn_pragma_acc(
7070
parallel loop seq present(vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3])
71-
async(_nt->stream_id) if (_nt->compute_gpu))
71+
async(_nt->streams[_nt->stream_id]) if (_nt->compute_gpu))
7272
for (int i = i2; i < i3; ++i) {
7373
vec_rhs[i] -= vec_b[i] * vec_rhs[parent_index[i]];
7474
vec_rhs[i] /= vec_d[i];
7575
}
7676

7777
if (_nt->compute_gpu) {
78-
nrn_pragma_acc(wait(_nt->stream_id))
78+
nrn_pragma_acc(wait(_nt->streams[_nth->stream_id]))
7979
}
8080
}
8181
} // namespace coreneuron

coreneuron/sim/treeset_core.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ static void nrn_rhs(NrnThread* _nt) {
3333
int* parent_index = _nt->_v_parent_index;
3434

3535
nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu)
36-
async(_nt->stream_id))
36+
async(_nt->streams[_nt->stream_id])
3737
nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
3838
for (int i = i1; i < i3; ++i) {
3939
vec_rhs[i] = 0.;
@@ -45,7 +45,7 @@ static void nrn_rhs(NrnThread* _nt) {
4545
double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs;
4646
nrn_pragma_acc(
4747
parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu)
48-
async(_nt->stream_id))
48+
async(_nt->streams[_nt->stream_id])
4949
nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
5050
for (int i = i1; i < i3; ++i) {
5151
fast_imem_d[i] = 0.;
@@ -75,7 +75,7 @@ static void nrn_rhs(NrnThread* _nt) {
7575
*/
7676
double* p = _nt->nrn_fast_imem->nrn_sav_rhs;
7777
nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu)
78-
async(_nt->stream_id))
78+
async(_nt->streams[_nt->stream_id])
7979
nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
8080
for (int i = i1; i < i3; ++i) {
8181
p[i] -= vec_rhs[i];
@@ -92,7 +92,7 @@ static void nrn_rhs(NrnThread* _nt) {
9292
vec_b [0:i3],
9393
vec_v [0:i3],
9494
parent_index [0:i3]) if (_nt->compute_gpu)
95-
async(_nt->stream_id))
95+
async(_nt->streams[_nt->stream_id])
9696
nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
9797
for (int i = i2; i < i3; ++i) {
9898
double dv = vec_v[parent_index[i]] - vec_v[i];
@@ -152,7 +152,7 @@ static void nrn_lhs(NrnThread* _nt) {
152152
so here we transform so it only has membrane current contribution
153153
*/
154154
double* p = _nt->nrn_fast_imem->nrn_sav_d;
155-
nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id))
155+
nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->streams[_nt->stream_id])
156156
nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
157157
for (int i = i1; i < i3; ++i) {
158158
p[i] += vec_d[i];
@@ -162,7 +162,7 @@ static void nrn_lhs(NrnThread* _nt) {
162162
/* now add the axial currents */
163163
nrn_pragma_acc(parallel loop present(
164164
vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu)
165-
async(_nt->stream_id))
165+
async(_nt->streams[_nt->stream_id])
166166
nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
167167
for (int i = i2; i < i3; ++i) {
168168
nrn_pragma_acc(atomic update)

0 commit comments

Comments
 (0)