@@ -79,20 +79,6 @@ InitParticles (const amrex::RealVect& a_u_std,
7979 {
8080 amrex::Box tile_box = mfi.tilebox (box_nodal, box_grow);
8181
82- if (a_radius != std::numeric_limits<amrex::Real>::max ()) {
83- amrex::IntVect lo_limit {
84- static_cast <int >(std::round ((-a_radius - plo[0 ])/dx[0 ] - 2 )),
85- static_cast <int >(std::round ((-a_radius - plo[1 ])/dx[1 ] - 2 )),
86- tile_box.smallEnd (2 )
87- };
88- amrex::IntVect hi_limit {
89- static_cast <int >(std::round (( a_radius - plo[0 ])/dx[0 ] + 2 )),
90- static_cast <int >(std::round (( a_radius - plo[1 ])/dx[1 ] + 2 )),
91- tile_box.bigEnd (2 )
92- };
93- tile_box &= amrex::Box (lo_limit, hi_limit, box_nodal);
94- }
95-
9682 const amrex::Real radius_sq = a_radius == std::numeric_limits<amrex::Real>::max () ?
9783 std::numeric_limits<amrex::Real>::max () : a_radius * a_radius;
9884
@@ -105,7 +91,6 @@ InitParticles (const amrex::RealVect& a_u_std,
10591 const amrex::Real c_t = c_light * Hipace::m_physical_time;
10692 const amrex::Real min_density = m_min_density;
10793
108-
10994 amrex::BaseFab<int > fab_fine{};
11095 if (use_fine_patch) {
11196 fab_fine.resize (tile_box, 2 );
@@ -151,37 +136,98 @@ InitParticles (const amrex::RealVect& a_u_std,
151136 }
152137 }
153138
139+ auto check_position = [=] AMREX_GPU_DEVICE (int i, int j, int i_part)
140+ -> amrex::GpuTuple<bool , amrex::Real, amrex::Real, amrex::Real>
141+ {
142+ amrex::Real r[2 ];
143+ bool do_init = false ;
144+ ParticleUtil::get_position_unit_cell_fine (r, do_init, i_part,
145+ ppc_lev, fine_transition_cells,
146+ use_fine_patch ? arr_fine (i, j, comp_a) : 0 );
147+
148+ if (!do_init) return {false , 0 , 0 , 0 };
149+
150+ const amrex::Real x = plo[0 ] + (i + r[0 ] + x_offset)*dx[0 ];
151+ const amrex::Real y = plo[1 ] + (j + r[1 ] + y_offset)*dx[1 ];
152+
153+ const amrex::Real rsq = x*x + y*y;
154+ const amrex::Real density = density_func (x, y, c_t );
155+ if (x >= a_bounds.hi (0 ) || x < a_bounds.lo (0 ) ||
156+ y >= a_bounds.hi (1 ) || y < a_bounds.lo (1 ) ||
157+ rsq > radius_sq ||
158+ rsq < a_hollow_core_radius*a_hollow_core_radius ||
159+ density <= min_density) return {false , 0 , 0 , 0 };
160+
161+ return {true , x, y, density};
162+ };
163+
164+ amrex::TypeMultiplier<amrex::ReduceOps,
165+ amrex::ReduceOpSum,
166+ amrex::ReduceOpMin[2 *n_lev],
167+ amrex::ReduceOpMax[2 *n_lev]> reduce_op;
168+ amrex::TypeMultiplier<amrex::ReduceData,
169+ amrex::Long,
170+ int [4 *n_lev]> reduce_data (reduce_op);
171+ using ReduceTuple = typename decltype (reduce_data)::Type;
172+
154173 // Count the total number of particles so only one resize is needed
155- amrex::Long total_num_particles = amrex::Reduce::Sum<amrex::Long>(tile_box. numPts () ,
156- [=] AMREX_GPU_DEVICE (amrex::Long idx) noexcept
174+ reduce_op. eval (tile_box, reduce_data ,
175+ [=] AMREX_GPU_DEVICE (int i, int j, int ) -> ReduceTuple
157176 {
158- auto [i,j,k] = tile_box.atOffset3d (idx).arr ;
159-
160177 amrex::Long num_particles_cell = 0 ;
161178 for (int i_part=0 ; i_part<max_ppc; ++i_part)
162179 {
163- amrex::Real r[2 ];
164- bool do_init = false ;
165- ParticleUtil::get_position_unit_cell_fine (r, do_init, i_part,
166- ppc_lev, fine_transition_cells,
167- use_fine_patch ? arr_fine (i, j, comp_a) : 0 );
180+ auto [do_init, x, y, density] = check_position (i, j, i_part);
168181
169- if (!do_init) continue ;
182+ if (do_init) {
183+ num_particles_cell += 1 ;
184+ }
185+ }
186+ const bool is_lev_0 = num_particles_cell > 0 ;
187+ const bool is_lev_1 = num_particles_cell > ppc_lev[0 ][0 ] * ppc_lev[0 ][1 ];
188+ const bool is_lev_2 = num_particles_cell > ppc_lev[1 ][0 ] * ppc_lev[1 ][1 ];
189+ return {
190+ num_particles_cell,
191+ is_lev_0 ? i : tile_box.bigEnd (0 ), is_lev_0 ? j : tile_box.bigEnd (1 ),
192+ is_lev_1 ? i : tile_box.bigEnd (0 ), is_lev_1 ? j : tile_box.bigEnd (1 ),
193+ is_lev_2 ? i : tile_box.bigEnd (0 ), is_lev_2 ? j : tile_box.bigEnd (1 ),
194+ is_lev_0 ? i : tile_box.smallEnd (0 ), is_lev_0 ? j : tile_box.smallEnd (1 ),
195+ is_lev_1 ? i : tile_box.smallEnd (0 ), is_lev_1 ? j : tile_box.smallEnd (1 ),
196+ is_lev_2 ? i : tile_box.smallEnd (0 ), is_lev_2 ? j : tile_box.smallEnd (1 )
197+ };
198+ });
170199
171- amrex::Real x = plo[0 ] + (i + r[0 ] + x_offset)*dx[0 ];
172- amrex::Real y = plo[1 ] + (j + r[1 ] + y_offset)*dx[1 ];
200+ auto [np_tup, lo_tup, hi_tup] = amrex::TupleSplit<1 , 2 *n_lev, 2 *n_lev>(reduce_data.value ());
173201
174- const amrex::Real rsq = x*x + y*y;
175- if (x >= a_bounds.hi (0 ) || x < a_bounds.lo (0 ) ||
176- y >= a_bounds.hi (1 ) || y < a_bounds.lo (1 ) ||
177- rsq > radius_sq ||
178- rsq < a_hollow_core_radius*a_hollow_core_radius ||
179- density_func (x, y, c_t ) <= min_density) continue ;
202+ amrex::Long total_num_particles = amrex::get<0 >(np_tup);
203+ const amrex::Long total_non_mirrored_particles = total_num_particles;
204+ auto lo_arr = amrex::tupleToArray (lo_tup);
205+ auto hi_arr = amrex::tupleToArray (hi_tup);
180206
181- num_particles_cell += 1 ;
182- }
183- return num_particles_cell;
184- });
207+ amrex::Array<amrex::Box, n_lev> lev_bounds;
208+
209+ #ifdef AMREX_USE_GPU
210+ // The loop over particles is outside the loop over cells
211+ // so that particles in the same cell are far apart.
212+ // This makes current deposition faster.
213+ constexpr int idx_x = 0 ;
214+ constexpr int idx_y = 1 ;
215+ constexpr int idx_ppc = 2 ;
216+ #else
217+ constexpr int idx_ppc = 0 ;
218+ constexpr int idx_x = 1 ;
219+ constexpr int idx_y = 2 ;
220+ #endif
221+
222+ for (int ilev=0 ; ilev < n_lev; ++ilev) {
223+ lev_bounds[ilev].setSmall (idx_x, lo_arr[2 *ilev]);
224+ lev_bounds[ilev].setSmall (idx_y, lo_arr[2 *ilev+1 ]);
225+ lev_bounds[ilev].setSmall (idx_ppc,
226+ ilev == 0 ? 0 : ppc_lev[ilev-1 ][0 ] * ppc_lev[ilev-1 ][1 ]);
227+ lev_bounds[ilev].setBig (idx_x, hi_arr[2 *ilev]);
228+ lev_bounds[ilev].setBig (idx_y, hi_arr[2 *ilev+1 ]);
229+ lev_bounds[ilev].setBig (idx_ppc, ppc_lev[ilev][0 ] * ppc_lev[ilev][1 ] - 1 );
230+ }
185231
186232 if (m_do_symmetrize) {
187233 total_num_particles *= 4 ;
@@ -190,131 +236,76 @@ InitParticles (const amrex::RealVect& a_u_std,
190236 }
191237 }
192238
193- auto & particles = GetParticles (0 );
194- auto & particle_tile = particles[std::make_pair (mfi.index (), mfi.LocalTileIndex ())];
239+ auto & particle_tile = DefineAndReturnParticleTile (0 , mfi);
195240
196- auto old_size = particle_tile.size ();
197- const auto new_size = old_size + total_num_particles ;
198- particle_tile.resize (new_size );
241+ AMREX_ALWAYS_ASSERT ( particle_tile.size () == 0 );
242+ unsigned int current_size = 0 ;
243+ particle_tile.resize (total_num_particles );
199244
200245 auto ptd = particle_tile.getParticleTileData ();
201246 const int init_ion_lev = m_init_ion_lev;
202247
203- // The loop over particles is outside the loop over cells
204- // so that particles in the same cell are far apart.
205- // This makes current deposition faster.
206- for (int i_part=0 ; i_part<max_ppc; ++i_part)
207- {
208- amrex::Gpu::DeviceVector<unsigned int > counts (tile_box.numPts (), 0 );
209- unsigned int * pcount = counts.dataPtr ();
210-
211- amrex::Gpu::DeviceVector<unsigned int > offsets (tile_box.numPts ());
212- unsigned int * poffset = offsets.dataPtr ();
213-
214- amrex::ParallelFor (tile_box,
215- [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
216- {
217- amrex::Real r[2 ];
218- bool do_init = false ;
219- ParticleUtil::get_position_unit_cell_fine (r, do_init, i_part,
220- ppc_lev, fine_transition_cells,
221- use_fine_patch ? arr_fine (i, j, comp_a) : 0 );
222-
223- if (!do_init) return ;
224-
225- amrex::Real x = plo[0 ] + (i + r[0 ] + x_offset)*dx[0 ];
226- amrex::Real y = plo[1 ] + (j + r[1 ] + y_offset)*dx[1 ];
227-
228- const amrex::Real rsq = x*x + y*y;
229- if (x >= a_bounds.hi (0 ) || x < a_bounds.lo (0 ) ||
230- y >= a_bounds.hi (1 ) || y < a_bounds.lo (1 ) ||
231- rsq > radius_sq ||
232- rsq < a_hollow_core_radius*a_hollow_core_radius ||
233- density_func (x, y, c_t ) <= min_density) return ;
234-
235- int ix = i - lo.x ;
236- int iy = j - lo.y ;
237- int iz = k - lo.z ;
238- int nx = hi.x -lo.x +1 ;
239- int ny = hi.y -lo.y +1 ;
240- int nz = hi.z -lo.z +1 ;
241- unsigned int uix = amrex::min (nx-1 ,amrex::max (0 ,ix));
242- unsigned int uiy = amrex::min (ny-1 ,amrex::max (0 ,iy));
243- unsigned int uiz = amrex::min (nz-1 ,amrex::max (0 ,iz));
244-
245- // Ordering of axes from fastest to slowest:
246- // x
247- // y
248- // z (not used)
249- // ppc
250- unsigned int cellid = (uiz * ny + uiy) * nx + uix;
251-
252- pcount[cellid] = 1 ;
253- });
254-
255- unsigned int num_to_add =
256- amrex::Scan::ExclusiveSum (counts.size (), counts.data (), offsets.data ());
257-
258- if (num_to_add == 0 ) continue ;
259-
260- amrex::ParallelForRNG (tile_box,
261- [=] AMREX_GPU_DEVICE (int i, int j, int k, const amrex::RandomEngine& engine) noexcept
262- {
263- int ix = i - lo.x ;
264- int iy = j - lo.y ;
265- int iz = k - lo.z ;
266- int nx = hi.x -lo.x +1 ;
267- int ny = hi.y -lo.y +1 ;
268- int nz = hi.z -lo.z +1 ;
269- unsigned int uix = amrex::min (nx-1 ,amrex::max (0 ,ix));
270- unsigned int uiy = amrex::min (ny-1 ,amrex::max (0 ,iy));
271- unsigned int uiz = amrex::min (nz-1 ,amrex::max (0 ,iz));
272-
273- unsigned int cellid = (uiz * ny + uiy) * nx + uix;
274-
275- const amrex::Long pidx = poffset[cellid] - poffset[0 ] + old_size;
276-
277- amrex::Real r[2 ] = {0 .,0 .};
278- bool do_init = false ;
279- ParticleUtil::get_position_unit_cell_fine (r, do_init, i_part,
280- ppc_lev, fine_transition_cells,
281- use_fine_patch ? arr_fine (i, j, comp_a) : 0 );
282-
283- if (!do_init) return ;
248+ for (int ilev=0 ; ilev < n_lev; ++ilev) {
249+ const amrex::Box bx = lev_bounds[ilev];
250+ if (!bx.ok ()) {
251+ continue ;
252+ }
253+ const amrex::BoxIndexer bxi{bx};
254+
255+ current_size += amrex::Scan::PrefixSum<unsigned int >(bx.numPts (),
256+ [=] AMREX_GPU_DEVICE (amrex::Long idx) -> unsigned int {
257+ const auto iv = bxi.intVect (idx);
258+ const int i = iv[idx_x];
259+ const int j = iv[idx_y];
260+ const int i_part = iv[idx_ppc];
261+
262+ auto [do_init, x, y, density] = check_position (i, j, i_part);
263+ return do_init;
264+ },
265+ [=] AMREX_GPU_DEVICE (amrex::Long idx, unsigned int pidx) {
266+ const auto iv = bxi.intVect (idx);
267+ const int i = iv[idx_x];
268+ const int j = iv[idx_y];
269+ const int i_part = iv[idx_ppc];
270+
271+ auto [do_init, x, y, density] = check_position (i, j, i_part);
272+ if (!do_init) {
273+ return ;
274+ }
284275
285- amrex::Real x = plo[0 ] + (i + r[0 ] + x_offset)*dx[0 ];
286- amrex::Real y = plo[1 ] + (j + r[1 ] + y_offset)*dx[1 ];
276+ pidx += current_size;
277+ ptd.id (pidx) = 1 ; // plasma id is only used to distinguish between valid/invalid
278+ ptd.cpu (pidx) = 0 ; // level 0
279+ ptd.rdata (PlasmaIdx::x)[pidx] = x;
280+ ptd.rdata (PlasmaIdx::y)[pidx] = y;
281+
282+ if (use_fine_patch) {
283+ const int fine_loc = arr_fine (i, j, comp_a);
284+ if (fine_loc == 0 ) {
285+ ptd.rdata (PlasmaIdx::w)[pidx] = density * scale_fac_lev[0 ];
286+ } else if (fine_loc <= fine_transition_cells + 1 ) {
287+ ptd.rdata (PlasmaIdx::w)[pidx] = density * scale_fac_lev[1 ];
288+ } else {
289+ ptd.rdata (PlasmaIdx::w)[pidx] = density * scale_fac_lev[2 ];
290+ }
291+ } else {
292+ ptd.rdata (PlasmaIdx::w)[pidx] = density * scale_fac_lev[0 ];
293+ }
294+ },
295+ amrex::Scan::Type::exclusive,
296+ amrex::Scan::retSum);
297+ }
287298
288- const amrex::Real density = density_func (x, y, c_t );
299+ AMREX_ALWAYS_ASSERT (total_non_mirrored_particles == current_size );
289300
290- const amrex::Real rsq = x*x + y*y;
291- if (x >= a_bounds.hi (0 ) || x < a_bounds.lo (0 ) ||
292- y >= a_bounds.hi (1 ) || y < a_bounds.lo (1 ) ||
293- rsq > radius_sq ||
294- rsq < a_hollow_core_radius*a_hollow_core_radius ||
295- density <= min_density) return ;
301+ amrex::ParallelForRNG (current_size,
302+ [=] AMREX_GPU_DEVICE (unsigned int pidx, const amrex::RandomEngine& engine) {
303+ const amrex::Real x = ptd.rdata (PlasmaIdx::x)[pidx];
304+ const amrex::Real y = ptd.rdata (PlasmaIdx::y)[pidx];
296305
297306 amrex::Real u[3 ] = {0 .,0 .,0 .};
298307 ParticleUtil::get_gaussian_random_momentum (u, a_u_mean, a_u_std, engine);
299308
300- ptd.id (pidx) = 1 ; // plasma id is only used to distinguish between valid/invalid
301- ptd.cpu (pidx) = 0 ; // level 0
302- ptd.rdata (PlasmaIdx::x)[pidx] = x;
303- ptd.rdata (PlasmaIdx::y)[pidx] = y;
304-
305- if (use_fine_patch) {
306- const int fine_loc = arr_fine (i, j, comp_a);
307- if (fine_loc == 0 ) {
308- ptd.rdata (PlasmaIdx::w)[pidx] = density * scale_fac_lev[0 ];
309- } else if (fine_loc <= fine_transition_cells + 1 ) {
310- ptd.rdata (PlasmaIdx::w)[pidx] = density * scale_fac_lev[1 ];
311- } else {
312- ptd.rdata (PlasmaIdx::w)[pidx] = density * scale_fac_lev[2 ];
313- }
314- } else {
315- ptd.rdata (PlasmaIdx::w)[pidx] = density * scale_fac_lev[0 ];
316- }
317-
318309 ptd.rdata (PlasmaIdx::ux)[pidx] = u[0 ];
319310 ptd.rdata (PlasmaIdx::uy)[pidx] = u[1 ];
320311 ptd.rdata (PlasmaIdx::psi)[pidx] = plasma_psi (u[0 ], u[1 ], u[2 ],
@@ -333,14 +324,11 @@ InitParticles (const amrex::RealVect& a_u_std,
333324 ptd.idata (PlasmaIdx::ion_lev)[pidx] = init_ion_lev;
334325 });
335326
336- old_size += num_to_add;
337- }
338327 if (m_do_symmetrize) {
339328
340329 const amrex::Real x_mid2 = (a_bounds.lo (0 ) + a_bounds.hi (0 ));
341330 const amrex::Real y_mid2 = (a_bounds.lo (1 ) + a_bounds.hi (1 ));
342- const amrex::Long mirror_offset = total_num_particles/4 ;
343- amrex::ParallelFor (mirror_offset,
331+ amrex::ParallelFor (total_non_mirrored_particles,
344332 [=] AMREX_GPU_DEVICE (amrex::Long pidx) noexcept
345333 {
346334 const amrex::Real x = ptd.rdata (PlasmaIdx::x)[pidx];
@@ -355,7 +343,7 @@ InitParticles (const amrex::RealVect& a_u_std,
355343
356344 HIPACE_LOOP_UNROLL
357345 for (int imirror=0 ; imirror<3 ; ++imirror) {
358- const amrex::Long midx = (imirror+1 )*mirror_offset + pidx;
346+ const amrex::Long midx = (imirror+1 )*total_non_mirrored_particles + pidx;
359347
360348 ptd.id (midx) = 1 ; // plasma id is only used to distinguish between valid/invalid
361349 ptd.cpu (midx) = 0 ; // level 0
@@ -386,12 +374,6 @@ InitParticles (const amrex::RealVect& a_u_std,
386374 });
387375 }
388376 }
389- #ifndef AMREX_USE_GPU
390- // The index order for initializing plasma particles is optimized for GPU.
391- // On CPU, especially with multiple particles per cell,
392- // reordering particles by cell gives better performance for plasma deposition and push.
393- SortParticlesByCell ();
394- #endif
395377}
396378
397379void
0 commit comments