Skip to content

Commit d47a7d7

Browse files
Use amrex::Scan::PrefixSum in InitPlasmaParticles (#1314)
1 parent ceb8c11 commit d47a7d7

2 files changed

Lines changed: 159 additions & 177 deletions

File tree

src/particles/plasma/PlasmaParticleContainerInit.cpp

Lines changed: 143 additions & 161 deletions
Original file line numberDiff line numberDiff line change
@@ -79,20 +79,6 @@ InitParticles (const amrex::RealVect& a_u_std,
7979
{
8080
amrex::Box tile_box = mfi.tilebox(box_nodal, box_grow);
8181

82-
if (a_radius != std::numeric_limits<amrex::Real>::max()) {
83-
amrex::IntVect lo_limit {
84-
static_cast<int>(std::round((-a_radius - plo[0])/dx[0] - 2)),
85-
static_cast<int>(std::round((-a_radius - plo[1])/dx[1] - 2)),
86-
tile_box.smallEnd(2)
87-
};
88-
amrex::IntVect hi_limit {
89-
static_cast<int>(std::round(( a_radius - plo[0])/dx[0] + 2)),
90-
static_cast<int>(std::round(( a_radius - plo[1])/dx[1] + 2)),
91-
tile_box.bigEnd(2)
92-
};
93-
tile_box &= amrex::Box(lo_limit, hi_limit, box_nodal);
94-
}
95-
9682
const amrex::Real radius_sq = a_radius == std::numeric_limits<amrex::Real>::max() ?
9783
std::numeric_limits<amrex::Real>::max() : a_radius * a_radius;
9884

@@ -105,7 +91,6 @@ InitParticles (const amrex::RealVect& a_u_std,
10591
const amrex::Real c_t = c_light * Hipace::m_physical_time;
10692
const amrex::Real min_density = m_min_density;
10793

108-
10994
amrex::BaseFab<int> fab_fine{};
11095
if (use_fine_patch) {
11196
fab_fine.resize(tile_box, 2);
@@ -151,37 +136,98 @@ InitParticles (const amrex::RealVect& a_u_std,
151136
}
152137
}
153138

139+
auto check_position = [=] AMREX_GPU_DEVICE (int i, int j, int i_part)
140+
-> amrex::GpuTuple<bool, amrex::Real, amrex::Real, amrex::Real>
141+
{
142+
amrex::Real r[2];
143+
bool do_init = false;
144+
ParticleUtil::get_position_unit_cell_fine(r, do_init, i_part,
145+
ppc_lev, fine_transition_cells,
146+
use_fine_patch ? arr_fine(i, j, comp_a) : 0);
147+
148+
if (!do_init) return {false, 0, 0, 0};
149+
150+
const amrex::Real x = plo[0] + (i + r[0] + x_offset)*dx[0];
151+
const amrex::Real y = plo[1] + (j + r[1] + y_offset)*dx[1];
152+
153+
const amrex::Real rsq = x*x + y*y;
154+
const amrex::Real density = density_func(x, y, c_t);
155+
if (x >= a_bounds.hi(0) || x < a_bounds.lo(0) ||
156+
y >= a_bounds.hi(1) || y < a_bounds.lo(1) ||
157+
rsq > radius_sq ||
158+
rsq < a_hollow_core_radius*a_hollow_core_radius ||
159+
density <= min_density) return {false, 0, 0, 0};
160+
161+
return {true, x, y, density};
162+
};
163+
164+
amrex::TypeMultiplier<amrex::ReduceOps,
165+
amrex::ReduceOpSum,
166+
amrex::ReduceOpMin[2*n_lev],
167+
amrex::ReduceOpMax[2*n_lev]> reduce_op;
168+
amrex::TypeMultiplier<amrex::ReduceData,
169+
amrex::Long,
170+
int[4*n_lev]> reduce_data(reduce_op);
171+
using ReduceTuple = typename decltype(reduce_data)::Type;
172+
154173
// Count the total number of particles so only one resize is needed
155-
amrex::Long total_num_particles = amrex::Reduce::Sum<amrex::Long>(tile_box.numPts(),
156-
[=] AMREX_GPU_DEVICE (amrex::Long idx) noexcept
174+
reduce_op.eval(tile_box, reduce_data,
175+
[=] AMREX_GPU_DEVICE (int i, int j, int) -> ReduceTuple
157176
{
158-
auto [i,j,k] = tile_box.atOffset3d(idx).arr;
159-
160177
amrex::Long num_particles_cell = 0;
161178
for (int i_part=0; i_part<max_ppc; ++i_part)
162179
{
163-
amrex::Real r[2];
164-
bool do_init = false;
165-
ParticleUtil::get_position_unit_cell_fine(r, do_init, i_part,
166-
ppc_lev, fine_transition_cells,
167-
use_fine_patch ? arr_fine(i, j, comp_a) : 0);
180+
auto [do_init, x, y, density] = check_position(i, j, i_part);
168181

169-
if (!do_init) continue;
182+
if (do_init) {
183+
num_particles_cell += 1;
184+
}
185+
}
186+
const bool is_lev_0 = num_particles_cell > 0;
187+
const bool is_lev_1 = num_particles_cell > ppc_lev[0][0] * ppc_lev[0][1];
188+
const bool is_lev_2 = num_particles_cell > ppc_lev[1][0] * ppc_lev[1][1];
189+
return {
190+
num_particles_cell,
191+
is_lev_0 ? i : tile_box.bigEnd(0), is_lev_0 ? j : tile_box.bigEnd(1),
192+
is_lev_1 ? i : tile_box.bigEnd(0), is_lev_1 ? j : tile_box.bigEnd(1),
193+
is_lev_2 ? i : tile_box.bigEnd(0), is_lev_2 ? j : tile_box.bigEnd(1),
194+
is_lev_0 ? i : tile_box.smallEnd(0), is_lev_0 ? j : tile_box.smallEnd(1),
195+
is_lev_1 ? i : tile_box.smallEnd(0), is_lev_1 ? j : tile_box.smallEnd(1),
196+
is_lev_2 ? i : tile_box.smallEnd(0), is_lev_2 ? j : tile_box.smallEnd(1)
197+
};
198+
});
170199

171-
amrex::Real x = plo[0] + (i + r[0] + x_offset)*dx[0];
172-
amrex::Real y = plo[1] + (j + r[1] + y_offset)*dx[1];
200+
auto [np_tup, lo_tup, hi_tup] = amrex::TupleSplit<1, 2*n_lev, 2*n_lev>(reduce_data.value());
173201

174-
const amrex::Real rsq = x*x + y*y;
175-
if (x >= a_bounds.hi(0) || x < a_bounds.lo(0) ||
176-
y >= a_bounds.hi(1) || y < a_bounds.lo(1) ||
177-
rsq > radius_sq ||
178-
rsq < a_hollow_core_radius*a_hollow_core_radius ||
179-
density_func(x, y, c_t) <= min_density) continue;
202+
amrex::Long total_num_particles = amrex::get<0>(np_tup);
203+
const amrex::Long total_non_mirrored_particles = total_num_particles;
204+
auto lo_arr = amrex::tupleToArray(lo_tup);
205+
auto hi_arr = amrex::tupleToArray(hi_tup);
180206

181-
num_particles_cell += 1;
182-
}
183-
return num_particles_cell;
184-
});
207+
amrex::Array<amrex::Box, n_lev> lev_bounds;
208+
209+
#ifdef AMREX_USE_GPU
210+
// The loop over particles is outside the loop over cells
211+
// so that particles in the same cell are far apart.
212+
// This makes current deposition faster.
213+
constexpr int idx_x = 0;
214+
constexpr int idx_y = 1;
215+
constexpr int idx_ppc = 2;
216+
#else
217+
constexpr int idx_ppc = 0;
218+
constexpr int idx_x = 1;
219+
constexpr int idx_y = 2;
220+
#endif
221+
222+
for (int ilev=0; ilev < n_lev; ++ilev) {
223+
lev_bounds[ilev].setSmall(idx_x, lo_arr[2*ilev]);
224+
lev_bounds[ilev].setSmall(idx_y, lo_arr[2*ilev+1]);
225+
lev_bounds[ilev].setSmall(idx_ppc,
226+
ilev == 0 ? 0 : ppc_lev[ilev-1][0] * ppc_lev[ilev-1][1]);
227+
lev_bounds[ilev].setBig(idx_x, hi_arr[2*ilev]);
228+
lev_bounds[ilev].setBig(idx_y, hi_arr[2*ilev+1]);
229+
lev_bounds[ilev].setBig(idx_ppc, ppc_lev[ilev][0] * ppc_lev[ilev][1] - 1);
230+
}
185231

186232
if (m_do_symmetrize) {
187233
total_num_particles *= 4;
@@ -190,131 +236,76 @@ InitParticles (const amrex::RealVect& a_u_std,
190236
}
191237
}
192238

193-
auto& particles = GetParticles(0);
194-
auto& particle_tile = particles[std::make_pair(mfi.index(), mfi.LocalTileIndex())];
239+
auto& particle_tile = DefineAndReturnParticleTile(0, mfi);
195240

196-
auto old_size = particle_tile.size();
197-
const auto new_size = old_size + total_num_particles;
198-
particle_tile.resize(new_size);
241+
AMREX_ALWAYS_ASSERT(particle_tile.size() == 0);
242+
unsigned int current_size = 0;
243+
particle_tile.resize(total_num_particles);
199244

200245
auto ptd = particle_tile.getParticleTileData();
201246
const int init_ion_lev = m_init_ion_lev;
202247

203-
// The loop over particles is outside the loop over cells
204-
// so that particles in the same cell are far apart.
205-
// This makes current deposition faster.
206-
for (int i_part=0; i_part<max_ppc; ++i_part)
207-
{
208-
amrex::Gpu::DeviceVector<unsigned int> counts(tile_box.numPts(), 0);
209-
unsigned int* pcount = counts.dataPtr();
210-
211-
amrex::Gpu::DeviceVector<unsigned int> offsets(tile_box.numPts());
212-
unsigned int* poffset = offsets.dataPtr();
213-
214-
amrex::ParallelFor(tile_box,
215-
[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
216-
{
217-
amrex::Real r[2];
218-
bool do_init = false;
219-
ParticleUtil::get_position_unit_cell_fine(r, do_init, i_part,
220-
ppc_lev, fine_transition_cells,
221-
use_fine_patch ? arr_fine(i, j, comp_a) : 0);
222-
223-
if (!do_init) return;
224-
225-
amrex::Real x = plo[0] + (i + r[0] + x_offset)*dx[0];
226-
amrex::Real y = plo[1] + (j + r[1] + y_offset)*dx[1];
227-
228-
const amrex::Real rsq = x*x + y*y;
229-
if (x >= a_bounds.hi(0) || x < a_bounds.lo(0) ||
230-
y >= a_bounds.hi(1) || y < a_bounds.lo(1) ||
231-
rsq > radius_sq ||
232-
rsq < a_hollow_core_radius*a_hollow_core_radius ||
233-
density_func(x, y, c_t) <= min_density) return;
234-
235-
int ix = i - lo.x;
236-
int iy = j - lo.y;
237-
int iz = k - lo.z;
238-
int nx = hi.x-lo.x+1;
239-
int ny = hi.y-lo.y+1;
240-
int nz = hi.z-lo.z+1;
241-
unsigned int uix = amrex::min(nx-1,amrex::max(0,ix));
242-
unsigned int uiy = amrex::min(ny-1,amrex::max(0,iy));
243-
unsigned int uiz = amrex::min(nz-1,amrex::max(0,iz));
244-
245-
// Ordering of axes from fastest to slowest:
246-
// x
247-
// y
248-
// z (not used)
249-
// ppc
250-
unsigned int cellid = (uiz * ny + uiy) * nx + uix;
251-
252-
pcount[cellid] = 1;
253-
});
254-
255-
unsigned int num_to_add =
256-
amrex::Scan::ExclusiveSum(counts.size(), counts.data(), offsets.data());
257-
258-
if (num_to_add == 0) continue;
259-
260-
amrex::ParallelForRNG(tile_box,
261-
[=] AMREX_GPU_DEVICE (int i, int j, int k, const amrex::RandomEngine& engine) noexcept
262-
{
263-
int ix = i - lo.x;
264-
int iy = j - lo.y;
265-
int iz = k - lo.z;
266-
int nx = hi.x-lo.x+1;
267-
int ny = hi.y-lo.y+1;
268-
int nz = hi.z-lo.z+1;
269-
unsigned int uix = amrex::min(nx-1,amrex::max(0,ix));
270-
unsigned int uiy = amrex::min(ny-1,amrex::max(0,iy));
271-
unsigned int uiz = amrex::min(nz-1,amrex::max(0,iz));
272-
273-
unsigned int cellid = (uiz * ny + uiy) * nx + uix;
274-
275-
const amrex::Long pidx = poffset[cellid] - poffset[0] + old_size;
276-
277-
amrex::Real r[2] = {0.,0.};
278-
bool do_init = false;
279-
ParticleUtil::get_position_unit_cell_fine(r, do_init, i_part,
280-
ppc_lev, fine_transition_cells,
281-
use_fine_patch ? arr_fine(i, j, comp_a) : 0);
282-
283-
if (!do_init) return;
248+
for (int ilev=0; ilev < n_lev; ++ilev) {
249+
const amrex::Box bx = lev_bounds[ilev];
250+
if (!bx.ok()) {
251+
continue;
252+
}
253+
const amrex::BoxIndexer bxi{bx};
254+
255+
current_size += amrex::Scan::PrefixSum<unsigned int>(bx.numPts(),
256+
[=] AMREX_GPU_DEVICE (amrex::Long idx) -> unsigned int {
257+
const auto iv = bxi.intVect(idx);
258+
const int i = iv[idx_x];
259+
const int j = iv[idx_y];
260+
const int i_part = iv[idx_ppc];
261+
262+
auto [do_init, x, y, density] = check_position(i, j, i_part);
263+
return do_init;
264+
},
265+
[=] AMREX_GPU_DEVICE (amrex::Long idx, unsigned int pidx) {
266+
const auto iv = bxi.intVect(idx);
267+
const int i = iv[idx_x];
268+
const int j = iv[idx_y];
269+
const int i_part = iv[idx_ppc];
270+
271+
auto [do_init, x, y, density] = check_position(i, j, i_part);
272+
if (!do_init) {
273+
return;
274+
}
284275

285-
amrex::Real x = plo[0] + (i + r[0] + x_offset)*dx[0];
286-
amrex::Real y = plo[1] + (j + r[1] + y_offset)*dx[1];
276+
pidx += current_size;
277+
ptd.id(pidx) = 1; // plasma id is only used to distinguish between valid/invalid
278+
ptd.cpu(pidx) = 0; // level 0
279+
ptd.rdata(PlasmaIdx::x)[pidx] = x;
280+
ptd.rdata(PlasmaIdx::y)[pidx] = y;
281+
282+
if (use_fine_patch) {
283+
const int fine_loc = arr_fine(i, j, comp_a);
284+
if (fine_loc == 0) {
285+
ptd.rdata(PlasmaIdx::w)[pidx] = density * scale_fac_lev[0];
286+
} else if (fine_loc <= fine_transition_cells + 1) {
287+
ptd.rdata(PlasmaIdx::w)[pidx] = density * scale_fac_lev[1];
288+
} else {
289+
ptd.rdata(PlasmaIdx::w)[pidx] = density * scale_fac_lev[2];
290+
}
291+
} else {
292+
ptd.rdata(PlasmaIdx::w)[pidx] = density * scale_fac_lev[0];
293+
}
294+
},
295+
amrex::Scan::Type::exclusive,
296+
amrex::Scan::retSum);
297+
}
287298

288-
const amrex::Real density = density_func(x, y, c_t);
299+
AMREX_ALWAYS_ASSERT(total_non_mirrored_particles == current_size);
289300

290-
const amrex::Real rsq = x*x + y*y;
291-
if (x >= a_bounds.hi(0) || x < a_bounds.lo(0) ||
292-
y >= a_bounds.hi(1) || y < a_bounds.lo(1) ||
293-
rsq > radius_sq ||
294-
rsq < a_hollow_core_radius*a_hollow_core_radius ||
295-
density <= min_density) return;
301+
amrex::ParallelForRNG(current_size,
302+
[=] AMREX_GPU_DEVICE (unsigned int pidx, const amrex::RandomEngine& engine) {
303+
const amrex::Real x = ptd.rdata(PlasmaIdx::x)[pidx];
304+
const amrex::Real y = ptd.rdata(PlasmaIdx::y)[pidx];
296305

297306
amrex::Real u[3] = {0.,0.,0.};
298307
ParticleUtil::get_gaussian_random_momentum(u, a_u_mean, a_u_std, engine);
299308

300-
ptd.id(pidx) = 1; // plasma id is only used to distinguish between valid/invalid
301-
ptd.cpu(pidx) = 0; // level 0
302-
ptd.rdata(PlasmaIdx::x)[pidx] = x;
303-
ptd.rdata(PlasmaIdx::y)[pidx] = y;
304-
305-
if (use_fine_patch) {
306-
const int fine_loc = arr_fine(i, j, comp_a);
307-
if (fine_loc == 0) {
308-
ptd.rdata(PlasmaIdx::w)[pidx] = density * scale_fac_lev[0];
309-
} else if (fine_loc <= fine_transition_cells + 1) {
310-
ptd.rdata(PlasmaIdx::w)[pidx] = density * scale_fac_lev[1];
311-
} else {
312-
ptd.rdata(PlasmaIdx::w)[pidx] = density * scale_fac_lev[2];
313-
}
314-
} else {
315-
ptd.rdata(PlasmaIdx::w)[pidx] = density * scale_fac_lev[0];
316-
}
317-
318309
ptd.rdata(PlasmaIdx::ux)[pidx] = u[0];
319310
ptd.rdata(PlasmaIdx::uy)[pidx] = u[1];
320311
ptd.rdata(PlasmaIdx::psi)[pidx] = plasma_psi(u[0], u[1], u[2],
@@ -333,14 +324,11 @@ InitParticles (const amrex::RealVect& a_u_std,
333324
ptd.idata(PlasmaIdx::ion_lev)[pidx] = init_ion_lev;
334325
});
335326

336-
old_size += num_to_add;
337-
}
338327
if (m_do_symmetrize) {
339328

340329
const amrex::Real x_mid2 = (a_bounds.lo(0) + a_bounds.hi(0));
341330
const amrex::Real y_mid2 = (a_bounds.lo(1) + a_bounds.hi(1));
342-
const amrex::Long mirror_offset = total_num_particles/4;
343-
amrex::ParallelFor(mirror_offset,
331+
amrex::ParallelFor(total_non_mirrored_particles,
344332
[=] AMREX_GPU_DEVICE (amrex::Long pidx) noexcept
345333
{
346334
const amrex::Real x = ptd.rdata(PlasmaIdx::x)[pidx];
@@ -355,7 +343,7 @@ InitParticles (const amrex::RealVect& a_u_std,
355343

356344
HIPACE_LOOP_UNROLL
357345
for (int imirror=0; imirror<3; ++imirror) {
358-
const amrex::Long midx = (imirror+1)*mirror_offset +pidx;
346+
const amrex::Long midx = (imirror+1)*total_non_mirrored_particles + pidx;
359347

360348
ptd.id(midx) = 1; // plasma id is only used to distinguish between valid/invalid
361349
ptd.cpu(midx) = 0; // level 0
@@ -386,12 +374,6 @@ InitParticles (const amrex::RealVect& a_u_std,
386374
});
387375
}
388376
}
389-
#ifndef AMREX_USE_GPU
390-
// The index order for initializing plasma particles is optimized for GPU.
391-
// On CPU, especially with multiple particles per cell,
392-
// reordering particles by cell gives better performance for plasma deposition and push.
393-
SortParticlesByCell();
394-
#endif
395377
}
396378

397379
void

0 commit comments

Comments
 (0)