Skip to content

Commit 521d61c

Browse files
committed
Kahan summation for void fraction smearing
1 parent d7c0801 commit 521d61c

2 files changed

Lines changed: 52 additions & 20 deletions

File tree

src/simulation/m_bubbles_EL.fpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,10 @@ module m_bubbles_EL
7676
real(wp) :: Rmax_glb, Rmin_glb !< Maximum and minimum bubbe size in the local domain
7777
!< Projection of the lagrangian particles in the Eulerian framework
7878
type(scalar_field), dimension(:), allocatable :: q_beta
79+
type(scalar_field), dimension(:), allocatable :: kahan_comp !< Kahan compensation for q_beta accumulation
7980
integer :: q_beta_idx !< Size of the q_beta vector field
8081
81-
$:GPU_DECLARE(create='[Rmax_glb,Rmin_glb,q_beta,q_beta_idx]')
82+
$:GPU_DECLARE(create='[Rmax_glb,Rmin_glb,q_beta,kahan_comp,q_beta_idx]')
8283
8384
integer, parameter :: LAG_EVOL_ID = 11 ! File id for lag_bubbles_evol_*.dat
8485
integer, parameter :: LAG_STATS_ID = 12 ! File id for stats_lag_bubbles_*.dat
@@ -135,12 +136,17 @@ contains
135136
$:GPU_UPDATE(device='[lag_num_ts, q_beta_idx]')
136137
137138
@:ALLOCATE(q_beta(1:q_beta_idx))
139+
@:ALLOCATE(kahan_comp(1:q_beta_idx))
138140
139141
do i = 1, q_beta_idx
140142
@:ALLOCATE(q_beta(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
141143
idwbuff(2)%beg:idwbuff(2)%end, &
142144
idwbuff(3)%beg:idwbuff(3)%end))
143145
@:ACC_SETUP_SFs(q_beta(i))
146+
@:ALLOCATE(kahan_comp(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
147+
idwbuff(2)%beg:idwbuff(2)%end, &
148+
idwbuff(3)%beg:idwbuff(3)%end))
149+
@:ACC_SETUP_SFs(kahan_comp(i))
144150
end do
145151
146152
! Allocating space for lagrangian variables
@@ -969,6 +975,7 @@ contains
969975
do k = idwbuff(2)%beg, idwbuff(2)%end
970976
do j = idwbuff(1)%beg, idwbuff(1)%end
971977
q_beta(i)%sf(j, k, l) = 0._wp
978+
kahan_comp(i)%sf(j, k, l) = 0._wp
972979
end do
973980
end do
974981
end do
@@ -979,7 +986,7 @@ contains
979986
call s_build_cell_list(n_el_bubs_loc, mtn_s)
980987

981988
call s_smoothfunction(n_el_bubs_loc, intfc_rad, intfc_vel, &
982-
mtn_s, mtn_pos, q_beta)
989+
mtn_s, mtn_pos, q_beta, kahan_comp)
983990

984991
! DEBUG: checksum after Gaussian smearing (before communication)
985992
! all_cells = sum over entire grid (interior + buffer) per rank, then MPI_SUM.
@@ -2337,8 +2344,10 @@ contains
23372344

23382345
do i = 1, q_beta_idx
23392346
@:DEALLOCATE(q_beta(i)%sf)
2347+
@:DEALLOCATE(kahan_comp(i)%sf)
23402348
end do
23412349
@:DEALLOCATE(q_beta)
2350+
@:DEALLOCATE(kahan_comp)
23422351

23432352
!Deallocating space
23442353
@:DEALLOCATE(lag_id)

src/simulation/m_bubbles_EL_kernels.fpp

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -38,18 +38,19 @@ contains
3838
!! @param lbk_s Computational coordinates of the bubbles
3939
!! @param lbk_pos Spatial coordinates of the bubbles
4040
!! @param updatedvar Eulerian variable to be updated
41-
subroutine s_smoothfunction(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar)
41+
subroutine s_smoothfunction(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar, kcomp)
4242

4343
integer, intent(in) :: nBubs
4444
real(wp), dimension(1:lag_params%nBubs_glb, 1:3, 1:2), intent(in) :: lbk_s, lbk_pos
4545
real(wp), dimension(1:lag_params%nBubs_glb, 1:2), intent(in) :: lbk_rad, lbk_vel
4646
type(scalar_field), dimension(:), intent(inout) :: updatedvar
47+
type(scalar_field), dimension(:), intent(inout) :: kcomp
4748

4849
smoothfunc:select case(lag_params%smooth_type)
4950
case (1)
50-
call s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar)
51+
call s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar, kcomp)
5152
case (2)
52-
call s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar)
53+
call s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar, kcomp)
5354
end select smoothfunc
5455

5556
end subroutine s_smoothfunction
@@ -118,18 +119,20 @@ contains
118119
!> Cell-centric delta-function smearing using the cell list (no GPU atomics).
119120
!! Each bubble only affects the cell it resides in. The outer GPU loop
120121
!! iterates over interior cells and sums contributions from resident bubbles.
121-
subroutine s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar)
122+
subroutine s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar, kcomp)
122123

123124
integer, intent(in) :: nBubs
124125
real(wp), dimension(1:lag_params%nBubs_glb, 1:3, 1:2), intent(in) :: lbk_s
125126
real(wp), dimension(1:lag_params%nBubs_glb, 1:2), intent(in) :: lbk_rad, lbk_vel
126127
type(scalar_field), dimension(:), intent(inout) :: updatedvar
128+
type(scalar_field), dimension(:), intent(inout) :: kcomp
127129

128130
real(wp) :: strength_vel, strength_vol
129131
real(wp) :: volpart, Vol
132+
real(wp) :: y_kahan, t_kahan
130133
integer :: i, j, k, lb, bub_idx
131134

132-
$:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,lb,bub_idx,volpart,Vol,strength_vel,strength_vol]')
135+
$:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,lb,bub_idx,volpart,Vol,strength_vel,strength_vol,y_kahan,t_kahan]')
133136
do k = 0, p
134137
do j = 0, n
135138
do i = 0, m
@@ -153,16 +156,24 @@ contains
153156
strength_vol = volpart
154157
strength_vel = 4._wp*pi*lbk_rad(bub_idx, 2)**2._wp*lbk_vel(bub_idx, 2)
155158

156-
! Update void fraction field — no atomics needed
157-
updatedvar(1)%sf(i, j, k) = updatedvar(1)%sf(i, j, k) + real(strength_vol/Vol, kind=stp)
159+
! Kahan summation for void fraction
160+
y_kahan = real(strength_vol/Vol, kind=wp) - kcomp(1)%sf(i, j, k)
161+
t_kahan = updatedvar(1)%sf(i, j, k) + y_kahan
162+
kcomp(1)%sf(i, j, k) = (t_kahan - updatedvar(1)%sf(i, j, k)) - y_kahan
163+
updatedvar(1)%sf(i, j, k) = t_kahan
158164

159-
! Update time derivative of void fraction
160-
updatedvar(2)%sf(i, j, k) = updatedvar(2)%sf(i, j, k) + real(strength_vel/Vol, kind=stp)
165+
! Kahan summation for time derivative of void fraction
166+
y_kahan = real(strength_vel/Vol, kind=wp) - kcomp(2)%sf(i, j, k)
167+
t_kahan = updatedvar(2)%sf(i, j, k) + y_kahan
168+
kcomp(2)%sf(i, j, k) = (t_kahan - updatedvar(2)%sf(i, j, k)) - y_kahan
169+
updatedvar(2)%sf(i, j, k) = t_kahan
161170

162171
! Product of two smeared functions
163172
if (lag_params%cluster_type >= 4) then
164-
updatedvar(5)%sf(i, j, k) = updatedvar(5)%sf(i, j, k) + &
165-
real((strength_vol*strength_vel)/Vol, kind=stp)
173+
y_kahan = real((strength_vol*strength_vel)/Vol, kind=wp) - kcomp(5)%sf(i, j, k)
174+
t_kahan = updatedvar(5)%sf(i, j, k) + y_kahan
175+
kcomp(5)%sf(i, j, k) = (t_kahan - updatedvar(5)%sf(i, j, k)) - y_kahan
176+
updatedvar(5)%sf(i, j, k) = t_kahan
166177
end if
167178
end do
168179

@@ -176,18 +187,20 @@ contains
176187
!> Cell-centric gaussian smearing using the cell list (no GPU atomics).
177188
!! Each grid cell accumulates contributions from nearby bubbles looked up
178189
!! via cell_list_start/count/idx.
179-
subroutine s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar)
190+
subroutine s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar, kcomp)
180191

181192
integer, intent(in) :: nBubs
182193
real(wp), dimension(1:lag_params%nBubs_glb, 1:3, 1:2), intent(in) :: lbk_s, lbk_pos
183194
real(wp), dimension(1:lag_params%nBubs_glb, 1:2), intent(in) :: lbk_rad, lbk_vel
184195
type(scalar_field), dimension(:), intent(inout) :: updatedvar
196+
type(scalar_field), dimension(:), intent(inout) :: kcomp
185197

186198
real(wp), dimension(3) :: center, nodecoord, s_coord
187199
integer, dimension(3) :: cell, cellijk
188200
real(wp) :: stddsv, volpart
189201
real(wp) :: strength_vel, strength_vol
190202
real(wp) :: func, func2
203+
real(wp) :: y_kahan, t_kahan
191204
integer :: i, j, k, di, dj, dk, lb, bub_idx
192205
integer :: di_beg, di_end, dj_beg, dj_end, dk_beg, dk_end
193206
integer :: smear_x_beg, smear_x_end
@@ -203,7 +216,7 @@ contains
203216
smear_z_end = merge(p + mapCells + 1, p, p > 0)
204217

205218
$:GPU_PARALLEL_LOOP(collapse=3, &
206-
& private='[i,j,k,di,dj,dk,lb,bub_idx,center,nodecoord,s_coord,cell,cellijk,stddsv,volpart,strength_vel,strength_vol,func,func2,di_beg,di_end,dj_beg,dj_end,dk_beg,dk_end]', &
219+
& private='[i,j,k,di,dj,dk,lb,bub_idx,center,nodecoord,s_coord,cell,cellijk,stddsv,volpart,strength_vel,strength_vol,func,func2,y_kahan,t_kahan,di_beg,di_end,dj_beg,dj_end,dk_beg,dk_end]', &
207220
& copyin='[smear_x_beg,smear_x_end,smear_y_beg,smear_y_end,smear_z_beg,smear_z_end]')
208221
do k = smear_z_beg, smear_z_end
209222
do j = smear_y_beg, smear_y_end
@@ -253,14 +266,24 @@ contains
253266

254267
call s_applygaussian(center, cellijk, nodecoord, stddsv, 0._wp, func)
255268

256-
! Accumulate — no atomics needed (each thread owns its (i,j,k))
257-
updatedvar(1)%sf(i, j, k) = updatedvar(1)%sf(i, j, k) + real(func*strength_vol, kind=stp)
258-
updatedvar(2)%sf(i, j, k) = updatedvar(2)%sf(i, j, k) + real(func*strength_vel, kind=stp)
269+
! Kahan summation for void fraction
270+
y_kahan = real(func*strength_vol, kind=wp) - kcomp(1)%sf(i, j, k)
271+
t_kahan = updatedvar(1)%sf(i, j, k) + y_kahan
272+
kcomp(1)%sf(i, j, k) = (t_kahan - updatedvar(1)%sf(i, j, k)) - y_kahan
273+
updatedvar(1)%sf(i, j, k) = t_kahan
274+
275+
! Kahan summation for time derivative of void fraction
276+
y_kahan = real(func*strength_vel, kind=wp) - kcomp(2)%sf(i, j, k)
277+
t_kahan = updatedvar(2)%sf(i, j, k) + y_kahan
278+
kcomp(2)%sf(i, j, k) = (t_kahan - updatedvar(2)%sf(i, j, k)) - y_kahan
279+
updatedvar(2)%sf(i, j, k) = t_kahan
259280

260281
if (lag_params%cluster_type >= 4) then
261282
call s_applygaussian(center, cellijk, nodecoord, stddsv, 1._wp, func2)
262-
updatedvar(5)%sf(i, j, k) = updatedvar(5)%sf(i, j, k) + &
263-
real(func2*strength_vol*strength_vel, kind=stp)
283+
y_kahan = real(func2*strength_vol*strength_vel, kind=wp) - kcomp(5)%sf(i, j, k)
284+
t_kahan = updatedvar(5)%sf(i, j, k) + y_kahan
285+
kcomp(5)%sf(i, j, k) = (t_kahan - updatedvar(5)%sf(i, j, k)) - y_kahan
286+
updatedvar(5)%sf(i, j, k) = t_kahan
264287
end if
265288

266289
end do

0 commit comments

Comments
 (0)