@@ -115,11 +115,15 @@ end
115115 h = (h_a + h_b) / 2
116116
117117 rho_mean = (rho_a + rho_b) / 2
118-
119118 (; alpha, beta, epsilon) = viscosity
120- mu = h * vr / (distance^ 2 + epsilon * h^ 2 )
119+
120+ # Since this is one of the most performance critical functions, using fast divisions
121+ # here gives a significant speedup on GPUs.
122+ # See the docs page "Development" for more details on `div_fast`.
123+ mu = div_fast (h * vr, distance^ 2 + epsilon * h^ 2 )
121124 c = sound_speed
122- dv_viscosity = m_b * (alpha * c * mu + beta * mu^ 2 ) / rho_mean * grad_kernel
125+ # TODO why is m_b inside the `div_fast` faster on H100 than `m_b * div_fast(...)`?
126+ dv_viscosity = div_fast (m_b * alpha * c * mu + m_b * beta * mu^ 2 , rho_mean) * grad_kernel
123127 dv_particle[] += viscosity_correction * dv_viscosity
124128 end
125129
182186 mu_a = nu_a * rho_a
183187 mu_b = nu_b * rho_b
184188
185- dv_particle[] += viscosity_correction * m_b * (mu_a + mu_b) /
186- (rho_a * rho_b) * dot (pos_diff, grad_kernel) /
187- (distance^ 2 + epsilon * h^ 2 ) * v_diff
189+ # Since this is one of the most performance critical functions, using fast divisions
190+ # here gives a significant speedup on GPUs.
191+ # See the docs page "Development" for more details on `div_fast`.
192+ dv_viscosity = div_fast (m_b * (mu_a + mu_b) * dot (pos_diff, grad_kernel),
193+ rho_a * rho_b * (distance^ 2 + epsilon * h^ 2 )) * v_diff
194+ dv_particle[] += viscosity_correction * dv_viscosity
188195
189196 return dv_particle
190197end
@@ -209,19 +216,22 @@ struct ViscosityAdami{ELTYPE}
209216 end
210217end
211218
212- @inline function adami_viscosity_force! (dv_particle, smoothing_length_average , pos_diff,
213- distance, grad_kernel, m_a, m_b, rho_a, rho_b,
219+ @inline function adami_viscosity_force! (dv_particle, h , pos_diff, distance ,
220+ grad_kernel, m_a, m_b, rho_a, rho_b,
214221 v_diff, nu_a, nu_b, epsilon,
215222 viscosity_correction= 1 )
216223 eta_a = nu_a * rho_a
217224 eta_b = nu_b * rho_b
218225
219- eta_tilde = 2 * (eta_a * eta_b) / (eta_a + eta_b)
220-
221- tmp = eta_tilde / (distance^ 2 + epsilon * smoothing_length_average^ 2 )
226+ # Since this is one of the most performance critical functions, using fast divisions
227+ # here gives a significant speedup on GPUs.
228+ # See the docs page "Development" for more details on `div_fast`.
229+ volume_a = div_fast (m_a, rho_a)
230+ volume_b = div_fast (m_b, rho_b)
222231
223- volume_a = m_a / rho_a
224- volume_b = m_b / rho_b
232+ # eta_tilde = 2 * (eta_a * eta_b) / (eta_a + eta_b)
233+ # tmp = eta_tilde / (distance^2 + epsilon * h^2) / m_a
234+ tmp = div_fast (2 * eta_a * eta_b, (eta_a + eta_b) * (distance^ 2 + epsilon * h^ 2 ) * m_a)
225235
226236 # This formulation was introduced by Hu and Adams (2006). https://doi.org/10.1016/j.jcp.2005.09.001
227237 # They argued that the formulation is more flexible because of the possibility to formulate
232242 # Because when using this formulation for the pressure acceleration, it is not
233243 # energy conserving.
234244 # See issue: https://github.com/trixi-framework/TrixiParticles.jl/issues/394
235- visc = (volume_a^ 2 + volume_b^ 2 ) * dot (grad_kernel, pos_diff) * tmp / m_a
245+ visc = (volume_a^ 2 + volume_b^ 2 ) * dot (grad_kernel, pos_diff) * tmp
236246
237- dv_particle[] += viscosity_correction * visc . * v_diff
247+ dv_particle[] += viscosity_correction * visc * v_diff
238248
239249 return dv_particle
240250end
375385 # and then the Smagorinsky eddy viscosity:
376386 # ν_SGS = (C_S * h̄)^2 * S_mag.
377387 #
378- S_mag = norm (v_diff) / (distance + epsilon)
388+ # Since this is one of the most performance critical functions, using fast divisions
389+ # here gives a significant speedup on GPUs.
390+ # See the docs page "Development" for more details on `div_fast`.
391+ S_mag = div_fast (sqrt (dot (v_diff, v_diff)), (distance + epsilon))
379392 nu_SGS = (viscosity. C_S * smoothing_length_average)^ 2 * S_mag
380393
381394 # Effective kinematic viscosity is the sum of the standard and SGS parts.
459472
460473 smoothing_length_particle = smoothing_length (particle_system, particle)
461474 smoothing_length_neighbor = smoothing_length (neighbor_system, neighbor)
462- smoothing_length_average = (smoothing_length_particle + smoothing_length_neighbor) / 2
475+ h = (smoothing_length_particle + smoothing_length_neighbor) / 2
463476
464477 nu_a = kinematic_viscosity (particle_system,
465478 viscosity_model (neighbor_system, particle_system),
474487
475488 # SGS part: Compute the subgrid-scale eddy viscosity.
476489 # See comments above for `ViscosityAdamiSGS`.
477- S_mag = norm (v_diff) / (distance + epsilon)
478- nu_SGS = (viscosity. C_S * smoothing_length_average)^ 2 * S_mag
490+ # Since this is one of the most performance critical functions, using fast divisions
491+ # here gives a significant speedup on GPUs.
492+ # See the docs page "Development" for more details on `div_fast`.
493+ S_mag = div_fast (sqrt (dot (v_diff, v_diff)), (distance + epsilon))
494+ nu_SGS = (viscosity. C_S * h)^ 2 * S_mag
479495
480496 # Effective viscosities include the SGS term.
481497 nu_a_eff = nu_a + nu_SGS
485501 mu_a = nu_a_eff * rho_a
486502 mu_b = nu_b_eff * rho_b
487503
488- dv_particle[] += viscosity_correction * m_b * (mu_a + mu_b) /
489- (rho_a * rho_b) * (dot (pos_diff, grad_kernel)) /
490- (distance^ 2 + epsilon * smoothing_length_average^ 2 ) * v_diff
504+ # Since this is one of the most performance critical functions, using fast divisions
505+ # here gives a significant speedup on GPUs.
506+ # See the docs page "Development" for more details on `div_fast`.
507+ dv_viscosity = div_fast (m_b * (mu_a + mu_b) * dot (pos_diff, grad_kernel),
508+ rho_a * rho_b * (distance^ 2 + epsilon * h^ 2 )) * v_diff
509+ dv_particle[] += viscosity_correction * dv_viscosity
491510
492511 return dv_particle
493512end
546565 v_b = viscous_velocity (v_neighbor_system, neighbor_system, neighbor, v_b)
547566 v_diff = v_a - v_b
548567
549- gamma_dot = norm (v_diff) / (distance + epsilon)
568+ # Since this is one of the most performance critical functions, using fast divisions
569+ # here gives a significant speedup on GPUs.
570+ # See the docs page "Development" for more details on `div_fast`.
571+ gamma_dot = div_fast (sqrt (dot (v_diff, v_diff)), (distance + epsilon))
550572
551573 # Compute Carreau-Yasuda effective viscosity
552574 (; nu0, nu_inf, lambda, a, n) = viscosity
0 commit comments